{ "best_global_step": 18390, "best_metric": 0.1670130044221878, "best_model_checkpoint": "saves/prefix-tuning/llama-3-8b-instruct/train_multirc_1756735871/checkpoint-18390", "epoch": 10.0, "eval_steps": 6130, "global_step": 122590, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00040786361040867934, "grad_norm": 128.10255432128906, "learning_rate": 1.6314544416347175e-08, "loss": 13.3028, "num_input_tokens_seen": 3824, "step": 5 }, { "epoch": 0.0008157272208173587, "grad_norm": 154.21292114257812, "learning_rate": 3.670772493678114e-08, "loss": 13.1692, "num_input_tokens_seen": 8032, "step": 10 }, { "epoch": 0.001223590831226038, "grad_norm": 141.57974243164062, "learning_rate": 5.710090545721511e-08, "loss": 13.505, "num_input_tokens_seen": 12464, "step": 15 }, { "epoch": 0.0016314544416347174, "grad_norm": 163.12156677246094, "learning_rate": 7.749408597764908e-08, "loss": 12.8809, "num_input_tokens_seen": 17056, "step": 20 }, { "epoch": 0.0020393180520433965, "grad_norm": 161.04200744628906, "learning_rate": 9.788726649808304e-08, "loss": 13.3366, "num_input_tokens_seen": 21376, "step": 25 }, { "epoch": 0.002447181662452076, "grad_norm": 158.90042114257812, "learning_rate": 1.1828044701851702e-07, "loss": 13.2903, "num_input_tokens_seen": 26896, "step": 30 }, { "epoch": 0.0028550452728607553, "grad_norm": 132.90487670898438, "learning_rate": 1.3867362753895098e-07, "loss": 13.3055, "num_input_tokens_seen": 31696, "step": 35 }, { "epoch": 0.0032629088832694347, "grad_norm": 154.15957641601562, "learning_rate": 1.5906680805938495e-07, "loss": 12.5954, "num_input_tokens_seen": 37248, "step": 40 }, { "epoch": 0.003670772493678114, "grad_norm": 179.69326782226562, "learning_rate": 1.7945998857981894e-07, "loss": 12.5271, "num_input_tokens_seen": 41584, "step": 45 }, { "epoch": 0.004078636104086793, "grad_norm": 151.7662811279297, "learning_rate": 1.9985316910025287e-07, "loss": 12.4165, "num_input_tokens_seen": 45776, "step": 50 }, { "epoch": 0.004486499714495473, "grad_norm": 155.37709045410156, "learning_rate": 2.2024634962068687e-07, "loss": 12.1219, "num_input_tokens_seen": 50528, "step": 55 }, { "epoch": 0.004894363324904152, "grad_norm": 148.14727783203125, "learning_rate": 2.4063953014112083e-07, "loss": 11.9554, "num_input_tokens_seen": 55488, "step": 60 }, { "epoch": 0.005302226935312832, "grad_norm": 157.15585327148438, "learning_rate": 2.610327106615548e-07, "loss": 11.9004, "num_input_tokens_seen": 60720, "step": 65 }, { "epoch": 0.005710090545721511, "grad_norm": 148.4269256591797, "learning_rate": 2.8142589118198876e-07, "loss": 11.9313, "num_input_tokens_seen": 65344, "step": 70 }, { "epoch": 0.0061179541561301905, "grad_norm": 155.89920043945312, "learning_rate": 3.018190717024227e-07, "loss": 11.2097, "num_input_tokens_seen": 71248, "step": 75 }, { "epoch": 0.0065258177665388694, "grad_norm": 156.2261962890625, "learning_rate": 3.222122522228567e-07, "loss": 11.0832, "num_input_tokens_seen": 75600, "step": 80 }, { "epoch": 0.006933681376947548, "grad_norm": 153.85423278808594, "learning_rate": 3.4260543274329065e-07, "loss": 10.8107, "num_input_tokens_seen": 80800, "step": 85 }, { "epoch": 0.007341544987356228, "grad_norm": 148.4920196533203, "learning_rate": 3.629986132637246e-07, "loss": 10.1902, "num_input_tokens_seen": 85232, "step": 90 }, { "epoch": 0.007749408597764907, "grad_norm": 164.8092041015625, "learning_rate": 3.833917937841586e-07, "loss": 9.907, "num_input_tokens_seen": 90128, "step": 95 }, { "epoch": 0.008157272208173586, "grad_norm": 149.53932189941406, "learning_rate": 4.037849743045926e-07, "loss": 9.3368, "num_input_tokens_seen": 95584, "step": 100 }, { "epoch": 0.008565135818582266, "grad_norm": 142.064453125, "learning_rate": 4.2417815482502656e-07, "loss": 9.2755, "num_input_tokens_seen": 100320, "step": 105 }, { "epoch": 0.008972999428990946, "grad_norm": 148.3561248779297, "learning_rate": 4.4457133534546047e-07, "loss": 8.6328, "num_input_tokens_seen": 105104, "step": 110 }, { "epoch": 0.009380863039399626, "grad_norm": 153.8699188232422, "learning_rate": 4.649645158658945e-07, "loss": 8.3791, "num_input_tokens_seen": 109088, "step": 115 }, { "epoch": 0.009788726649808304, "grad_norm": 142.484375, "learning_rate": 4.853576963863285e-07, "loss": 7.9823, "num_input_tokens_seen": 114144, "step": 120 }, { "epoch": 0.010196590260216984, "grad_norm": 130.0813446044922, "learning_rate": 5.057508769067624e-07, "loss": 7.5332, "num_input_tokens_seen": 118512, "step": 125 }, { "epoch": 0.010604453870625663, "grad_norm": 121.34651184082031, "learning_rate": 5.261440574271964e-07, "loss": 6.793, "num_input_tokens_seen": 123024, "step": 130 }, { "epoch": 0.011012317481034341, "grad_norm": 115.00995635986328, "learning_rate": 5.465372379476303e-07, "loss": 6.4393, "num_input_tokens_seen": 127024, "step": 135 }, { "epoch": 0.011420181091443021, "grad_norm": 133.9142608642578, "learning_rate": 5.669304184680643e-07, "loss": 6.0207, "num_input_tokens_seen": 132112, "step": 140 }, { "epoch": 0.011828044701851701, "grad_norm": 109.19799041748047, "learning_rate": 5.873235989884983e-07, "loss": 5.7598, "num_input_tokens_seen": 136656, "step": 145 }, { "epoch": 0.012235908312260381, "grad_norm": 102.66749572753906, "learning_rate": 6.077167795089322e-07, "loss": 4.8394, "num_input_tokens_seen": 141696, "step": 150 }, { "epoch": 0.012643771922669059, "grad_norm": 115.88455963134766, "learning_rate": 6.281099600293662e-07, "loss": 4.7248, "num_input_tokens_seen": 145808, "step": 155 }, { "epoch": 0.013051635533077739, "grad_norm": 93.86137390136719, "learning_rate": 6.485031405498002e-07, "loss": 4.2974, "num_input_tokens_seen": 150432, "step": 160 }, { "epoch": 0.013459499143486419, "grad_norm": 92.2104721069336, "learning_rate": 6.688963210702341e-07, "loss": 3.9694, "num_input_tokens_seen": 155136, "step": 165 }, { "epoch": 0.013867362753895097, "grad_norm": 87.72124481201172, "learning_rate": 6.892895015906681e-07, "loss": 3.7012, "num_input_tokens_seen": 159760, "step": 170 }, { "epoch": 0.014275226364303777, "grad_norm": 85.49071502685547, "learning_rate": 7.096826821111022e-07, "loss": 3.2703, "num_input_tokens_seen": 164864, "step": 175 }, { "epoch": 0.014683089974712456, "grad_norm": 75.05280303955078, "learning_rate": 7.30075862631536e-07, "loss": 2.7423, "num_input_tokens_seen": 169808, "step": 180 }, { "epoch": 0.015090953585121136, "grad_norm": 60.738460540771484, "learning_rate": 7.5046904315197e-07, "loss": 2.4598, "num_input_tokens_seen": 174640, "step": 185 }, { "epoch": 0.015498817195529814, "grad_norm": 64.68683624267578, "learning_rate": 7.708622236724041e-07, "loss": 2.1246, "num_input_tokens_seen": 178784, "step": 190 }, { "epoch": 0.015906680805938492, "grad_norm": 62.9332160949707, "learning_rate": 7.91255404192838e-07, "loss": 1.7554, "num_input_tokens_seen": 183904, "step": 195 }, { "epoch": 0.016314544416347172, "grad_norm": 54.12619400024414, "learning_rate": 8.11648584713272e-07, "loss": 1.537, "num_input_tokens_seen": 189296, "step": 200 }, { "epoch": 0.016722408026755852, "grad_norm": 48.98590087890625, "learning_rate": 8.320417652337058e-07, "loss": 1.4371, "num_input_tokens_seen": 194640, "step": 205 }, { "epoch": 0.017130271637164532, "grad_norm": 70.805908203125, "learning_rate": 8.524349457541398e-07, "loss": 1.2504, "num_input_tokens_seen": 199328, "step": 210 }, { "epoch": 0.017538135247573212, "grad_norm": 44.8169059753418, "learning_rate": 8.728281262745738e-07, "loss": 0.9866, "num_input_tokens_seen": 204576, "step": 215 }, { "epoch": 0.01794599885798189, "grad_norm": 49.73596954345703, "learning_rate": 8.932213067950078e-07, "loss": 0.9292, "num_input_tokens_seen": 209952, "step": 220 }, { "epoch": 0.01835386246839057, "grad_norm": 32.43190383911133, "learning_rate": 9.136144873154418e-07, "loss": 0.7745, "num_input_tokens_seen": 214000, "step": 225 }, { "epoch": 0.01876172607879925, "grad_norm": 26.848127365112305, "learning_rate": 9.340076678358758e-07, "loss": 0.721, "num_input_tokens_seen": 218256, "step": 230 }, { "epoch": 0.019169589689207928, "grad_norm": 22.0032958984375, "learning_rate": 9.544008483563097e-07, "loss": 0.5482, "num_input_tokens_seen": 222736, "step": 235 }, { "epoch": 0.019577453299616607, "grad_norm": 46.969058990478516, "learning_rate": 9.747940288767436e-07, "loss": 0.5186, "num_input_tokens_seen": 227328, "step": 240 }, { "epoch": 0.019985316910025287, "grad_norm": 29.771120071411133, "learning_rate": 9.951872093971777e-07, "loss": 0.45, "num_input_tokens_seen": 231728, "step": 245 }, { "epoch": 0.020393180520433967, "grad_norm": 22.684938430786133, "learning_rate": 1.0155803899176115e-06, "loss": 0.4444, "num_input_tokens_seen": 237088, "step": 250 }, { "epoch": 0.020801044130842647, "grad_norm": 17.871313095092773, "learning_rate": 1.0359735704380456e-06, "loss": 0.3798, "num_input_tokens_seen": 241216, "step": 255 }, { "epoch": 0.021208907741251327, "grad_norm": 27.557416915893555, "learning_rate": 1.0563667509584797e-06, "loss": 0.4832, "num_input_tokens_seen": 246608, "step": 260 }, { "epoch": 0.021616771351660007, "grad_norm": 20.96791648864746, "learning_rate": 1.0767599314789135e-06, "loss": 0.4004, "num_input_tokens_seen": 251616, "step": 265 }, { "epoch": 0.022024634962068683, "grad_norm": 21.793527603149414, "learning_rate": 1.0971531119993476e-06, "loss": 0.3924, "num_input_tokens_seen": 256752, "step": 270 }, { "epoch": 0.022432498572477363, "grad_norm": 30.348018646240234, "learning_rate": 1.1175462925197814e-06, "loss": 0.3711, "num_input_tokens_seen": 260944, "step": 275 }, { "epoch": 0.022840362182886043, "grad_norm": 78.30055236816406, "learning_rate": 1.1379394730402153e-06, "loss": 0.5242, "num_input_tokens_seen": 265712, "step": 280 }, { "epoch": 0.023248225793294722, "grad_norm": 35.641395568847656, "learning_rate": 1.1583326535606494e-06, "loss": 0.3542, "num_input_tokens_seen": 270848, "step": 285 }, { "epoch": 0.023656089403703402, "grad_norm": 23.16868019104004, "learning_rate": 1.1787258340810832e-06, "loss": 0.4078, "num_input_tokens_seen": 275296, "step": 290 }, { "epoch": 0.024063953014112082, "grad_norm": 16.39275360107422, "learning_rate": 1.1991190146015173e-06, "loss": 0.383, "num_input_tokens_seen": 280224, "step": 295 }, { "epoch": 0.024471816624520762, "grad_norm": 27.138965606689453, "learning_rate": 1.2195121951219514e-06, "loss": 0.3688, "num_input_tokens_seen": 284240, "step": 300 }, { "epoch": 0.024879680234929438, "grad_norm": 62.29192352294922, "learning_rate": 1.2399053756423852e-06, "loss": 0.4248, "num_input_tokens_seen": 289296, "step": 305 }, { "epoch": 0.025287543845338118, "grad_norm": 20.228775024414062, "learning_rate": 1.260298556162819e-06, "loss": 0.3957, "num_input_tokens_seen": 294672, "step": 310 }, { "epoch": 0.025695407455746798, "grad_norm": 49.2585563659668, "learning_rate": 1.2806917366832532e-06, "loss": 0.3851, "num_input_tokens_seen": 299696, "step": 315 }, { "epoch": 0.026103271066155478, "grad_norm": 45.84053421020508, "learning_rate": 1.301084917203687e-06, "loss": 0.3676, "num_input_tokens_seen": 304880, "step": 320 }, { "epoch": 0.026511134676564158, "grad_norm": 21.188247680664062, "learning_rate": 1.321478097724121e-06, "loss": 0.4137, "num_input_tokens_seen": 309360, "step": 325 }, { "epoch": 0.026918998286972837, "grad_norm": 37.1287956237793, "learning_rate": 1.3418712782445552e-06, "loss": 0.3492, "num_input_tokens_seen": 315120, "step": 330 }, { "epoch": 0.027326861897381517, "grad_norm": 53.40016174316406, "learning_rate": 1.362264458764989e-06, "loss": 0.351, "num_input_tokens_seen": 320000, "step": 335 }, { "epoch": 0.027734725507790194, "grad_norm": 44.689937591552734, "learning_rate": 1.382657639285423e-06, "loss": 0.3875, "num_input_tokens_seen": 325216, "step": 340 }, { "epoch": 0.028142589118198873, "grad_norm": 73.82674407958984, "learning_rate": 1.403050819805857e-06, "loss": 0.3662, "num_input_tokens_seen": 329312, "step": 345 }, { "epoch": 0.028550452728607553, "grad_norm": 45.25686264038086, "learning_rate": 1.4234440003262908e-06, "loss": 0.4455, "num_input_tokens_seen": 333888, "step": 350 }, { "epoch": 0.028958316339016233, "grad_norm": 19.425838470458984, "learning_rate": 1.4438371808467249e-06, "loss": 0.3891, "num_input_tokens_seen": 338432, "step": 355 }, { "epoch": 0.029366179949424913, "grad_norm": 29.500385284423828, "learning_rate": 1.464230361367159e-06, "loss": 0.3169, "num_input_tokens_seen": 343280, "step": 360 }, { "epoch": 0.029774043559833593, "grad_norm": 49.53668212890625, "learning_rate": 1.4846235418875928e-06, "loss": 0.373, "num_input_tokens_seen": 347824, "step": 365 }, { "epoch": 0.030181907170242273, "grad_norm": 46.028587341308594, "learning_rate": 1.5050167224080269e-06, "loss": 0.4737, "num_input_tokens_seen": 353088, "step": 370 }, { "epoch": 0.03058977078065095, "grad_norm": 108.37428283691406, "learning_rate": 1.525409902928461e-06, "loss": 0.436, "num_input_tokens_seen": 357488, "step": 375 }, { "epoch": 0.03099763439105963, "grad_norm": 35.635963439941406, "learning_rate": 1.5458030834488946e-06, "loss": 0.4433, "num_input_tokens_seen": 362400, "step": 380 }, { "epoch": 0.03140549800146831, "grad_norm": 17.6068058013916, "learning_rate": 1.5661962639693287e-06, "loss": 0.3557, "num_input_tokens_seen": 367056, "step": 385 }, { "epoch": 0.031813361611876985, "grad_norm": 46.64408493041992, "learning_rate": 1.586589444489763e-06, "loss": 0.342, "num_input_tokens_seen": 372400, "step": 390 }, { "epoch": 0.03222122522228567, "grad_norm": 33.90852737426758, "learning_rate": 1.6069826250101966e-06, "loss": 0.2979, "num_input_tokens_seen": 377184, "step": 395 }, { "epoch": 0.032629088832694345, "grad_norm": 37.093101501464844, "learning_rate": 1.6273758055306304e-06, "loss": 0.4253, "num_input_tokens_seen": 381808, "step": 400 }, { "epoch": 0.03303695244310303, "grad_norm": 16.703540802001953, "learning_rate": 1.6477689860510647e-06, "loss": 0.508, "num_input_tokens_seen": 386832, "step": 405 }, { "epoch": 0.033444816053511704, "grad_norm": 12.378348350524902, "learning_rate": 1.6681621665714984e-06, "loss": 0.3748, "num_input_tokens_seen": 391776, "step": 410 }, { "epoch": 0.03385267966392039, "grad_norm": 31.21529197692871, "learning_rate": 1.6885553470919327e-06, "loss": 0.3711, "num_input_tokens_seen": 397440, "step": 415 }, { "epoch": 0.034260543274329064, "grad_norm": 24.14615821838379, "learning_rate": 1.7089485276123665e-06, "loss": 0.3773, "num_input_tokens_seen": 402416, "step": 420 }, { "epoch": 0.03466840688473774, "grad_norm": 34.95779800415039, "learning_rate": 1.7293417081328006e-06, "loss": 0.3258, "num_input_tokens_seen": 407312, "step": 425 }, { "epoch": 0.035076270495146424, "grad_norm": 20.636564254760742, "learning_rate": 1.7497348886532344e-06, "loss": 0.3385, "num_input_tokens_seen": 411920, "step": 430 }, { "epoch": 0.0354841341055551, "grad_norm": 49.91535186767578, "learning_rate": 1.7701280691736683e-06, "loss": 0.4635, "num_input_tokens_seen": 417712, "step": 435 }, { "epoch": 0.03589199771596378, "grad_norm": 13.839750289916992, "learning_rate": 1.7905212496941024e-06, "loss": 0.3751, "num_input_tokens_seen": 422672, "step": 440 }, { "epoch": 0.03629986132637246, "grad_norm": 19.982593536376953, "learning_rate": 1.8109144302145362e-06, "loss": 0.4394, "num_input_tokens_seen": 427376, "step": 445 }, { "epoch": 0.03670772493678114, "grad_norm": 15.658137321472168, "learning_rate": 1.8313076107349703e-06, "loss": 0.3632, "num_input_tokens_seen": 432608, "step": 450 }, { "epoch": 0.03711558854718982, "grad_norm": 35.24448013305664, "learning_rate": 1.8517007912554042e-06, "loss": 0.3763, "num_input_tokens_seen": 437568, "step": 455 }, { "epoch": 0.0375234521575985, "grad_norm": 26.772878646850586, "learning_rate": 1.8720939717758384e-06, "loss": 0.3315, "num_input_tokens_seen": 441776, "step": 460 }, { "epoch": 0.03793131576800718, "grad_norm": 26.543962478637695, "learning_rate": 1.8924871522962723e-06, "loss": 0.3333, "num_input_tokens_seen": 446784, "step": 465 }, { "epoch": 0.038339179378415855, "grad_norm": 30.119369506835938, "learning_rate": 1.912880332816706e-06, "loss": 0.4821, "num_input_tokens_seen": 451008, "step": 470 }, { "epoch": 0.03874704298882454, "grad_norm": 25.84795379638672, "learning_rate": 1.93327351333714e-06, "loss": 0.3421, "num_input_tokens_seen": 455680, "step": 475 }, { "epoch": 0.039154906599233215, "grad_norm": 29.637754440307617, "learning_rate": 1.953666693857574e-06, "loss": 0.3845, "num_input_tokens_seen": 461136, "step": 480 }, { "epoch": 0.0395627702096419, "grad_norm": 31.406719207763672, "learning_rate": 1.974059874378008e-06, "loss": 0.3186, "num_input_tokens_seen": 465376, "step": 485 }, { "epoch": 0.039970633820050575, "grad_norm": 31.8857479095459, "learning_rate": 1.994453054898442e-06, "loss": 0.4009, "num_input_tokens_seen": 470016, "step": 490 }, { "epoch": 0.04037849743045926, "grad_norm": 57.410621643066406, "learning_rate": 2.0148462354188763e-06, "loss": 0.4373, "num_input_tokens_seen": 475392, "step": 495 }, { "epoch": 0.040786361040867934, "grad_norm": 18.596036911010742, "learning_rate": 2.03523941593931e-06, "loss": 0.315, "num_input_tokens_seen": 480000, "step": 500 }, { "epoch": 0.04119422465127661, "grad_norm": 21.442176818847656, "learning_rate": 2.0556325964597436e-06, "loss": 0.4273, "num_input_tokens_seen": 484224, "step": 505 }, { "epoch": 0.041602088261685294, "grad_norm": 37.85723876953125, "learning_rate": 2.076025776980178e-06, "loss": 0.3514, "num_input_tokens_seen": 488656, "step": 510 }, { "epoch": 0.04200995187209397, "grad_norm": 19.732952117919922, "learning_rate": 2.0964189575006117e-06, "loss": 0.3272, "num_input_tokens_seen": 493328, "step": 515 }, { "epoch": 0.042417815482502653, "grad_norm": 35.35569763183594, "learning_rate": 2.116812138021046e-06, "loss": 0.3637, "num_input_tokens_seen": 497536, "step": 520 }, { "epoch": 0.04282567909291133, "grad_norm": 26.690099716186523, "learning_rate": 2.13720531854148e-06, "loss": 0.3659, "num_input_tokens_seen": 502544, "step": 525 }, { "epoch": 0.04323354270332001, "grad_norm": 19.940181732177734, "learning_rate": 2.157598499061914e-06, "loss": 0.3671, "num_input_tokens_seen": 508256, "step": 530 }, { "epoch": 0.04364140631372869, "grad_norm": 55.083377838134766, "learning_rate": 2.1779916795823476e-06, "loss": 0.4007, "num_input_tokens_seen": 512672, "step": 535 }, { "epoch": 0.044049269924137366, "grad_norm": 42.40034866333008, "learning_rate": 2.1983848601027816e-06, "loss": 0.4318, "num_input_tokens_seen": 517200, "step": 540 }, { "epoch": 0.04445713353454605, "grad_norm": 27.408470153808594, "learning_rate": 2.2187780406232157e-06, "loss": 0.3718, "num_input_tokens_seen": 522384, "step": 545 }, { "epoch": 0.044864997144954726, "grad_norm": 60.97392654418945, "learning_rate": 2.2391712211436494e-06, "loss": 0.3853, "num_input_tokens_seen": 527248, "step": 550 }, { "epoch": 0.04527286075536341, "grad_norm": 19.619749069213867, "learning_rate": 2.259564401664084e-06, "loss": 0.3621, "num_input_tokens_seen": 532128, "step": 555 }, { "epoch": 0.045680724365772085, "grad_norm": 17.713308334350586, "learning_rate": 2.2799575821845175e-06, "loss": 0.3537, "num_input_tokens_seen": 536960, "step": 560 }, { "epoch": 0.04608858797618077, "grad_norm": 28.080060958862305, "learning_rate": 2.3003507627049516e-06, "loss": 0.3457, "num_input_tokens_seen": 541936, "step": 565 }, { "epoch": 0.046496451586589445, "grad_norm": 22.557966232299805, "learning_rate": 2.3207439432253856e-06, "loss": 0.325, "num_input_tokens_seen": 546512, "step": 570 }, { "epoch": 0.04690431519699812, "grad_norm": 22.878562927246094, "learning_rate": 2.3411371237458193e-06, "loss": 0.446, "num_input_tokens_seen": 550416, "step": 575 }, { "epoch": 0.047312178807406804, "grad_norm": 18.57056427001953, "learning_rate": 2.3615303042662534e-06, "loss": 0.4253, "num_input_tokens_seen": 555520, "step": 580 }, { "epoch": 0.04772004241781548, "grad_norm": 27.81142234802246, "learning_rate": 2.3819234847866874e-06, "loss": 0.2961, "num_input_tokens_seen": 560176, "step": 585 }, { "epoch": 0.048127906028224164, "grad_norm": 34.8035888671875, "learning_rate": 2.4023166653071215e-06, "loss": 0.3577, "num_input_tokens_seen": 565360, "step": 590 }, { "epoch": 0.04853576963863284, "grad_norm": 39.132850646972656, "learning_rate": 2.422709845827555e-06, "loss": 0.4003, "num_input_tokens_seen": 570048, "step": 595 }, { "epoch": 0.048943633249041524, "grad_norm": 37.20161437988281, "learning_rate": 2.4431030263479896e-06, "loss": 0.3818, "num_input_tokens_seen": 574864, "step": 600 }, { "epoch": 0.0493514968594502, "grad_norm": 30.174400329589844, "learning_rate": 2.4634962068684233e-06, "loss": 0.4252, "num_input_tokens_seen": 580656, "step": 605 }, { "epoch": 0.049759360469858877, "grad_norm": 38.754554748535156, "learning_rate": 2.4838893873888574e-06, "loss": 0.3477, "num_input_tokens_seen": 585936, "step": 610 }, { "epoch": 0.05016722408026756, "grad_norm": 15.445249557495117, "learning_rate": 2.5042825679092914e-06, "loss": 0.4227, "num_input_tokens_seen": 590768, "step": 615 }, { "epoch": 0.050575087690676236, "grad_norm": 17.307544708251953, "learning_rate": 2.524675748429725e-06, "loss": 0.3769, "num_input_tokens_seen": 594736, "step": 620 }, { "epoch": 0.05098295130108492, "grad_norm": 18.07362174987793, "learning_rate": 2.545068928950159e-06, "loss": 0.3717, "num_input_tokens_seen": 600288, "step": 625 }, { "epoch": 0.051390814911493596, "grad_norm": 17.995147705078125, "learning_rate": 2.5654621094705932e-06, "loss": 0.3448, "num_input_tokens_seen": 604784, "step": 630 }, { "epoch": 0.05179867852190228, "grad_norm": 24.591646194458008, "learning_rate": 2.5858552899910273e-06, "loss": 0.3388, "num_input_tokens_seen": 609792, "step": 635 }, { "epoch": 0.052206542132310955, "grad_norm": 19.101886749267578, "learning_rate": 2.606248470511461e-06, "loss": 0.2943, "num_input_tokens_seen": 614736, "step": 640 }, { "epoch": 0.05261440574271963, "grad_norm": 40.091583251953125, "learning_rate": 2.6266416510318954e-06, "loss": 0.413, "num_input_tokens_seen": 619776, "step": 645 }, { "epoch": 0.053022269353128315, "grad_norm": 54.009578704833984, "learning_rate": 2.647034831552329e-06, "loss": 0.4177, "num_input_tokens_seen": 623840, "step": 650 }, { "epoch": 0.05343013296353699, "grad_norm": 6.730740547180176, "learning_rate": 2.6674280120727627e-06, "loss": 0.2319, "num_input_tokens_seen": 628176, "step": 655 }, { "epoch": 0.053837996573945675, "grad_norm": 67.18085479736328, "learning_rate": 2.687821192593197e-06, "loss": 0.5004, "num_input_tokens_seen": 632624, "step": 660 }, { "epoch": 0.05424586018435435, "grad_norm": 70.97489166259766, "learning_rate": 2.708214373113631e-06, "loss": 0.826, "num_input_tokens_seen": 637856, "step": 665 }, { "epoch": 0.054653723794763034, "grad_norm": 14.07853889465332, "learning_rate": 2.728607553634065e-06, "loss": 0.2843, "num_input_tokens_seen": 642544, "step": 670 }, { "epoch": 0.05506158740517171, "grad_norm": 13.787659645080566, "learning_rate": 2.749000734154499e-06, "loss": 0.3182, "num_input_tokens_seen": 647248, "step": 675 }, { "epoch": 0.05546945101558039, "grad_norm": 20.085857391357422, "learning_rate": 2.769393914674933e-06, "loss": 0.4128, "num_input_tokens_seen": 652064, "step": 680 }, { "epoch": 0.05587731462598907, "grad_norm": 45.49406814575195, "learning_rate": 2.7897870951953667e-06, "loss": 0.4097, "num_input_tokens_seen": 656128, "step": 685 }, { "epoch": 0.05628517823639775, "grad_norm": 22.449670791625977, "learning_rate": 2.8101802757158008e-06, "loss": 0.4128, "num_input_tokens_seen": 660528, "step": 690 }, { "epoch": 0.05669304184680643, "grad_norm": 21.77682113647461, "learning_rate": 2.830573456236235e-06, "loss": 0.348, "num_input_tokens_seen": 664928, "step": 695 }, { "epoch": 0.057100905457215106, "grad_norm": 16.248912811279297, "learning_rate": 2.8509666367566685e-06, "loss": 0.2762, "num_input_tokens_seen": 669584, "step": 700 }, { "epoch": 0.05750876906762379, "grad_norm": 16.730371475219727, "learning_rate": 2.871359817277103e-06, "loss": 0.3519, "num_input_tokens_seen": 674992, "step": 705 }, { "epoch": 0.057916632678032466, "grad_norm": 9.971351623535156, "learning_rate": 2.8917529977975366e-06, "loss": 0.2789, "num_input_tokens_seen": 680320, "step": 710 }, { "epoch": 0.05832449628844114, "grad_norm": 19.078466415405273, "learning_rate": 2.9121461783179707e-06, "loss": 0.6819, "num_input_tokens_seen": 684352, "step": 715 }, { "epoch": 0.058732359898849826, "grad_norm": 13.314475059509277, "learning_rate": 2.9325393588384048e-06, "loss": 0.6013, "num_input_tokens_seen": 689280, "step": 720 }, { "epoch": 0.0591402235092585, "grad_norm": 19.691442489624023, "learning_rate": 2.9529325393588384e-06, "loss": 0.3907, "num_input_tokens_seen": 694352, "step": 725 }, { "epoch": 0.059548087119667185, "grad_norm": 17.371828079223633, "learning_rate": 2.9733257198792725e-06, "loss": 0.442, "num_input_tokens_seen": 700016, "step": 730 }, { "epoch": 0.05995595073007586, "grad_norm": 9.948322296142578, "learning_rate": 2.9937189003997066e-06, "loss": 0.4012, "num_input_tokens_seen": 705456, "step": 735 }, { "epoch": 0.060363814340484545, "grad_norm": 22.975605010986328, "learning_rate": 3.0141120809201406e-06, "loss": 0.3734, "num_input_tokens_seen": 709824, "step": 740 }, { "epoch": 0.06077167795089322, "grad_norm": 18.660913467407227, "learning_rate": 3.0345052614405743e-06, "loss": 0.2942, "num_input_tokens_seen": 715200, "step": 745 }, { "epoch": 0.0611795415613019, "grad_norm": 14.203372955322266, "learning_rate": 3.0548984419610084e-06, "loss": 0.2871, "num_input_tokens_seen": 719632, "step": 750 }, { "epoch": 0.06158740517171058, "grad_norm": 12.215680122375488, "learning_rate": 3.0752916224814424e-06, "loss": 0.3615, "num_input_tokens_seen": 724608, "step": 755 }, { "epoch": 0.06199526878211926, "grad_norm": 26.449430465698242, "learning_rate": 3.095684803001876e-06, "loss": 0.2737, "num_input_tokens_seen": 729040, "step": 760 }, { "epoch": 0.06240313239252794, "grad_norm": 35.44478988647461, "learning_rate": 3.1160779835223106e-06, "loss": 0.409, "num_input_tokens_seen": 733392, "step": 765 }, { "epoch": 0.06281099600293662, "grad_norm": 41.311702728271484, "learning_rate": 3.1364711640427446e-06, "loss": 0.5818, "num_input_tokens_seen": 737792, "step": 770 }, { "epoch": 0.0632188596133453, "grad_norm": 5.528947830200195, "learning_rate": 3.1568643445631783e-06, "loss": 0.6721, "num_input_tokens_seen": 741696, "step": 775 }, { "epoch": 0.06362672322375397, "grad_norm": 9.532819747924805, "learning_rate": 3.1772575250836123e-06, "loss": 0.4599, "num_input_tokens_seen": 746128, "step": 780 }, { "epoch": 0.06403458683416266, "grad_norm": 10.559427261352539, "learning_rate": 3.197650705604046e-06, "loss": 0.341, "num_input_tokens_seen": 750304, "step": 785 }, { "epoch": 0.06444245044457134, "grad_norm": 11.672725677490234, "learning_rate": 3.2180438861244796e-06, "loss": 0.6283, "num_input_tokens_seen": 754816, "step": 790 }, { "epoch": 0.06485031405498001, "grad_norm": 14.96315860748291, "learning_rate": 3.238437066644914e-06, "loss": 0.2673, "num_input_tokens_seen": 759696, "step": 795 }, { "epoch": 0.06525817766538869, "grad_norm": 15.872020721435547, "learning_rate": 3.258830247165348e-06, "loss": 0.4021, "num_input_tokens_seen": 764112, "step": 800 }, { "epoch": 0.06566604127579738, "grad_norm": 32.46170425415039, "learning_rate": 3.279223427685782e-06, "loss": 0.3017, "num_input_tokens_seen": 768432, "step": 805 }, { "epoch": 0.06607390488620606, "grad_norm": 11.891767501831055, "learning_rate": 3.299616608206216e-06, "loss": 0.3505, "num_input_tokens_seen": 773184, "step": 810 }, { "epoch": 0.06648176849661473, "grad_norm": 33.49330139160156, "learning_rate": 3.3200097887266504e-06, "loss": 0.3543, "num_input_tokens_seen": 778304, "step": 815 }, { "epoch": 0.06688963210702341, "grad_norm": 22.372821807861328, "learning_rate": 3.340402969247084e-06, "loss": 0.3555, "num_input_tokens_seen": 782864, "step": 820 }, { "epoch": 0.06729749571743208, "grad_norm": 28.632034301757812, "learning_rate": 3.3607961497675177e-06, "loss": 0.4817, "num_input_tokens_seen": 788048, "step": 825 }, { "epoch": 0.06770535932784078, "grad_norm": 15.63804817199707, "learning_rate": 3.3811893302879518e-06, "loss": 0.3097, "num_input_tokens_seen": 792800, "step": 830 }, { "epoch": 0.06811322293824945, "grad_norm": 26.38883399963379, "learning_rate": 3.4015825108083854e-06, "loss": 0.3746, "num_input_tokens_seen": 797424, "step": 835 }, { "epoch": 0.06852108654865813, "grad_norm": 12.32161808013916, "learning_rate": 3.42197569132882e-06, "loss": 0.3296, "num_input_tokens_seen": 802880, "step": 840 }, { "epoch": 0.0689289501590668, "grad_norm": 18.342147827148438, "learning_rate": 3.442368871849254e-06, "loss": 0.2096, "num_input_tokens_seen": 808144, "step": 845 }, { "epoch": 0.06933681376947548, "grad_norm": 18.312191009521484, "learning_rate": 3.4627620523696876e-06, "loss": 0.5851, "num_input_tokens_seen": 812288, "step": 850 }, { "epoch": 0.06974467737988417, "grad_norm": 19.723119735717773, "learning_rate": 3.4831552328901217e-06, "loss": 0.3456, "num_input_tokens_seen": 817504, "step": 855 }, { "epoch": 0.07015254099029285, "grad_norm": 15.206892967224121, "learning_rate": 3.5035484134105554e-06, "loss": 0.2982, "num_input_tokens_seen": 822064, "step": 860 }, { "epoch": 0.07056040460070152, "grad_norm": 31.398386001586914, "learning_rate": 3.52394159393099e-06, "loss": 0.3177, "num_input_tokens_seen": 827328, "step": 865 }, { "epoch": 0.0709682682111102, "grad_norm": 19.17327308654785, "learning_rate": 3.5443347744514235e-06, "loss": 0.4756, "num_input_tokens_seen": 831824, "step": 870 }, { "epoch": 0.07137613182151889, "grad_norm": 15.711431503295898, "learning_rate": 3.5647279549718576e-06, "loss": 0.4158, "num_input_tokens_seen": 835776, "step": 875 }, { "epoch": 0.07178399543192757, "grad_norm": 17.141433715820312, "learning_rate": 3.585121135492291e-06, "loss": 0.3566, "num_input_tokens_seen": 840928, "step": 880 }, { "epoch": 0.07219185904233624, "grad_norm": 25.88591766357422, "learning_rate": 3.6055143160127257e-06, "loss": 0.3574, "num_input_tokens_seen": 845872, "step": 885 }, { "epoch": 0.07259972265274492, "grad_norm": 26.599428176879883, "learning_rate": 3.6259074965331598e-06, "loss": 0.3807, "num_input_tokens_seen": 850288, "step": 890 }, { "epoch": 0.0730075862631536, "grad_norm": 27.97245979309082, "learning_rate": 3.6463006770535934e-06, "loss": 0.3548, "num_input_tokens_seen": 855200, "step": 895 }, { "epoch": 0.07341544987356229, "grad_norm": 23.754533767700195, "learning_rate": 3.6666938575740275e-06, "loss": 0.3931, "num_input_tokens_seen": 859808, "step": 900 }, { "epoch": 0.07382331348397096, "grad_norm": 12.2180814743042, "learning_rate": 3.687087038094461e-06, "loss": 0.4878, "num_input_tokens_seen": 865648, "step": 905 }, { "epoch": 0.07423117709437964, "grad_norm": 15.376197814941406, "learning_rate": 3.7074802186148956e-06, "loss": 0.361, "num_input_tokens_seen": 869840, "step": 910 }, { "epoch": 0.07463904070478831, "grad_norm": 8.543769836425781, "learning_rate": 3.7278733991353293e-06, "loss": 0.4301, "num_input_tokens_seen": 874928, "step": 915 }, { "epoch": 0.075046904315197, "grad_norm": 7.28000545501709, "learning_rate": 3.7482665796557633e-06, "loss": 0.3402, "num_input_tokens_seen": 878976, "step": 920 }, { "epoch": 0.07545476792560568, "grad_norm": 9.105621337890625, "learning_rate": 3.768659760176197e-06, "loss": 0.3258, "num_input_tokens_seen": 883968, "step": 925 }, { "epoch": 0.07586263153601436, "grad_norm": 27.99003028869629, "learning_rate": 3.789052940696631e-06, "loss": 0.366, "num_input_tokens_seen": 889296, "step": 930 }, { "epoch": 0.07627049514642303, "grad_norm": 9.675555229187012, "learning_rate": 3.8094461212170656e-06, "loss": 0.3201, "num_input_tokens_seen": 894672, "step": 935 }, { "epoch": 0.07667835875683171, "grad_norm": 23.157642364501953, "learning_rate": 3.829839301737499e-06, "loss": 0.3358, "num_input_tokens_seen": 899792, "step": 940 }, { "epoch": 0.0770862223672404, "grad_norm": 20.219043731689453, "learning_rate": 3.850232482257933e-06, "loss": 0.3326, "num_input_tokens_seen": 905072, "step": 945 }, { "epoch": 0.07749408597764908, "grad_norm": 17.70180892944336, "learning_rate": 3.8706256627783665e-06, "loss": 0.3336, "num_input_tokens_seen": 909328, "step": 950 }, { "epoch": 0.07790194958805775, "grad_norm": 15.271267890930176, "learning_rate": 3.891018843298801e-06, "loss": 0.3405, "num_input_tokens_seen": 913904, "step": 955 }, { "epoch": 0.07830981319846643, "grad_norm": 17.13546371459961, "learning_rate": 3.9114120238192355e-06, "loss": 0.3054, "num_input_tokens_seen": 918304, "step": 960 }, { "epoch": 0.0787176768088751, "grad_norm": 7.323614120483398, "learning_rate": 3.931805204339669e-06, "loss": 0.4227, "num_input_tokens_seen": 923408, "step": 965 }, { "epoch": 0.0791255404192838, "grad_norm": 30.991575241088867, "learning_rate": 3.952198384860103e-06, "loss": 0.3508, "num_input_tokens_seen": 928736, "step": 970 }, { "epoch": 0.07953340402969247, "grad_norm": 14.993364334106445, "learning_rate": 3.972591565380537e-06, "loss": 0.3619, "num_input_tokens_seen": 933488, "step": 975 }, { "epoch": 0.07994126764010115, "grad_norm": 16.246843338012695, "learning_rate": 3.992984745900971e-06, "loss": 0.5009, "num_input_tokens_seen": 938864, "step": 980 }, { "epoch": 0.08034913125050983, "grad_norm": 8.900763511657715, "learning_rate": 4.013377926421405e-06, "loss": 0.336, "num_input_tokens_seen": 943216, "step": 985 }, { "epoch": 0.08075699486091852, "grad_norm": 13.691402435302734, "learning_rate": 4.033771106941839e-06, "loss": 0.4033, "num_input_tokens_seen": 946784, "step": 990 }, { "epoch": 0.08116485847132719, "grad_norm": 12.195028305053711, "learning_rate": 4.054164287462272e-06, "loss": 0.3182, "num_input_tokens_seen": 952048, "step": 995 }, { "epoch": 0.08157272208173587, "grad_norm": 13.911835670471191, "learning_rate": 4.074557467982706e-06, "loss": 0.3268, "num_input_tokens_seen": 956944, "step": 1000 }, { "epoch": 0.08198058569214454, "grad_norm": 12.756331443786621, "learning_rate": 4.094950648503141e-06, "loss": 0.3151, "num_input_tokens_seen": 961520, "step": 1005 }, { "epoch": 0.08238844930255322, "grad_norm": 24.846942901611328, "learning_rate": 4.1153438290235745e-06, "loss": 0.3564, "num_input_tokens_seen": 966336, "step": 1010 }, { "epoch": 0.08279631291296191, "grad_norm": 14.117002487182617, "learning_rate": 4.1357370095440086e-06, "loss": 0.3642, "num_input_tokens_seen": 970384, "step": 1015 }, { "epoch": 0.08320417652337059, "grad_norm": 18.375364303588867, "learning_rate": 4.156130190064443e-06, "loss": 0.2954, "num_input_tokens_seen": 975168, "step": 1020 }, { "epoch": 0.08361204013377926, "grad_norm": 14.703852653503418, "learning_rate": 4.176523370584877e-06, "loss": 0.312, "num_input_tokens_seen": 979600, "step": 1025 }, { "epoch": 0.08401990374418794, "grad_norm": 24.94498634338379, "learning_rate": 4.196916551105311e-06, "loss": 0.3574, "num_input_tokens_seen": 985120, "step": 1030 }, { "epoch": 0.08442776735459662, "grad_norm": 50.81404113769531, "learning_rate": 4.217309731625745e-06, "loss": 0.5803, "num_input_tokens_seen": 990112, "step": 1035 }, { "epoch": 0.08483563096500531, "grad_norm": 13.0558443069458, "learning_rate": 4.237702912146178e-06, "loss": 0.3675, "num_input_tokens_seen": 994176, "step": 1040 }, { "epoch": 0.08524349457541398, "grad_norm": 9.750115394592285, "learning_rate": 4.258096092666612e-06, "loss": 0.3181, "num_input_tokens_seen": 998832, "step": 1045 }, { "epoch": 0.08565135818582266, "grad_norm": 23.890283584594727, "learning_rate": 4.278489273187047e-06, "loss": 0.3696, "num_input_tokens_seen": 1003712, "step": 1050 }, { "epoch": 0.08605922179623134, "grad_norm": 28.86678123474121, "learning_rate": 4.29888245370748e-06, "loss": 0.4015, "num_input_tokens_seen": 1008592, "step": 1055 }, { "epoch": 0.08646708540664003, "grad_norm": 9.588279724121094, "learning_rate": 4.319275634227914e-06, "loss": 0.4569, "num_input_tokens_seen": 1013280, "step": 1060 }, { "epoch": 0.0868749490170487, "grad_norm": 10.20378303527832, "learning_rate": 4.339668814748348e-06, "loss": 0.3663, "num_input_tokens_seen": 1018144, "step": 1065 }, { "epoch": 0.08728281262745738, "grad_norm": 19.31324577331543, "learning_rate": 4.3600619952687825e-06, "loss": 0.367, "num_input_tokens_seen": 1023648, "step": 1070 }, { "epoch": 0.08769067623786606, "grad_norm": 12.592637062072754, "learning_rate": 4.3804551757892165e-06, "loss": 0.3457, "num_input_tokens_seen": 1029328, "step": 1075 }, { "epoch": 0.08809853984827473, "grad_norm": 19.07027244567871, "learning_rate": 4.400848356309651e-06, "loss": 0.3614, "num_input_tokens_seen": 1033040, "step": 1080 }, { "epoch": 0.08850640345868342, "grad_norm": 17.678987503051758, "learning_rate": 4.421241536830084e-06, "loss": 0.3578, "num_input_tokens_seen": 1038000, "step": 1085 }, { "epoch": 0.0889142670690921, "grad_norm": 10.334424018859863, "learning_rate": 4.441634717350518e-06, "loss": 0.3772, "num_input_tokens_seen": 1043024, "step": 1090 }, { "epoch": 0.08932213067950077, "grad_norm": 13.162654876708984, "learning_rate": 4.462027897870953e-06, "loss": 0.4476, "num_input_tokens_seen": 1048128, "step": 1095 }, { "epoch": 0.08972999428990945, "grad_norm": 10.57528018951416, "learning_rate": 4.482421078391386e-06, "loss": 0.3601, "num_input_tokens_seen": 1053920, "step": 1100 }, { "epoch": 0.09013785790031813, "grad_norm": 18.986106872558594, "learning_rate": 4.50281425891182e-06, "loss": 0.3337, "num_input_tokens_seen": 1059184, "step": 1105 }, { "epoch": 0.09054572151072682, "grad_norm": 14.079607963562012, "learning_rate": 4.523207439432254e-06, "loss": 0.3349, "num_input_tokens_seen": 1064832, "step": 1110 }, { "epoch": 0.0909535851211355, "grad_norm": 10.5726318359375, "learning_rate": 4.543600619952687e-06, "loss": 0.4207, "num_input_tokens_seen": 1069392, "step": 1115 }, { "epoch": 0.09136144873154417, "grad_norm": 6.386050224304199, "learning_rate": 4.563993800473122e-06, "loss": 0.3339, "num_input_tokens_seen": 1074512, "step": 1120 }, { "epoch": 0.09176931234195285, "grad_norm": 19.959495544433594, "learning_rate": 4.584386980993556e-06, "loss": 0.3597, "num_input_tokens_seen": 1078768, "step": 1125 }, { "epoch": 0.09217717595236154, "grad_norm": 13.764420509338379, "learning_rate": 4.60478016151399e-06, "loss": 0.3032, "num_input_tokens_seen": 1083632, "step": 1130 }, { "epoch": 0.09258503956277021, "grad_norm": 7.896544456481934, "learning_rate": 4.625173342034424e-06, "loss": 0.3934, "num_input_tokens_seen": 1087792, "step": 1135 }, { "epoch": 0.09299290317317889, "grad_norm": 7.855512619018555, "learning_rate": 4.645566522554859e-06, "loss": 0.3757, "num_input_tokens_seen": 1092592, "step": 1140 }, { "epoch": 0.09340076678358757, "grad_norm": 12.084761619567871, "learning_rate": 4.665959703075292e-06, "loss": 0.3128, "num_input_tokens_seen": 1097696, "step": 1145 }, { "epoch": 0.09380863039399624, "grad_norm": 12.555508613586426, "learning_rate": 4.686352883595726e-06, "loss": 0.3769, "num_input_tokens_seen": 1102480, "step": 1150 }, { "epoch": 0.09421649400440493, "grad_norm": 9.674520492553711, "learning_rate": 4.70674606411616e-06, "loss": 0.4355, "num_input_tokens_seen": 1107392, "step": 1155 }, { "epoch": 0.09462435761481361, "grad_norm": 12.857894897460938, "learning_rate": 4.727139244636593e-06, "loss": 0.3276, "num_input_tokens_seen": 1112096, "step": 1160 }, { "epoch": 0.09503222122522229, "grad_norm": 7.665470600128174, "learning_rate": 4.747532425157028e-06, "loss": 0.3525, "num_input_tokens_seen": 1116080, "step": 1165 }, { "epoch": 0.09544008483563096, "grad_norm": 6.792557716369629, "learning_rate": 4.767925605677462e-06, "loss": 0.3301, "num_input_tokens_seen": 1120656, "step": 1170 }, { "epoch": 0.09584794844603964, "grad_norm": 10.890433311462402, "learning_rate": 4.788318786197895e-06, "loss": 0.3728, "num_input_tokens_seen": 1126064, "step": 1175 }, { "epoch": 0.09625581205644833, "grad_norm": 13.602570533752441, "learning_rate": 4.8087119667183295e-06, "loss": 0.3046, "num_input_tokens_seen": 1131056, "step": 1180 }, { "epoch": 0.096663675666857, "grad_norm": 14.95585823059082, "learning_rate": 4.8291051472387635e-06, "loss": 0.4235, "num_input_tokens_seen": 1136432, "step": 1185 }, { "epoch": 0.09707153927726568, "grad_norm": 14.471333503723145, "learning_rate": 4.849498327759198e-06, "loss": 0.34, "num_input_tokens_seen": 1141152, "step": 1190 }, { "epoch": 0.09747940288767436, "grad_norm": 6.3764967918396, "learning_rate": 4.869891508279632e-06, "loss": 0.3487, "num_input_tokens_seen": 1145504, "step": 1195 }, { "epoch": 0.09788726649808305, "grad_norm": 8.552063941955566, "learning_rate": 4.890284688800066e-06, "loss": 0.4066, "num_input_tokens_seen": 1150512, "step": 1200 }, { "epoch": 0.09829513010849172, "grad_norm": 9.378458976745605, "learning_rate": 4.910677869320499e-06, "loss": 0.3722, "num_input_tokens_seen": 1155312, "step": 1205 }, { "epoch": 0.0987029937189004, "grad_norm": 22.355392456054688, "learning_rate": 4.931071049840934e-06, "loss": 0.3944, "num_input_tokens_seen": 1160080, "step": 1210 }, { "epoch": 0.09911085732930908, "grad_norm": 16.40981674194336, "learning_rate": 4.951464230361368e-06, "loss": 0.3142, "num_input_tokens_seen": 1164432, "step": 1215 }, { "epoch": 0.09951872093971775, "grad_norm": 23.028911590576172, "learning_rate": 4.971857410881801e-06, "loss": 0.344, "num_input_tokens_seen": 1169920, "step": 1220 }, { "epoch": 0.09992658455012644, "grad_norm": 16.708932876586914, "learning_rate": 4.992250591402235e-06, "loss": 0.3673, "num_input_tokens_seen": 1175152, "step": 1225 }, { "epoch": 0.10033444816053512, "grad_norm": 12.453238487243652, "learning_rate": 5.012643771922669e-06, "loss": 0.3622, "num_input_tokens_seen": 1180016, "step": 1230 }, { "epoch": 0.1007423117709438, "grad_norm": 21.68697738647461, "learning_rate": 5.033036952443103e-06, "loss": 0.3755, "num_input_tokens_seen": 1184448, "step": 1235 }, { "epoch": 0.10115017538135247, "grad_norm": 9.087939262390137, "learning_rate": 5.0534301329635375e-06, "loss": 0.427, "num_input_tokens_seen": 1189200, "step": 1240 }, { "epoch": 0.10155803899176115, "grad_norm": 9.924860000610352, "learning_rate": 5.0738233134839715e-06, "loss": 0.3417, "num_input_tokens_seen": 1193632, "step": 1245 }, { "epoch": 0.10196590260216984, "grad_norm": 7.190810203552246, "learning_rate": 5.094216494004405e-06, "loss": 0.3482, "num_input_tokens_seen": 1198592, "step": 1250 }, { "epoch": 0.10237376621257852, "grad_norm": 11.661046981811523, "learning_rate": 5.114609674524839e-06, "loss": 0.3206, "num_input_tokens_seen": 1202992, "step": 1255 }, { "epoch": 0.10278162982298719, "grad_norm": 10.791495323181152, "learning_rate": 5.135002855045274e-06, "loss": 0.3261, "num_input_tokens_seen": 1207792, "step": 1260 }, { "epoch": 0.10318949343339587, "grad_norm": 10.833209991455078, "learning_rate": 5.155396035565707e-06, "loss": 0.42, "num_input_tokens_seen": 1212464, "step": 1265 }, { "epoch": 0.10359735704380456, "grad_norm": 21.24329948425293, "learning_rate": 5.175789216086141e-06, "loss": 0.3224, "num_input_tokens_seen": 1216768, "step": 1270 }, { "epoch": 0.10400522065421323, "grad_norm": 6.382613182067871, "learning_rate": 5.196182396606575e-06, "loss": 0.3561, "num_input_tokens_seen": 1221264, "step": 1275 }, { "epoch": 0.10441308426462191, "grad_norm": 9.04773998260498, "learning_rate": 5.216575577127009e-06, "loss": 0.2657, "num_input_tokens_seen": 1224816, "step": 1280 }, { "epoch": 0.10482094787503059, "grad_norm": 28.40637969970703, "learning_rate": 5.236968757647443e-06, "loss": 0.4159, "num_input_tokens_seen": 1229712, "step": 1285 }, { "epoch": 0.10522881148543926, "grad_norm": 8.35102367401123, "learning_rate": 5.257361938167877e-06, "loss": 0.342, "num_input_tokens_seen": 1234368, "step": 1290 }, { "epoch": 0.10563667509584795, "grad_norm": 6.182595729827881, "learning_rate": 5.2777551186883105e-06, "loss": 0.3663, "num_input_tokens_seen": 1238992, "step": 1295 }, { "epoch": 0.10604453870625663, "grad_norm": 20.801673889160156, "learning_rate": 5.298148299208745e-06, "loss": 0.4059, "num_input_tokens_seen": 1243968, "step": 1300 }, { "epoch": 0.1064524023166653, "grad_norm": 6.84321403503418, "learning_rate": 5.318541479729179e-06, "loss": 0.3358, "num_input_tokens_seen": 1248848, "step": 1305 }, { "epoch": 0.10686026592707398, "grad_norm": 10.903738021850586, "learning_rate": 5.338934660249613e-06, "loss": 0.4695, "num_input_tokens_seen": 1254048, "step": 1310 }, { "epoch": 0.10726812953748266, "grad_norm": 4.572890281677246, "learning_rate": 5.359327840770047e-06, "loss": 0.4215, "num_input_tokens_seen": 1258848, "step": 1315 }, { "epoch": 0.10767599314789135, "grad_norm": 7.51182222366333, "learning_rate": 5.379721021290481e-06, "loss": 0.3504, "num_input_tokens_seen": 1262960, "step": 1320 }, { "epoch": 0.10808385675830003, "grad_norm": 9.593701362609863, "learning_rate": 5.400114201810914e-06, "loss": 0.3794, "num_input_tokens_seen": 1267296, "step": 1325 }, { "epoch": 0.1084917203687087, "grad_norm": 11.12649154663086, "learning_rate": 5.420507382331349e-06, "loss": 0.3299, "num_input_tokens_seen": 1271168, "step": 1330 }, { "epoch": 0.10889958397911738, "grad_norm": 18.1518611907959, "learning_rate": 5.440900562851783e-06, "loss": 0.3399, "num_input_tokens_seen": 1276288, "step": 1335 }, { "epoch": 0.10930744758952607, "grad_norm": 12.361876487731934, "learning_rate": 5.461293743372216e-06, "loss": 0.4007, "num_input_tokens_seen": 1281312, "step": 1340 }, { "epoch": 0.10971531119993475, "grad_norm": 10.608939170837402, "learning_rate": 5.48168692389265e-06, "loss": 0.2946, "num_input_tokens_seen": 1286752, "step": 1345 }, { "epoch": 0.11012317481034342, "grad_norm": 18.71640968322754, "learning_rate": 5.5020801044130845e-06, "loss": 0.4885, "num_input_tokens_seen": 1291088, "step": 1350 }, { "epoch": 0.1105310384207521, "grad_norm": 8.706215858459473, "learning_rate": 5.5224732849335185e-06, "loss": 0.3307, "num_input_tokens_seen": 1296432, "step": 1355 }, { "epoch": 0.11093890203116077, "grad_norm": 19.138877868652344, "learning_rate": 5.542866465453953e-06, "loss": 0.3849, "num_input_tokens_seen": 1301088, "step": 1360 }, { "epoch": 0.11134676564156946, "grad_norm": 5.218441009521484, "learning_rate": 5.563259645974387e-06, "loss": 0.3541, "num_input_tokens_seen": 1305168, "step": 1365 }, { "epoch": 0.11175462925197814, "grad_norm": 5.504722595214844, "learning_rate": 5.58365282649482e-06, "loss": 0.319, "num_input_tokens_seen": 1309984, "step": 1370 }, { "epoch": 0.11216249286238682, "grad_norm": 5.614574909210205, "learning_rate": 5.604046007015255e-06, "loss": 0.2782, "num_input_tokens_seen": 1314688, "step": 1375 }, { "epoch": 0.1125703564727955, "grad_norm": 4.652282238006592, "learning_rate": 5.624439187535689e-06, "loss": 0.4304, "num_input_tokens_seen": 1319440, "step": 1380 }, { "epoch": 0.11297822008320417, "grad_norm": 17.05095672607422, "learning_rate": 5.644832368056122e-06, "loss": 0.4237, "num_input_tokens_seen": 1324704, "step": 1385 }, { "epoch": 0.11338608369361286, "grad_norm": 13.999674797058105, "learning_rate": 5.665225548576556e-06, "loss": 0.4364, "num_input_tokens_seen": 1329776, "step": 1390 }, { "epoch": 0.11379394730402154, "grad_norm": 13.145926475524902, "learning_rate": 5.68561872909699e-06, "loss": 0.3427, "num_input_tokens_seen": 1334368, "step": 1395 }, { "epoch": 0.11420181091443021, "grad_norm": 5.40587043762207, "learning_rate": 5.706011909617424e-06, "loss": 0.3843, "num_input_tokens_seen": 1338960, "step": 1400 }, { "epoch": 0.11460967452483889, "grad_norm": 6.143148422241211, "learning_rate": 5.726405090137858e-06, "loss": 0.3807, "num_input_tokens_seen": 1343968, "step": 1405 }, { "epoch": 0.11501753813524758, "grad_norm": 4.835149765014648, "learning_rate": 5.7467982706582925e-06, "loss": 0.3792, "num_input_tokens_seen": 1348400, "step": 1410 }, { "epoch": 0.11542540174565626, "grad_norm": 12.357966423034668, "learning_rate": 5.767191451178726e-06, "loss": 0.3324, "num_input_tokens_seen": 1353760, "step": 1415 }, { "epoch": 0.11583326535606493, "grad_norm": 7.003427028656006, "learning_rate": 5.787584631699161e-06, "loss": 0.3792, "num_input_tokens_seen": 1359184, "step": 1420 }, { "epoch": 0.11624112896647361, "grad_norm": 9.161282539367676, "learning_rate": 5.807977812219594e-06, "loss": 0.3515, "num_input_tokens_seen": 1364192, "step": 1425 }, { "epoch": 0.11664899257688228, "grad_norm": 7.235124111175537, "learning_rate": 5.828370992740028e-06, "loss": 0.3014, "num_input_tokens_seen": 1369152, "step": 1430 }, { "epoch": 0.11705685618729098, "grad_norm": 7.298038482666016, "learning_rate": 5.848764173260462e-06, "loss": 0.3038, "num_input_tokens_seen": 1373456, "step": 1435 }, { "epoch": 0.11746471979769965, "grad_norm": 4.988548278808594, "learning_rate": 5.869157353780896e-06, "loss": 0.2584, "num_input_tokens_seen": 1377808, "step": 1440 }, { "epoch": 0.11787258340810833, "grad_norm": 3.856151580810547, "learning_rate": 5.88955053430133e-06, "loss": 0.2463, "num_input_tokens_seen": 1382848, "step": 1445 }, { "epoch": 0.118280447018517, "grad_norm": 1.4249448776245117, "learning_rate": 5.909943714821764e-06, "loss": 0.2849, "num_input_tokens_seen": 1387408, "step": 1450 }, { "epoch": 0.11868831062892568, "grad_norm": 49.076812744140625, "learning_rate": 5.930336895342198e-06, "loss": 0.9702, "num_input_tokens_seen": 1391600, "step": 1455 }, { "epoch": 0.11909617423933437, "grad_norm": 31.448915481567383, "learning_rate": 5.9507300758626315e-06, "loss": 1.2784, "num_input_tokens_seen": 1396192, "step": 1460 }, { "epoch": 0.11950403784974305, "grad_norm": 14.766374588012695, "learning_rate": 5.9711232563830655e-06, "loss": 0.4461, "num_input_tokens_seen": 1400720, "step": 1465 }, { "epoch": 0.11991190146015172, "grad_norm": 32.050872802734375, "learning_rate": 5.9915164369035e-06, "loss": 0.3879, "num_input_tokens_seen": 1406000, "step": 1470 }, { "epoch": 0.1203197650705604, "grad_norm": 15.345243453979492, "learning_rate": 6.011909617423934e-06, "loss": 0.3679, "num_input_tokens_seen": 1411056, "step": 1475 }, { "epoch": 0.12072762868096909, "grad_norm": 16.105541229248047, "learning_rate": 6.032302797944368e-06, "loss": 0.3896, "num_input_tokens_seen": 1416144, "step": 1480 }, { "epoch": 0.12113549229137777, "grad_norm": 34.0405387878418, "learning_rate": 6.052695978464802e-06, "loss": 0.3815, "num_input_tokens_seen": 1421632, "step": 1485 }, { "epoch": 0.12154335590178644, "grad_norm": 18.508880615234375, "learning_rate": 6.073089158985236e-06, "loss": 0.578, "num_input_tokens_seen": 1426720, "step": 1490 }, { "epoch": 0.12195121951219512, "grad_norm": 24.71353530883789, "learning_rate": 6.09348233950567e-06, "loss": 0.5367, "num_input_tokens_seen": 1431504, "step": 1495 }, { "epoch": 0.1223590831226038, "grad_norm": 11.414009094238281, "learning_rate": 6.113875520026103e-06, "loss": 0.3579, "num_input_tokens_seen": 1436896, "step": 1500 }, { "epoch": 0.12276694673301249, "grad_norm": 11.342082977294922, "learning_rate": 6.134268700546537e-06, "loss": 0.31, "num_input_tokens_seen": 1442288, "step": 1505 }, { "epoch": 0.12317481034342116, "grad_norm": 9.130901336669922, "learning_rate": 6.154661881066971e-06, "loss": 0.4535, "num_input_tokens_seen": 1446832, "step": 1510 }, { "epoch": 0.12358267395382984, "grad_norm": 12.250860214233398, "learning_rate": 6.175055061587405e-06, "loss": 0.3309, "num_input_tokens_seen": 1451952, "step": 1515 }, { "epoch": 0.12399053756423851, "grad_norm": 18.91921043395996, "learning_rate": 6.1954482421078395e-06, "loss": 0.4005, "num_input_tokens_seen": 1457168, "step": 1520 }, { "epoch": 0.12439840117464719, "grad_norm": 11.613739967346191, "learning_rate": 6.2158414226282735e-06, "loss": 0.3538, "num_input_tokens_seen": 1462176, "step": 1525 }, { "epoch": 0.12480626478505588, "grad_norm": 7.678050994873047, "learning_rate": 6.236234603148708e-06, "loss": 0.3331, "num_input_tokens_seen": 1466720, "step": 1530 }, { "epoch": 0.12521412839546456, "grad_norm": 8.063640594482422, "learning_rate": 6.256627783669142e-06, "loss": 0.387, "num_input_tokens_seen": 1471264, "step": 1535 }, { "epoch": 0.12562199200587323, "grad_norm": 13.517932891845703, "learning_rate": 6.277020964189576e-06, "loss": 0.53, "num_input_tokens_seen": 1476208, "step": 1540 }, { "epoch": 0.1260298556162819, "grad_norm": 15.049299240112305, "learning_rate": 6.297414144710009e-06, "loss": 0.3819, "num_input_tokens_seen": 1480528, "step": 1545 }, { "epoch": 0.1264377192266906, "grad_norm": 8.874709129333496, "learning_rate": 6.317807325230443e-06, "loss": 0.4204, "num_input_tokens_seen": 1484528, "step": 1550 }, { "epoch": 0.12684558283709926, "grad_norm": 8.206560134887695, "learning_rate": 6.338200505750877e-06, "loss": 0.3382, "num_input_tokens_seen": 1489376, "step": 1555 }, { "epoch": 0.12725344644750794, "grad_norm": 7.021800994873047, "learning_rate": 6.358593686271311e-06, "loss": 0.3214, "num_input_tokens_seen": 1493664, "step": 1560 }, { "epoch": 0.12766131005791664, "grad_norm": 4.583018779754639, "learning_rate": 6.378986866791744e-06, "loss": 0.3235, "num_input_tokens_seen": 1497568, "step": 1565 }, { "epoch": 0.12806917366832532, "grad_norm": 10.32192325592041, "learning_rate": 6.3993800473121785e-06, "loss": 0.4455, "num_input_tokens_seen": 1502688, "step": 1570 }, { "epoch": 0.128477037278734, "grad_norm": 3.1015636920928955, "learning_rate": 6.419773227832613e-06, "loss": 0.3893, "num_input_tokens_seen": 1507984, "step": 1575 }, { "epoch": 0.12888490088914267, "grad_norm": 12.08314037322998, "learning_rate": 6.4401664083530475e-06, "loss": 0.356, "num_input_tokens_seen": 1512736, "step": 1580 }, { "epoch": 0.12929276449955135, "grad_norm": 7.81959867477417, "learning_rate": 6.4605595888734815e-06, "loss": 0.3437, "num_input_tokens_seen": 1517408, "step": 1585 }, { "epoch": 0.12970062810996003, "grad_norm": 14.043208122253418, "learning_rate": 6.480952769393915e-06, "loss": 0.3353, "num_input_tokens_seen": 1521072, "step": 1590 }, { "epoch": 0.1301084917203687, "grad_norm": 7.032672882080078, "learning_rate": 6.501345949914349e-06, "loss": 0.3317, "num_input_tokens_seen": 1525200, "step": 1595 }, { "epoch": 0.13051635533077738, "grad_norm": 6.726536273956299, "learning_rate": 6.521739130434783e-06, "loss": 0.3876, "num_input_tokens_seen": 1530512, "step": 1600 }, { "epoch": 0.13092421894118605, "grad_norm": 18.540367126464844, "learning_rate": 6.542132310955217e-06, "loss": 0.3986, "num_input_tokens_seen": 1535232, "step": 1605 }, { "epoch": 0.13133208255159476, "grad_norm": 9.008016586303711, "learning_rate": 6.56252549147565e-06, "loss": 0.3191, "num_input_tokens_seen": 1540176, "step": 1610 }, { "epoch": 0.13173994616200344, "grad_norm": 12.994604110717773, "learning_rate": 6.582918671996084e-06, "loss": 0.324, "num_input_tokens_seen": 1544720, "step": 1615 }, { "epoch": 0.1321478097724121, "grad_norm": 13.540788650512695, "learning_rate": 6.603311852516519e-06, "loss": 0.3999, "num_input_tokens_seen": 1549312, "step": 1620 }, { "epoch": 0.1325556733828208, "grad_norm": 5.088010311126709, "learning_rate": 6.623705033036953e-06, "loss": 0.3516, "num_input_tokens_seen": 1554352, "step": 1625 }, { "epoch": 0.13296353699322946, "grad_norm": 5.263687610626221, "learning_rate": 6.644098213557387e-06, "loss": 0.3574, "num_input_tokens_seen": 1559552, "step": 1630 }, { "epoch": 0.13337140060363814, "grad_norm": 9.252799987792969, "learning_rate": 6.6644913940778205e-06, "loss": 0.3819, "num_input_tokens_seen": 1564016, "step": 1635 }, { "epoch": 0.13377926421404682, "grad_norm": 10.301292419433594, "learning_rate": 6.684884574598255e-06, "loss": 0.3214, "num_input_tokens_seen": 1569232, "step": 1640 }, { "epoch": 0.1341871278244555, "grad_norm": 11.436758995056152, "learning_rate": 6.705277755118689e-06, "loss": 0.3476, "num_input_tokens_seen": 1574032, "step": 1645 }, { "epoch": 0.13459499143486417, "grad_norm": 8.021950721740723, "learning_rate": 6.725670935639123e-06, "loss": 0.3268, "num_input_tokens_seen": 1577984, "step": 1650 }, { "epoch": 0.13500285504527287, "grad_norm": 11.268006324768066, "learning_rate": 6.746064116159556e-06, "loss": 0.4418, "num_input_tokens_seen": 1582800, "step": 1655 }, { "epoch": 0.13541071865568155, "grad_norm": 7.930236339569092, "learning_rate": 6.76645729667999e-06, "loss": 0.3358, "num_input_tokens_seen": 1587312, "step": 1660 }, { "epoch": 0.13581858226609023, "grad_norm": 10.703290939331055, "learning_rate": 6.786850477200425e-06, "loss": 0.3425, "num_input_tokens_seen": 1591808, "step": 1665 }, { "epoch": 0.1362264458764989, "grad_norm": 10.428373336791992, "learning_rate": 6.807243657720859e-06, "loss": 0.3667, "num_input_tokens_seen": 1596672, "step": 1670 }, { "epoch": 0.13663430948690758, "grad_norm": 6.9453816413879395, "learning_rate": 6.827636838241293e-06, "loss": 0.3197, "num_input_tokens_seen": 1601504, "step": 1675 }, { "epoch": 0.13704217309731626, "grad_norm": 7.300814628601074, "learning_rate": 6.848030018761726e-06, "loss": 0.267, "num_input_tokens_seen": 1605776, "step": 1680 }, { "epoch": 0.13745003670772493, "grad_norm": 6.710818290710449, "learning_rate": 6.86842319928216e-06, "loss": 0.4249, "num_input_tokens_seen": 1610576, "step": 1685 }, { "epoch": 0.1378579003181336, "grad_norm": 2.4412903785705566, "learning_rate": 6.8888163798025944e-06, "loss": 0.1719, "num_input_tokens_seen": 1615040, "step": 1690 }, { "epoch": 0.13826576392854228, "grad_norm": 21.650630950927734, "learning_rate": 6.909209560323028e-06, "loss": 0.7144, "num_input_tokens_seen": 1620288, "step": 1695 }, { "epoch": 0.13867362753895096, "grad_norm": 17.85610008239746, "learning_rate": 6.929602740843462e-06, "loss": 0.4913, "num_input_tokens_seen": 1626176, "step": 1700 }, { "epoch": 0.13908149114935967, "grad_norm": 8.261235237121582, "learning_rate": 6.949995921363896e-06, "loss": 0.4297, "num_input_tokens_seen": 1631408, "step": 1705 }, { "epoch": 0.13948935475976834, "grad_norm": 13.750737190246582, "learning_rate": 6.97038910188433e-06, "loss": 0.368, "num_input_tokens_seen": 1636336, "step": 1710 }, { "epoch": 0.13989721837017702, "grad_norm": 7.77808952331543, "learning_rate": 6.990782282404765e-06, "loss": 0.3219, "num_input_tokens_seen": 1641184, "step": 1715 }, { "epoch": 0.1403050819805857, "grad_norm": 5.204422473907471, "learning_rate": 7.011175462925199e-06, "loss": 0.3577, "num_input_tokens_seen": 1645968, "step": 1720 }, { "epoch": 0.14071294559099437, "grad_norm": 7.529096603393555, "learning_rate": 7.031568643445632e-06, "loss": 0.3242, "num_input_tokens_seen": 1650608, "step": 1725 }, { "epoch": 0.14112080920140305, "grad_norm": 5.574354648590088, "learning_rate": 7.051961823966066e-06, "loss": 0.2857, "num_input_tokens_seen": 1655008, "step": 1730 }, { "epoch": 0.14152867281181172, "grad_norm": 6.936734199523926, "learning_rate": 7.0723550044865e-06, "loss": 0.3891, "num_input_tokens_seen": 1660368, "step": 1735 }, { "epoch": 0.1419365364222204, "grad_norm": 4.852234363555908, "learning_rate": 7.0927481850069335e-06, "loss": 0.4078, "num_input_tokens_seen": 1666080, "step": 1740 }, { "epoch": 0.14234440003262908, "grad_norm": 3.877595901489258, "learning_rate": 7.1131413655273675e-06, "loss": 0.3266, "num_input_tokens_seen": 1671568, "step": 1745 }, { "epoch": 0.14275226364303778, "grad_norm": 13.045038223266602, "learning_rate": 7.133534546047802e-06, "loss": 0.3563, "num_input_tokens_seen": 1677664, "step": 1750 }, { "epoch": 0.14316012725344646, "grad_norm": 3.281592845916748, "learning_rate": 7.153927726568236e-06, "loss": 0.37, "num_input_tokens_seen": 1681824, "step": 1755 }, { "epoch": 0.14356799086385513, "grad_norm": 6.29805850982666, "learning_rate": 7.174320907088671e-06, "loss": 0.3514, "num_input_tokens_seen": 1687008, "step": 1760 }, { "epoch": 0.1439758544742638, "grad_norm": 9.258400917053223, "learning_rate": 7.194714087609105e-06, "loss": 0.3183, "num_input_tokens_seen": 1691648, "step": 1765 }, { "epoch": 0.14438371808467249, "grad_norm": 17.373769760131836, "learning_rate": 7.215107268129538e-06, "loss": 0.3553, "num_input_tokens_seen": 1695664, "step": 1770 }, { "epoch": 0.14479158169508116, "grad_norm": 18.57534408569336, "learning_rate": 7.235500448649972e-06, "loss": 0.4682, "num_input_tokens_seen": 1699424, "step": 1775 }, { "epoch": 0.14519944530548984, "grad_norm": 12.416900634765625, "learning_rate": 7.255893629170406e-06, "loss": 0.3851, "num_input_tokens_seen": 1704720, "step": 1780 }, { "epoch": 0.14560730891589851, "grad_norm": 7.459508419036865, "learning_rate": 7.276286809690839e-06, "loss": 0.3928, "num_input_tokens_seen": 1709168, "step": 1785 }, { "epoch": 0.1460151725263072, "grad_norm": 6.059111595153809, "learning_rate": 7.296679990211273e-06, "loss": 0.4243, "num_input_tokens_seen": 1713408, "step": 1790 }, { "epoch": 0.1464230361367159, "grad_norm": 4.9333367347717285, "learning_rate": 7.317073170731707e-06, "loss": 0.3404, "num_input_tokens_seen": 1718448, "step": 1795 }, { "epoch": 0.14683089974712457, "grad_norm": 3.0741639137268066, "learning_rate": 7.3374663512521414e-06, "loss": 0.3472, "num_input_tokens_seen": 1722640, "step": 1800 }, { "epoch": 0.14723876335753325, "grad_norm": 9.929667472839355, "learning_rate": 7.357859531772576e-06, "loss": 0.3861, "num_input_tokens_seen": 1727936, "step": 1805 }, { "epoch": 0.14764662696794192, "grad_norm": 8.04268741607666, "learning_rate": 7.3782527122930104e-06, "loss": 0.3213, "num_input_tokens_seen": 1732896, "step": 1810 }, { "epoch": 0.1480544905783506, "grad_norm": 10.285965919494629, "learning_rate": 7.398645892813444e-06, "loss": 0.3544, "num_input_tokens_seen": 1738208, "step": 1815 }, { "epoch": 0.14846235418875928, "grad_norm": 3.853783130645752, "learning_rate": 7.419039073333878e-06, "loss": 0.3444, "num_input_tokens_seen": 1743712, "step": 1820 }, { "epoch": 0.14887021779916795, "grad_norm": 13.421109199523926, "learning_rate": 7.439432253854312e-06, "loss": 0.3449, "num_input_tokens_seen": 1747712, "step": 1825 }, { "epoch": 0.14927808140957663, "grad_norm": 4.780526638031006, "learning_rate": 7.459825434374745e-06, "loss": 0.3479, "num_input_tokens_seen": 1753184, "step": 1830 }, { "epoch": 0.1496859450199853, "grad_norm": 4.435236930847168, "learning_rate": 7.480218614895179e-06, "loss": 0.3642, "num_input_tokens_seen": 1758064, "step": 1835 }, { "epoch": 0.150093808630394, "grad_norm": 9.219344139099121, "learning_rate": 7.500611795415613e-06, "loss": 0.3573, "num_input_tokens_seen": 1762288, "step": 1840 }, { "epoch": 0.1505016722408027, "grad_norm": 5.112336158752441, "learning_rate": 7.521004975936046e-06, "loss": 0.3724, "num_input_tokens_seen": 1767232, "step": 1845 }, { "epoch": 0.15090953585121136, "grad_norm": 12.018974304199219, "learning_rate": 7.5413981564564805e-06, "loss": 0.4074, "num_input_tokens_seen": 1772208, "step": 1850 }, { "epoch": 0.15131739946162004, "grad_norm": 3.543149471282959, "learning_rate": 7.561791336976916e-06, "loss": 0.3503, "num_input_tokens_seen": 1777008, "step": 1855 }, { "epoch": 0.15172526307202872, "grad_norm": 4.066500186920166, "learning_rate": 7.5821845174973494e-06, "loss": 0.3493, "num_input_tokens_seen": 1782176, "step": 1860 }, { "epoch": 0.1521331266824374, "grad_norm": 9.782663345336914, "learning_rate": 7.6025776980177835e-06, "loss": 0.3596, "num_input_tokens_seen": 1787168, "step": 1865 }, { "epoch": 0.15254099029284607, "grad_norm": 11.904740333557129, "learning_rate": 7.622970878538218e-06, "loss": 0.3782, "num_input_tokens_seen": 1792704, "step": 1870 }, { "epoch": 0.15294885390325474, "grad_norm": 4.267374515533447, "learning_rate": 7.64336405905865e-06, "loss": 0.3011, "num_input_tokens_seen": 1796752, "step": 1875 }, { "epoch": 0.15335671751366342, "grad_norm": 14.446444511413574, "learning_rate": 7.663757239579085e-06, "loss": 0.3656, "num_input_tokens_seen": 1802144, "step": 1880 }, { "epoch": 0.1537645811240721, "grad_norm": 6.582821846008301, "learning_rate": 7.684150420099519e-06, "loss": 0.3465, "num_input_tokens_seen": 1806208, "step": 1885 }, { "epoch": 0.1541724447344808, "grad_norm": 6.145726203918457, "learning_rate": 7.704543600619953e-06, "loss": 0.3464, "num_input_tokens_seen": 1811296, "step": 1890 }, { "epoch": 0.15458030834488948, "grad_norm": 4.1327619552612305, "learning_rate": 7.724936781140387e-06, "loss": 0.4537, "num_input_tokens_seen": 1816320, "step": 1895 }, { "epoch": 0.15498817195529815, "grad_norm": 6.509510517120361, "learning_rate": 7.745329961660821e-06, "loss": 0.3337, "num_input_tokens_seen": 1821648, "step": 1900 }, { "epoch": 0.15539603556570683, "grad_norm": 9.702773094177246, "learning_rate": 7.765723142181255e-06, "loss": 0.4321, "num_input_tokens_seen": 1825200, "step": 1905 }, { "epoch": 0.1558038991761155, "grad_norm": 18.477996826171875, "learning_rate": 7.78611632270169e-06, "loss": 0.4585, "num_input_tokens_seen": 1830608, "step": 1910 }, { "epoch": 0.15621176278652418, "grad_norm": 2.988567590713501, "learning_rate": 7.806509503222123e-06, "loss": 0.3202, "num_input_tokens_seen": 1836000, "step": 1915 }, { "epoch": 0.15661962639693286, "grad_norm": 5.397668838500977, "learning_rate": 7.826902683742557e-06, "loss": 0.4033, "num_input_tokens_seen": 1841680, "step": 1920 }, { "epoch": 0.15702749000734154, "grad_norm": 6.059991359710693, "learning_rate": 7.847295864262992e-06, "loss": 0.3932, "num_input_tokens_seen": 1845968, "step": 1925 }, { "epoch": 0.1574353536177502, "grad_norm": 4.7747979164123535, "learning_rate": 7.867689044783424e-06, "loss": 0.342, "num_input_tokens_seen": 1850576, "step": 1930 }, { "epoch": 0.15784321722815892, "grad_norm": 3.6761856079101562, "learning_rate": 7.888082225303858e-06, "loss": 0.3279, "num_input_tokens_seen": 1855456, "step": 1935 }, { "epoch": 0.1582510808385676, "grad_norm": 9.116640090942383, "learning_rate": 7.908475405824292e-06, "loss": 0.3565, "num_input_tokens_seen": 1859488, "step": 1940 }, { "epoch": 0.15865894444897627, "grad_norm": 6.749592304229736, "learning_rate": 7.928868586344728e-06, "loss": 0.3097, "num_input_tokens_seen": 1864576, "step": 1945 }, { "epoch": 0.15906680805938495, "grad_norm": 8.590089797973633, "learning_rate": 7.949261766865162e-06, "loss": 0.3959, "num_input_tokens_seen": 1869056, "step": 1950 }, { "epoch": 0.15947467166979362, "grad_norm": 8.29529857635498, "learning_rate": 7.969654947385594e-06, "loss": 0.3257, "num_input_tokens_seen": 1873696, "step": 1955 }, { "epoch": 0.1598825352802023, "grad_norm": 5.423698902130127, "learning_rate": 7.990048127906028e-06, "loss": 0.3159, "num_input_tokens_seen": 1878768, "step": 1960 }, { "epoch": 0.16029039889061097, "grad_norm": 5.567906379699707, "learning_rate": 8.010441308426462e-06, "loss": 0.4039, "num_input_tokens_seen": 1883024, "step": 1965 }, { "epoch": 0.16069826250101965, "grad_norm": 4.438025951385498, "learning_rate": 8.030834488946896e-06, "loss": 0.3909, "num_input_tokens_seen": 1888080, "step": 1970 }, { "epoch": 0.16110612611142833, "grad_norm": 9.804730415344238, "learning_rate": 8.05122766946733e-06, "loss": 0.4175, "num_input_tokens_seen": 1893200, "step": 1975 }, { "epoch": 0.16151398972183703, "grad_norm": 6.4839982986450195, "learning_rate": 8.071620849987765e-06, "loss": 0.4008, "num_input_tokens_seen": 1898304, "step": 1980 }, { "epoch": 0.1619218533322457, "grad_norm": 5.871853828430176, "learning_rate": 8.092014030508199e-06, "loss": 0.3352, "num_input_tokens_seen": 1902656, "step": 1985 }, { "epoch": 0.16232971694265438, "grad_norm": 5.792938709259033, "learning_rate": 8.112407211028633e-06, "loss": 0.3509, "num_input_tokens_seen": 1907536, "step": 1990 }, { "epoch": 0.16273758055306306, "grad_norm": 7.505515098571777, "learning_rate": 8.132800391549067e-06, "loss": 0.3886, "num_input_tokens_seen": 1911472, "step": 1995 }, { "epoch": 0.16314544416347174, "grad_norm": 10.946763038635254, "learning_rate": 8.153193572069501e-06, "loss": 0.4157, "num_input_tokens_seen": 1916336, "step": 2000 }, { "epoch": 0.1635533077738804, "grad_norm": 2.436859607696533, "learning_rate": 8.173586752589935e-06, "loss": 0.3467, "num_input_tokens_seen": 1921488, "step": 2005 }, { "epoch": 0.1639611713842891, "grad_norm": 2.7489120960235596, "learning_rate": 8.193979933110369e-06, "loss": 0.3548, "num_input_tokens_seen": 1926864, "step": 2010 }, { "epoch": 0.16436903499469777, "grad_norm": 4.631455421447754, "learning_rate": 8.214373113630801e-06, "loss": 0.3366, "num_input_tokens_seen": 1931184, "step": 2015 }, { "epoch": 0.16477689860510644, "grad_norm": 7.43580961227417, "learning_rate": 8.234766294151235e-06, "loss": 0.3561, "num_input_tokens_seen": 1935152, "step": 2020 }, { "epoch": 0.16518476221551512, "grad_norm": 3.9111075401306152, "learning_rate": 8.25515947467167e-06, "loss": 0.3081, "num_input_tokens_seen": 1940672, "step": 2025 }, { "epoch": 0.16559262582592382, "grad_norm": 5.56207799911499, "learning_rate": 8.275552655192104e-06, "loss": 0.3726, "num_input_tokens_seen": 1945872, "step": 2030 }, { "epoch": 0.1660004894363325, "grad_norm": 7.644935131072998, "learning_rate": 8.295945835712538e-06, "loss": 0.3537, "num_input_tokens_seen": 1950720, "step": 2035 }, { "epoch": 0.16640835304674118, "grad_norm": 4.625678062438965, "learning_rate": 8.316339016232973e-06, "loss": 0.3235, "num_input_tokens_seen": 1955472, "step": 2040 }, { "epoch": 0.16681621665714985, "grad_norm": 6.140848159790039, "learning_rate": 8.336732196753406e-06, "loss": 0.2979, "num_input_tokens_seen": 1960832, "step": 2045 }, { "epoch": 0.16722408026755853, "grad_norm": 11.794724464416504, "learning_rate": 8.35712537727384e-06, "loss": 0.3365, "num_input_tokens_seen": 1965264, "step": 2050 }, { "epoch": 0.1676319438779672, "grad_norm": 29.812477111816406, "learning_rate": 8.377518557794274e-06, "loss": 0.5237, "num_input_tokens_seen": 1969872, "step": 2055 }, { "epoch": 0.16803980748837588, "grad_norm": 10.252077102661133, "learning_rate": 8.397911738314708e-06, "loss": 0.3273, "num_input_tokens_seen": 1974544, "step": 2060 }, { "epoch": 0.16844767109878456, "grad_norm": 11.37096881866455, "learning_rate": 8.418304918835142e-06, "loss": 0.3846, "num_input_tokens_seen": 1979280, "step": 2065 }, { "epoch": 0.16885553470919323, "grad_norm": 3.5610291957855225, "learning_rate": 8.438698099355576e-06, "loss": 0.3172, "num_input_tokens_seen": 1984032, "step": 2070 }, { "epoch": 0.16926339831960194, "grad_norm": 9.096935272216797, "learning_rate": 8.45909127987601e-06, "loss": 0.3119, "num_input_tokens_seen": 1989696, "step": 2075 }, { "epoch": 0.16967126193001061, "grad_norm": 3.803318738937378, "learning_rate": 8.479484460396443e-06, "loss": 0.3863, "num_input_tokens_seen": 1993664, "step": 2080 }, { "epoch": 0.1700791255404193, "grad_norm": 7.321660041809082, "learning_rate": 8.499877640916878e-06, "loss": 0.3304, "num_input_tokens_seen": 1998752, "step": 2085 }, { "epoch": 0.17048698915082797, "grad_norm": 6.256577491760254, "learning_rate": 8.520270821437312e-06, "loss": 0.3383, "num_input_tokens_seen": 2003456, "step": 2090 }, { "epoch": 0.17089485276123664, "grad_norm": 5.912494659423828, "learning_rate": 8.540664001957746e-06, "loss": 0.3125, "num_input_tokens_seen": 2008576, "step": 2095 }, { "epoch": 0.17130271637164532, "grad_norm": 11.679612159729004, "learning_rate": 8.56105718247818e-06, "loss": 0.422, "num_input_tokens_seen": 2013424, "step": 2100 }, { "epoch": 0.171710579982054, "grad_norm": 4.675047874450684, "learning_rate": 8.581450362998613e-06, "loss": 0.3514, "num_input_tokens_seen": 2017648, "step": 2105 }, { "epoch": 0.17211844359246267, "grad_norm": 5.183383941650391, "learning_rate": 8.601843543519047e-06, "loss": 0.3679, "num_input_tokens_seen": 2021344, "step": 2110 }, { "epoch": 0.17252630720287135, "grad_norm": 3.7035017013549805, "learning_rate": 8.622236724039481e-06, "loss": 0.3417, "num_input_tokens_seen": 2026576, "step": 2115 }, { "epoch": 0.17293417081328005, "grad_norm": 5.791747093200684, "learning_rate": 8.642629904559915e-06, "loss": 0.3174, "num_input_tokens_seen": 2031216, "step": 2120 }, { "epoch": 0.17334203442368873, "grad_norm": 5.196982383728027, "learning_rate": 8.66302308508035e-06, "loss": 0.2135, "num_input_tokens_seen": 2036784, "step": 2125 }, { "epoch": 0.1737498980340974, "grad_norm": 8.294468879699707, "learning_rate": 8.683416265600785e-06, "loss": 0.3474, "num_input_tokens_seen": 2040864, "step": 2130 }, { "epoch": 0.17415776164450608, "grad_norm": 5.887025833129883, "learning_rate": 8.703809446121217e-06, "loss": 0.2486, "num_input_tokens_seen": 2046080, "step": 2135 }, { "epoch": 0.17456562525491476, "grad_norm": 21.11735725402832, "learning_rate": 8.724202626641651e-06, "loss": 0.3601, "num_input_tokens_seen": 2049856, "step": 2140 }, { "epoch": 0.17497348886532343, "grad_norm": 15.627104759216309, "learning_rate": 8.744595807162086e-06, "loss": 0.4212, "num_input_tokens_seen": 2054592, "step": 2145 }, { "epoch": 0.1753813524757321, "grad_norm": 32.199928283691406, "learning_rate": 8.76498898768252e-06, "loss": 0.4743, "num_input_tokens_seen": 2059504, "step": 2150 }, { "epoch": 0.1757892160861408, "grad_norm": 15.664324760437012, "learning_rate": 8.785382168202954e-06, "loss": 0.4211, "num_input_tokens_seen": 2064016, "step": 2155 }, { "epoch": 0.17619707969654946, "grad_norm": 23.20318031311035, "learning_rate": 8.805775348723388e-06, "loss": 0.2578, "num_input_tokens_seen": 2069008, "step": 2160 }, { "epoch": 0.17660494330695814, "grad_norm": 9.86707592010498, "learning_rate": 8.82616852924382e-06, "loss": 1.1456, "num_input_tokens_seen": 2074224, "step": 2165 }, { "epoch": 0.17701280691736684, "grad_norm": 4.344518661499023, "learning_rate": 8.846561709764254e-06, "loss": 0.4011, "num_input_tokens_seen": 2078496, "step": 2170 }, { "epoch": 0.17742067052777552, "grad_norm": 6.21725606918335, "learning_rate": 8.866954890284688e-06, "loss": 0.3182, "num_input_tokens_seen": 2083200, "step": 2175 }, { "epoch": 0.1778285341381842, "grad_norm": 13.69442367553711, "learning_rate": 8.887348070805124e-06, "loss": 0.4865, "num_input_tokens_seen": 2088016, "step": 2180 }, { "epoch": 0.17823639774859287, "grad_norm": 9.914393424987793, "learning_rate": 8.907741251325558e-06, "loss": 0.2967, "num_input_tokens_seen": 2093824, "step": 2185 }, { "epoch": 0.17864426135900155, "grad_norm": 6.43015718460083, "learning_rate": 8.928134431845992e-06, "loss": 0.3828, "num_input_tokens_seen": 2100048, "step": 2190 }, { "epoch": 0.17905212496941023, "grad_norm": 4.25566291809082, "learning_rate": 8.948527612366425e-06, "loss": 0.3557, "num_input_tokens_seen": 2105616, "step": 2195 }, { "epoch": 0.1794599885798189, "grad_norm": 9.30568790435791, "learning_rate": 8.968920792886859e-06, "loss": 0.373, "num_input_tokens_seen": 2109536, "step": 2200 }, { "epoch": 0.17986785219022758, "grad_norm": 2.982161283493042, "learning_rate": 8.989313973407293e-06, "loss": 0.3342, "num_input_tokens_seen": 2113968, "step": 2205 }, { "epoch": 0.18027571580063625, "grad_norm": 6.420356750488281, "learning_rate": 9.009707153927727e-06, "loss": 0.3865, "num_input_tokens_seen": 2118848, "step": 2210 }, { "epoch": 0.18068357941104496, "grad_norm": 5.854020595550537, "learning_rate": 9.03010033444816e-06, "loss": 0.4394, "num_input_tokens_seen": 2124144, "step": 2215 }, { "epoch": 0.18109144302145364, "grad_norm": 4.748794078826904, "learning_rate": 9.050493514968595e-06, "loss": 0.251, "num_input_tokens_seen": 2128592, "step": 2220 }, { "epoch": 0.1814993066318623, "grad_norm": 13.913771629333496, "learning_rate": 9.070886695489029e-06, "loss": 0.3909, "num_input_tokens_seen": 2134304, "step": 2225 }, { "epoch": 0.181907170242271, "grad_norm": 3.2631995677948, "learning_rate": 9.091279876009463e-06, "loss": 0.4273, "num_input_tokens_seen": 2139600, "step": 2230 }, { "epoch": 0.18231503385267966, "grad_norm": 6.1844096183776855, "learning_rate": 9.111673056529897e-06, "loss": 0.3575, "num_input_tokens_seen": 2144384, "step": 2235 }, { "epoch": 0.18272289746308834, "grad_norm": 5.57953405380249, "learning_rate": 9.132066237050331e-06, "loss": 0.2912, "num_input_tokens_seen": 2149088, "step": 2240 }, { "epoch": 0.18313076107349702, "grad_norm": 8.142049789428711, "learning_rate": 9.152459417570765e-06, "loss": 0.3915, "num_input_tokens_seen": 2153664, "step": 2245 }, { "epoch": 0.1835386246839057, "grad_norm": 4.383854866027832, "learning_rate": 9.1728525980912e-06, "loss": 0.3136, "num_input_tokens_seen": 2158576, "step": 2250 }, { "epoch": 0.18394648829431437, "grad_norm": 9.75634765625, "learning_rate": 9.193245778611632e-06, "loss": 0.3806, "num_input_tokens_seen": 2162880, "step": 2255 }, { "epoch": 0.18435435190472307, "grad_norm": 3.4251532554626465, "learning_rate": 9.213638959132066e-06, "loss": 0.2845, "num_input_tokens_seen": 2168240, "step": 2260 }, { "epoch": 0.18476221551513175, "grad_norm": 10.324609756469727, "learning_rate": 9.2340321396525e-06, "loss": 0.35, "num_input_tokens_seen": 2173040, "step": 2265 }, { "epoch": 0.18517007912554043, "grad_norm": 4.69254207611084, "learning_rate": 9.254425320172936e-06, "loss": 0.3147, "num_input_tokens_seen": 2176960, "step": 2270 }, { "epoch": 0.1855779427359491, "grad_norm": 27.802886962890625, "learning_rate": 9.27481850069337e-06, "loss": 0.3265, "num_input_tokens_seen": 2181248, "step": 2275 }, { "epoch": 0.18598580634635778, "grad_norm": 23.3270206451416, "learning_rate": 9.295211681213804e-06, "loss": 0.3517, "num_input_tokens_seen": 2185744, "step": 2280 }, { "epoch": 0.18639366995676646, "grad_norm": 5.264839172363281, "learning_rate": 9.315604861734236e-06, "loss": 0.408, "num_input_tokens_seen": 2191232, "step": 2285 }, { "epoch": 0.18680153356717513, "grad_norm": 7.058464527130127, "learning_rate": 9.33599804225467e-06, "loss": 0.3129, "num_input_tokens_seen": 2196416, "step": 2290 }, { "epoch": 0.1872093971775838, "grad_norm": 6.896816253662109, "learning_rate": 9.356391222775104e-06, "loss": 0.3616, "num_input_tokens_seen": 2200336, "step": 2295 }, { "epoch": 0.18761726078799248, "grad_norm": 4.955200672149658, "learning_rate": 9.376784403295538e-06, "loss": 0.3281, "num_input_tokens_seen": 2204384, "step": 2300 }, { "epoch": 0.18802512439840116, "grad_norm": 6.750766754150391, "learning_rate": 9.397177583815972e-06, "loss": 0.4448, "num_input_tokens_seen": 2208944, "step": 2305 }, { "epoch": 0.18843298800880987, "grad_norm": 3.096465826034546, "learning_rate": 9.417570764336406e-06, "loss": 0.2773, "num_input_tokens_seen": 2213504, "step": 2310 }, { "epoch": 0.18884085161921854, "grad_norm": 11.921808242797852, "learning_rate": 9.43796394485684e-06, "loss": 0.3814, "num_input_tokens_seen": 2217888, "step": 2315 }, { "epoch": 0.18924871522962722, "grad_norm": 5.535558700561523, "learning_rate": 9.458357125377275e-06, "loss": 0.2684, "num_input_tokens_seen": 2222352, "step": 2320 }, { "epoch": 0.1896565788400359, "grad_norm": 6.542778491973877, "learning_rate": 9.478750305897709e-06, "loss": 0.4992, "num_input_tokens_seen": 2227504, "step": 2325 }, { "epoch": 0.19006444245044457, "grad_norm": 8.096564292907715, "learning_rate": 9.499143486418143e-06, "loss": 0.3518, "num_input_tokens_seen": 2232384, "step": 2330 }, { "epoch": 0.19047230606085325, "grad_norm": 7.121298789978027, "learning_rate": 9.519536666938577e-06, "loss": 0.3188, "num_input_tokens_seen": 2237392, "step": 2335 }, { "epoch": 0.19088016967126192, "grad_norm": 8.801871299743652, "learning_rate": 9.53992984745901e-06, "loss": 0.4105, "num_input_tokens_seen": 2242528, "step": 2340 }, { "epoch": 0.1912880332816706, "grad_norm": 3.713839292526245, "learning_rate": 9.560323027979443e-06, "loss": 0.3663, "num_input_tokens_seen": 2247456, "step": 2345 }, { "epoch": 0.19169589689207928, "grad_norm": 5.456655025482178, "learning_rate": 9.580716208499877e-06, "loss": 0.3241, "num_input_tokens_seen": 2252672, "step": 2350 }, { "epoch": 0.19210376050248798, "grad_norm": 4.944878578186035, "learning_rate": 9.601109389020311e-06, "loss": 0.3472, "num_input_tokens_seen": 2257712, "step": 2355 }, { "epoch": 0.19251162411289666, "grad_norm": 5.346912860870361, "learning_rate": 9.621502569540745e-06, "loss": 0.4145, "num_input_tokens_seen": 2262464, "step": 2360 }, { "epoch": 0.19291948772330533, "grad_norm": 6.771721363067627, "learning_rate": 9.641895750061181e-06, "loss": 0.3286, "num_input_tokens_seen": 2267344, "step": 2365 }, { "epoch": 0.193327351333714, "grad_norm": 9.396222114562988, "learning_rate": 9.662288930581615e-06, "loss": 0.4063, "num_input_tokens_seen": 2272480, "step": 2370 }, { "epoch": 0.19373521494412269, "grad_norm": 10.925207138061523, "learning_rate": 9.682682111102048e-06, "loss": 0.4234, "num_input_tokens_seen": 2276960, "step": 2375 }, { "epoch": 0.19414307855453136, "grad_norm": 4.91389799118042, "learning_rate": 9.703075291622482e-06, "loss": 0.2735, "num_input_tokens_seen": 2280768, "step": 2380 }, { "epoch": 0.19455094216494004, "grad_norm": 3.7251367568969727, "learning_rate": 9.723468472142916e-06, "loss": 0.3233, "num_input_tokens_seen": 2285248, "step": 2385 }, { "epoch": 0.19495880577534871, "grad_norm": 5.173948287963867, "learning_rate": 9.74386165266335e-06, "loss": 0.2525, "num_input_tokens_seen": 2289360, "step": 2390 }, { "epoch": 0.1953666693857574, "grad_norm": 4.604894161224365, "learning_rate": 9.764254833183784e-06, "loss": 0.4015, "num_input_tokens_seen": 2293984, "step": 2395 }, { "epoch": 0.1957745329961661, "grad_norm": 3.8132987022399902, "learning_rate": 9.784648013704218e-06, "loss": 0.3781, "num_input_tokens_seen": 2297776, "step": 2400 }, { "epoch": 0.19618239660657477, "grad_norm": 3.6095807552337646, "learning_rate": 9.80504119422465e-06, "loss": 0.3132, "num_input_tokens_seen": 2302000, "step": 2405 }, { "epoch": 0.19659026021698345, "grad_norm": 9.062275886535645, "learning_rate": 9.825434374745086e-06, "loss": 0.2562, "num_input_tokens_seen": 2307568, "step": 2410 }, { "epoch": 0.19699812382739212, "grad_norm": 16.442426681518555, "learning_rate": 9.84582755526552e-06, "loss": 0.4257, "num_input_tokens_seen": 2311840, "step": 2415 }, { "epoch": 0.1974059874378008, "grad_norm": 10.517233848571777, "learning_rate": 9.866220735785954e-06, "loss": 0.3181, "num_input_tokens_seen": 2316880, "step": 2420 }, { "epoch": 0.19781385104820948, "grad_norm": 9.619904518127441, "learning_rate": 9.886613916306388e-06, "loss": 0.3361, "num_input_tokens_seen": 2322128, "step": 2425 }, { "epoch": 0.19822171465861815, "grad_norm": 8.159174919128418, "learning_rate": 9.907007096826822e-06, "loss": 0.2732, "num_input_tokens_seen": 2327264, "step": 2430 }, { "epoch": 0.19862957826902683, "grad_norm": 5.234193325042725, "learning_rate": 9.927400277347255e-06, "loss": 0.4193, "num_input_tokens_seen": 2332640, "step": 2435 }, { "epoch": 0.1990374418794355, "grad_norm": 6.7484517097473145, "learning_rate": 9.947793457867689e-06, "loss": 0.3629, "num_input_tokens_seen": 2337216, "step": 2440 }, { "epoch": 0.19944530548984418, "grad_norm": 3.426959276199341, "learning_rate": 9.968186638388123e-06, "loss": 0.3032, "num_input_tokens_seen": 2341712, "step": 2445 }, { "epoch": 0.1998531691002529, "grad_norm": 4.0439982414245605, "learning_rate": 9.988579818908557e-06, "loss": 0.2564, "num_input_tokens_seen": 2346352, "step": 2450 }, { "epoch": 0.20026103271066156, "grad_norm": 1.4390459060668945, "learning_rate": 1.0008972999428991e-05, "loss": 0.4387, "num_input_tokens_seen": 2352096, "step": 2455 }, { "epoch": 0.20066889632107024, "grad_norm": 1.5740493535995483, "learning_rate": 1.0029366179949425e-05, "loss": 0.5022, "num_input_tokens_seen": 2357184, "step": 2460 }, { "epoch": 0.20107675993147892, "grad_norm": 1.5872985124588013, "learning_rate": 1.004975936046986e-05, "loss": 0.6173, "num_input_tokens_seen": 2362832, "step": 2465 }, { "epoch": 0.2014846235418876, "grad_norm": 4.02196741104126, "learning_rate": 1.0070152540990293e-05, "loss": 0.3957, "num_input_tokens_seen": 2367904, "step": 2470 }, { "epoch": 0.20189248715229627, "grad_norm": 3.59627103805542, "learning_rate": 1.0090545721510727e-05, "loss": 0.401, "num_input_tokens_seen": 2372928, "step": 2475 }, { "epoch": 0.20230035076270494, "grad_norm": 3.03694224357605, "learning_rate": 1.0110938902031161e-05, "loss": 0.2771, "num_input_tokens_seen": 2377504, "step": 2480 }, { "epoch": 0.20270821437311362, "grad_norm": 4.293562412261963, "learning_rate": 1.0131332082551595e-05, "loss": 0.3063, "num_input_tokens_seen": 2381968, "step": 2485 }, { "epoch": 0.2031160779835223, "grad_norm": 9.294218063354492, "learning_rate": 1.015172526307203e-05, "loss": 0.5437, "num_input_tokens_seen": 2386432, "step": 2490 }, { "epoch": 0.203523941593931, "grad_norm": 7.813790321350098, "learning_rate": 1.0172118443592462e-05, "loss": 0.3536, "num_input_tokens_seen": 2391312, "step": 2495 }, { "epoch": 0.20393180520433968, "grad_norm": 7.637277126312256, "learning_rate": 1.0192511624112896e-05, "loss": 0.3301, "num_input_tokens_seen": 2396144, "step": 2500 }, { "epoch": 0.20433966881474835, "grad_norm": 7.977924346923828, "learning_rate": 1.0212904804633332e-05, "loss": 0.3331, "num_input_tokens_seen": 2400240, "step": 2505 }, { "epoch": 0.20474753242515703, "grad_norm": 2.7906675338745117, "learning_rate": 1.0233297985153766e-05, "loss": 0.4773, "num_input_tokens_seen": 2404896, "step": 2510 }, { "epoch": 0.2051553960355657, "grad_norm": 11.615259170532227, "learning_rate": 1.02536911656742e-05, "loss": 0.3319, "num_input_tokens_seen": 2410112, "step": 2515 }, { "epoch": 0.20556325964597438, "grad_norm": 11.294833183288574, "learning_rate": 1.0274084346194634e-05, "loss": 0.3221, "num_input_tokens_seen": 2415648, "step": 2520 }, { "epoch": 0.20597112325638306, "grad_norm": 6.060369491577148, "learning_rate": 1.0294477526715066e-05, "loss": 0.3969, "num_input_tokens_seen": 2420784, "step": 2525 }, { "epoch": 0.20637898686679174, "grad_norm": 6.391024112701416, "learning_rate": 1.03148707072355e-05, "loss": 0.359, "num_input_tokens_seen": 2425488, "step": 2530 }, { "epoch": 0.2067868504772004, "grad_norm": 2.743928909301758, "learning_rate": 1.0335263887755934e-05, "loss": 0.387, "num_input_tokens_seen": 2429664, "step": 2535 }, { "epoch": 0.20719471408760912, "grad_norm": 2.244645118713379, "learning_rate": 1.0355657068276369e-05, "loss": 0.3149, "num_input_tokens_seen": 2434576, "step": 2540 }, { "epoch": 0.2076025776980178, "grad_norm": 6.469843864440918, "learning_rate": 1.0376050248796803e-05, "loss": 0.3595, "num_input_tokens_seen": 2439744, "step": 2545 }, { "epoch": 0.20801044130842647, "grad_norm": 2.198047637939453, "learning_rate": 1.0396443429317237e-05, "loss": 0.354, "num_input_tokens_seen": 2444576, "step": 2550 }, { "epoch": 0.20841830491883515, "grad_norm": 5.180971622467041, "learning_rate": 1.041683660983767e-05, "loss": 0.3574, "num_input_tokens_seen": 2449552, "step": 2555 }, { "epoch": 0.20882616852924382, "grad_norm": 2.81728458404541, "learning_rate": 1.0437229790358105e-05, "loss": 0.3382, "num_input_tokens_seen": 2454512, "step": 2560 }, { "epoch": 0.2092340321396525, "grad_norm": 2.9755470752716064, "learning_rate": 1.0457622970878539e-05, "loss": 0.3485, "num_input_tokens_seen": 2459312, "step": 2565 }, { "epoch": 0.20964189575006117, "grad_norm": 4.938985347747803, "learning_rate": 1.0478016151398973e-05, "loss": 0.3507, "num_input_tokens_seen": 2463744, "step": 2570 }, { "epoch": 0.21004975936046985, "grad_norm": 3.8915627002716064, "learning_rate": 1.0498409331919407e-05, "loss": 0.3055, "num_input_tokens_seen": 2468592, "step": 2575 }, { "epoch": 0.21045762297087853, "grad_norm": 3.759429931640625, "learning_rate": 1.0518802512439841e-05, "loss": 0.3042, "num_input_tokens_seen": 2473584, "step": 2580 }, { "epoch": 0.2108654865812872, "grad_norm": 13.191893577575684, "learning_rate": 1.0539195692960273e-05, "loss": 0.3286, "num_input_tokens_seen": 2478976, "step": 2585 }, { "epoch": 0.2112733501916959, "grad_norm": 15.952300071716309, "learning_rate": 1.0559588873480708e-05, "loss": 0.3751, "num_input_tokens_seen": 2484000, "step": 2590 }, { "epoch": 0.21168121380210458, "grad_norm": 4.902406692504883, "learning_rate": 1.0579982054001142e-05, "loss": 0.4326, "num_input_tokens_seen": 2488112, "step": 2595 }, { "epoch": 0.21208907741251326, "grad_norm": 9.403731346130371, "learning_rate": 1.0600375234521577e-05, "loss": 0.2793, "num_input_tokens_seen": 2492416, "step": 2600 }, { "epoch": 0.21249694102292194, "grad_norm": 4.439281463623047, "learning_rate": 1.0620768415042011e-05, "loss": 0.4068, "num_input_tokens_seen": 2497984, "step": 2605 }, { "epoch": 0.2129048046333306, "grad_norm": 6.853982448577881, "learning_rate": 1.0641161595562444e-05, "loss": 0.3635, "num_input_tokens_seen": 2502960, "step": 2610 }, { "epoch": 0.2133126682437393, "grad_norm": 6.9331440925598145, "learning_rate": 1.0661554776082878e-05, "loss": 0.3237, "num_input_tokens_seen": 2508096, "step": 2615 }, { "epoch": 0.21372053185414797, "grad_norm": 20.286115646362305, "learning_rate": 1.0681947956603312e-05, "loss": 0.4943, "num_input_tokens_seen": 2512640, "step": 2620 }, { "epoch": 0.21412839546455664, "grad_norm": 10.124003410339355, "learning_rate": 1.0702341137123746e-05, "loss": 0.3528, "num_input_tokens_seen": 2517648, "step": 2625 }, { "epoch": 0.21453625907496532, "grad_norm": 2.302018404006958, "learning_rate": 1.072273431764418e-05, "loss": 0.3431, "num_input_tokens_seen": 2521408, "step": 2630 }, { "epoch": 0.21494412268537402, "grad_norm": 4.291892051696777, "learning_rate": 1.0743127498164614e-05, "loss": 0.3102, "num_input_tokens_seen": 2526400, "step": 2635 }, { "epoch": 0.2153519862957827, "grad_norm": 2.159266233444214, "learning_rate": 1.0763520678685048e-05, "loss": 0.3287, "num_input_tokens_seen": 2531744, "step": 2640 }, { "epoch": 0.21575984990619138, "grad_norm": 3.1269237995147705, "learning_rate": 1.0783913859205482e-05, "loss": 0.3448, "num_input_tokens_seen": 2536944, "step": 2645 }, { "epoch": 0.21616771351660005, "grad_norm": 4.266423225402832, "learning_rate": 1.0804307039725916e-05, "loss": 0.3288, "num_input_tokens_seen": 2541728, "step": 2650 }, { "epoch": 0.21657557712700873, "grad_norm": 3.0660650730133057, "learning_rate": 1.082470022024635e-05, "loss": 0.3118, "num_input_tokens_seen": 2546080, "step": 2655 }, { "epoch": 0.2169834407374174, "grad_norm": 4.065108776092529, "learning_rate": 1.0845093400766785e-05, "loss": 0.3476, "num_input_tokens_seen": 2551056, "step": 2660 }, { "epoch": 0.21739130434782608, "grad_norm": 10.762778282165527, "learning_rate": 1.0865486581287219e-05, "loss": 0.5236, "num_input_tokens_seen": 2556544, "step": 2665 }, { "epoch": 0.21779916795823476, "grad_norm": 4.199944972991943, "learning_rate": 1.0885879761807653e-05, "loss": 0.2868, "num_input_tokens_seen": 2561216, "step": 2670 }, { "epoch": 0.21820703156864343, "grad_norm": 2.4883158206939697, "learning_rate": 1.0906272942328085e-05, "loss": 0.3313, "num_input_tokens_seen": 2566288, "step": 2675 }, { "epoch": 0.21861489517905214, "grad_norm": 4.848094940185547, "learning_rate": 1.0926666122848519e-05, "loss": 0.35, "num_input_tokens_seen": 2570592, "step": 2680 }, { "epoch": 0.21902275878946081, "grad_norm": 3.0355098247528076, "learning_rate": 1.0947059303368953e-05, "loss": 0.2831, "num_input_tokens_seen": 2575440, "step": 2685 }, { "epoch": 0.2194306223998695, "grad_norm": 3.407423257827759, "learning_rate": 1.0967452483889389e-05, "loss": 0.4419, "num_input_tokens_seen": 2581216, "step": 2690 }, { "epoch": 0.21983848601027817, "grad_norm": 2.272261142730713, "learning_rate": 1.0987845664409823e-05, "loss": 0.3732, "num_input_tokens_seen": 2586112, "step": 2695 }, { "epoch": 0.22024634962068684, "grad_norm": 5.6865105628967285, "learning_rate": 1.1008238844930255e-05, "loss": 0.3538, "num_input_tokens_seen": 2591168, "step": 2700 }, { "epoch": 0.22065421323109552, "grad_norm": 6.169642448425293, "learning_rate": 1.102863202545069e-05, "loss": 0.3224, "num_input_tokens_seen": 2596112, "step": 2705 }, { "epoch": 0.2210620768415042, "grad_norm": 6.034750461578369, "learning_rate": 1.1049025205971124e-05, "loss": 0.3069, "num_input_tokens_seen": 2600144, "step": 2710 }, { "epoch": 0.22146994045191287, "grad_norm": 3.3087098598480225, "learning_rate": 1.1069418386491558e-05, "loss": 0.3048, "num_input_tokens_seen": 2605552, "step": 2715 }, { "epoch": 0.22187780406232155, "grad_norm": 2.7534003257751465, "learning_rate": 1.1089811567011992e-05, "loss": 0.2952, "num_input_tokens_seen": 2610752, "step": 2720 }, { "epoch": 0.22228566767273025, "grad_norm": 5.686404705047607, "learning_rate": 1.1110204747532426e-05, "loss": 0.3682, "num_input_tokens_seen": 2615840, "step": 2725 }, { "epoch": 0.22269353128313893, "grad_norm": 41.32060241699219, "learning_rate": 1.113059792805286e-05, "loss": 0.3853, "num_input_tokens_seen": 2620272, "step": 2730 }, { "epoch": 0.2231013948935476, "grad_norm": 46.115196228027344, "learning_rate": 1.1150991108573292e-05, "loss": 0.465, "num_input_tokens_seen": 2625168, "step": 2735 }, { "epoch": 0.22350925850395628, "grad_norm": 6.400082111358643, "learning_rate": 1.1171384289093728e-05, "loss": 0.2526, "num_input_tokens_seen": 2630496, "step": 2740 }, { "epoch": 0.22391712211436496, "grad_norm": 11.78439712524414, "learning_rate": 1.1191777469614162e-05, "loss": 0.2948, "num_input_tokens_seen": 2636128, "step": 2745 }, { "epoch": 0.22432498572477363, "grad_norm": 5.447180271148682, "learning_rate": 1.1212170650134596e-05, "loss": 0.3393, "num_input_tokens_seen": 2641440, "step": 2750 }, { "epoch": 0.2247328493351823, "grad_norm": 6.664244651794434, "learning_rate": 1.123256383065503e-05, "loss": 0.2525, "num_input_tokens_seen": 2646256, "step": 2755 }, { "epoch": 0.225140712945591, "grad_norm": 2.6851415634155273, "learning_rate": 1.1252957011175464e-05, "loss": 0.263, "num_input_tokens_seen": 2651120, "step": 2760 }, { "epoch": 0.22554857655599966, "grad_norm": 22.60161781311035, "learning_rate": 1.1273350191695897e-05, "loss": 0.4232, "num_input_tokens_seen": 2655904, "step": 2765 }, { "epoch": 0.22595644016640834, "grad_norm": 8.78441333770752, "learning_rate": 1.129374337221633e-05, "loss": 0.3161, "num_input_tokens_seen": 2660816, "step": 2770 }, { "epoch": 0.22636430377681704, "grad_norm": 1.6991007328033447, "learning_rate": 1.1314136552736765e-05, "loss": 0.4386, "num_input_tokens_seen": 2665120, "step": 2775 }, { "epoch": 0.22677216738722572, "grad_norm": 11.280922889709473, "learning_rate": 1.1334529733257199e-05, "loss": 0.4537, "num_input_tokens_seen": 2670080, "step": 2780 }, { "epoch": 0.2271800309976344, "grad_norm": 18.48232650756836, "learning_rate": 1.1354922913777635e-05, "loss": 0.612, "num_input_tokens_seen": 2675136, "step": 2785 }, { "epoch": 0.22758789460804307, "grad_norm": 24.24309730529785, "learning_rate": 1.1375316094298067e-05, "loss": 0.2133, "num_input_tokens_seen": 2680528, "step": 2790 }, { "epoch": 0.22799575821845175, "grad_norm": 13.526183128356934, "learning_rate": 1.1395709274818501e-05, "loss": 0.3193, "num_input_tokens_seen": 2684560, "step": 2795 }, { "epoch": 0.22840362182886043, "grad_norm": 14.159772872924805, "learning_rate": 1.1416102455338935e-05, "loss": 0.3558, "num_input_tokens_seen": 2689536, "step": 2800 }, { "epoch": 0.2288114854392691, "grad_norm": 14.038448333740234, "learning_rate": 1.143649563585937e-05, "loss": 0.4197, "num_input_tokens_seen": 2695184, "step": 2805 }, { "epoch": 0.22921934904967778, "grad_norm": 5.100596904754639, "learning_rate": 1.1456888816379803e-05, "loss": 0.4298, "num_input_tokens_seen": 2700896, "step": 2810 }, { "epoch": 0.22962721266008645, "grad_norm": 5.210004806518555, "learning_rate": 1.1477281996900237e-05, "loss": 0.3027, "num_input_tokens_seen": 2705568, "step": 2815 }, { "epoch": 0.23003507627049516, "grad_norm": 10.044856071472168, "learning_rate": 1.1497675177420671e-05, "loss": 0.3989, "num_input_tokens_seen": 2710208, "step": 2820 }, { "epoch": 0.23044293988090384, "grad_norm": 5.990285873413086, "learning_rate": 1.1518068357941104e-05, "loss": 0.4589, "num_input_tokens_seen": 2714896, "step": 2825 }, { "epoch": 0.2308508034913125, "grad_norm": 5.2214813232421875, "learning_rate": 1.153846153846154e-05, "loss": 0.3752, "num_input_tokens_seen": 2719376, "step": 2830 }, { "epoch": 0.2312586671017212, "grad_norm": 6.73740291595459, "learning_rate": 1.1558854718981974e-05, "loss": 0.3625, "num_input_tokens_seen": 2724368, "step": 2835 }, { "epoch": 0.23166653071212986, "grad_norm": 2.1805100440979004, "learning_rate": 1.1579247899502408e-05, "loss": 0.3959, "num_input_tokens_seen": 2728496, "step": 2840 }, { "epoch": 0.23207439432253854, "grad_norm": 5.6786651611328125, "learning_rate": 1.1599641080022842e-05, "loss": 0.3625, "num_input_tokens_seen": 2731984, "step": 2845 }, { "epoch": 0.23248225793294722, "grad_norm": 5.5058512687683105, "learning_rate": 1.1620034260543274e-05, "loss": 0.355, "num_input_tokens_seen": 2736528, "step": 2850 }, { "epoch": 0.2328901215433559, "grad_norm": 2.284275770187378, "learning_rate": 1.1640427441063708e-05, "loss": 0.3673, "num_input_tokens_seen": 2739984, "step": 2855 }, { "epoch": 0.23329798515376457, "grad_norm": 7.5538859367370605, "learning_rate": 1.1660820621584142e-05, "loss": 0.3483, "num_input_tokens_seen": 2744288, "step": 2860 }, { "epoch": 0.23370584876417327, "grad_norm": 5.5626912117004395, "learning_rate": 1.1681213802104576e-05, "loss": 0.3723, "num_input_tokens_seen": 2749072, "step": 2865 }, { "epoch": 0.23411371237458195, "grad_norm": 2.406972885131836, "learning_rate": 1.170160698262501e-05, "loss": 0.3984, "num_input_tokens_seen": 2754096, "step": 2870 }, { "epoch": 0.23452157598499063, "grad_norm": 6.0428972244262695, "learning_rate": 1.1722000163145446e-05, "loss": 0.275, "num_input_tokens_seen": 2758736, "step": 2875 }, { "epoch": 0.2349294395953993, "grad_norm": 3.2337419986724854, "learning_rate": 1.1742393343665879e-05, "loss": 0.2947, "num_input_tokens_seen": 2763808, "step": 2880 }, { "epoch": 0.23533730320580798, "grad_norm": 12.064266204833984, "learning_rate": 1.1762786524186313e-05, "loss": 0.3754, "num_input_tokens_seen": 2768800, "step": 2885 }, { "epoch": 0.23574516681621666, "grad_norm": 4.815568447113037, "learning_rate": 1.1783179704706747e-05, "loss": 0.3339, "num_input_tokens_seen": 2773152, "step": 2890 }, { "epoch": 0.23615303042662533, "grad_norm": 7.466865539550781, "learning_rate": 1.180357288522718e-05, "loss": 0.3438, "num_input_tokens_seen": 2777424, "step": 2895 }, { "epoch": 0.236560894037034, "grad_norm": 9.891520500183105, "learning_rate": 1.1823966065747615e-05, "loss": 0.2852, "num_input_tokens_seen": 2781952, "step": 2900 }, { "epoch": 0.23696875764744268, "grad_norm": 4.975196361541748, "learning_rate": 1.1844359246268049e-05, "loss": 0.3058, "num_input_tokens_seen": 2786352, "step": 2905 }, { "epoch": 0.23737662125785136, "grad_norm": 4.653197288513184, "learning_rate": 1.1864752426788483e-05, "loss": 0.3947, "num_input_tokens_seen": 2791280, "step": 2910 }, { "epoch": 0.23778448486826007, "grad_norm": 5.350923538208008, "learning_rate": 1.1885145607308915e-05, "loss": 0.3045, "num_input_tokens_seen": 2796912, "step": 2915 }, { "epoch": 0.23819234847866874, "grad_norm": 5.817941665649414, "learning_rate": 1.190553878782935e-05, "loss": 0.2161, "num_input_tokens_seen": 2801120, "step": 2920 }, { "epoch": 0.23860021208907742, "grad_norm": 7.864080905914307, "learning_rate": 1.1925931968349785e-05, "loss": 0.3335, "num_input_tokens_seen": 2805776, "step": 2925 }, { "epoch": 0.2390080756994861, "grad_norm": 14.979109764099121, "learning_rate": 1.194632514887022e-05, "loss": 0.3161, "num_input_tokens_seen": 2811472, "step": 2930 }, { "epoch": 0.23941593930989477, "grad_norm": 3.8604743480682373, "learning_rate": 1.1966718329390653e-05, "loss": 0.4039, "num_input_tokens_seen": 2817520, "step": 2935 }, { "epoch": 0.23982380292030345, "grad_norm": 3.3056986331939697, "learning_rate": 1.1987111509911086e-05, "loss": 0.4457, "num_input_tokens_seen": 2822752, "step": 2940 }, { "epoch": 0.24023166653071212, "grad_norm": 6.908957481384277, "learning_rate": 1.200750469043152e-05, "loss": 0.1698, "num_input_tokens_seen": 2828048, "step": 2945 }, { "epoch": 0.2406395301411208, "grad_norm": 24.880939483642578, "learning_rate": 1.2027897870951954e-05, "loss": 0.8136, "num_input_tokens_seen": 2832592, "step": 2950 }, { "epoch": 0.24104739375152948, "grad_norm": 18.64787483215332, "learning_rate": 1.2048291051472388e-05, "loss": 0.6719, "num_input_tokens_seen": 2836624, "step": 2955 }, { "epoch": 0.24145525736193818, "grad_norm": 3.4803717136383057, "learning_rate": 1.2068684231992822e-05, "loss": 0.3997, "num_input_tokens_seen": 2842272, "step": 2960 }, { "epoch": 0.24186312097234686, "grad_norm": 4.103664398193359, "learning_rate": 1.2089077412513256e-05, "loss": 0.4491, "num_input_tokens_seen": 2846784, "step": 2965 }, { "epoch": 0.24227098458275553, "grad_norm": 4.51846170425415, "learning_rate": 1.210947059303369e-05, "loss": 0.3454, "num_input_tokens_seen": 2850976, "step": 2970 }, { "epoch": 0.2426788481931642, "grad_norm": 8.465285301208496, "learning_rate": 1.2129863773554124e-05, "loss": 0.5829, "num_input_tokens_seen": 2854784, "step": 2975 }, { "epoch": 0.24308671180357289, "grad_norm": 11.88733196258545, "learning_rate": 1.2150256954074558e-05, "loss": 0.4945, "num_input_tokens_seen": 2859776, "step": 2980 }, { "epoch": 0.24349457541398156, "grad_norm": 11.775616645812988, "learning_rate": 1.2170650134594992e-05, "loss": 0.3474, "num_input_tokens_seen": 2865104, "step": 2985 }, { "epoch": 0.24390243902439024, "grad_norm": 8.287700653076172, "learning_rate": 1.2191043315115426e-05, "loss": 0.2728, "num_input_tokens_seen": 2870464, "step": 2990 }, { "epoch": 0.24431030263479891, "grad_norm": 6.732769012451172, "learning_rate": 1.221143649563586e-05, "loss": 0.2948, "num_input_tokens_seen": 2874784, "step": 2995 }, { "epoch": 0.2447181662452076, "grad_norm": 3.9908580780029297, "learning_rate": 1.2231829676156293e-05, "loss": 0.4026, "num_input_tokens_seen": 2879632, "step": 3000 }, { "epoch": 0.2451260298556163, "grad_norm": 3.2110822200775146, "learning_rate": 1.2252222856676727e-05, "loss": 0.1852, "num_input_tokens_seen": 2884608, "step": 3005 }, { "epoch": 0.24553389346602497, "grad_norm": 3.3261337280273438, "learning_rate": 1.2272616037197161e-05, "loss": 0.2663, "num_input_tokens_seen": 2889648, "step": 3010 }, { "epoch": 0.24594175707643365, "grad_norm": 1.3641865253448486, "learning_rate": 1.2293009217717597e-05, "loss": 0.2667, "num_input_tokens_seen": 2894224, "step": 3015 }, { "epoch": 0.24634962068684232, "grad_norm": 18.432279586791992, "learning_rate": 1.231340239823803e-05, "loss": 0.4692, "num_input_tokens_seen": 2898848, "step": 3020 }, { "epoch": 0.246757484297251, "grad_norm": 23.481103897094727, "learning_rate": 1.2333795578758465e-05, "loss": 0.7412, "num_input_tokens_seen": 2903712, "step": 3025 }, { "epoch": 0.24716534790765968, "grad_norm": 1.4855870008468628, "learning_rate": 1.2354188759278897e-05, "loss": 0.3443, "num_input_tokens_seen": 2908688, "step": 3030 }, { "epoch": 0.24757321151806835, "grad_norm": 37.59889221191406, "learning_rate": 1.2374581939799331e-05, "loss": 0.511, "num_input_tokens_seen": 2913952, "step": 3035 }, { "epoch": 0.24798107512847703, "grad_norm": 3.5266406536102295, "learning_rate": 1.2394975120319765e-05, "loss": 0.346, "num_input_tokens_seen": 2918736, "step": 3040 }, { "epoch": 0.2483889387388857, "grad_norm": 7.072018146514893, "learning_rate": 1.24153683008402e-05, "loss": 0.2885, "num_input_tokens_seen": 2923392, "step": 3045 }, { "epoch": 0.24879680234929438, "grad_norm": 6.873264312744141, "learning_rate": 1.2435761481360634e-05, "loss": 0.583, "num_input_tokens_seen": 2928400, "step": 3050 }, { "epoch": 0.2492046659597031, "grad_norm": 6.960788726806641, "learning_rate": 1.2456154661881068e-05, "loss": 0.4747, "num_input_tokens_seen": 2933200, "step": 3055 }, { "epoch": 0.24961252957011176, "grad_norm": 3.7237722873687744, "learning_rate": 1.2476547842401502e-05, "loss": 0.2627, "num_input_tokens_seen": 2937072, "step": 3060 }, { "epoch": 0.25002039318052044, "grad_norm": 12.459754943847656, "learning_rate": 1.2496941022921936e-05, "loss": 0.2942, "num_input_tokens_seen": 2942592, "step": 3065 }, { "epoch": 0.2504282567909291, "grad_norm": 13.195632934570312, "learning_rate": 1.2517334203442368e-05, "loss": 0.3988, "num_input_tokens_seen": 2947408, "step": 3070 }, { "epoch": 0.2508361204013378, "grad_norm": 4.636093616485596, "learning_rate": 1.2537727383962802e-05, "loss": 0.2467, "num_input_tokens_seen": 2952320, "step": 3075 }, { "epoch": 0.25124398401174647, "grad_norm": 6.036768913269043, "learning_rate": 1.2558120564483236e-05, "loss": 0.3637, "num_input_tokens_seen": 2957968, "step": 3080 }, { "epoch": 0.25165184762215514, "grad_norm": 2.6620936393737793, "learning_rate": 1.2578513745003672e-05, "loss": 0.3052, "num_input_tokens_seen": 2962016, "step": 3085 }, { "epoch": 0.2520597112325638, "grad_norm": 2.9857239723205566, "learning_rate": 1.2598906925524106e-05, "loss": 0.3716, "num_input_tokens_seen": 2967744, "step": 3090 }, { "epoch": 0.2524675748429725, "grad_norm": 4.576464653015137, "learning_rate": 1.261930010604454e-05, "loss": 0.353, "num_input_tokens_seen": 2971568, "step": 3095 }, { "epoch": 0.2528754384533812, "grad_norm": 2.6397039890289307, "learning_rate": 1.2639693286564974e-05, "loss": 0.2889, "num_input_tokens_seen": 2976736, "step": 3100 }, { "epoch": 0.25328330206378985, "grad_norm": 2.0175130367279053, "learning_rate": 1.2660086467085408e-05, "loss": 0.3438, "num_input_tokens_seen": 2981760, "step": 3105 }, { "epoch": 0.2536911656741985, "grad_norm": 5.821011066436768, "learning_rate": 1.2680479647605842e-05, "loss": 0.3384, "num_input_tokens_seen": 2986432, "step": 3110 }, { "epoch": 0.2540990292846072, "grad_norm": 0.12634634971618652, "learning_rate": 1.2700872828126276e-05, "loss": 0.249, "num_input_tokens_seen": 2991120, "step": 3115 }, { "epoch": 0.2545068928950159, "grad_norm": 5.538594722747803, "learning_rate": 1.2721266008646709e-05, "loss": 0.3371, "num_input_tokens_seen": 2995648, "step": 3120 }, { "epoch": 0.2549147565054246, "grad_norm": 11.904152870178223, "learning_rate": 1.2741659189167143e-05, "loss": 0.3591, "num_input_tokens_seen": 3001088, "step": 3125 }, { "epoch": 0.2553226201158333, "grad_norm": 6.072014808654785, "learning_rate": 1.2762052369687577e-05, "loss": 0.7533, "num_input_tokens_seen": 3005104, "step": 3130 }, { "epoch": 0.25573048372624196, "grad_norm": 5.62483024597168, "learning_rate": 1.2782445550208011e-05, "loss": 0.5567, "num_input_tokens_seen": 3010064, "step": 3135 }, { "epoch": 0.25613834733665064, "grad_norm": 3.3679616451263428, "learning_rate": 1.2802838730728445e-05, "loss": 0.2548, "num_input_tokens_seen": 3014688, "step": 3140 }, { "epoch": 0.2565462109470593, "grad_norm": 15.237527847290039, "learning_rate": 1.282323191124888e-05, "loss": 0.2718, "num_input_tokens_seen": 3018064, "step": 3145 }, { "epoch": 0.256954074557468, "grad_norm": 13.613211631774902, "learning_rate": 1.2843625091769312e-05, "loss": 0.3159, "num_input_tokens_seen": 3022848, "step": 3150 }, { "epoch": 0.25736193816787667, "grad_norm": 14.109664916992188, "learning_rate": 1.2864018272289746e-05, "loss": 0.3721, "num_input_tokens_seen": 3027040, "step": 3155 }, { "epoch": 0.25776980177828535, "grad_norm": 20.378976821899414, "learning_rate": 1.288441145281018e-05, "loss": 0.5539, "num_input_tokens_seen": 3030896, "step": 3160 }, { "epoch": 0.258177665388694, "grad_norm": 13.612994194030762, "learning_rate": 1.2904804633330614e-05, "loss": 0.3618, "num_input_tokens_seen": 3035984, "step": 3165 }, { "epoch": 0.2585855289991027, "grad_norm": 3.0462024211883545, "learning_rate": 1.2925197813851048e-05, "loss": 0.1999, "num_input_tokens_seen": 3041488, "step": 3170 }, { "epoch": 0.2589933926095114, "grad_norm": 9.289331436157227, "learning_rate": 1.2945590994371482e-05, "loss": 0.5098, "num_input_tokens_seen": 3046864, "step": 3175 }, { "epoch": 0.25940125621992005, "grad_norm": 9.412344932556152, "learning_rate": 1.2965984174891918e-05, "loss": 0.3006, "num_input_tokens_seen": 3052160, "step": 3180 }, { "epoch": 0.2598091198303287, "grad_norm": 4.858607769012451, "learning_rate": 1.2986377355412352e-05, "loss": 0.4739, "num_input_tokens_seen": 3057008, "step": 3185 }, { "epoch": 0.2602169834407374, "grad_norm": 2.665771007537842, "learning_rate": 1.3006770535932786e-05, "loss": 0.3345, "num_input_tokens_seen": 3062752, "step": 3190 }, { "epoch": 0.2606248470511461, "grad_norm": 2.3573079109191895, "learning_rate": 1.302716371645322e-05, "loss": 0.3531, "num_input_tokens_seen": 3066832, "step": 3195 }, { "epoch": 0.26103271066155476, "grad_norm": 2.5753791332244873, "learning_rate": 1.3047556896973654e-05, "loss": 0.223, "num_input_tokens_seen": 3071648, "step": 3200 }, { "epoch": 0.26144057427196343, "grad_norm": 1.2559345960617065, "learning_rate": 1.3067950077494088e-05, "loss": 0.31, "num_input_tokens_seen": 3076256, "step": 3205 }, { "epoch": 0.2618484378823721, "grad_norm": 1.239122986793518, "learning_rate": 1.308834325801452e-05, "loss": 0.2825, "num_input_tokens_seen": 3081504, "step": 3210 }, { "epoch": 0.26225630149278084, "grad_norm": 3.3630123138427734, "learning_rate": 1.3108736438534954e-05, "loss": 0.3974, "num_input_tokens_seen": 3086688, "step": 3215 }, { "epoch": 0.2626641651031895, "grad_norm": 5.685457229614258, "learning_rate": 1.3129129619055389e-05, "loss": 0.2753, "num_input_tokens_seen": 3092480, "step": 3220 }, { "epoch": 0.2630720287135982, "grad_norm": 4.820583343505859, "learning_rate": 1.3149522799575823e-05, "loss": 0.2054, "num_input_tokens_seen": 3097520, "step": 3225 }, { "epoch": 0.26347989232400687, "grad_norm": 9.781064987182617, "learning_rate": 1.3169915980096257e-05, "loss": 0.5342, "num_input_tokens_seen": 3102464, "step": 3230 }, { "epoch": 0.26388775593441555, "grad_norm": 5.013408184051514, "learning_rate": 1.319030916061669e-05, "loss": 0.6583, "num_input_tokens_seen": 3106528, "step": 3235 }, { "epoch": 0.2642956195448242, "grad_norm": 34.27593994140625, "learning_rate": 1.3210702341137123e-05, "loss": 1.2215, "num_input_tokens_seen": 3111552, "step": 3240 }, { "epoch": 0.2647034831552329, "grad_norm": 9.378642082214355, "learning_rate": 1.3231095521657557e-05, "loss": 0.3744, "num_input_tokens_seen": 3116752, "step": 3245 }, { "epoch": 0.2651113467656416, "grad_norm": 5.126638889312744, "learning_rate": 1.3251488702177991e-05, "loss": 0.3584, "num_input_tokens_seen": 3121728, "step": 3250 }, { "epoch": 0.26551921037605025, "grad_norm": 4.489051342010498, "learning_rate": 1.3271881882698425e-05, "loss": 0.288, "num_input_tokens_seen": 3127440, "step": 3255 }, { "epoch": 0.26592707398645893, "grad_norm": 5.450727939605713, "learning_rate": 1.329227506321886e-05, "loss": 0.3206, "num_input_tokens_seen": 3132016, "step": 3260 }, { "epoch": 0.2663349375968676, "grad_norm": 1.3334404230117798, "learning_rate": 1.3312668243739293e-05, "loss": 0.3647, "num_input_tokens_seen": 3136448, "step": 3265 }, { "epoch": 0.2667428012072763, "grad_norm": 3.920229911804199, "learning_rate": 1.3333061424259728e-05, "loss": 0.2851, "num_input_tokens_seen": 3140784, "step": 3270 }, { "epoch": 0.26715066481768496, "grad_norm": 7.7530388832092285, "learning_rate": 1.3353454604780163e-05, "loss": 0.3275, "num_input_tokens_seen": 3146048, "step": 3275 }, { "epoch": 0.26755852842809363, "grad_norm": 4.951605319976807, "learning_rate": 1.3373847785300597e-05, "loss": 0.3033, "num_input_tokens_seen": 3150880, "step": 3280 }, { "epoch": 0.2679663920385023, "grad_norm": 3.4966936111450195, "learning_rate": 1.3394240965821031e-05, "loss": 0.296, "num_input_tokens_seen": 3155920, "step": 3285 }, { "epoch": 0.268374255648911, "grad_norm": 3.4228708744049072, "learning_rate": 1.3414634146341466e-05, "loss": 0.4487, "num_input_tokens_seen": 3160752, "step": 3290 }, { "epoch": 0.26878211925931966, "grad_norm": 1.9587960243225098, "learning_rate": 1.3435027326861898e-05, "loss": 0.2161, "num_input_tokens_seen": 3165792, "step": 3295 }, { "epoch": 0.26918998286972834, "grad_norm": 2.6938271522521973, "learning_rate": 1.3455420507382332e-05, "loss": 0.3416, "num_input_tokens_seen": 3171296, "step": 3300 }, { "epoch": 0.269597846480137, "grad_norm": 3.3269169330596924, "learning_rate": 1.3475813687902766e-05, "loss": 0.512, "num_input_tokens_seen": 3175744, "step": 3305 }, { "epoch": 0.27000571009054575, "grad_norm": 1.4246478080749512, "learning_rate": 1.34962068684232e-05, "loss": 0.3249, "num_input_tokens_seen": 3180960, "step": 3310 }, { "epoch": 0.2704135737009544, "grad_norm": 1.9937900304794312, "learning_rate": 1.3516600048943634e-05, "loss": 0.2079, "num_input_tokens_seen": 3186016, "step": 3315 }, { "epoch": 0.2708214373113631, "grad_norm": 3.478790760040283, "learning_rate": 1.3536993229464068e-05, "loss": 0.5336, "num_input_tokens_seen": 3190768, "step": 3320 }, { "epoch": 0.2712293009217718, "grad_norm": 2.3681418895721436, "learning_rate": 1.3557386409984502e-05, "loss": 0.3379, "num_input_tokens_seen": 3195680, "step": 3325 }, { "epoch": 0.27163716453218045, "grad_norm": 9.03923225402832, "learning_rate": 1.3577779590504935e-05, "loss": 0.4377, "num_input_tokens_seen": 3200480, "step": 3330 }, { "epoch": 0.27204502814258913, "grad_norm": 9.265582084655762, "learning_rate": 1.3598172771025369e-05, "loss": 0.4971, "num_input_tokens_seen": 3204800, "step": 3335 }, { "epoch": 0.2724528917529978, "grad_norm": 4.41132926940918, "learning_rate": 1.3618565951545803e-05, "loss": 0.2297, "num_input_tokens_seen": 3209472, "step": 3340 }, { "epoch": 0.2728607553634065, "grad_norm": 1.7481598854064941, "learning_rate": 1.3638959132066237e-05, "loss": 0.3173, "num_input_tokens_seen": 3215088, "step": 3345 }, { "epoch": 0.27326861897381516, "grad_norm": 4.962473392486572, "learning_rate": 1.3659352312586671e-05, "loss": 0.6154, "num_input_tokens_seen": 3219600, "step": 3350 }, { "epoch": 0.27367648258422383, "grad_norm": 3.8375892639160156, "learning_rate": 1.3679745493107105e-05, "loss": 0.2918, "num_input_tokens_seen": 3225056, "step": 3355 }, { "epoch": 0.2740843461946325, "grad_norm": 6.100933074951172, "learning_rate": 1.3700138673627539e-05, "loss": 0.3482, "num_input_tokens_seen": 3229504, "step": 3360 }, { "epoch": 0.2744922098050412, "grad_norm": 10.383824348449707, "learning_rate": 1.3720531854147975e-05, "loss": 0.3607, "num_input_tokens_seen": 3234592, "step": 3365 }, { "epoch": 0.27490007341544986, "grad_norm": 6.496724605560303, "learning_rate": 1.3740925034668409e-05, "loss": 0.3672, "num_input_tokens_seen": 3239664, "step": 3370 }, { "epoch": 0.27530793702585854, "grad_norm": 3.405906915664673, "learning_rate": 1.3761318215188843e-05, "loss": 0.3436, "num_input_tokens_seen": 3244352, "step": 3375 }, { "epoch": 0.2757158006362672, "grad_norm": 2.1541647911071777, "learning_rate": 1.3781711395709277e-05, "loss": 0.4008, "num_input_tokens_seen": 3248704, "step": 3380 }, { "epoch": 0.2761236642466759, "grad_norm": 5.650137901306152, "learning_rate": 1.380210457622971e-05, "loss": 0.3489, "num_input_tokens_seen": 3253584, "step": 3385 }, { "epoch": 0.27653152785708457, "grad_norm": 3.1813340187072754, "learning_rate": 1.3822497756750144e-05, "loss": 0.3393, "num_input_tokens_seen": 3258800, "step": 3390 }, { "epoch": 0.27693939146749325, "grad_norm": 2.7914204597473145, "learning_rate": 1.3842890937270578e-05, "loss": 0.3598, "num_input_tokens_seen": 3263648, "step": 3395 }, { "epoch": 0.2773472550779019, "grad_norm": 1.692335844039917, "learning_rate": 1.3863284117791012e-05, "loss": 0.3791, "num_input_tokens_seen": 3268096, "step": 3400 }, { "epoch": 0.27775511868831065, "grad_norm": 3.514458179473877, "learning_rate": 1.3883677298311446e-05, "loss": 0.2921, "num_input_tokens_seen": 3272768, "step": 3405 }, { "epoch": 0.27816298229871933, "grad_norm": 2.052971363067627, "learning_rate": 1.390407047883188e-05, "loss": 0.3204, "num_input_tokens_seen": 3277424, "step": 3410 }, { "epoch": 0.278570845909128, "grad_norm": 5.905516147613525, "learning_rate": 1.3924463659352314e-05, "loss": 0.3449, "num_input_tokens_seen": 3282320, "step": 3415 }, { "epoch": 0.2789787095195367, "grad_norm": 6.294272422790527, "learning_rate": 1.3944856839872746e-05, "loss": 0.3434, "num_input_tokens_seen": 3287472, "step": 3420 }, { "epoch": 0.27938657312994536, "grad_norm": 2.681252956390381, "learning_rate": 1.396525002039318e-05, "loss": 0.35, "num_input_tokens_seen": 3292384, "step": 3425 }, { "epoch": 0.27979443674035404, "grad_norm": 3.1565873622894287, "learning_rate": 1.3985643200913614e-05, "loss": 0.3, "num_input_tokens_seen": 3297360, "step": 3430 }, { "epoch": 0.2802023003507627, "grad_norm": 4.931003570556641, "learning_rate": 1.4006036381434048e-05, "loss": 0.438, "num_input_tokens_seen": 3301712, "step": 3435 }, { "epoch": 0.2806101639611714, "grad_norm": 6.224372386932373, "learning_rate": 1.4026429561954483e-05, "loss": 0.4483, "num_input_tokens_seen": 3306560, "step": 3440 }, { "epoch": 0.28101802757158006, "grad_norm": 3.1446714401245117, "learning_rate": 1.4046822742474917e-05, "loss": 0.34, "num_input_tokens_seen": 3311376, "step": 3445 }, { "epoch": 0.28142589118198874, "grad_norm": 1.9764513969421387, "learning_rate": 1.406721592299535e-05, "loss": 0.26, "num_input_tokens_seen": 3316512, "step": 3450 }, { "epoch": 0.2818337547923974, "grad_norm": 4.814560890197754, "learning_rate": 1.4087609103515783e-05, "loss": 0.3636, "num_input_tokens_seen": 3321984, "step": 3455 }, { "epoch": 0.2822416184028061, "grad_norm": 2.410630226135254, "learning_rate": 1.410800228403622e-05, "loss": 0.294, "num_input_tokens_seen": 3326048, "step": 3460 }, { "epoch": 0.28264948201321477, "grad_norm": 4.771309852600098, "learning_rate": 1.4128395464556655e-05, "loss": 0.3848, "num_input_tokens_seen": 3330848, "step": 3465 }, { "epoch": 0.28305734562362345, "grad_norm": 1.071973443031311, "learning_rate": 1.4148788645077089e-05, "loss": 0.329, "num_input_tokens_seen": 3335232, "step": 3470 }, { "epoch": 0.2834652092340321, "grad_norm": 1.8640371561050415, "learning_rate": 1.4169181825597521e-05, "loss": 0.2581, "num_input_tokens_seen": 3338912, "step": 3475 }, { "epoch": 0.2838730728444408, "grad_norm": 2.2988240718841553, "learning_rate": 1.4189575006117955e-05, "loss": 0.3636, "num_input_tokens_seen": 3343648, "step": 3480 }, { "epoch": 0.2842809364548495, "grad_norm": 6.14235782623291, "learning_rate": 1.4209968186638389e-05, "loss": 0.318, "num_input_tokens_seen": 3347792, "step": 3485 }, { "epoch": 0.28468880006525815, "grad_norm": 5.959381580352783, "learning_rate": 1.4230361367158823e-05, "loss": 0.7481, "num_input_tokens_seen": 3352400, "step": 3490 }, { "epoch": 0.2850966636756669, "grad_norm": 6.519048690795898, "learning_rate": 1.4250754547679257e-05, "loss": 0.6187, "num_input_tokens_seen": 3357488, "step": 3495 }, { "epoch": 0.28550452728607556, "grad_norm": 2.3893611431121826, "learning_rate": 1.4271147728199691e-05, "loss": 0.2757, "num_input_tokens_seen": 3363056, "step": 3500 }, { "epoch": 0.28591239089648424, "grad_norm": 4.146803379058838, "learning_rate": 1.4291540908720125e-05, "loss": 0.2757, "num_input_tokens_seen": 3368064, "step": 3505 }, { "epoch": 0.2863202545068929, "grad_norm": 2.2372233867645264, "learning_rate": 1.4311934089240558e-05, "loss": 0.2117, "num_input_tokens_seen": 3372944, "step": 3510 }, { "epoch": 0.2867281181173016, "grad_norm": 13.073305130004883, "learning_rate": 1.4332327269760992e-05, "loss": 0.3117, "num_input_tokens_seen": 3376912, "step": 3515 }, { "epoch": 0.28713598172771027, "grad_norm": 1.8022863864898682, "learning_rate": 1.4352720450281426e-05, "loss": 0.5666, "num_input_tokens_seen": 3381696, "step": 3520 }, { "epoch": 0.28754384533811894, "grad_norm": 3.0859503746032715, "learning_rate": 1.437311363080186e-05, "loss": 0.5111, "num_input_tokens_seen": 3386144, "step": 3525 }, { "epoch": 0.2879517089485276, "grad_norm": 2.5610246658325195, "learning_rate": 1.4393506811322294e-05, "loss": 0.1613, "num_input_tokens_seen": 3391088, "step": 3530 }, { "epoch": 0.2883595725589363, "grad_norm": 6.325758934020996, "learning_rate": 1.4413899991842728e-05, "loss": 0.5576, "num_input_tokens_seen": 3396128, "step": 3535 }, { "epoch": 0.28876743616934497, "grad_norm": 7.15186882019043, "learning_rate": 1.443429317236316e-05, "loss": 0.3206, "num_input_tokens_seen": 3400384, "step": 3540 }, { "epoch": 0.28917529977975365, "grad_norm": 5.746334075927734, "learning_rate": 1.4454686352883595e-05, "loss": 0.3502, "num_input_tokens_seen": 3405040, "step": 3545 }, { "epoch": 0.2895831633901623, "grad_norm": 1.6820390224456787, "learning_rate": 1.4475079533404029e-05, "loss": 0.3327, "num_input_tokens_seen": 3410560, "step": 3550 }, { "epoch": 0.289991027000571, "grad_norm": 2.477328300476074, "learning_rate": 1.4495472713924466e-05, "loss": 0.3641, "num_input_tokens_seen": 3415712, "step": 3555 }, { "epoch": 0.2903988906109797, "grad_norm": 1.7299703359603882, "learning_rate": 1.45158658944449e-05, "loss": 0.3423, "num_input_tokens_seen": 3420720, "step": 3560 }, { "epoch": 0.29080675422138835, "grad_norm": 1.9441012144088745, "learning_rate": 1.4536259074965333e-05, "loss": 0.389, "num_input_tokens_seen": 3424816, "step": 3565 }, { "epoch": 0.29121461783179703, "grad_norm": 1.798440933227539, "learning_rate": 1.4556652255485767e-05, "loss": 0.3217, "num_input_tokens_seen": 3429648, "step": 3570 }, { "epoch": 0.2916224814422057, "grad_norm": 1.8627663850784302, "learning_rate": 1.45770454360062e-05, "loss": 0.361, "num_input_tokens_seen": 3434096, "step": 3575 }, { "epoch": 0.2920303450526144, "grad_norm": 2.5029711723327637, "learning_rate": 1.4597438616526635e-05, "loss": 0.3253, "num_input_tokens_seen": 3438512, "step": 3580 }, { "epoch": 0.29243820866302306, "grad_norm": 3.8541994094848633, "learning_rate": 1.4617831797047069e-05, "loss": 0.3102, "num_input_tokens_seen": 3443408, "step": 3585 }, { "epoch": 0.2928460722734318, "grad_norm": 1.5736491680145264, "learning_rate": 1.4638224977567503e-05, "loss": 0.388, "num_input_tokens_seen": 3447824, "step": 3590 }, { "epoch": 0.29325393588384047, "grad_norm": 0.3120940923690796, "learning_rate": 1.4658618158087935e-05, "loss": 0.129, "num_input_tokens_seen": 3452912, "step": 3595 }, { "epoch": 0.29366179949424914, "grad_norm": 62.18372344970703, "learning_rate": 1.467901133860837e-05, "loss": 1.052, "num_input_tokens_seen": 3457696, "step": 3600 }, { "epoch": 0.2940696631046578, "grad_norm": 4.463944435119629, "learning_rate": 1.4699404519128803e-05, "loss": 0.3679, "num_input_tokens_seen": 3462544, "step": 3605 }, { "epoch": 0.2944775267150665, "grad_norm": 1.8005589246749878, "learning_rate": 1.4719797699649238e-05, "loss": 0.4198, "num_input_tokens_seen": 3468128, "step": 3610 }, { "epoch": 0.29488539032547517, "grad_norm": 3.7428743839263916, "learning_rate": 1.4740190880169672e-05, "loss": 0.3612, "num_input_tokens_seen": 3472640, "step": 3615 }, { "epoch": 0.29529325393588385, "grad_norm": 1.334486722946167, "learning_rate": 1.4760584060690106e-05, "loss": 0.2946, "num_input_tokens_seen": 3477024, "step": 3620 }, { "epoch": 0.2957011175462925, "grad_norm": 2.656231641769409, "learning_rate": 1.478097724121054e-05, "loss": 0.3446, "num_input_tokens_seen": 3482352, "step": 3625 }, { "epoch": 0.2961089811567012, "grad_norm": 1.1537377834320068, "learning_rate": 1.4801370421730972e-05, "loss": 0.2928, "num_input_tokens_seen": 3487536, "step": 3630 }, { "epoch": 0.2965168447671099, "grad_norm": 1.3481837511062622, "learning_rate": 1.4821763602251406e-05, "loss": 0.3293, "num_input_tokens_seen": 3491616, "step": 3635 }, { "epoch": 0.29692470837751855, "grad_norm": 1.0623717308044434, "learning_rate": 1.484215678277184e-05, "loss": 0.3259, "num_input_tokens_seen": 3494992, "step": 3640 }, { "epoch": 0.29733257198792723, "grad_norm": 2.1481173038482666, "learning_rate": 1.4862549963292278e-05, "loss": 0.2549, "num_input_tokens_seen": 3500416, "step": 3645 }, { "epoch": 0.2977404355983359, "grad_norm": 2.803804874420166, "learning_rate": 1.4882943143812712e-05, "loss": 0.5139, "num_input_tokens_seen": 3505424, "step": 3650 }, { "epoch": 0.2981482992087446, "grad_norm": 1.6673537492752075, "learning_rate": 1.4903336324333144e-05, "loss": 0.6965, "num_input_tokens_seen": 3510528, "step": 3655 }, { "epoch": 0.29855616281915326, "grad_norm": 3.2535619735717773, "learning_rate": 1.4923729504853578e-05, "loss": 0.2577, "num_input_tokens_seen": 3516176, "step": 3660 }, { "epoch": 0.29896402642956194, "grad_norm": 2.4648208618164062, "learning_rate": 1.4944122685374012e-05, "loss": 0.3323, "num_input_tokens_seen": 3520864, "step": 3665 }, { "epoch": 0.2993718900399706, "grad_norm": 3.52551007270813, "learning_rate": 1.4964515865894446e-05, "loss": 0.1773, "num_input_tokens_seen": 3525280, "step": 3670 }, { "epoch": 0.2997797536503793, "grad_norm": 3.182382106781006, "learning_rate": 1.498490904641488e-05, "loss": 0.5402, "num_input_tokens_seen": 3530352, "step": 3675 }, { "epoch": 0.300187617260788, "grad_norm": 4.824429512023926, "learning_rate": 1.5005302226935314e-05, "loss": 0.3061, "num_input_tokens_seen": 3534896, "step": 3680 }, { "epoch": 0.3005954808711967, "grad_norm": 8.90404987335205, "learning_rate": 1.5025695407455747e-05, "loss": 0.233, "num_input_tokens_seen": 3539376, "step": 3685 }, { "epoch": 0.3010033444816054, "grad_norm": 5.928615570068359, "learning_rate": 1.5046088587976181e-05, "loss": 0.3243, "num_input_tokens_seen": 3543296, "step": 3690 }, { "epoch": 0.30141120809201405, "grad_norm": 8.689245223999023, "learning_rate": 1.5066481768496615e-05, "loss": 0.487, "num_input_tokens_seen": 3547920, "step": 3695 }, { "epoch": 0.3018190717024227, "grad_norm": 2.2888686656951904, "learning_rate": 1.5086874949017049e-05, "loss": 0.3765, "num_input_tokens_seen": 3552656, "step": 3700 }, { "epoch": 0.3022269353128314, "grad_norm": 1.2576717138290405, "learning_rate": 1.5107268129537483e-05, "loss": 0.3307, "num_input_tokens_seen": 3556992, "step": 3705 }, { "epoch": 0.3026347989232401, "grad_norm": 2.160254955291748, "learning_rate": 1.5127661310057917e-05, "loss": 0.3512, "num_input_tokens_seen": 3561728, "step": 3710 }, { "epoch": 0.30304266253364875, "grad_norm": 10.805049896240234, "learning_rate": 1.5148054490578351e-05, "loss": 0.5095, "num_input_tokens_seen": 3566448, "step": 3715 }, { "epoch": 0.30345052614405743, "grad_norm": 1.7827237844467163, "learning_rate": 1.5168447671098784e-05, "loss": 0.3241, "num_input_tokens_seen": 3570480, "step": 3720 }, { "epoch": 0.3038583897544661, "grad_norm": 1.990911602973938, "learning_rate": 1.5188840851619218e-05, "loss": 0.3209, "num_input_tokens_seen": 3575568, "step": 3725 }, { "epoch": 0.3042662533648748, "grad_norm": 2.214735984802246, "learning_rate": 1.5209234032139652e-05, "loss": 0.3132, "num_input_tokens_seen": 3579744, "step": 3730 }, { "epoch": 0.30467411697528346, "grad_norm": 3.5116066932678223, "learning_rate": 1.5229627212660086e-05, "loss": 0.4397, "num_input_tokens_seen": 3584544, "step": 3735 }, { "epoch": 0.30508198058569214, "grad_norm": 1.9852360486984253, "learning_rate": 1.5250020393180522e-05, "loss": 0.3134, "num_input_tokens_seen": 3589520, "step": 3740 }, { "epoch": 0.3054898441961008, "grad_norm": 1.7586373090744019, "learning_rate": 1.5270413573700956e-05, "loss": 0.3709, "num_input_tokens_seen": 3594288, "step": 3745 }, { "epoch": 0.3058977078065095, "grad_norm": 3.1114418506622314, "learning_rate": 1.529080675422139e-05, "loss": 0.2951, "num_input_tokens_seen": 3599424, "step": 3750 }, { "epoch": 0.30630557141691817, "grad_norm": 4.855413436889648, "learning_rate": 1.5311199934741824e-05, "loss": 0.4194, "num_input_tokens_seen": 3604720, "step": 3755 }, { "epoch": 0.30671343502732684, "grad_norm": 3.704258441925049, "learning_rate": 1.5331593115262256e-05, "loss": 0.2985, "num_input_tokens_seen": 3609408, "step": 3760 }, { "epoch": 0.3071212986377355, "grad_norm": 3.4103565216064453, "learning_rate": 1.5351986295782692e-05, "loss": 0.4012, "num_input_tokens_seen": 3614352, "step": 3765 }, { "epoch": 0.3075291622481442, "grad_norm": 1.1856778860092163, "learning_rate": 1.5372379476303124e-05, "loss": 0.4748, "num_input_tokens_seen": 3618672, "step": 3770 }, { "epoch": 0.3079370258585529, "grad_norm": 1.0663607120513916, "learning_rate": 1.539277265682356e-05, "loss": 0.3059, "num_input_tokens_seen": 3623088, "step": 3775 }, { "epoch": 0.3083448894689616, "grad_norm": 3.0441665649414062, "learning_rate": 1.5413165837343993e-05, "loss": 0.3147, "num_input_tokens_seen": 3628304, "step": 3780 }, { "epoch": 0.3087527530793703, "grad_norm": 1.4030078649520874, "learning_rate": 1.5433559017864428e-05, "loss": 0.2577, "num_input_tokens_seen": 3633088, "step": 3785 }, { "epoch": 0.30916061668977896, "grad_norm": 2.714218854904175, "learning_rate": 1.545395219838486e-05, "loss": 0.2085, "num_input_tokens_seen": 3637184, "step": 3790 }, { "epoch": 0.30956848030018763, "grad_norm": 2.1617159843444824, "learning_rate": 1.5474345378905293e-05, "loss": 0.5154, "num_input_tokens_seen": 3642544, "step": 3795 }, { "epoch": 0.3099763439105963, "grad_norm": 8.582765579223633, "learning_rate": 1.549473855942573e-05, "loss": 0.5602, "num_input_tokens_seen": 3646944, "step": 3800 }, { "epoch": 0.310384207521005, "grad_norm": 0.8794758319854736, "learning_rate": 1.551513173994616e-05, "loss": 0.5079, "num_input_tokens_seen": 3652384, "step": 3805 }, { "epoch": 0.31079207113141366, "grad_norm": 1.5211405754089355, "learning_rate": 1.5535524920466597e-05, "loss": 0.4112, "num_input_tokens_seen": 3657792, "step": 3810 }, { "epoch": 0.31119993474182234, "grad_norm": 4.503806114196777, "learning_rate": 1.555591810098703e-05, "loss": 0.3015, "num_input_tokens_seen": 3662896, "step": 3815 }, { "epoch": 0.311607798352231, "grad_norm": 6.5548624992370605, "learning_rate": 1.5576311281507465e-05, "loss": 0.3075, "num_input_tokens_seen": 3667984, "step": 3820 }, { "epoch": 0.3120156619626397, "grad_norm": 1.902435541152954, "learning_rate": 1.5596704462027897e-05, "loss": 0.2734, "num_input_tokens_seen": 3672672, "step": 3825 }, { "epoch": 0.31242352557304837, "grad_norm": 5.874241352081299, "learning_rate": 1.5617097642548333e-05, "loss": 0.3416, "num_input_tokens_seen": 3677488, "step": 3830 }, { "epoch": 0.31283138918345704, "grad_norm": 3.7720696926116943, "learning_rate": 1.563749082306877e-05, "loss": 0.3374, "num_input_tokens_seen": 3682080, "step": 3835 }, { "epoch": 0.3132392527938657, "grad_norm": 9.807635307312012, "learning_rate": 1.56578840035892e-05, "loss": 0.354, "num_input_tokens_seen": 3687312, "step": 3840 }, { "epoch": 0.3136471164042744, "grad_norm": 4.457699775695801, "learning_rate": 1.5678277184109637e-05, "loss": 0.348, "num_input_tokens_seen": 3692176, "step": 3845 }, { "epoch": 0.31405498001468307, "grad_norm": 4.4245524406433105, "learning_rate": 1.569867036463007e-05, "loss": 0.3146, "num_input_tokens_seen": 3697840, "step": 3850 }, { "epoch": 0.31446284362509175, "grad_norm": 6.30049467086792, "learning_rate": 1.5719063545150502e-05, "loss": 0.3564, "num_input_tokens_seen": 3702992, "step": 3855 }, { "epoch": 0.3148707072355004, "grad_norm": 7.9435648918151855, "learning_rate": 1.5739456725670938e-05, "loss": 0.3548, "num_input_tokens_seen": 3708000, "step": 3860 }, { "epoch": 0.3152785708459091, "grad_norm": 1.5926673412322998, "learning_rate": 1.575984990619137e-05, "loss": 0.2342, "num_input_tokens_seen": 3713280, "step": 3865 }, { "epoch": 0.31568643445631783, "grad_norm": 1.2339348793029785, "learning_rate": 1.5780243086711806e-05, "loss": 0.2896, "num_input_tokens_seen": 3718240, "step": 3870 }, { "epoch": 0.3160942980667265, "grad_norm": 6.746217250823975, "learning_rate": 1.5800636267232238e-05, "loss": 0.3293, "num_input_tokens_seen": 3724032, "step": 3875 }, { "epoch": 0.3165021616771352, "grad_norm": 2.2449004650115967, "learning_rate": 1.582102944775267e-05, "loss": 0.2521, "num_input_tokens_seen": 3728880, "step": 3880 }, { "epoch": 0.31691002528754386, "grad_norm": 4.615090370178223, "learning_rate": 1.5841422628273106e-05, "loss": 0.3481, "num_input_tokens_seen": 3732944, "step": 3885 }, { "epoch": 0.31731788889795254, "grad_norm": 1.3450978994369507, "learning_rate": 1.586181580879354e-05, "loss": 0.3121, "num_input_tokens_seen": 3737552, "step": 3890 }, { "epoch": 0.3177257525083612, "grad_norm": 1.1998528242111206, "learning_rate": 1.5882208989313974e-05, "loss": 0.3278, "num_input_tokens_seen": 3742624, "step": 3895 }, { "epoch": 0.3181336161187699, "grad_norm": 45.52830123901367, "learning_rate": 1.5902602169834407e-05, "loss": 0.62, "num_input_tokens_seen": 3747232, "step": 3900 }, { "epoch": 0.31854147972917857, "grad_norm": 1.266596794128418, "learning_rate": 1.5922995350354843e-05, "loss": 0.2074, "num_input_tokens_seen": 3752416, "step": 3905 }, { "epoch": 0.31894934333958724, "grad_norm": 2.8471922874450684, "learning_rate": 1.5943388530875275e-05, "loss": 0.4512, "num_input_tokens_seen": 3757600, "step": 3910 }, { "epoch": 0.3193572069499959, "grad_norm": 2.730801820755005, "learning_rate": 1.5963781711395707e-05, "loss": 0.3132, "num_input_tokens_seen": 3762032, "step": 3915 }, { "epoch": 0.3197650705604046, "grad_norm": 2.244605779647827, "learning_rate": 1.5984174891916143e-05, "loss": 0.6016, "num_input_tokens_seen": 3766688, "step": 3920 }, { "epoch": 0.3201729341708133, "grad_norm": 4.6610307693481445, "learning_rate": 1.600456807243658e-05, "loss": 0.4371, "num_input_tokens_seen": 3771520, "step": 3925 }, { "epoch": 0.32058079778122195, "grad_norm": 1.4383177757263184, "learning_rate": 1.6024961252957015e-05, "loss": 0.318, "num_input_tokens_seen": 3775712, "step": 3930 }, { "epoch": 0.3209886613916306, "grad_norm": 1.8611347675323486, "learning_rate": 1.6045354433477447e-05, "loss": 0.2785, "num_input_tokens_seen": 3780624, "step": 3935 }, { "epoch": 0.3213965250020393, "grad_norm": 2.6420156955718994, "learning_rate": 1.606574761399788e-05, "loss": 0.3564, "num_input_tokens_seen": 3785296, "step": 3940 }, { "epoch": 0.321804388612448, "grad_norm": 3.305281400680542, "learning_rate": 1.6086140794518315e-05, "loss": 0.3471, "num_input_tokens_seen": 3790192, "step": 3945 }, { "epoch": 0.32221225222285665, "grad_norm": 2.080251693725586, "learning_rate": 1.6106533975038747e-05, "loss": 0.2955, "num_input_tokens_seen": 3794944, "step": 3950 }, { "epoch": 0.32262011583326533, "grad_norm": 1.259503960609436, "learning_rate": 1.6126927155559183e-05, "loss": 0.3668, "num_input_tokens_seen": 3799520, "step": 3955 }, { "epoch": 0.32302797944367406, "grad_norm": 3.5805749893188477, "learning_rate": 1.6147320336079616e-05, "loss": 0.3224, "num_input_tokens_seen": 3804528, "step": 3960 }, { "epoch": 0.32343584305408274, "grad_norm": 1.170324444770813, "learning_rate": 1.616771351660005e-05, "loss": 0.3387, "num_input_tokens_seen": 3809072, "step": 3965 }, { "epoch": 0.3238437066644914, "grad_norm": 6.106842041015625, "learning_rate": 1.6188106697120484e-05, "loss": 0.2197, "num_input_tokens_seen": 3813904, "step": 3970 }, { "epoch": 0.3242515702749001, "grad_norm": 3.998260736465454, "learning_rate": 1.6208499877640916e-05, "loss": 0.4884, "num_input_tokens_seen": 3818352, "step": 3975 }, { "epoch": 0.32465943388530877, "grad_norm": 1.8101171255111694, "learning_rate": 1.6228893058161352e-05, "loss": 0.4888, "num_input_tokens_seen": 3823616, "step": 3980 }, { "epoch": 0.32506729749571744, "grad_norm": 1.4107913970947266, "learning_rate": 1.6249286238681784e-05, "loss": 0.4293, "num_input_tokens_seen": 3828736, "step": 3985 }, { "epoch": 0.3254751611061261, "grad_norm": 0.8848510980606079, "learning_rate": 1.626967941920222e-05, "loss": 0.2327, "num_input_tokens_seen": 3833920, "step": 3990 }, { "epoch": 0.3258830247165348, "grad_norm": 0.944189190864563, "learning_rate": 1.6290072599722652e-05, "loss": 0.4826, "num_input_tokens_seen": 3839904, "step": 3995 }, { "epoch": 0.3262908883269435, "grad_norm": 1.3593316078186035, "learning_rate": 1.6310465780243088e-05, "loss": 0.287, "num_input_tokens_seen": 3845088, "step": 4000 }, { "epoch": 0.32669875193735215, "grad_norm": 3.607387065887451, "learning_rate": 1.633085896076352e-05, "loss": 0.4772, "num_input_tokens_seen": 3849760, "step": 4005 }, { "epoch": 0.3271066155477608, "grad_norm": 4.4687581062316895, "learning_rate": 1.6351252141283953e-05, "loss": 0.2888, "num_input_tokens_seen": 3854976, "step": 4010 }, { "epoch": 0.3275144791581695, "grad_norm": 13.51845932006836, "learning_rate": 1.637164532180439e-05, "loss": 0.7544, "num_input_tokens_seen": 3859696, "step": 4015 }, { "epoch": 0.3279223427685782, "grad_norm": 1.2181544303894043, "learning_rate": 1.6392038502324824e-05, "loss": 0.4524, "num_input_tokens_seen": 3864704, "step": 4020 }, { "epoch": 0.32833020637898686, "grad_norm": 9.043682098388672, "learning_rate": 1.6412431682845257e-05, "loss": 0.493, "num_input_tokens_seen": 3870016, "step": 4025 }, { "epoch": 0.32873806998939553, "grad_norm": 5.887227535247803, "learning_rate": 1.6432824863365693e-05, "loss": 0.5954, "num_input_tokens_seen": 3874800, "step": 4030 }, { "epoch": 0.3291459335998042, "grad_norm": 1.80534827709198, "learning_rate": 1.6453218043886125e-05, "loss": 0.4273, "num_input_tokens_seen": 3880672, "step": 4035 }, { "epoch": 0.3295537972102129, "grad_norm": 1.521964192390442, "learning_rate": 1.647361122440656e-05, "loss": 0.282, "num_input_tokens_seen": 3885408, "step": 4040 }, { "epoch": 0.32996166082062156, "grad_norm": 2.0896215438842773, "learning_rate": 1.6494004404926993e-05, "loss": 0.3276, "num_input_tokens_seen": 3890016, "step": 4045 }, { "epoch": 0.33036952443103024, "grad_norm": 1.2256309986114502, "learning_rate": 1.651439758544743e-05, "loss": 0.3473, "num_input_tokens_seen": 3895168, "step": 4050 }, { "epoch": 0.33077738804143897, "grad_norm": 0.8734874725341797, "learning_rate": 1.653479076596786e-05, "loss": 0.3842, "num_input_tokens_seen": 3899888, "step": 4055 }, { "epoch": 0.33118525165184765, "grad_norm": 4.047298431396484, "learning_rate": 1.6555183946488294e-05, "loss": 0.3282, "num_input_tokens_seen": 3904768, "step": 4060 }, { "epoch": 0.3315931152622563, "grad_norm": 1.4683908224105835, "learning_rate": 1.657557712700873e-05, "loss": 0.2695, "num_input_tokens_seen": 3909312, "step": 4065 }, { "epoch": 0.332000978872665, "grad_norm": 1.9804717302322388, "learning_rate": 1.6595970307529162e-05, "loss": 0.4452, "num_input_tokens_seen": 3913648, "step": 4070 }, { "epoch": 0.3324088424830737, "grad_norm": 0.9142129421234131, "learning_rate": 1.6616363488049598e-05, "loss": 0.2992, "num_input_tokens_seen": 3918656, "step": 4075 }, { "epoch": 0.33281670609348235, "grad_norm": 5.494579792022705, "learning_rate": 1.663675666857003e-05, "loss": 0.3556, "num_input_tokens_seen": 3923280, "step": 4080 }, { "epoch": 0.333224569703891, "grad_norm": 4.214181900024414, "learning_rate": 1.6657149849090466e-05, "loss": 0.2803, "num_input_tokens_seen": 3928192, "step": 4085 }, { "epoch": 0.3336324333142997, "grad_norm": 1.6547352075576782, "learning_rate": 1.6677543029610898e-05, "loss": 0.2474, "num_input_tokens_seen": 3933232, "step": 4090 }, { "epoch": 0.3340402969247084, "grad_norm": 1.848538875579834, "learning_rate": 1.669793621013133e-05, "loss": 0.2847, "num_input_tokens_seen": 3937984, "step": 4095 }, { "epoch": 0.33444816053511706, "grad_norm": 1.3644230365753174, "learning_rate": 1.6718329390651766e-05, "loss": 0.4606, "num_input_tokens_seen": 3942288, "step": 4100 }, { "epoch": 0.33485602414552573, "grad_norm": 9.778334617614746, "learning_rate": 1.67387225711722e-05, "loss": 0.5098, "num_input_tokens_seen": 3947392, "step": 4105 }, { "epoch": 0.3352638877559344, "grad_norm": 1.4967807531356812, "learning_rate": 1.6759115751692638e-05, "loss": 0.3447, "num_input_tokens_seen": 3952032, "step": 4110 }, { "epoch": 0.3356717513663431, "grad_norm": 1.0344284772872925, "learning_rate": 1.677950893221307e-05, "loss": 0.2936, "num_input_tokens_seen": 3956128, "step": 4115 }, { "epoch": 0.33607961497675176, "grad_norm": 1.2212421894073486, "learning_rate": 1.6799902112733502e-05, "loss": 0.259, "num_input_tokens_seen": 3960944, "step": 4120 }, { "epoch": 0.33648747858716044, "grad_norm": 1.8159890174865723, "learning_rate": 1.6820295293253938e-05, "loss": 0.33, "num_input_tokens_seen": 3965680, "step": 4125 }, { "epoch": 0.3368953421975691, "grad_norm": 8.182273864746094, "learning_rate": 1.684068847377437e-05, "loss": 0.3841, "num_input_tokens_seen": 3970560, "step": 4130 }, { "epoch": 0.3373032058079778, "grad_norm": 1.3031134605407715, "learning_rate": 1.6861081654294806e-05, "loss": 0.3098, "num_input_tokens_seen": 3976128, "step": 4135 }, { "epoch": 0.33771106941838647, "grad_norm": 2.855546712875366, "learning_rate": 1.688147483481524e-05, "loss": 0.3275, "num_input_tokens_seen": 3980928, "step": 4140 }, { "epoch": 0.33811893302879514, "grad_norm": 2.510204553604126, "learning_rate": 1.6901868015335675e-05, "loss": 0.3782, "num_input_tokens_seen": 3986304, "step": 4145 }, { "epoch": 0.3385267966392039, "grad_norm": 19.185810089111328, "learning_rate": 1.6922261195856107e-05, "loss": 0.2504, "num_input_tokens_seen": 3990864, "step": 4150 }, { "epoch": 0.33893466024961255, "grad_norm": 5.237133979797363, "learning_rate": 1.694265437637654e-05, "loss": 0.5064, "num_input_tokens_seen": 3994896, "step": 4155 }, { "epoch": 0.33934252386002123, "grad_norm": 20.777055740356445, "learning_rate": 1.6963047556896975e-05, "loss": 0.5021, "num_input_tokens_seen": 3999296, "step": 4160 }, { "epoch": 0.3397503874704299, "grad_norm": 3.9135544300079346, "learning_rate": 1.6983440737417407e-05, "loss": 0.4716, "num_input_tokens_seen": 4004160, "step": 4165 }, { "epoch": 0.3401582510808386, "grad_norm": 1.7991034984588623, "learning_rate": 1.7003833917937843e-05, "loss": 0.4467, "num_input_tokens_seen": 4008464, "step": 4170 }, { "epoch": 0.34056611469124726, "grad_norm": 8.75973129272461, "learning_rate": 1.7024227098458276e-05, "loss": 0.4132, "num_input_tokens_seen": 4013552, "step": 4175 }, { "epoch": 0.34097397830165593, "grad_norm": 1.521417498588562, "learning_rate": 1.7044620278978708e-05, "loss": 0.3739, "num_input_tokens_seen": 4018992, "step": 4180 }, { "epoch": 0.3413818419120646, "grad_norm": 1.35158371925354, "learning_rate": 1.7065013459499144e-05, "loss": 0.2238, "num_input_tokens_seen": 4023712, "step": 4185 }, { "epoch": 0.3417897055224733, "grad_norm": 1.0394160747528076, "learning_rate": 1.7085406640019576e-05, "loss": 0.3018, "num_input_tokens_seen": 4028512, "step": 4190 }, { "epoch": 0.34219756913288196, "grad_norm": 1.3135985136032104, "learning_rate": 1.7105799820540012e-05, "loss": 0.3518, "num_input_tokens_seen": 4033344, "step": 4195 }, { "epoch": 0.34260543274329064, "grad_norm": 8.99705696105957, "learning_rate": 1.7126193001060444e-05, "loss": 0.5014, "num_input_tokens_seen": 4037632, "step": 4200 }, { "epoch": 0.3430132963536993, "grad_norm": 8.341306686401367, "learning_rate": 1.714658618158088e-05, "loss": 0.5671, "num_input_tokens_seen": 4042976, "step": 4205 }, { "epoch": 0.343421159964108, "grad_norm": 5.628380298614502, "learning_rate": 1.7166979362101316e-05, "loss": 0.4354, "num_input_tokens_seen": 4047696, "step": 4210 }, { "epoch": 0.34382902357451667, "grad_norm": 2.039355516433716, "learning_rate": 1.7187372542621748e-05, "loss": 0.7087, "num_input_tokens_seen": 4052384, "step": 4215 }, { "epoch": 0.34423688718492534, "grad_norm": 3.2048702239990234, "learning_rate": 1.7207765723142184e-05, "loss": 0.2208, "num_input_tokens_seen": 4056688, "step": 4220 }, { "epoch": 0.344644750795334, "grad_norm": 3.14206862449646, "learning_rate": 1.7228158903662616e-05, "loss": 0.2785, "num_input_tokens_seen": 4061840, "step": 4225 }, { "epoch": 0.3450526144057427, "grad_norm": 3.724424362182617, "learning_rate": 1.7248552084183052e-05, "loss": 0.4421, "num_input_tokens_seen": 4066352, "step": 4230 }, { "epoch": 0.3454604780161514, "grad_norm": 1.5709227323532104, "learning_rate": 1.7268945264703484e-05, "loss": 0.3202, "num_input_tokens_seen": 4071104, "step": 4235 }, { "epoch": 0.3458683416265601, "grad_norm": 8.714828491210938, "learning_rate": 1.7289338445223917e-05, "loss": 0.3439, "num_input_tokens_seen": 4076448, "step": 4240 }, { "epoch": 0.3462762052369688, "grad_norm": 3.753328323364258, "learning_rate": 1.7309731625744353e-05, "loss": 0.3354, "num_input_tokens_seen": 4081360, "step": 4245 }, { "epoch": 0.34668406884737746, "grad_norm": 1.955442190170288, "learning_rate": 1.7330124806264785e-05, "loss": 0.4269, "num_input_tokens_seen": 4086208, "step": 4250 }, { "epoch": 0.34709193245778613, "grad_norm": 5.247828960418701, "learning_rate": 1.735051798678522e-05, "loss": 0.2633, "num_input_tokens_seen": 4090912, "step": 4255 }, { "epoch": 0.3474997960681948, "grad_norm": 1.764268159866333, "learning_rate": 1.7370911167305653e-05, "loss": 0.3314, "num_input_tokens_seen": 4095472, "step": 4260 }, { "epoch": 0.3479076596786035, "grad_norm": 2.5634143352508545, "learning_rate": 1.739130434782609e-05, "loss": 0.4073, "num_input_tokens_seen": 4099328, "step": 4265 }, { "epoch": 0.34831552328901216, "grad_norm": 6.0657782554626465, "learning_rate": 1.741169752834652e-05, "loss": 0.5217, "num_input_tokens_seen": 4104672, "step": 4270 }, { "epoch": 0.34872338689942084, "grad_norm": 1.2647449970245361, "learning_rate": 1.7432090708866954e-05, "loss": 0.3864, "num_input_tokens_seen": 4109040, "step": 4275 }, { "epoch": 0.3491312505098295, "grad_norm": 1.193560004234314, "learning_rate": 1.745248388938739e-05, "loss": 0.3026, "num_input_tokens_seen": 4113296, "step": 4280 }, { "epoch": 0.3495391141202382, "grad_norm": 0.7527774572372437, "learning_rate": 1.7472877069907822e-05, "loss": 0.3917, "num_input_tokens_seen": 4118176, "step": 4285 }, { "epoch": 0.34994697773064687, "grad_norm": 2.1528594493865967, "learning_rate": 1.7493270250428257e-05, "loss": 0.3393, "num_input_tokens_seen": 4121888, "step": 4290 }, { "epoch": 0.35035484134105555, "grad_norm": 1.6387978792190552, "learning_rate": 1.751366343094869e-05, "loss": 0.2941, "num_input_tokens_seen": 4126944, "step": 4295 }, { "epoch": 0.3507627049514642, "grad_norm": 1.1277254819869995, "learning_rate": 1.7534056611469126e-05, "loss": 0.2878, "num_input_tokens_seen": 4131424, "step": 4300 }, { "epoch": 0.3511705685618729, "grad_norm": 0.7978527545928955, "learning_rate": 1.755444979198956e-05, "loss": 0.2469, "num_input_tokens_seen": 4136464, "step": 4305 }, { "epoch": 0.3515784321722816, "grad_norm": 2.8242077827453613, "learning_rate": 1.7574842972509994e-05, "loss": 0.4717, "num_input_tokens_seen": 4141536, "step": 4310 }, { "epoch": 0.35198629578269025, "grad_norm": 1.906509518623352, "learning_rate": 1.759523615303043e-05, "loss": 0.4813, "num_input_tokens_seen": 4145728, "step": 4315 }, { "epoch": 0.3523941593930989, "grad_norm": 3.2243950366973877, "learning_rate": 1.7615629333550862e-05, "loss": 0.6146, "num_input_tokens_seen": 4150400, "step": 4320 }, { "epoch": 0.3528020230035076, "grad_norm": 1.457192301750183, "learning_rate": 1.7636022514071294e-05, "loss": 0.3318, "num_input_tokens_seen": 4154560, "step": 4325 }, { "epoch": 0.3532098866139163, "grad_norm": 3.836621046066284, "learning_rate": 1.765641569459173e-05, "loss": 0.3326, "num_input_tokens_seen": 4159072, "step": 4330 }, { "epoch": 0.353617750224325, "grad_norm": 3.4650394916534424, "learning_rate": 1.7676808875112162e-05, "loss": 0.4083, "num_input_tokens_seen": 4163824, "step": 4335 }, { "epoch": 0.3540256138347337, "grad_norm": 4.130291938781738, "learning_rate": 1.7697202055632598e-05, "loss": 0.3329, "num_input_tokens_seen": 4169536, "step": 4340 }, { "epoch": 0.35443347744514236, "grad_norm": 6.982908248901367, "learning_rate": 1.771759523615303e-05, "loss": 0.3556, "num_input_tokens_seen": 4174368, "step": 4345 }, { "epoch": 0.35484134105555104, "grad_norm": 6.666118144989014, "learning_rate": 1.7737988416673466e-05, "loss": 0.3916, "num_input_tokens_seen": 4178608, "step": 4350 }, { "epoch": 0.3552492046659597, "grad_norm": 9.05910587310791, "learning_rate": 1.77583815971939e-05, "loss": 0.3348, "num_input_tokens_seen": 4182432, "step": 4355 }, { "epoch": 0.3556570682763684, "grad_norm": 4.857473850250244, "learning_rate": 1.777877477771433e-05, "loss": 0.3323, "num_input_tokens_seen": 4187456, "step": 4360 }, { "epoch": 0.35606493188677707, "grad_norm": 7.892941474914551, "learning_rate": 1.7799167958234767e-05, "loss": 0.34, "num_input_tokens_seen": 4191648, "step": 4365 }, { "epoch": 0.35647279549718575, "grad_norm": 6.54142427444458, "learning_rate": 1.78195611387552e-05, "loss": 0.3696, "num_input_tokens_seen": 4196288, "step": 4370 }, { "epoch": 0.3568806591075944, "grad_norm": 3.920725107192993, "learning_rate": 1.7839954319275635e-05, "loss": 0.3216, "num_input_tokens_seen": 4200384, "step": 4375 }, { "epoch": 0.3572885227180031, "grad_norm": 3.6445908546447754, "learning_rate": 1.7860347499796067e-05, "loss": 0.3753, "num_input_tokens_seen": 4204976, "step": 4380 }, { "epoch": 0.3576963863284118, "grad_norm": 4.053186893463135, "learning_rate": 1.7880740680316503e-05, "loss": 0.2767, "num_input_tokens_seen": 4209872, "step": 4385 }, { "epoch": 0.35810424993882045, "grad_norm": 3.3975865840911865, "learning_rate": 1.790113386083694e-05, "loss": 0.3096, "num_input_tokens_seen": 4214640, "step": 4390 }, { "epoch": 0.35851211354922913, "grad_norm": 3.517869234085083, "learning_rate": 1.792152704135737e-05, "loss": 0.3557, "num_input_tokens_seen": 4219648, "step": 4395 }, { "epoch": 0.3589199771596378, "grad_norm": 2.4094913005828857, "learning_rate": 1.7941920221877807e-05, "loss": 0.2453, "num_input_tokens_seen": 4223952, "step": 4400 }, { "epoch": 0.3593278407700465, "grad_norm": 3.634136438369751, "learning_rate": 1.796231340239824e-05, "loss": 0.2035, "num_input_tokens_seen": 4228576, "step": 4405 }, { "epoch": 0.35973570438045516, "grad_norm": 7.5028486251831055, "learning_rate": 1.7982706582918675e-05, "loss": 0.6056, "num_input_tokens_seen": 4232832, "step": 4410 }, { "epoch": 0.36014356799086383, "grad_norm": 5.060705661773682, "learning_rate": 1.8003099763439108e-05, "loss": 0.3661, "num_input_tokens_seen": 4238208, "step": 4415 }, { "epoch": 0.3605514316012725, "grad_norm": 12.52291202545166, "learning_rate": 1.802349294395954e-05, "loss": 0.4112, "num_input_tokens_seen": 4243184, "step": 4420 }, { "epoch": 0.36095929521168124, "grad_norm": 34.911766052246094, "learning_rate": 1.8043886124479976e-05, "loss": 0.5355, "num_input_tokens_seen": 4247680, "step": 4425 }, { "epoch": 0.3613671588220899, "grad_norm": 9.057777404785156, "learning_rate": 1.8064279305000408e-05, "loss": 0.4164, "num_input_tokens_seen": 4252672, "step": 4430 }, { "epoch": 0.3617750224324986, "grad_norm": 6.248121738433838, "learning_rate": 1.8084672485520844e-05, "loss": 0.3264, "num_input_tokens_seen": 4257408, "step": 4435 }, { "epoch": 0.36218288604290727, "grad_norm": 6.895608901977539, "learning_rate": 1.8105065666041276e-05, "loss": 0.2642, "num_input_tokens_seen": 4262848, "step": 4440 }, { "epoch": 0.36259074965331595, "grad_norm": 11.694469451904297, "learning_rate": 1.8125458846561712e-05, "loss": 0.4945, "num_input_tokens_seen": 4267424, "step": 4445 }, { "epoch": 0.3629986132637246, "grad_norm": 5.217210292816162, "learning_rate": 1.8145852027082144e-05, "loss": 0.3768, "num_input_tokens_seen": 4272528, "step": 4450 }, { "epoch": 0.3634064768741333, "grad_norm": 9.573820114135742, "learning_rate": 1.8166245207602577e-05, "loss": 0.4627, "num_input_tokens_seen": 4277280, "step": 4455 }, { "epoch": 0.363814340484542, "grad_norm": 5.058030128479004, "learning_rate": 1.8186638388123012e-05, "loss": 0.2835, "num_input_tokens_seen": 4282384, "step": 4460 }, { "epoch": 0.36422220409495065, "grad_norm": 7.26202392578125, "learning_rate": 1.8207031568643445e-05, "loss": 0.3675, "num_input_tokens_seen": 4287008, "step": 4465 }, { "epoch": 0.36463006770535933, "grad_norm": 6.945033550262451, "learning_rate": 1.822742474916388e-05, "loss": 0.2437, "num_input_tokens_seen": 4292560, "step": 4470 }, { "epoch": 0.365037931315768, "grad_norm": 9.391419410705566, "learning_rate": 1.8247817929684313e-05, "loss": 0.4326, "num_input_tokens_seen": 4297120, "step": 4475 }, { "epoch": 0.3654457949261767, "grad_norm": 2.4047915935516357, "learning_rate": 1.826821111020475e-05, "loss": 0.5008, "num_input_tokens_seen": 4301856, "step": 4480 }, { "epoch": 0.36585365853658536, "grad_norm": 4.241995811462402, "learning_rate": 1.8288604290725185e-05, "loss": 0.3024, "num_input_tokens_seen": 4306752, "step": 4485 }, { "epoch": 0.36626152214699403, "grad_norm": 1.8839695453643799, "learning_rate": 1.8308997471245617e-05, "loss": 0.3035, "num_input_tokens_seen": 4312128, "step": 4490 }, { "epoch": 0.3666693857574027, "grad_norm": 2.1159508228302, "learning_rate": 1.8329390651766053e-05, "loss": 0.3123, "num_input_tokens_seen": 4316464, "step": 4495 }, { "epoch": 0.3670772493678114, "grad_norm": 1.3885924816131592, "learning_rate": 1.8349783832286485e-05, "loss": 0.3689, "num_input_tokens_seen": 4321360, "step": 4500 }, { "epoch": 0.36748511297822006, "grad_norm": 2.088679552078247, "learning_rate": 1.8370177012806917e-05, "loss": 0.3796, "num_input_tokens_seen": 4325856, "step": 4505 }, { "epoch": 0.36789297658862874, "grad_norm": 1.6044180393218994, "learning_rate": 1.8390570193327353e-05, "loss": 0.2975, "num_input_tokens_seen": 4331456, "step": 4510 }, { "epoch": 0.3683008401990374, "grad_norm": 4.166197299957275, "learning_rate": 1.8410963373847786e-05, "loss": 0.362, "num_input_tokens_seen": 4334672, "step": 4515 }, { "epoch": 0.36870870380944615, "grad_norm": 5.473355770111084, "learning_rate": 1.843135655436822e-05, "loss": 0.261, "num_input_tokens_seen": 4338496, "step": 4520 }, { "epoch": 0.3691165674198548, "grad_norm": 9.667319297790527, "learning_rate": 1.8451749734888654e-05, "loss": 0.4726, "num_input_tokens_seen": 4343584, "step": 4525 }, { "epoch": 0.3695244310302635, "grad_norm": 3.06796932220459, "learning_rate": 1.847214291540909e-05, "loss": 0.3761, "num_input_tokens_seen": 4347920, "step": 4530 }, { "epoch": 0.3699322946406722, "grad_norm": 30.986122131347656, "learning_rate": 1.8492536095929522e-05, "loss": 0.2687, "num_input_tokens_seen": 4352368, "step": 4535 }, { "epoch": 0.37034015825108085, "grad_norm": 7.751478672027588, "learning_rate": 1.8512929276449954e-05, "loss": 0.316, "num_input_tokens_seen": 4356624, "step": 4540 }, { "epoch": 0.37074802186148953, "grad_norm": 21.542926788330078, "learning_rate": 1.853332245697039e-05, "loss": 0.5065, "num_input_tokens_seen": 4361584, "step": 4545 }, { "epoch": 0.3711558854718982, "grad_norm": 3.0395569801330566, "learning_rate": 1.8553715637490822e-05, "loss": 0.2867, "num_input_tokens_seen": 4366752, "step": 4550 }, { "epoch": 0.3715637490823069, "grad_norm": 3.7101871967315674, "learning_rate": 1.8574108818011258e-05, "loss": 0.405, "num_input_tokens_seen": 4371056, "step": 4555 }, { "epoch": 0.37197161269271556, "grad_norm": 3.986194372177124, "learning_rate": 1.859450199853169e-05, "loss": 0.3088, "num_input_tokens_seen": 4376304, "step": 4560 }, { "epoch": 0.37237947630312423, "grad_norm": 1.8714752197265625, "learning_rate": 1.8614895179052126e-05, "loss": 0.3192, "num_input_tokens_seen": 4380960, "step": 4565 }, { "epoch": 0.3727873399135329, "grad_norm": 4.984430313110352, "learning_rate": 1.863528835957256e-05, "loss": 0.204, "num_input_tokens_seen": 4386688, "step": 4570 }, { "epoch": 0.3731952035239416, "grad_norm": 9.278380393981934, "learning_rate": 1.8655681540092994e-05, "loss": 0.3614, "num_input_tokens_seen": 4390352, "step": 4575 }, { "epoch": 0.37360306713435026, "grad_norm": 3.5687851905822754, "learning_rate": 1.867607472061343e-05, "loss": 0.6226, "num_input_tokens_seen": 4395424, "step": 4580 }, { "epoch": 0.37401093074475894, "grad_norm": 2.685257911682129, "learning_rate": 1.8696467901133863e-05, "loss": 0.1846, "num_input_tokens_seen": 4399696, "step": 4585 }, { "epoch": 0.3744187943551676, "grad_norm": 7.78679084777832, "learning_rate": 1.8716861081654298e-05, "loss": 0.2311, "num_input_tokens_seen": 4403984, "step": 4590 }, { "epoch": 0.3748266579655763, "grad_norm": 1.4040943384170532, "learning_rate": 1.873725426217473e-05, "loss": 0.399, "num_input_tokens_seen": 4408528, "step": 4595 }, { "epoch": 0.37523452157598497, "grad_norm": 1.4674205780029297, "learning_rate": 1.8757647442695163e-05, "loss": 0.3643, "num_input_tokens_seen": 4413280, "step": 4600 }, { "epoch": 0.37564238518639365, "grad_norm": 11.304502487182617, "learning_rate": 1.87780406232156e-05, "loss": 0.651, "num_input_tokens_seen": 4417248, "step": 4605 }, { "epoch": 0.3760502487968023, "grad_norm": 1.3142902851104736, "learning_rate": 1.879843380373603e-05, "loss": 0.3334, "num_input_tokens_seen": 4422512, "step": 4610 }, { "epoch": 0.37645811240721105, "grad_norm": 15.356976509094238, "learning_rate": 1.8818826984256467e-05, "loss": 0.4725, "num_input_tokens_seen": 4428000, "step": 4615 }, { "epoch": 0.37686597601761973, "grad_norm": 15.065921783447266, "learning_rate": 1.88392201647769e-05, "loss": 0.6483, "num_input_tokens_seen": 4432400, "step": 4620 }, { "epoch": 0.3772738396280284, "grad_norm": 22.02928352355957, "learning_rate": 1.8859613345297332e-05, "loss": 0.7067, "num_input_tokens_seen": 4437248, "step": 4625 }, { "epoch": 0.3776817032384371, "grad_norm": 9.818131446838379, "learning_rate": 1.8880006525817767e-05, "loss": 0.52, "num_input_tokens_seen": 4442160, "step": 4630 }, { "epoch": 0.37808956684884576, "grad_norm": 6.624122142791748, "learning_rate": 1.89003997063382e-05, "loss": 0.7541, "num_input_tokens_seen": 4446816, "step": 4635 }, { "epoch": 0.37849743045925444, "grad_norm": 5.258331775665283, "learning_rate": 1.8920792886858636e-05, "loss": 0.2788, "num_input_tokens_seen": 4451712, "step": 4640 }, { "epoch": 0.3789052940696631, "grad_norm": 7.928080081939697, "learning_rate": 1.8941186067379068e-05, "loss": 0.4433, "num_input_tokens_seen": 4456272, "step": 4645 }, { "epoch": 0.3793131576800718, "grad_norm": 4.70053243637085, "learning_rate": 1.8961579247899504e-05, "loss": 0.2998, "num_input_tokens_seen": 4460832, "step": 4650 }, { "epoch": 0.37972102129048046, "grad_norm": 4.890965938568115, "learning_rate": 1.8981972428419936e-05, "loss": 0.3035, "num_input_tokens_seen": 4465536, "step": 4655 }, { "epoch": 0.38012888490088914, "grad_norm": 9.320484161376953, "learning_rate": 1.900236560894037e-05, "loss": 0.1838, "num_input_tokens_seen": 4470512, "step": 4660 }, { "epoch": 0.3805367485112978, "grad_norm": 5.056351184844971, "learning_rate": 1.9022758789460804e-05, "loss": 0.2763, "num_input_tokens_seen": 4475280, "step": 4665 }, { "epoch": 0.3809446121217065, "grad_norm": 13.858388900756836, "learning_rate": 1.904315196998124e-05, "loss": 0.4361, "num_input_tokens_seen": 4479952, "step": 4670 }, { "epoch": 0.38135247573211517, "grad_norm": 1.7239230871200562, "learning_rate": 1.9063545150501676e-05, "loss": 0.1372, "num_input_tokens_seen": 4486512, "step": 4675 }, { "epoch": 0.38176033934252385, "grad_norm": 1.2533833980560303, "learning_rate": 1.9083938331022108e-05, "loss": 0.456, "num_input_tokens_seen": 4491200, "step": 4680 }, { "epoch": 0.3821682029529325, "grad_norm": 9.835113525390625, "learning_rate": 1.910433151154254e-05, "loss": 0.6247, "num_input_tokens_seen": 4495200, "step": 4685 }, { "epoch": 0.3825760665633412, "grad_norm": 10.299059867858887, "learning_rate": 1.9124724692062976e-05, "loss": 0.6022, "num_input_tokens_seen": 4500864, "step": 4690 }, { "epoch": 0.3829839301737499, "grad_norm": 0.6767905354499817, "learning_rate": 1.914511787258341e-05, "loss": 0.2988, "num_input_tokens_seen": 4505696, "step": 4695 }, { "epoch": 0.38339179378415855, "grad_norm": 1.591931939125061, "learning_rate": 1.9165511053103844e-05, "loss": 0.4445, "num_input_tokens_seen": 4510224, "step": 4700 }, { "epoch": 0.3837996573945673, "grad_norm": 2.303255796432495, "learning_rate": 1.9185904233624277e-05, "loss": 0.2762, "num_input_tokens_seen": 4515120, "step": 4705 }, { "epoch": 0.38420752100497596, "grad_norm": 5.890844821929932, "learning_rate": 1.9206297414144713e-05, "loss": 0.4583, "num_input_tokens_seen": 4519248, "step": 4710 }, { "epoch": 0.38461538461538464, "grad_norm": 2.0480117797851562, "learning_rate": 1.9226690594665145e-05, "loss": 0.1663, "num_input_tokens_seen": 4523424, "step": 4715 }, { "epoch": 0.3850232482257933, "grad_norm": 12.05473518371582, "learning_rate": 1.9247083775185577e-05, "loss": 0.5156, "num_input_tokens_seen": 4528064, "step": 4720 }, { "epoch": 0.385431111836202, "grad_norm": 1.4582182168960571, "learning_rate": 1.9267476955706013e-05, "loss": 0.4194, "num_input_tokens_seen": 4532608, "step": 4725 }, { "epoch": 0.38583897544661067, "grad_norm": 6.111687183380127, "learning_rate": 1.9287870136226445e-05, "loss": 0.3635, "num_input_tokens_seen": 4536608, "step": 4730 }, { "epoch": 0.38624683905701934, "grad_norm": 4.427358150482178, "learning_rate": 1.930826331674688e-05, "loss": 0.4296, "num_input_tokens_seen": 4540880, "step": 4735 }, { "epoch": 0.386654702667428, "grad_norm": 2.8062539100646973, "learning_rate": 1.9328656497267314e-05, "loss": 0.2366, "num_input_tokens_seen": 4545984, "step": 4740 }, { "epoch": 0.3870625662778367, "grad_norm": 5.331012725830078, "learning_rate": 1.934904967778775e-05, "loss": 0.438, "num_input_tokens_seen": 4550240, "step": 4745 }, { "epoch": 0.38747042988824537, "grad_norm": 6.848670482635498, "learning_rate": 1.9369442858308182e-05, "loss": 0.098, "num_input_tokens_seen": 4555568, "step": 4750 }, { "epoch": 0.38787829349865405, "grad_norm": 0.7259514927864075, "learning_rate": 1.9389836038828614e-05, "loss": 0.1827, "num_input_tokens_seen": 4560624, "step": 4755 }, { "epoch": 0.3882861571090627, "grad_norm": 9.040507316589355, "learning_rate": 1.941022921934905e-05, "loss": 0.3481, "num_input_tokens_seen": 4565664, "step": 4760 }, { "epoch": 0.3886940207194714, "grad_norm": 12.807860374450684, "learning_rate": 1.9430622399869486e-05, "loss": 0.726, "num_input_tokens_seen": 4570944, "step": 4765 }, { "epoch": 0.3891018843298801, "grad_norm": 3.980137348175049, "learning_rate": 1.9451015580389918e-05, "loss": 0.3592, "num_input_tokens_seen": 4575360, "step": 4770 }, { "epoch": 0.38950974794028875, "grad_norm": 11.378644943237305, "learning_rate": 1.9471408760910354e-05, "loss": 0.3687, "num_input_tokens_seen": 4579088, "step": 4775 }, { "epoch": 0.38991761155069743, "grad_norm": 4.36524772644043, "learning_rate": 1.9491801941430786e-05, "loss": 0.2365, "num_input_tokens_seen": 4584896, "step": 4780 }, { "epoch": 0.3903254751611061, "grad_norm": 10.944759368896484, "learning_rate": 1.9512195121951222e-05, "loss": 0.4851, "num_input_tokens_seen": 4589984, "step": 4785 }, { "epoch": 0.3907333387715148, "grad_norm": 0.7543559670448303, "learning_rate": 1.9532588302471654e-05, "loss": 0.1494, "num_input_tokens_seen": 4594480, "step": 4790 }, { "epoch": 0.39114120238192346, "grad_norm": 12.33006477355957, "learning_rate": 1.955298148299209e-05, "loss": 0.3791, "num_input_tokens_seen": 4599232, "step": 4795 }, { "epoch": 0.3915490659923322, "grad_norm": 7.18052339553833, "learning_rate": 1.9573374663512522e-05, "loss": 0.2113, "num_input_tokens_seen": 4603872, "step": 4800 }, { "epoch": 0.39195692960274087, "grad_norm": 9.60571002960205, "learning_rate": 1.9593767844032955e-05, "loss": 0.6128, "num_input_tokens_seen": 4608560, "step": 4805 }, { "epoch": 0.39236479321314954, "grad_norm": 48.330997467041016, "learning_rate": 1.961416102455339e-05, "loss": 0.2949, "num_input_tokens_seen": 4613392, "step": 4810 }, { "epoch": 0.3927726568235582, "grad_norm": 5.591818809509277, "learning_rate": 1.9634554205073823e-05, "loss": 0.1876, "num_input_tokens_seen": 4618048, "step": 4815 }, { "epoch": 0.3931805204339669, "grad_norm": 3.0804193019866943, "learning_rate": 1.965494738559426e-05, "loss": 0.3436, "num_input_tokens_seen": 4622384, "step": 4820 }, { "epoch": 0.39358838404437557, "grad_norm": 7.749312400817871, "learning_rate": 1.967534056611469e-05, "loss": 0.4858, "num_input_tokens_seen": 4627792, "step": 4825 }, { "epoch": 0.39399624765478425, "grad_norm": 5.967507839202881, "learning_rate": 1.9695733746635127e-05, "loss": 0.3528, "num_input_tokens_seen": 4632464, "step": 4830 }, { "epoch": 0.3944041112651929, "grad_norm": 1.7257062196731567, "learning_rate": 1.971612692715556e-05, "loss": 0.5796, "num_input_tokens_seen": 4637344, "step": 4835 }, { "epoch": 0.3948119748756016, "grad_norm": 10.949023246765137, "learning_rate": 1.973652010767599e-05, "loss": 0.2791, "num_input_tokens_seen": 4642944, "step": 4840 }, { "epoch": 0.3952198384860103, "grad_norm": 2.393204927444458, "learning_rate": 1.9756913288196427e-05, "loss": 0.7411, "num_input_tokens_seen": 4647712, "step": 4845 }, { "epoch": 0.39562770209641895, "grad_norm": 8.837616920471191, "learning_rate": 1.977730646871686e-05, "loss": 0.3601, "num_input_tokens_seen": 4652256, "step": 4850 }, { "epoch": 0.39603556570682763, "grad_norm": 6.238463878631592, "learning_rate": 1.97976996492373e-05, "loss": 0.4231, "num_input_tokens_seen": 4657488, "step": 4855 }, { "epoch": 0.3964434293172363, "grad_norm": 6.295674800872803, "learning_rate": 1.981809282975773e-05, "loss": 0.4016, "num_input_tokens_seen": 4662192, "step": 4860 }, { "epoch": 0.396851292927645, "grad_norm": 7.2431793212890625, "learning_rate": 1.9838486010278164e-05, "loss": 0.2258, "num_input_tokens_seen": 4667216, "step": 4865 }, { "epoch": 0.39725915653805366, "grad_norm": 19.696693420410156, "learning_rate": 1.98588791907986e-05, "loss": 0.3254, "num_input_tokens_seen": 4671184, "step": 4870 }, { "epoch": 0.39766702014846234, "grad_norm": 11.866687774658203, "learning_rate": 1.9879272371319032e-05, "loss": 0.8824, "num_input_tokens_seen": 4676128, "step": 4875 }, { "epoch": 0.398074883758871, "grad_norm": 2.1691315174102783, "learning_rate": 1.9899665551839468e-05, "loss": 0.2472, "num_input_tokens_seen": 4681152, "step": 4880 }, { "epoch": 0.3984827473692797, "grad_norm": 11.28787899017334, "learning_rate": 1.99200587323599e-05, "loss": 0.4138, "num_input_tokens_seen": 4686208, "step": 4885 }, { "epoch": 0.39889061097968836, "grad_norm": 7.070778846740723, "learning_rate": 1.9940451912880336e-05, "loss": 0.1987, "num_input_tokens_seen": 4691360, "step": 4890 }, { "epoch": 0.3992984745900971, "grad_norm": 3.101466655731201, "learning_rate": 1.9960845093400768e-05, "loss": 0.0943, "num_input_tokens_seen": 4695904, "step": 4895 }, { "epoch": 0.3997063382005058, "grad_norm": 8.626077651977539, "learning_rate": 1.99812382739212e-05, "loss": 0.5352, "num_input_tokens_seen": 4701296, "step": 4900 }, { "epoch": 0.40011420181091445, "grad_norm": 11.416695594787598, "learning_rate": 2.0001631454441636e-05, "loss": 0.4861, "num_input_tokens_seen": 4706480, "step": 4905 }, { "epoch": 0.4005220654213231, "grad_norm": 12.49030590057373, "learning_rate": 2.002202463496207e-05, "loss": 0.6202, "num_input_tokens_seen": 4711216, "step": 4910 }, { "epoch": 0.4009299290317318, "grad_norm": 5.518862724304199, "learning_rate": 2.0042417815482504e-05, "loss": 0.4669, "num_input_tokens_seen": 4716560, "step": 4915 }, { "epoch": 0.4013377926421405, "grad_norm": 11.137595176696777, "learning_rate": 2.0062810996002937e-05, "loss": 0.5481, "num_input_tokens_seen": 4721488, "step": 4920 }, { "epoch": 0.40174565625254915, "grad_norm": 5.496927738189697, "learning_rate": 2.0083204176523373e-05, "loss": 0.2571, "num_input_tokens_seen": 4726000, "step": 4925 }, { "epoch": 0.40215351986295783, "grad_norm": 2.6845598220825195, "learning_rate": 2.0103597357043805e-05, "loss": 0.2392, "num_input_tokens_seen": 4732032, "step": 4930 }, { "epoch": 0.4025613834733665, "grad_norm": 4.630258083343506, "learning_rate": 2.0123990537564237e-05, "loss": 0.2249, "num_input_tokens_seen": 4737248, "step": 4935 }, { "epoch": 0.4029692470837752, "grad_norm": 1.5953636169433594, "learning_rate": 2.0144383718084673e-05, "loss": 0.2618, "num_input_tokens_seen": 4741408, "step": 4940 }, { "epoch": 0.40337711069418386, "grad_norm": 5.937971591949463, "learning_rate": 2.0164776898605105e-05, "loss": 0.4715, "num_input_tokens_seen": 4746160, "step": 4945 }, { "epoch": 0.40378497430459254, "grad_norm": 11.452228546142578, "learning_rate": 2.018517007912554e-05, "loss": 0.1413, "num_input_tokens_seen": 4752112, "step": 4950 }, { "epoch": 0.4041928379150012, "grad_norm": 12.411952018737793, "learning_rate": 2.0205563259645977e-05, "loss": 0.2251, "num_input_tokens_seen": 4757008, "step": 4955 }, { "epoch": 0.4046007015254099, "grad_norm": 14.518814086914062, "learning_rate": 2.022595644016641e-05, "loss": 0.2792, "num_input_tokens_seen": 4762144, "step": 4960 }, { "epoch": 0.40500856513581857, "grad_norm": 10.027960777282715, "learning_rate": 2.0246349620686845e-05, "loss": 0.2275, "num_input_tokens_seen": 4766128, "step": 4965 }, { "epoch": 0.40541642874622724, "grad_norm": 47.25763702392578, "learning_rate": 2.0266742801207277e-05, "loss": 0.3657, "num_input_tokens_seen": 4770176, "step": 4970 }, { "epoch": 0.4058242923566359, "grad_norm": 10.238906860351562, "learning_rate": 2.0287135981727713e-05, "loss": 0.7035, "num_input_tokens_seen": 4773808, "step": 4975 }, { "epoch": 0.4062321559670446, "grad_norm": 27.878406524658203, "learning_rate": 2.0307529162248146e-05, "loss": 0.3949, "num_input_tokens_seen": 4778528, "step": 4980 }, { "epoch": 0.4066400195774533, "grad_norm": 16.37567710876465, "learning_rate": 2.0327922342768578e-05, "loss": 0.6587, "num_input_tokens_seen": 4783216, "step": 4985 }, { "epoch": 0.407047883187862, "grad_norm": 21.77558135986328, "learning_rate": 2.0348315523289014e-05, "loss": 0.3669, "num_input_tokens_seen": 4788224, "step": 4990 }, { "epoch": 0.4074557467982707, "grad_norm": 4.97957181930542, "learning_rate": 2.0368708703809446e-05, "loss": 0.475, "num_input_tokens_seen": 4792096, "step": 4995 }, { "epoch": 0.40786361040867936, "grad_norm": 8.074036598205566, "learning_rate": 2.0389101884329882e-05, "loss": 0.3379, "num_input_tokens_seen": 4797104, "step": 5000 }, { "epoch": 0.40827147401908803, "grad_norm": 9.667261123657227, "learning_rate": 2.0409495064850314e-05, "loss": 0.3455, "num_input_tokens_seen": 4801616, "step": 5005 }, { "epoch": 0.4086793376294967, "grad_norm": 6.1500244140625, "learning_rate": 2.042988824537075e-05, "loss": 0.5817, "num_input_tokens_seen": 4806944, "step": 5010 }, { "epoch": 0.4090872012399054, "grad_norm": 9.035599708557129, "learning_rate": 2.0450281425891182e-05, "loss": 0.2003, "num_input_tokens_seen": 4811520, "step": 5015 }, { "epoch": 0.40949506485031406, "grad_norm": 7.161929607391357, "learning_rate": 2.0470674606411615e-05, "loss": 0.232, "num_input_tokens_seen": 4816144, "step": 5020 }, { "epoch": 0.40990292846072274, "grad_norm": 6.398303031921387, "learning_rate": 2.049106778693205e-05, "loss": 0.5242, "num_input_tokens_seen": 4820128, "step": 5025 }, { "epoch": 0.4103107920711314, "grad_norm": 1.9154317378997803, "learning_rate": 2.0511460967452483e-05, "loss": 0.5418, "num_input_tokens_seen": 4824560, "step": 5030 }, { "epoch": 0.4107186556815401, "grad_norm": 11.317159652709961, "learning_rate": 2.053185414797292e-05, "loss": 0.3456, "num_input_tokens_seen": 4829872, "step": 5035 }, { "epoch": 0.41112651929194877, "grad_norm": 13.287921905517578, "learning_rate": 2.055224732849335e-05, "loss": 0.3831, "num_input_tokens_seen": 4834112, "step": 5040 }, { "epoch": 0.41153438290235744, "grad_norm": 6.11420202255249, "learning_rate": 2.0572640509013787e-05, "loss": 0.4654, "num_input_tokens_seen": 4839248, "step": 5045 }, { "epoch": 0.4119422465127661, "grad_norm": 1.7638276815414429, "learning_rate": 2.0593033689534223e-05, "loss": 0.2, "num_input_tokens_seen": 4843344, "step": 5050 }, { "epoch": 0.4123501101231748, "grad_norm": 5.3524298667907715, "learning_rate": 2.0613426870054655e-05, "loss": 0.3663, "num_input_tokens_seen": 4848656, "step": 5055 }, { "epoch": 0.41275797373358347, "grad_norm": 2.621096611022949, "learning_rate": 2.063382005057509e-05, "loss": 0.2428, "num_input_tokens_seen": 4853232, "step": 5060 }, { "epoch": 0.41316583734399215, "grad_norm": 4.268825531005859, "learning_rate": 2.0654213231095523e-05, "loss": 0.4708, "num_input_tokens_seen": 4856608, "step": 5065 }, { "epoch": 0.4135737009544008, "grad_norm": 9.816685676574707, "learning_rate": 2.0674606411615955e-05, "loss": 0.2998, "num_input_tokens_seen": 4860704, "step": 5070 }, { "epoch": 0.4139815645648095, "grad_norm": 3.891491413116455, "learning_rate": 2.069499959213639e-05, "loss": 0.215, "num_input_tokens_seen": 4865440, "step": 5075 }, { "epoch": 0.41438942817521823, "grad_norm": 27.42755889892578, "learning_rate": 2.0715392772656824e-05, "loss": 0.2688, "num_input_tokens_seen": 4869648, "step": 5080 }, { "epoch": 0.4147972917856269, "grad_norm": 9.779366493225098, "learning_rate": 2.073578595317726e-05, "loss": 0.6293, "num_input_tokens_seen": 4874848, "step": 5085 }, { "epoch": 0.4152051553960356, "grad_norm": 15.3138427734375, "learning_rate": 2.0756179133697692e-05, "loss": 0.5402, "num_input_tokens_seen": 4879104, "step": 5090 }, { "epoch": 0.41561301900644426, "grad_norm": 11.82590389251709, "learning_rate": 2.0776572314218127e-05, "loss": 0.2676, "num_input_tokens_seen": 4884096, "step": 5095 }, { "epoch": 0.41602088261685294, "grad_norm": 3.6857218742370605, "learning_rate": 2.079696549473856e-05, "loss": 0.1982, "num_input_tokens_seen": 4888832, "step": 5100 }, { "epoch": 0.4164287462272616, "grad_norm": 6.431576251983643, "learning_rate": 2.0817358675258992e-05, "loss": 0.4832, "num_input_tokens_seen": 4893456, "step": 5105 }, { "epoch": 0.4168366098376703, "grad_norm": 5.641967296600342, "learning_rate": 2.0837751855779428e-05, "loss": 0.3651, "num_input_tokens_seen": 4898224, "step": 5110 }, { "epoch": 0.41724447344807897, "grad_norm": 5.580441474914551, "learning_rate": 2.085814503629986e-05, "loss": 0.3623, "num_input_tokens_seen": 4903056, "step": 5115 }, { "epoch": 0.41765233705848764, "grad_norm": 0.13677985966205597, "learning_rate": 2.0878538216820296e-05, "loss": 0.3281, "num_input_tokens_seen": 4908032, "step": 5120 }, { "epoch": 0.4180602006688963, "grad_norm": 11.4225492477417, "learning_rate": 2.089893139734073e-05, "loss": 0.2725, "num_input_tokens_seen": 4913360, "step": 5125 }, { "epoch": 0.418468064279305, "grad_norm": 64.42420196533203, "learning_rate": 2.0919324577861164e-05, "loss": 0.3081, "num_input_tokens_seen": 4918656, "step": 5130 }, { "epoch": 0.4188759278897137, "grad_norm": 1.697023868560791, "learning_rate": 2.09397177583816e-05, "loss": 0.7492, "num_input_tokens_seen": 4922416, "step": 5135 }, { "epoch": 0.41928379150012235, "grad_norm": 10.79765510559082, "learning_rate": 2.0960110938902032e-05, "loss": 0.2724, "num_input_tokens_seen": 4927136, "step": 5140 }, { "epoch": 0.419691655110531, "grad_norm": 15.496758460998535, "learning_rate": 2.0980504119422468e-05, "loss": 0.4402, "num_input_tokens_seen": 4932816, "step": 5145 }, { "epoch": 0.4200995187209397, "grad_norm": 0.21315602958202362, "learning_rate": 2.10008972999429e-05, "loss": 0.0627, "num_input_tokens_seen": 4937456, "step": 5150 }, { "epoch": 0.4205073823313484, "grad_norm": 43.360660552978516, "learning_rate": 2.1021290480463336e-05, "loss": 0.5514, "num_input_tokens_seen": 4941696, "step": 5155 }, { "epoch": 0.42091524594175705, "grad_norm": 13.145626068115234, "learning_rate": 2.104168366098377e-05, "loss": 0.6295, "num_input_tokens_seen": 4946368, "step": 5160 }, { "epoch": 0.42132310955216573, "grad_norm": 0.9841834902763367, "learning_rate": 2.10620768415042e-05, "loss": 0.3125, "num_input_tokens_seen": 4951136, "step": 5165 }, { "epoch": 0.4217309731625744, "grad_norm": 14.561131477355957, "learning_rate": 2.1082470022024637e-05, "loss": 0.2284, "num_input_tokens_seen": 4955344, "step": 5170 }, { "epoch": 0.42213883677298314, "grad_norm": 29.3531436920166, "learning_rate": 2.110286320254507e-05, "loss": 0.4546, "num_input_tokens_seen": 4959248, "step": 5175 }, { "epoch": 0.4225467003833918, "grad_norm": 19.525117874145508, "learning_rate": 2.1123256383065505e-05, "loss": 0.5391, "num_input_tokens_seen": 4964000, "step": 5180 }, { "epoch": 0.4229545639938005, "grad_norm": 0.11730855703353882, "learning_rate": 2.1143649563585937e-05, "loss": 0.2315, "num_input_tokens_seen": 4969344, "step": 5185 }, { "epoch": 0.42336242760420917, "grad_norm": 20.87998390197754, "learning_rate": 2.1164042744106373e-05, "loss": 0.459, "num_input_tokens_seen": 4974160, "step": 5190 }, { "epoch": 0.42377029121461784, "grad_norm": 14.013689994812012, "learning_rate": 2.1184435924626806e-05, "loss": 0.5537, "num_input_tokens_seen": 4979088, "step": 5195 }, { "epoch": 0.4241781548250265, "grad_norm": 9.606575012207031, "learning_rate": 2.1204829105147238e-05, "loss": 0.4115, "num_input_tokens_seen": 4983808, "step": 5200 }, { "epoch": 0.4245860184354352, "grad_norm": 2.660202980041504, "learning_rate": 2.1225222285667674e-05, "loss": 0.5235, "num_input_tokens_seen": 4989120, "step": 5205 }, { "epoch": 0.4249938820458439, "grad_norm": 3.4322452545166016, "learning_rate": 2.1245615466188106e-05, "loss": 0.3278, "num_input_tokens_seen": 4993760, "step": 5210 }, { "epoch": 0.42540174565625255, "grad_norm": 4.962357997894287, "learning_rate": 2.1266008646708542e-05, "loss": 0.4749, "num_input_tokens_seen": 4998576, "step": 5215 }, { "epoch": 0.4258096092666612, "grad_norm": 13.918377876281738, "learning_rate": 2.1286401827228974e-05, "loss": 0.2139, "num_input_tokens_seen": 5002768, "step": 5220 }, { "epoch": 0.4262174728770699, "grad_norm": 15.966291427612305, "learning_rate": 2.130679500774941e-05, "loss": 0.5254, "num_input_tokens_seen": 5007120, "step": 5225 }, { "epoch": 0.4266253364874786, "grad_norm": 3.8809075355529785, "learning_rate": 2.1327188188269846e-05, "loss": 0.5175, "num_input_tokens_seen": 5011712, "step": 5230 }, { "epoch": 0.42703320009788726, "grad_norm": 2.9373013973236084, "learning_rate": 2.1347581368790278e-05, "loss": 0.283, "num_input_tokens_seen": 5015936, "step": 5235 }, { "epoch": 0.42744106370829593, "grad_norm": 5.30304479598999, "learning_rate": 2.1367974549310714e-05, "loss": 0.3408, "num_input_tokens_seen": 5020784, "step": 5240 }, { "epoch": 0.4278489273187046, "grad_norm": 7.444758892059326, "learning_rate": 2.1388367729831146e-05, "loss": 0.3528, "num_input_tokens_seen": 5025472, "step": 5245 }, { "epoch": 0.4282567909291133, "grad_norm": 11.052680969238281, "learning_rate": 2.140876091035158e-05, "loss": 0.232, "num_input_tokens_seen": 5030640, "step": 5250 }, { "epoch": 0.42866465453952196, "grad_norm": 2.1848695278167725, "learning_rate": 2.1429154090872014e-05, "loss": 0.2643, "num_input_tokens_seen": 5035344, "step": 5255 }, { "epoch": 0.42907251814993064, "grad_norm": 3.7303876876831055, "learning_rate": 2.1449547271392447e-05, "loss": 0.2787, "num_input_tokens_seen": 5040384, "step": 5260 }, { "epoch": 0.42948038176033937, "grad_norm": 4.4188008308410645, "learning_rate": 2.1469940451912882e-05, "loss": 0.1924, "num_input_tokens_seen": 5044208, "step": 5265 }, { "epoch": 0.42988824537074805, "grad_norm": 1.6354707479476929, "learning_rate": 2.1490333632433315e-05, "loss": 0.3528, "num_input_tokens_seen": 5048976, "step": 5270 }, { "epoch": 0.4302961089811567, "grad_norm": 12.443038940429688, "learning_rate": 2.151072681295375e-05, "loss": 0.3758, "num_input_tokens_seen": 5053952, "step": 5275 }, { "epoch": 0.4307039725915654, "grad_norm": 3.406273603439331, "learning_rate": 2.1531119993474183e-05, "loss": 0.3076, "num_input_tokens_seen": 5059056, "step": 5280 }, { "epoch": 0.4311118362019741, "grad_norm": 8.444272994995117, "learning_rate": 2.1551513173994615e-05, "loss": 0.3109, "num_input_tokens_seen": 5064032, "step": 5285 }, { "epoch": 0.43151969981238275, "grad_norm": 10.630735397338867, "learning_rate": 2.157190635451505e-05, "loss": 0.4212, "num_input_tokens_seen": 5069296, "step": 5290 }, { "epoch": 0.4319275634227914, "grad_norm": 8.30797290802002, "learning_rate": 2.1592299535035484e-05, "loss": 0.6133, "num_input_tokens_seen": 5073952, "step": 5295 }, { "epoch": 0.4323354270332001, "grad_norm": 3.917954921722412, "learning_rate": 2.161269271555592e-05, "loss": 0.6009, "num_input_tokens_seen": 5078784, "step": 5300 }, { "epoch": 0.4327432906436088, "grad_norm": 1.864706039428711, "learning_rate": 2.163308589607635e-05, "loss": 0.198, "num_input_tokens_seen": 5083360, "step": 5305 }, { "epoch": 0.43315115425401746, "grad_norm": 5.00532341003418, "learning_rate": 2.1653479076596787e-05, "loss": 0.2716, "num_input_tokens_seen": 5088000, "step": 5310 }, { "epoch": 0.43355901786442613, "grad_norm": 11.449432373046875, "learning_rate": 2.167387225711722e-05, "loss": 0.421, "num_input_tokens_seen": 5092800, "step": 5315 }, { "epoch": 0.4339668814748348, "grad_norm": 3.771070718765259, "learning_rate": 2.1694265437637652e-05, "loss": 0.3235, "num_input_tokens_seen": 5097616, "step": 5320 }, { "epoch": 0.4343747450852435, "grad_norm": 1.8702746629714966, "learning_rate": 2.171465861815809e-05, "loss": 0.3041, "num_input_tokens_seen": 5101872, "step": 5325 }, { "epoch": 0.43478260869565216, "grad_norm": 2.7359087467193604, "learning_rate": 2.1735051798678524e-05, "loss": 0.2648, "num_input_tokens_seen": 5107072, "step": 5330 }, { "epoch": 0.43519047230606084, "grad_norm": 4.210171699523926, "learning_rate": 2.175544497919896e-05, "loss": 0.1821, "num_input_tokens_seen": 5112000, "step": 5335 }, { "epoch": 0.4355983359164695, "grad_norm": 5.320333480834961, "learning_rate": 2.1775838159719392e-05, "loss": 0.2116, "num_input_tokens_seen": 5117008, "step": 5340 }, { "epoch": 0.4360061995268782, "grad_norm": 10.754891395568848, "learning_rate": 2.1796231340239824e-05, "loss": 0.4946, "num_input_tokens_seen": 5122400, "step": 5345 }, { "epoch": 0.43641406313728687, "grad_norm": 0.24146758019924164, "learning_rate": 2.181662452076026e-05, "loss": 0.2775, "num_input_tokens_seen": 5126288, "step": 5350 }, { "epoch": 0.43682192674769554, "grad_norm": 0.8605926036834717, "learning_rate": 2.1837017701280692e-05, "loss": 0.3035, "num_input_tokens_seen": 5130672, "step": 5355 }, { "epoch": 0.4372297903581043, "grad_norm": 38.3165397644043, "learning_rate": 2.1857410881801128e-05, "loss": 0.2542, "num_input_tokens_seen": 5135600, "step": 5360 }, { "epoch": 0.43763765396851295, "grad_norm": 1.0886093378067017, "learning_rate": 2.187780406232156e-05, "loss": 0.2251, "num_input_tokens_seen": 5140864, "step": 5365 }, { "epoch": 0.43804551757892163, "grad_norm": 0.5166689157485962, "learning_rate": 2.1898197242841996e-05, "loss": 0.6778, "num_input_tokens_seen": 5145984, "step": 5370 }, { "epoch": 0.4384533811893303, "grad_norm": 0.9477138519287109, "learning_rate": 2.191859042336243e-05, "loss": 0.5114, "num_input_tokens_seen": 5151104, "step": 5375 }, { "epoch": 0.438861244799739, "grad_norm": 10.913915634155273, "learning_rate": 2.193898360388286e-05, "loss": 0.738, "num_input_tokens_seen": 5155152, "step": 5380 }, { "epoch": 0.43926910841014766, "grad_norm": 1.7864794731140137, "learning_rate": 2.1959376784403297e-05, "loss": 0.2896, "num_input_tokens_seen": 5159248, "step": 5385 }, { "epoch": 0.43967697202055633, "grad_norm": 0.13475356996059418, "learning_rate": 2.197976996492373e-05, "loss": 0.3333, "num_input_tokens_seen": 5163920, "step": 5390 }, { "epoch": 0.440084835630965, "grad_norm": 0.6487515568733215, "learning_rate": 2.2000163145444165e-05, "loss": 0.2896, "num_input_tokens_seen": 5167936, "step": 5395 }, { "epoch": 0.4404926992413737, "grad_norm": 5.061704158782959, "learning_rate": 2.2020556325964597e-05, "loss": 0.891, "num_input_tokens_seen": 5171968, "step": 5400 }, { "epoch": 0.44090056285178236, "grad_norm": 3.3358335494995117, "learning_rate": 2.204094950648503e-05, "loss": 0.3044, "num_input_tokens_seen": 5176528, "step": 5405 }, { "epoch": 0.44130842646219104, "grad_norm": 2.4471685886383057, "learning_rate": 2.2061342687005465e-05, "loss": 0.4434, "num_input_tokens_seen": 5182192, "step": 5410 }, { "epoch": 0.4417162900725997, "grad_norm": 1.486863136291504, "learning_rate": 2.20817358675259e-05, "loss": 0.3609, "num_input_tokens_seen": 5186448, "step": 5415 }, { "epoch": 0.4421241536830084, "grad_norm": 3.4088685512542725, "learning_rate": 2.2102129048046337e-05, "loss": 0.3538, "num_input_tokens_seen": 5190736, "step": 5420 }, { "epoch": 0.44253201729341707, "grad_norm": 2.2453091144561768, "learning_rate": 2.212252222856677e-05, "loss": 0.3962, "num_input_tokens_seen": 5195392, "step": 5425 }, { "epoch": 0.44293988090382574, "grad_norm": 2.0634748935699463, "learning_rate": 2.2142915409087202e-05, "loss": 0.3278, "num_input_tokens_seen": 5200240, "step": 5430 }, { "epoch": 0.4433477445142344, "grad_norm": 1.9137654304504395, "learning_rate": 2.2163308589607637e-05, "loss": 0.3912, "num_input_tokens_seen": 5205616, "step": 5435 }, { "epoch": 0.4437556081246431, "grad_norm": 2.669167995452881, "learning_rate": 2.218370177012807e-05, "loss": 0.3686, "num_input_tokens_seen": 5210848, "step": 5440 }, { "epoch": 0.4441634717350518, "grad_norm": 0.9570200443267822, "learning_rate": 2.2204094950648506e-05, "loss": 0.2756, "num_input_tokens_seen": 5215856, "step": 5445 }, { "epoch": 0.4445713353454605, "grad_norm": 3.502612590789795, "learning_rate": 2.2224488131168938e-05, "loss": 0.3653, "num_input_tokens_seen": 5220816, "step": 5450 }, { "epoch": 0.4449791989558692, "grad_norm": 7.568334102630615, "learning_rate": 2.2244881311689374e-05, "loss": 0.1754, "num_input_tokens_seen": 5225312, "step": 5455 }, { "epoch": 0.44538706256627786, "grad_norm": 4.042202949523926, "learning_rate": 2.2265274492209806e-05, "loss": 0.3697, "num_input_tokens_seen": 5229696, "step": 5460 }, { "epoch": 0.44579492617668653, "grad_norm": 17.64641571044922, "learning_rate": 2.228566767273024e-05, "loss": 0.5575, "num_input_tokens_seen": 5234736, "step": 5465 }, { "epoch": 0.4462027897870952, "grad_norm": 9.408732414245605, "learning_rate": 2.2306060853250674e-05, "loss": 0.3898, "num_input_tokens_seen": 5239920, "step": 5470 }, { "epoch": 0.4466106533975039, "grad_norm": 0.5325583219528198, "learning_rate": 2.2326454033771107e-05, "loss": 0.1887, "num_input_tokens_seen": 5245264, "step": 5475 }, { "epoch": 0.44701851700791256, "grad_norm": 8.053396224975586, "learning_rate": 2.2346847214291542e-05, "loss": 0.3688, "num_input_tokens_seen": 5249584, "step": 5480 }, { "epoch": 0.44742638061832124, "grad_norm": 1.862457275390625, "learning_rate": 2.2367240394811975e-05, "loss": 0.1168, "num_input_tokens_seen": 5253488, "step": 5485 }, { "epoch": 0.4478342442287299, "grad_norm": 5.653777599334717, "learning_rate": 2.238763357533241e-05, "loss": 0.0438, "num_input_tokens_seen": 5258160, "step": 5490 }, { "epoch": 0.4482421078391386, "grad_norm": 6.327877044677734, "learning_rate": 2.2408026755852843e-05, "loss": 0.556, "num_input_tokens_seen": 5263584, "step": 5495 }, { "epoch": 0.44864997144954727, "grad_norm": 4.359814643859863, "learning_rate": 2.2428419936373275e-05, "loss": 0.6515, "num_input_tokens_seen": 5268832, "step": 5500 }, { "epoch": 0.44905783505995595, "grad_norm": 24.40760612487793, "learning_rate": 2.244881311689371e-05, "loss": 0.5994, "num_input_tokens_seen": 5273664, "step": 5505 }, { "epoch": 0.4494656986703646, "grad_norm": 1.4713162183761597, "learning_rate": 2.2469206297414147e-05, "loss": 0.2668, "num_input_tokens_seen": 5278816, "step": 5510 }, { "epoch": 0.4498735622807733, "grad_norm": 27.963472366333008, "learning_rate": 2.248959947793458e-05, "loss": 1.0884, "num_input_tokens_seen": 5283840, "step": 5515 }, { "epoch": 0.450281425891182, "grad_norm": 9.650701522827148, "learning_rate": 2.2509992658455015e-05, "loss": 0.7319, "num_input_tokens_seen": 5289088, "step": 5520 }, { "epoch": 0.45068928950159065, "grad_norm": 2.689453363418579, "learning_rate": 2.2530385838975447e-05, "loss": 0.2005, "num_input_tokens_seen": 5294320, "step": 5525 }, { "epoch": 0.4510971531119993, "grad_norm": 5.977563858032227, "learning_rate": 2.2550779019495883e-05, "loss": 0.5346, "num_input_tokens_seen": 5299184, "step": 5530 }, { "epoch": 0.451505016722408, "grad_norm": 3.703963279724121, "learning_rate": 2.2571172200016315e-05, "loss": 0.1676, "num_input_tokens_seen": 5304160, "step": 5535 }, { "epoch": 0.4519128803328167, "grad_norm": 3.5764472484588623, "learning_rate": 2.259156538053675e-05, "loss": 0.2841, "num_input_tokens_seen": 5308640, "step": 5540 }, { "epoch": 0.4523207439432254, "grad_norm": 3.889777421951294, "learning_rate": 2.2611958561057184e-05, "loss": 0.3594, "num_input_tokens_seen": 5313728, "step": 5545 }, { "epoch": 0.4527286075536341, "grad_norm": 0.6061722040176392, "learning_rate": 2.2632351741577616e-05, "loss": 0.5079, "num_input_tokens_seen": 5318832, "step": 5550 }, { "epoch": 0.45313647116404276, "grad_norm": 3.424825668334961, "learning_rate": 2.2652744922098052e-05, "loss": 0.3977, "num_input_tokens_seen": 5324208, "step": 5555 }, { "epoch": 0.45354433477445144, "grad_norm": 4.000329971313477, "learning_rate": 2.2673138102618484e-05, "loss": 0.1887, "num_input_tokens_seen": 5329072, "step": 5560 }, { "epoch": 0.4539521983848601, "grad_norm": 1.6606074571609497, "learning_rate": 2.269353128313892e-05, "loss": 0.4759, "num_input_tokens_seen": 5333824, "step": 5565 }, { "epoch": 0.4543600619952688, "grad_norm": 7.508471965789795, "learning_rate": 2.2713924463659352e-05, "loss": 0.4206, "num_input_tokens_seen": 5339072, "step": 5570 }, { "epoch": 0.45476792560567747, "grad_norm": 8.713958740234375, "learning_rate": 2.2734317644179788e-05, "loss": 0.207, "num_input_tokens_seen": 5343328, "step": 5575 }, { "epoch": 0.45517578921608615, "grad_norm": 5.909074783325195, "learning_rate": 2.275471082470022e-05, "loss": 0.2564, "num_input_tokens_seen": 5347648, "step": 5580 }, { "epoch": 0.4555836528264948, "grad_norm": 3.443434953689575, "learning_rate": 2.2775104005220653e-05, "loss": 0.3973, "num_input_tokens_seen": 5352784, "step": 5585 }, { "epoch": 0.4559915164369035, "grad_norm": 4.340832233428955, "learning_rate": 2.279549718574109e-05, "loss": 0.3222, "num_input_tokens_seen": 5358528, "step": 5590 }, { "epoch": 0.4563993800473122, "grad_norm": 4.863219261169434, "learning_rate": 2.281589036626152e-05, "loss": 0.4127, "num_input_tokens_seen": 5363680, "step": 5595 }, { "epoch": 0.45680724365772085, "grad_norm": 9.438630104064941, "learning_rate": 2.283628354678196e-05, "loss": 0.656, "num_input_tokens_seen": 5367792, "step": 5600 }, { "epoch": 0.45721510726812953, "grad_norm": 1.9708524942398071, "learning_rate": 2.2856676727302392e-05, "loss": 0.3186, "num_input_tokens_seen": 5371456, "step": 5605 }, { "epoch": 0.4576229708785382, "grad_norm": 2.639040231704712, "learning_rate": 2.2877069907822825e-05, "loss": 0.3072, "num_input_tokens_seen": 5376704, "step": 5610 }, { "epoch": 0.4580308344889469, "grad_norm": 2.752511739730835, "learning_rate": 2.289746308834326e-05, "loss": 0.2311, "num_input_tokens_seen": 5381216, "step": 5615 }, { "epoch": 0.45843869809935556, "grad_norm": 2.8469181060791016, "learning_rate": 2.2917856268863693e-05, "loss": 0.2875, "num_input_tokens_seen": 5385472, "step": 5620 }, { "epoch": 0.45884656170976423, "grad_norm": 5.304035186767578, "learning_rate": 2.293824944938413e-05, "loss": 0.3786, "num_input_tokens_seen": 5389264, "step": 5625 }, { "epoch": 0.4592544253201729, "grad_norm": 2.0494203567504883, "learning_rate": 2.295864262990456e-05, "loss": 0.2596, "num_input_tokens_seen": 5394224, "step": 5630 }, { "epoch": 0.4596622889305816, "grad_norm": 2.2148778438568115, "learning_rate": 2.2979035810424997e-05, "loss": 0.2114, "num_input_tokens_seen": 5400128, "step": 5635 }, { "epoch": 0.4600701525409903, "grad_norm": 1.6099251508712769, "learning_rate": 2.299942899094543e-05, "loss": 0.1614, "num_input_tokens_seen": 5404496, "step": 5640 }, { "epoch": 0.460478016151399, "grad_norm": 1.3939539194107056, "learning_rate": 2.301982217146586e-05, "loss": 0.4029, "num_input_tokens_seen": 5409392, "step": 5645 }, { "epoch": 0.46088587976180767, "grad_norm": 6.952746868133545, "learning_rate": 2.3040215351986297e-05, "loss": 0.6626, "num_input_tokens_seen": 5414224, "step": 5650 }, { "epoch": 0.46129374337221635, "grad_norm": 1.4419080018997192, "learning_rate": 2.306060853250673e-05, "loss": 0.6391, "num_input_tokens_seen": 5419056, "step": 5655 }, { "epoch": 0.461701606982625, "grad_norm": 13.57243537902832, "learning_rate": 2.3081001713027166e-05, "loss": 0.3201, "num_input_tokens_seen": 5423808, "step": 5660 }, { "epoch": 0.4621094705930337, "grad_norm": 37.09145736694336, "learning_rate": 2.3101394893547598e-05, "loss": 0.2756, "num_input_tokens_seen": 5428656, "step": 5665 }, { "epoch": 0.4625173342034424, "grad_norm": 0.56190425157547, "learning_rate": 2.3121788074068034e-05, "loss": 0.2501, "num_input_tokens_seen": 5432768, "step": 5670 }, { "epoch": 0.46292519781385105, "grad_norm": 2.8313148021698, "learning_rate": 2.3142181254588466e-05, "loss": 0.4995, "num_input_tokens_seen": 5437456, "step": 5675 }, { "epoch": 0.46333306142425973, "grad_norm": 2.394782066345215, "learning_rate": 2.31625744351089e-05, "loss": 0.2252, "num_input_tokens_seen": 5442560, "step": 5680 }, { "epoch": 0.4637409250346684, "grad_norm": 3.6133203506469727, "learning_rate": 2.3182967615629334e-05, "loss": 0.1497, "num_input_tokens_seen": 5447520, "step": 5685 }, { "epoch": 0.4641487886450771, "grad_norm": 1.026145100593567, "learning_rate": 2.3203360796149767e-05, "loss": 0.0786, "num_input_tokens_seen": 5452224, "step": 5690 }, { "epoch": 0.46455665225548576, "grad_norm": 81.33798217773438, "learning_rate": 2.3223753976670202e-05, "loss": 0.4285, "num_input_tokens_seen": 5456944, "step": 5695 }, { "epoch": 0.46496451586589443, "grad_norm": 49.77914810180664, "learning_rate": 2.3244147157190638e-05, "loss": 0.6515, "num_input_tokens_seen": 5461872, "step": 5700 }, { "epoch": 0.4653723794763031, "grad_norm": 15.233407020568848, "learning_rate": 2.326454033771107e-05, "loss": 0.351, "num_input_tokens_seen": 5467008, "step": 5705 }, { "epoch": 0.4657802430867118, "grad_norm": 32.43766784667969, "learning_rate": 2.3284933518231506e-05, "loss": 0.6794, "num_input_tokens_seen": 5471728, "step": 5710 }, { "epoch": 0.46618810669712046, "grad_norm": 24.16095733642578, "learning_rate": 2.330532669875194e-05, "loss": 0.6037, "num_input_tokens_seen": 5476288, "step": 5715 }, { "epoch": 0.46659597030752914, "grad_norm": 17.167709350585938, "learning_rate": 2.3325719879272374e-05, "loss": 0.3428, "num_input_tokens_seen": 5481440, "step": 5720 }, { "epoch": 0.4670038339179378, "grad_norm": 4.037345886230469, "learning_rate": 2.3346113059792807e-05, "loss": 0.3285, "num_input_tokens_seen": 5485200, "step": 5725 }, { "epoch": 0.46741169752834655, "grad_norm": 5.654531002044678, "learning_rate": 2.336650624031324e-05, "loss": 0.311, "num_input_tokens_seen": 5490320, "step": 5730 }, { "epoch": 0.4678195611387552, "grad_norm": 7.030014514923096, "learning_rate": 2.3386899420833675e-05, "loss": 0.3529, "num_input_tokens_seen": 5495056, "step": 5735 }, { "epoch": 0.4682274247491639, "grad_norm": 0.7626911997795105, "learning_rate": 2.3407292601354107e-05, "loss": 0.3386, "num_input_tokens_seen": 5499440, "step": 5740 }, { "epoch": 0.4686352883595726, "grad_norm": 6.280337810516357, "learning_rate": 2.3427685781874543e-05, "loss": 0.2357, "num_input_tokens_seen": 5504496, "step": 5745 }, { "epoch": 0.46904315196998125, "grad_norm": 0.22494064271450043, "learning_rate": 2.3448078962394975e-05, "loss": 0.6337, "num_input_tokens_seen": 5508896, "step": 5750 }, { "epoch": 0.46945101558038993, "grad_norm": 0.8117440938949585, "learning_rate": 2.346847214291541e-05, "loss": 0.7003, "num_input_tokens_seen": 5513664, "step": 5755 }, { "epoch": 0.4698588791907986, "grad_norm": 6.668015480041504, "learning_rate": 2.3488865323435844e-05, "loss": 0.6181, "num_input_tokens_seen": 5518416, "step": 5760 }, { "epoch": 0.4702667428012073, "grad_norm": 15.063234329223633, "learning_rate": 2.3509258503956276e-05, "loss": 0.7233, "num_input_tokens_seen": 5522544, "step": 5765 }, { "epoch": 0.47067460641161596, "grad_norm": 7.305372714996338, "learning_rate": 2.3529651684476712e-05, "loss": 0.5994, "num_input_tokens_seen": 5527456, "step": 5770 }, { "epoch": 0.47108247002202464, "grad_norm": 6.123553276062012, "learning_rate": 2.3550044864997144e-05, "loss": 0.2545, "num_input_tokens_seen": 5532064, "step": 5775 }, { "epoch": 0.4714903336324333, "grad_norm": 1.0233286619186401, "learning_rate": 2.357043804551758e-05, "loss": 0.1348, "num_input_tokens_seen": 5536032, "step": 5780 }, { "epoch": 0.471898197242842, "grad_norm": 4.898958683013916, "learning_rate": 2.3590831226038012e-05, "loss": 0.4146, "num_input_tokens_seen": 5540656, "step": 5785 }, { "epoch": 0.47230606085325066, "grad_norm": 24.241456985473633, "learning_rate": 2.3611224406558448e-05, "loss": 0.2229, "num_input_tokens_seen": 5545504, "step": 5790 }, { "epoch": 0.47271392446365934, "grad_norm": 5.34653377532959, "learning_rate": 2.3631617587078884e-05, "loss": 0.3386, "num_input_tokens_seen": 5550352, "step": 5795 }, { "epoch": 0.473121788074068, "grad_norm": 2.2681424617767334, "learning_rate": 2.3652010767599316e-05, "loss": 0.5584, "num_input_tokens_seen": 5555392, "step": 5800 }, { "epoch": 0.4735296516844767, "grad_norm": 26.114810943603516, "learning_rate": 2.3672403948119752e-05, "loss": 0.3185, "num_input_tokens_seen": 5559728, "step": 5805 }, { "epoch": 0.47393751529488537, "grad_norm": 2.2482969760894775, "learning_rate": 2.3692797128640184e-05, "loss": 0.2772, "num_input_tokens_seen": 5564192, "step": 5810 }, { "epoch": 0.47434537890529405, "grad_norm": 2.515279531478882, "learning_rate": 2.371319030916062e-05, "loss": 0.4039, "num_input_tokens_seen": 5569216, "step": 5815 }, { "epoch": 0.4747532425157027, "grad_norm": 3.7693943977355957, "learning_rate": 2.3733583489681052e-05, "loss": 0.3236, "num_input_tokens_seen": 5574080, "step": 5820 }, { "epoch": 0.47516110612611145, "grad_norm": 1.2588919401168823, "learning_rate": 2.3753976670201485e-05, "loss": 0.3633, "num_input_tokens_seen": 5578864, "step": 5825 }, { "epoch": 0.47556896973652013, "grad_norm": 4.0698089599609375, "learning_rate": 2.377436985072192e-05, "loss": 0.2451, "num_input_tokens_seen": 5583216, "step": 5830 }, { "epoch": 0.4759768333469288, "grad_norm": 21.32686424255371, "learning_rate": 2.3794763031242353e-05, "loss": 0.3527, "num_input_tokens_seen": 5587984, "step": 5835 }, { "epoch": 0.4763846969573375, "grad_norm": 15.986854553222656, "learning_rate": 2.381515621176279e-05, "loss": 0.5473, "num_input_tokens_seen": 5592480, "step": 5840 }, { "epoch": 0.47679256056774616, "grad_norm": 4.697226047515869, "learning_rate": 2.383554939228322e-05, "loss": 0.3794, "num_input_tokens_seen": 5597856, "step": 5845 }, { "epoch": 0.47720042417815484, "grad_norm": 4.958000659942627, "learning_rate": 2.3855942572803653e-05, "loss": 0.3609, "num_input_tokens_seen": 5601968, "step": 5850 }, { "epoch": 0.4776082877885635, "grad_norm": 4.78529691696167, "learning_rate": 2.387633575332409e-05, "loss": 0.3516, "num_input_tokens_seen": 5605808, "step": 5855 }, { "epoch": 0.4780161513989722, "grad_norm": 1.3751335144042969, "learning_rate": 2.389672893384452e-05, "loss": 0.4345, "num_input_tokens_seen": 5610688, "step": 5860 }, { "epoch": 0.47842401500938087, "grad_norm": 1.9253219366073608, "learning_rate": 2.3917122114364957e-05, "loss": 0.46, "num_input_tokens_seen": 5615744, "step": 5865 }, { "epoch": 0.47883187861978954, "grad_norm": 2.9034292697906494, "learning_rate": 2.393751529488539e-05, "loss": 0.4216, "num_input_tokens_seen": 5620752, "step": 5870 }, { "epoch": 0.4792397422301982, "grad_norm": 13.694072723388672, "learning_rate": 2.3957908475405825e-05, "loss": 0.5506, "num_input_tokens_seen": 5625264, "step": 5875 }, { "epoch": 0.4796476058406069, "grad_norm": 2.7000064849853516, "learning_rate": 2.397830165592626e-05, "loss": 0.2652, "num_input_tokens_seen": 5629328, "step": 5880 }, { "epoch": 0.48005546945101557, "grad_norm": 6.407078266143799, "learning_rate": 2.3998694836446694e-05, "loss": 0.3875, "num_input_tokens_seen": 5633760, "step": 5885 }, { "epoch": 0.48046333306142425, "grad_norm": 4.770577907562256, "learning_rate": 2.401908801696713e-05, "loss": 0.4331, "num_input_tokens_seen": 5637504, "step": 5890 }, { "epoch": 0.4808711966718329, "grad_norm": 5.714108943939209, "learning_rate": 2.4039481197487562e-05, "loss": 0.3017, "num_input_tokens_seen": 5642016, "step": 5895 }, { "epoch": 0.4812790602822416, "grad_norm": 11.997713088989258, "learning_rate": 2.4059874378007998e-05, "loss": 0.4191, "num_input_tokens_seen": 5647136, "step": 5900 }, { "epoch": 0.4816869238926503, "grad_norm": 5.559933662414551, "learning_rate": 2.408026755852843e-05, "loss": 0.4231, "num_input_tokens_seen": 5652176, "step": 5905 }, { "epoch": 0.48209478750305895, "grad_norm": 1.700002670288086, "learning_rate": 2.4100660739048862e-05, "loss": 0.1758, "num_input_tokens_seen": 5657024, "step": 5910 }, { "epoch": 0.48250265111346763, "grad_norm": 1.008876085281372, "learning_rate": 2.4121053919569298e-05, "loss": 0.1698, "num_input_tokens_seen": 5661600, "step": 5915 }, { "epoch": 0.48291051472387636, "grad_norm": 2.5878708362579346, "learning_rate": 2.414144710008973e-05, "loss": 0.4635, "num_input_tokens_seen": 5666320, "step": 5920 }, { "epoch": 0.48331837833428504, "grad_norm": 11.496213912963867, "learning_rate": 2.4161840280610166e-05, "loss": 0.3045, "num_input_tokens_seen": 5671456, "step": 5925 }, { "epoch": 0.4837262419446937, "grad_norm": 2.7506930828094482, "learning_rate": 2.41822334611306e-05, "loss": 0.1575, "num_input_tokens_seen": 5676736, "step": 5930 }, { "epoch": 0.4841341055551024, "grad_norm": 0.79879230260849, "learning_rate": 2.4202626641651034e-05, "loss": 0.4101, "num_input_tokens_seen": 5681344, "step": 5935 }, { "epoch": 0.48454196916551107, "grad_norm": 0.18796606361865997, "learning_rate": 2.4223019822171467e-05, "loss": 0.4077, "num_input_tokens_seen": 5685984, "step": 5940 }, { "epoch": 0.48494983277591974, "grad_norm": 9.32632827758789, "learning_rate": 2.42434130026919e-05, "loss": 0.8223, "num_input_tokens_seen": 5691200, "step": 5945 }, { "epoch": 0.4853576963863284, "grad_norm": 10.045580863952637, "learning_rate": 2.4263806183212335e-05, "loss": 0.1006, "num_input_tokens_seen": 5695536, "step": 5950 }, { "epoch": 0.4857655599967371, "grad_norm": 5.4760284423828125, "learning_rate": 2.4284199363732767e-05, "loss": 0.9758, "num_input_tokens_seen": 5700640, "step": 5955 }, { "epoch": 0.48617342360714577, "grad_norm": 0.6702522039413452, "learning_rate": 2.4304592544253203e-05, "loss": 0.2134, "num_input_tokens_seen": 5706208, "step": 5960 }, { "epoch": 0.48658128721755445, "grad_norm": 11.6939115524292, "learning_rate": 2.4324985724773635e-05, "loss": 0.5367, "num_input_tokens_seen": 5710768, "step": 5965 }, { "epoch": 0.4869891508279631, "grad_norm": 4.108180999755859, "learning_rate": 2.434537890529407e-05, "loss": 0.281, "num_input_tokens_seen": 5715920, "step": 5970 }, { "epoch": 0.4873970144383718, "grad_norm": 0.9117482304573059, "learning_rate": 2.4365772085814507e-05, "loss": 0.2094, "num_input_tokens_seen": 5720528, "step": 5975 }, { "epoch": 0.4878048780487805, "grad_norm": 10.875059127807617, "learning_rate": 2.438616526633494e-05, "loss": 0.3682, "num_input_tokens_seen": 5724768, "step": 5980 }, { "epoch": 0.48821274165918915, "grad_norm": 2.636070489883423, "learning_rate": 2.4406558446855375e-05, "loss": 0.2168, "num_input_tokens_seen": 5729776, "step": 5985 }, { "epoch": 0.48862060526959783, "grad_norm": 2.8901963233947754, "learning_rate": 2.4426951627375807e-05, "loss": 0.1572, "num_input_tokens_seen": 5734752, "step": 5990 }, { "epoch": 0.4890284688800065, "grad_norm": 3.5265979766845703, "learning_rate": 2.444734480789624e-05, "loss": 0.3525, "num_input_tokens_seen": 5739120, "step": 5995 }, { "epoch": 0.4894363324904152, "grad_norm": 3.037003517150879, "learning_rate": 2.4467737988416676e-05, "loss": 0.6972, "num_input_tokens_seen": 5744448, "step": 6000 }, { "epoch": 0.48984419610082386, "grad_norm": 0.4170233905315399, "learning_rate": 2.4488131168937108e-05, "loss": 0.3048, "num_input_tokens_seen": 5749616, "step": 6005 }, { "epoch": 0.4902520597112326, "grad_norm": 2.8764963150024414, "learning_rate": 2.4508524349457544e-05, "loss": 0.481, "num_input_tokens_seen": 5754400, "step": 6010 }, { "epoch": 0.49065992332164127, "grad_norm": 4.6757283210754395, "learning_rate": 2.4528917529977976e-05, "loss": 0.4152, "num_input_tokens_seen": 5758848, "step": 6015 }, { "epoch": 0.49106778693204994, "grad_norm": 6.884565353393555, "learning_rate": 2.4549310710498412e-05, "loss": 0.264, "num_input_tokens_seen": 5764064, "step": 6020 }, { "epoch": 0.4914756505424586, "grad_norm": 1.7533948421478271, "learning_rate": 2.4569703891018844e-05, "loss": 0.3897, "num_input_tokens_seen": 5768960, "step": 6025 }, { "epoch": 0.4918835141528673, "grad_norm": 13.953547477722168, "learning_rate": 2.4590097071539277e-05, "loss": 0.378, "num_input_tokens_seen": 5772880, "step": 6030 }, { "epoch": 0.492291377763276, "grad_norm": 6.619807243347168, "learning_rate": 2.4610490252059712e-05, "loss": 0.5566, "num_input_tokens_seen": 5778272, "step": 6035 }, { "epoch": 0.49269924137368465, "grad_norm": 3.1795167922973633, "learning_rate": 2.4630883432580145e-05, "loss": 0.4909, "num_input_tokens_seen": 5781776, "step": 6040 }, { "epoch": 0.4931071049840933, "grad_norm": 1.9965319633483887, "learning_rate": 2.465127661310058e-05, "loss": 0.2634, "num_input_tokens_seen": 5786656, "step": 6045 }, { "epoch": 0.493514968594502, "grad_norm": 1.9944560527801514, "learning_rate": 2.4671669793621013e-05, "loss": 0.4496, "num_input_tokens_seen": 5790624, "step": 6050 }, { "epoch": 0.4939228322049107, "grad_norm": 0.7932521104812622, "learning_rate": 2.469206297414145e-05, "loss": 0.2314, "num_input_tokens_seen": 5795328, "step": 6055 }, { "epoch": 0.49433069581531935, "grad_norm": 6.912315368652344, "learning_rate": 2.471245615466188e-05, "loss": 0.3326, "num_input_tokens_seen": 5800016, "step": 6060 }, { "epoch": 0.49473855942572803, "grad_norm": 3.682835340499878, "learning_rate": 2.4732849335182313e-05, "loss": 0.4953, "num_input_tokens_seen": 5805264, "step": 6065 }, { "epoch": 0.4951464230361367, "grad_norm": 2.294795036315918, "learning_rate": 2.4753242515702753e-05, "loss": 0.1972, "num_input_tokens_seen": 5809552, "step": 6070 }, { "epoch": 0.4955542866465454, "grad_norm": 1.1691011190414429, "learning_rate": 2.4773635696223185e-05, "loss": 0.2753, "num_input_tokens_seen": 5814400, "step": 6075 }, { "epoch": 0.49596215025695406, "grad_norm": 3.909325122833252, "learning_rate": 2.479402887674362e-05, "loss": 0.4702, "num_input_tokens_seen": 5819568, "step": 6080 }, { "epoch": 0.49637001386736274, "grad_norm": 1.3038709163665771, "learning_rate": 2.4814422057264053e-05, "loss": 0.3895, "num_input_tokens_seen": 5824720, "step": 6085 }, { "epoch": 0.4967778774777714, "grad_norm": 6.855472087860107, "learning_rate": 2.4834815237784485e-05, "loss": 0.31, "num_input_tokens_seen": 5829888, "step": 6090 }, { "epoch": 0.4971857410881801, "grad_norm": 1.6314456462860107, "learning_rate": 2.485520841830492e-05, "loss": 0.3492, "num_input_tokens_seen": 5834944, "step": 6095 }, { "epoch": 0.49759360469858877, "grad_norm": 2.441068649291992, "learning_rate": 2.4875601598825354e-05, "loss": 0.2896, "num_input_tokens_seen": 5840160, "step": 6100 }, { "epoch": 0.4980014683089975, "grad_norm": 1.2939398288726807, "learning_rate": 2.489599477934579e-05, "loss": 0.2708, "num_input_tokens_seen": 5844928, "step": 6105 }, { "epoch": 0.4984093319194062, "grad_norm": 2.675081253051758, "learning_rate": 2.491638795986622e-05, "loss": 0.3495, "num_input_tokens_seen": 5850000, "step": 6110 }, { "epoch": 0.49881719552981485, "grad_norm": 3.6692895889282227, "learning_rate": 2.4936781140386657e-05, "loss": 0.3271, "num_input_tokens_seen": 5854032, "step": 6115 }, { "epoch": 0.4992250591402235, "grad_norm": 2.0282063484191895, "learning_rate": 2.495717432090709e-05, "loss": 0.2657, "num_input_tokens_seen": 5858800, "step": 6120 }, { "epoch": 0.4996329227506322, "grad_norm": 1.1339002847671509, "learning_rate": 2.4977567501427522e-05, "loss": 0.2132, "num_input_tokens_seen": 5863296, "step": 6125 }, { "epoch": 0.5000407863610409, "grad_norm": 3.8557658195495605, "learning_rate": 2.4997960681947958e-05, "loss": 0.3178, "num_input_tokens_seen": 5867968, "step": 6130 }, { "epoch": 0.5000407863610409, "eval_loss": 0.38023096323013306, "eval_runtime": 570.8704, "eval_samples_per_second": 4.773, "eval_steps_per_second": 2.388, "num_input_tokens_seen": 5867968, "step": 6130 }, { "epoch": 0.5004486499714496, "grad_norm": 0.105685293674469, "learning_rate": 2.501835386246839e-05, "loss": 0.6086, "num_input_tokens_seen": 5872528, "step": 6135 }, { "epoch": 0.5008565135818582, "grad_norm": 5.192967891693115, "learning_rate": 2.5038747042988826e-05, "loss": 0.2783, "num_input_tokens_seen": 5877408, "step": 6140 }, { "epoch": 0.5012643771922669, "grad_norm": 2.2954635620117188, "learning_rate": 2.505914022350926e-05, "loss": 0.3049, "num_input_tokens_seen": 5881712, "step": 6145 }, { "epoch": 0.5016722408026756, "grad_norm": 1.3314660787582397, "learning_rate": 2.507953340402969e-05, "loss": 0.1424, "num_input_tokens_seen": 5886352, "step": 6150 }, { "epoch": 0.5020801044130843, "grad_norm": 2.9779107570648193, "learning_rate": 2.5099926584550127e-05, "loss": 0.3488, "num_input_tokens_seen": 5890512, "step": 6155 }, { "epoch": 0.5024879680234929, "grad_norm": 2.2294702529907227, "learning_rate": 2.512031976507056e-05, "loss": 0.2597, "num_input_tokens_seen": 5895664, "step": 6160 }, { "epoch": 0.5028958316339016, "grad_norm": 2.4147324562072754, "learning_rate": 2.5140712945590995e-05, "loss": 0.3614, "num_input_tokens_seen": 5900080, "step": 6165 }, { "epoch": 0.5033036952443103, "grad_norm": 3.689218044281006, "learning_rate": 2.5161106126111427e-05, "loss": 0.1597, "num_input_tokens_seen": 5904864, "step": 6170 }, { "epoch": 0.503711558854719, "grad_norm": 15.854595184326172, "learning_rate": 2.5181499306631863e-05, "loss": 0.6089, "num_input_tokens_seen": 5909408, "step": 6175 }, { "epoch": 0.5041194224651276, "grad_norm": 1.1643186807632446, "learning_rate": 2.5201892487152295e-05, "loss": 0.4757, "num_input_tokens_seen": 5913632, "step": 6180 }, { "epoch": 0.5045272860755363, "grad_norm": 14.667288780212402, "learning_rate": 2.5222285667672728e-05, "loss": 0.4519, "num_input_tokens_seen": 5918896, "step": 6185 }, { "epoch": 0.504935149685945, "grad_norm": 3.9522438049316406, "learning_rate": 2.5242678848193163e-05, "loss": 0.3693, "num_input_tokens_seen": 5923344, "step": 6190 }, { "epoch": 0.5053430132963537, "grad_norm": 3.507676124572754, "learning_rate": 2.5263072028713596e-05, "loss": 0.4899, "num_input_tokens_seen": 5928704, "step": 6195 }, { "epoch": 0.5057508769067623, "grad_norm": 2.815263509750366, "learning_rate": 2.528346520923403e-05, "loss": 0.387, "num_input_tokens_seen": 5932672, "step": 6200 }, { "epoch": 0.506158740517171, "grad_norm": 3.7162201404571533, "learning_rate": 2.5303858389754464e-05, "loss": 0.4767, "num_input_tokens_seen": 5937360, "step": 6205 }, { "epoch": 0.5065666041275797, "grad_norm": 0.7983293533325195, "learning_rate": 2.5324251570274903e-05, "loss": 0.0858, "num_input_tokens_seen": 5942736, "step": 6210 }, { "epoch": 0.5069744677379884, "grad_norm": 3.958277702331543, "learning_rate": 2.534464475079534e-05, "loss": 0.5363, "num_input_tokens_seen": 5948160, "step": 6215 }, { "epoch": 0.507382331348397, "grad_norm": 0.8930565714836121, "learning_rate": 2.536503793131577e-05, "loss": 0.3085, "num_input_tokens_seen": 5953168, "step": 6220 }, { "epoch": 0.5077901949588057, "grad_norm": 5.282925128936768, "learning_rate": 2.5385431111836207e-05, "loss": 0.4277, "num_input_tokens_seen": 5957632, "step": 6225 }, { "epoch": 0.5081980585692144, "grad_norm": 1.4069974422454834, "learning_rate": 2.540582429235664e-05, "loss": 0.3062, "num_input_tokens_seen": 5962672, "step": 6230 }, { "epoch": 0.5086059221796231, "grad_norm": 2.920929193496704, "learning_rate": 2.5426217472877072e-05, "loss": 0.237, "num_input_tokens_seen": 5967296, "step": 6235 }, { "epoch": 0.5090137857900318, "grad_norm": 1.8842878341674805, "learning_rate": 2.5446610653397508e-05, "loss": 0.2682, "num_input_tokens_seen": 5972240, "step": 6240 }, { "epoch": 0.5094216494004405, "grad_norm": 1.4230767488479614, "learning_rate": 2.546700383391794e-05, "loss": 0.379, "num_input_tokens_seen": 5976928, "step": 6245 }, { "epoch": 0.5098295130108492, "grad_norm": 9.102432250976562, "learning_rate": 2.5487397014438376e-05, "loss": 0.3342, "num_input_tokens_seen": 5981968, "step": 6250 }, { "epoch": 0.5102373766212579, "grad_norm": 1.5445305109024048, "learning_rate": 2.5507790194958808e-05, "loss": 0.5633, "num_input_tokens_seen": 5985760, "step": 6255 }, { "epoch": 0.5106452402316666, "grad_norm": 3.2636454105377197, "learning_rate": 2.5528183375479244e-05, "loss": 0.369, "num_input_tokens_seen": 5990944, "step": 6260 }, { "epoch": 0.5110531038420753, "grad_norm": 1.3144220113754272, "learning_rate": 2.5548576555999676e-05, "loss": 0.2898, "num_input_tokens_seen": 5995280, "step": 6265 }, { "epoch": 0.5114609674524839, "grad_norm": 5.235739231109619, "learning_rate": 2.556896973652011e-05, "loss": 0.2122, "num_input_tokens_seen": 5999696, "step": 6270 }, { "epoch": 0.5118688310628926, "grad_norm": 5.66917085647583, "learning_rate": 2.5589362917040544e-05, "loss": 0.5914, "num_input_tokens_seen": 6004416, "step": 6275 }, { "epoch": 0.5122766946733013, "grad_norm": 1.7234466075897217, "learning_rate": 2.5609756097560977e-05, "loss": 0.2551, "num_input_tokens_seen": 6009328, "step": 6280 }, { "epoch": 0.51268455828371, "grad_norm": 3.1174395084381104, "learning_rate": 2.5630149278081412e-05, "loss": 0.3356, "num_input_tokens_seen": 6013968, "step": 6285 }, { "epoch": 0.5130924218941186, "grad_norm": 2.2660562992095947, "learning_rate": 2.5650542458601845e-05, "loss": 0.3034, "num_input_tokens_seen": 6018368, "step": 6290 }, { "epoch": 0.5135002855045273, "grad_norm": 5.453761100769043, "learning_rate": 2.5670935639122277e-05, "loss": 0.3211, "num_input_tokens_seen": 6022656, "step": 6295 }, { "epoch": 0.513908149114936, "grad_norm": 6.816925048828125, "learning_rate": 2.5691328819642713e-05, "loss": 0.3312, "num_input_tokens_seen": 6026816, "step": 6300 }, { "epoch": 0.5143160127253447, "grad_norm": 11.44483757019043, "learning_rate": 2.5711722000163145e-05, "loss": 0.2999, "num_input_tokens_seen": 6031152, "step": 6305 }, { "epoch": 0.5147238763357533, "grad_norm": 3.316868305206299, "learning_rate": 2.573211518068358e-05, "loss": 0.296, "num_input_tokens_seen": 6034976, "step": 6310 }, { "epoch": 0.515131739946162, "grad_norm": 2.870023250579834, "learning_rate": 2.5752508361204013e-05, "loss": 0.1915, "num_input_tokens_seen": 6039488, "step": 6315 }, { "epoch": 0.5155396035565707, "grad_norm": 3.3545632362365723, "learning_rate": 2.577290154172445e-05, "loss": 0.3945, "num_input_tokens_seen": 6043712, "step": 6320 }, { "epoch": 0.5159474671669794, "grad_norm": 6.141219139099121, "learning_rate": 2.579329472224488e-05, "loss": 0.2611, "num_input_tokens_seen": 6048832, "step": 6325 }, { "epoch": 0.516355330777388, "grad_norm": 13.928855895996094, "learning_rate": 2.5813687902765314e-05, "loss": 0.5337, "num_input_tokens_seen": 6053920, "step": 6330 }, { "epoch": 0.5167631943877967, "grad_norm": 0.6237022876739502, "learning_rate": 2.583408108328575e-05, "loss": 0.1243, "num_input_tokens_seen": 6058960, "step": 6335 }, { "epoch": 0.5171710579982054, "grad_norm": 7.575913429260254, "learning_rate": 2.5854474263806182e-05, "loss": 0.2398, "num_input_tokens_seen": 6064112, "step": 6340 }, { "epoch": 0.5175789216086141, "grad_norm": 2.8686037063598633, "learning_rate": 2.5874867444326618e-05, "loss": 0.444, "num_input_tokens_seen": 6068752, "step": 6345 }, { "epoch": 0.5179867852190227, "grad_norm": 15.832703590393066, "learning_rate": 2.589526062484705e-05, "loss": 0.5676, "num_input_tokens_seen": 6073408, "step": 6350 }, { "epoch": 0.5183946488294314, "grad_norm": 6.426987648010254, "learning_rate": 2.5915653805367486e-05, "loss": 0.4709, "num_input_tokens_seen": 6077600, "step": 6355 }, { "epoch": 0.5188025124398401, "grad_norm": 1.6481112241744995, "learning_rate": 2.593604698588792e-05, "loss": 0.5974, "num_input_tokens_seen": 6082752, "step": 6360 }, { "epoch": 0.5192103760502488, "grad_norm": 3.2212581634521484, "learning_rate": 2.595644016640835e-05, "loss": 0.4355, "num_input_tokens_seen": 6087888, "step": 6365 }, { "epoch": 0.5196182396606575, "grad_norm": 5.702973365783691, "learning_rate": 2.5976833346928787e-05, "loss": 0.3476, "num_input_tokens_seen": 6092832, "step": 6370 }, { "epoch": 0.5200261032710661, "grad_norm": 1.471932053565979, "learning_rate": 2.599722652744922e-05, "loss": 0.349, "num_input_tokens_seen": 6097952, "step": 6375 }, { "epoch": 0.5204339668814748, "grad_norm": 2.830340623855591, "learning_rate": 2.6017619707969655e-05, "loss": 0.3009, "num_input_tokens_seen": 6103664, "step": 6380 }, { "epoch": 0.5208418304918835, "grad_norm": 0.9147693514823914, "learning_rate": 2.6038012888490087e-05, "loss": 0.2152, "num_input_tokens_seen": 6109136, "step": 6385 }, { "epoch": 0.5212496941022922, "grad_norm": 2.3532543182373047, "learning_rate": 2.6058406069010523e-05, "loss": 0.2283, "num_input_tokens_seen": 6113376, "step": 6390 }, { "epoch": 0.5216575577127008, "grad_norm": 12.251986503601074, "learning_rate": 2.6078799249530962e-05, "loss": 0.3361, "num_input_tokens_seen": 6117648, "step": 6395 }, { "epoch": 0.5220654213231095, "grad_norm": 4.9906768798828125, "learning_rate": 2.6099192430051394e-05, "loss": 0.4045, "num_input_tokens_seen": 6123216, "step": 6400 }, { "epoch": 0.5224732849335182, "grad_norm": 5.945387840270996, "learning_rate": 2.6119585610571827e-05, "loss": 0.4343, "num_input_tokens_seen": 6128272, "step": 6405 }, { "epoch": 0.5228811485439269, "grad_norm": 4.6584014892578125, "learning_rate": 2.6139978791092262e-05, "loss": 0.3437, "num_input_tokens_seen": 6133168, "step": 6410 }, { "epoch": 0.5232890121543355, "grad_norm": 5.964834690093994, "learning_rate": 2.6160371971612695e-05, "loss": 0.3442, "num_input_tokens_seen": 6138240, "step": 6415 }, { "epoch": 0.5236968757647442, "grad_norm": 2.862839460372925, "learning_rate": 2.618076515213313e-05, "loss": 0.3172, "num_input_tokens_seen": 6142608, "step": 6420 }, { "epoch": 0.5241047393751529, "grad_norm": 0.7890955209732056, "learning_rate": 2.6201158332653563e-05, "loss": 0.4546, "num_input_tokens_seen": 6147232, "step": 6425 }, { "epoch": 0.5245126029855617, "grad_norm": 5.935354709625244, "learning_rate": 2.6221551513174e-05, "loss": 0.5168, "num_input_tokens_seen": 6152304, "step": 6430 }, { "epoch": 0.5249204665959704, "grad_norm": 5.708734035491943, "learning_rate": 2.624194469369443e-05, "loss": 0.4172, "num_input_tokens_seen": 6156528, "step": 6435 }, { "epoch": 0.525328330206379, "grad_norm": 3.3211207389831543, "learning_rate": 2.6262337874214864e-05, "loss": 0.2892, "num_input_tokens_seen": 6160912, "step": 6440 }, { "epoch": 0.5257361938167877, "grad_norm": 3.238603353500366, "learning_rate": 2.62827310547353e-05, "loss": 0.261, "num_input_tokens_seen": 6166016, "step": 6445 }, { "epoch": 0.5261440574271964, "grad_norm": 5.525774002075195, "learning_rate": 2.630312423525573e-05, "loss": 0.3782, "num_input_tokens_seen": 6170560, "step": 6450 }, { "epoch": 0.5265519210376051, "grad_norm": 2.345075845718384, "learning_rate": 2.6323517415776167e-05, "loss": 0.5469, "num_input_tokens_seen": 6174704, "step": 6455 }, { "epoch": 0.5269597846480137, "grad_norm": 2.4644482135772705, "learning_rate": 2.63439105962966e-05, "loss": 0.244, "num_input_tokens_seen": 6179792, "step": 6460 }, { "epoch": 0.5273676482584224, "grad_norm": 3.688465118408203, "learning_rate": 2.6364303776817036e-05, "loss": 0.2636, "num_input_tokens_seen": 6183888, "step": 6465 }, { "epoch": 0.5277755118688311, "grad_norm": 5.7085795402526855, "learning_rate": 2.6384696957337468e-05, "loss": 0.188, "num_input_tokens_seen": 6188752, "step": 6470 }, { "epoch": 0.5281833754792398, "grad_norm": 1.3085280656814575, "learning_rate": 2.64050901378579e-05, "loss": 0.2836, "num_input_tokens_seen": 6193648, "step": 6475 }, { "epoch": 0.5285912390896484, "grad_norm": 4.420768737792969, "learning_rate": 2.6425483318378336e-05, "loss": 0.8671, "num_input_tokens_seen": 6198736, "step": 6480 }, { "epoch": 0.5289991027000571, "grad_norm": 0.408110111951828, "learning_rate": 2.644587649889877e-05, "loss": 0.1713, "num_input_tokens_seen": 6202880, "step": 6485 }, { "epoch": 0.5294069663104658, "grad_norm": 0.9258120059967041, "learning_rate": 2.6466269679419204e-05, "loss": 0.3773, "num_input_tokens_seen": 6207760, "step": 6490 }, { "epoch": 0.5298148299208745, "grad_norm": 3.525092601776123, "learning_rate": 2.6486662859939637e-05, "loss": 0.376, "num_input_tokens_seen": 6212752, "step": 6495 }, { "epoch": 0.5302226935312832, "grad_norm": 0.8244789838790894, "learning_rate": 2.6507056040460072e-05, "loss": 0.2018, "num_input_tokens_seen": 6217232, "step": 6500 }, { "epoch": 0.5306305571416918, "grad_norm": 5.928687572479248, "learning_rate": 2.6527449220980505e-05, "loss": 0.7745, "num_input_tokens_seen": 6221664, "step": 6505 }, { "epoch": 0.5310384207521005, "grad_norm": 6.059571266174316, "learning_rate": 2.6547842401500937e-05, "loss": 0.4208, "num_input_tokens_seen": 6227248, "step": 6510 }, { "epoch": 0.5314462843625092, "grad_norm": 4.019392967224121, "learning_rate": 2.6568235582021373e-05, "loss": 0.3037, "num_input_tokens_seen": 6231712, "step": 6515 }, { "epoch": 0.5318541479729179, "grad_norm": 2.815485715866089, "learning_rate": 2.6588628762541805e-05, "loss": 0.2871, "num_input_tokens_seen": 6236400, "step": 6520 }, { "epoch": 0.5322620115833265, "grad_norm": 3.101485013961792, "learning_rate": 2.660902194306224e-05, "loss": 0.3067, "num_input_tokens_seen": 6241072, "step": 6525 }, { "epoch": 0.5326698751937352, "grad_norm": 4.148009777069092, "learning_rate": 2.6629415123582673e-05, "loss": 0.2532, "num_input_tokens_seen": 6246176, "step": 6530 }, { "epoch": 0.5330777388041439, "grad_norm": 2.0793356895446777, "learning_rate": 2.664980830410311e-05, "loss": 0.2278, "num_input_tokens_seen": 6251232, "step": 6535 }, { "epoch": 0.5334856024145526, "grad_norm": 3.1074578762054443, "learning_rate": 2.667020148462354e-05, "loss": 0.4693, "num_input_tokens_seen": 6255648, "step": 6540 }, { "epoch": 0.5338934660249612, "grad_norm": 2.6855502128601074, "learning_rate": 2.6690594665143974e-05, "loss": 0.2515, "num_input_tokens_seen": 6260704, "step": 6545 }, { "epoch": 0.5343013296353699, "grad_norm": 11.528626441955566, "learning_rate": 2.671098784566441e-05, "loss": 0.3678, "num_input_tokens_seen": 6265520, "step": 6550 }, { "epoch": 0.5347091932457786, "grad_norm": 8.313993453979492, "learning_rate": 2.6731381026184842e-05, "loss": 0.6623, "num_input_tokens_seen": 6270928, "step": 6555 }, { "epoch": 0.5351170568561873, "grad_norm": 3.2306251525878906, "learning_rate": 2.6751774206705278e-05, "loss": 0.3735, "num_input_tokens_seen": 6276032, "step": 6560 }, { "epoch": 0.5355249204665959, "grad_norm": 1.682097315788269, "learning_rate": 2.677216738722571e-05, "loss": 0.2111, "num_input_tokens_seen": 6281072, "step": 6565 }, { "epoch": 0.5359327840770046, "grad_norm": 3.046433687210083, "learning_rate": 2.6792560567746146e-05, "loss": 0.2242, "num_input_tokens_seen": 6286128, "step": 6570 }, { "epoch": 0.5363406476874133, "grad_norm": 7.547574996948242, "learning_rate": 2.681295374826658e-05, "loss": 0.5267, "num_input_tokens_seen": 6290720, "step": 6575 }, { "epoch": 0.536748511297822, "grad_norm": 1.0916041135787964, "learning_rate": 2.6833346928787017e-05, "loss": 0.2939, "num_input_tokens_seen": 6295216, "step": 6580 }, { "epoch": 0.5371563749082306, "grad_norm": 3.132174491882324, "learning_rate": 2.685374010930745e-05, "loss": 0.2642, "num_input_tokens_seen": 6299984, "step": 6585 }, { "epoch": 0.5375642385186393, "grad_norm": 1.582925796508789, "learning_rate": 2.6874133289827886e-05, "loss": 0.4417, "num_input_tokens_seen": 6304464, "step": 6590 }, { "epoch": 0.537972102129048, "grad_norm": 4.7001142501831055, "learning_rate": 2.6894526470348318e-05, "loss": 0.4255, "num_input_tokens_seen": 6309536, "step": 6595 }, { "epoch": 0.5383799657394567, "grad_norm": 2.9020681381225586, "learning_rate": 2.6914919650868754e-05, "loss": 0.2288, "num_input_tokens_seen": 6314208, "step": 6600 }, { "epoch": 0.5387878293498654, "grad_norm": 2.895322322845459, "learning_rate": 2.6935312831389186e-05, "loss": 0.2349, "num_input_tokens_seen": 6319328, "step": 6605 }, { "epoch": 0.539195692960274, "grad_norm": 2.6504807472229004, "learning_rate": 2.6955706011909622e-05, "loss": 0.2441, "num_input_tokens_seen": 6324496, "step": 6610 }, { "epoch": 0.5396035565706828, "grad_norm": 7.7918500900268555, "learning_rate": 2.6976099192430054e-05, "loss": 0.1784, "num_input_tokens_seen": 6329536, "step": 6615 }, { "epoch": 0.5400114201810915, "grad_norm": 1.9199694395065308, "learning_rate": 2.6996492372950487e-05, "loss": 0.092, "num_input_tokens_seen": 6334400, "step": 6620 }, { "epoch": 0.5404192837915002, "grad_norm": 7.974882125854492, "learning_rate": 2.7016885553470922e-05, "loss": 0.1969, "num_input_tokens_seen": 6339376, "step": 6625 }, { "epoch": 0.5408271474019088, "grad_norm": 0.1353074312210083, "learning_rate": 2.7037278733991355e-05, "loss": 0.2494, "num_input_tokens_seen": 6344224, "step": 6630 }, { "epoch": 0.5412350110123175, "grad_norm": 13.736860275268555, "learning_rate": 2.705767191451179e-05, "loss": 1.2003, "num_input_tokens_seen": 6348960, "step": 6635 }, { "epoch": 0.5416428746227262, "grad_norm": 5.006495475769043, "learning_rate": 2.7078065095032223e-05, "loss": 0.4202, "num_input_tokens_seen": 6353424, "step": 6640 }, { "epoch": 0.5420507382331349, "grad_norm": 4.987539291381836, "learning_rate": 2.709845827555266e-05, "loss": 0.4799, "num_input_tokens_seen": 6357616, "step": 6645 }, { "epoch": 0.5424586018435436, "grad_norm": 3.0130703449249268, "learning_rate": 2.711885145607309e-05, "loss": 0.4072, "num_input_tokens_seen": 6363008, "step": 6650 }, { "epoch": 0.5428664654539522, "grad_norm": 1.5786913633346558, "learning_rate": 2.7139244636593523e-05, "loss": 0.2248, "num_input_tokens_seen": 6368144, "step": 6655 }, { "epoch": 0.5432743290643609, "grad_norm": 2.3412246704101562, "learning_rate": 2.715963781711396e-05, "loss": 0.3295, "num_input_tokens_seen": 6372144, "step": 6660 }, { "epoch": 0.5436821926747696, "grad_norm": 2.3509066104888916, "learning_rate": 2.718003099763439e-05, "loss": 0.3423, "num_input_tokens_seen": 6376288, "step": 6665 }, { "epoch": 0.5440900562851783, "grad_norm": 1.1089973449707031, "learning_rate": 2.7200424178154827e-05, "loss": 0.3159, "num_input_tokens_seen": 6381424, "step": 6670 }, { "epoch": 0.5444979198955869, "grad_norm": 3.365797519683838, "learning_rate": 2.722081735867526e-05, "loss": 0.2333, "num_input_tokens_seen": 6385552, "step": 6675 }, { "epoch": 0.5449057835059956, "grad_norm": 3.726969003677368, "learning_rate": 2.7241210539195695e-05, "loss": 0.3565, "num_input_tokens_seen": 6390864, "step": 6680 }, { "epoch": 0.5453136471164043, "grad_norm": 1.6962571144104004, "learning_rate": 2.7261603719716128e-05, "loss": 0.4322, "num_input_tokens_seen": 6395296, "step": 6685 }, { "epoch": 0.545721510726813, "grad_norm": 1.278773307800293, "learning_rate": 2.728199690023656e-05, "loss": 0.2547, "num_input_tokens_seen": 6400560, "step": 6690 }, { "epoch": 0.5461293743372216, "grad_norm": 1.9812407493591309, "learning_rate": 2.7302390080756996e-05, "loss": 0.3899, "num_input_tokens_seen": 6404928, "step": 6695 }, { "epoch": 0.5465372379476303, "grad_norm": 4.454627990722656, "learning_rate": 2.732278326127743e-05, "loss": 0.2135, "num_input_tokens_seen": 6409632, "step": 6700 }, { "epoch": 0.546945101558039, "grad_norm": 0.7014477252960205, "learning_rate": 2.7343176441797864e-05, "loss": 0.2327, "num_input_tokens_seen": 6414192, "step": 6705 }, { "epoch": 0.5473529651684477, "grad_norm": 10.889202117919922, "learning_rate": 2.7363569622318297e-05, "loss": 0.3985, "num_input_tokens_seen": 6418448, "step": 6710 }, { "epoch": 0.5477608287788563, "grad_norm": 3.6543197631835938, "learning_rate": 2.7383962802838732e-05, "loss": 0.371, "num_input_tokens_seen": 6423792, "step": 6715 }, { "epoch": 0.548168692389265, "grad_norm": 3.347442388534546, "learning_rate": 2.7404355983359165e-05, "loss": 0.258, "num_input_tokens_seen": 6428048, "step": 6720 }, { "epoch": 0.5485765559996737, "grad_norm": 1.2470451593399048, "learning_rate": 2.7424749163879597e-05, "loss": 0.2962, "num_input_tokens_seen": 6432560, "step": 6725 }, { "epoch": 0.5489844196100824, "grad_norm": 3.4425463676452637, "learning_rate": 2.7445142344400033e-05, "loss": 0.4702, "num_input_tokens_seen": 6437680, "step": 6730 }, { "epoch": 0.549392283220491, "grad_norm": 5.209595680236816, "learning_rate": 2.7465535524920465e-05, "loss": 0.3035, "num_input_tokens_seen": 6442720, "step": 6735 }, { "epoch": 0.5498001468308997, "grad_norm": 5.086968898773193, "learning_rate": 2.74859287054409e-05, "loss": 0.3247, "num_input_tokens_seen": 6447024, "step": 6740 }, { "epoch": 0.5502080104413084, "grad_norm": 2.3772759437561035, "learning_rate": 2.7506321885961333e-05, "loss": 0.3392, "num_input_tokens_seen": 6452016, "step": 6745 }, { "epoch": 0.5506158740517171, "grad_norm": 2.437790632247925, "learning_rate": 2.752671506648177e-05, "loss": 0.369, "num_input_tokens_seen": 6456784, "step": 6750 }, { "epoch": 0.5510237376621258, "grad_norm": 1.1919560432434082, "learning_rate": 2.75471082470022e-05, "loss": 0.2496, "num_input_tokens_seen": 6462240, "step": 6755 }, { "epoch": 0.5514316012725344, "grad_norm": 4.242952346801758, "learning_rate": 2.7567501427522634e-05, "loss": 0.3346, "num_input_tokens_seen": 6466736, "step": 6760 }, { "epoch": 0.5518394648829431, "grad_norm": 1.9761724472045898, "learning_rate": 2.7587894608043073e-05, "loss": 0.1716, "num_input_tokens_seen": 6471888, "step": 6765 }, { "epoch": 0.5522473284933518, "grad_norm": 0.3860419690608978, "learning_rate": 2.760828778856351e-05, "loss": 0.2661, "num_input_tokens_seen": 6477600, "step": 6770 }, { "epoch": 0.5526551921037605, "grad_norm": 3.8708817958831787, "learning_rate": 2.762868096908394e-05, "loss": 0.4175, "num_input_tokens_seen": 6482208, "step": 6775 }, { "epoch": 0.5530630557141691, "grad_norm": 2.127704620361328, "learning_rate": 2.7649074149604377e-05, "loss": 0.2562, "num_input_tokens_seen": 6487648, "step": 6780 }, { "epoch": 0.5534709193245778, "grad_norm": 1.2846133708953857, "learning_rate": 2.766946733012481e-05, "loss": 0.2311, "num_input_tokens_seen": 6492512, "step": 6785 }, { "epoch": 0.5538787829349865, "grad_norm": 6.895698547363281, "learning_rate": 2.7689860510645245e-05, "loss": 0.205, "num_input_tokens_seen": 6497216, "step": 6790 }, { "epoch": 0.5542866465453952, "grad_norm": 0.9131535887718201, "learning_rate": 2.7710253691165677e-05, "loss": 0.2483, "num_input_tokens_seen": 6500752, "step": 6795 }, { "epoch": 0.5546945101558038, "grad_norm": 9.26795482635498, "learning_rate": 2.773064687168611e-05, "loss": 0.7512, "num_input_tokens_seen": 6506016, "step": 6800 }, { "epoch": 0.5551023737662126, "grad_norm": 4.7475104331970215, "learning_rate": 2.7751040052206546e-05, "loss": 0.6913, "num_input_tokens_seen": 6510992, "step": 6805 }, { "epoch": 0.5555102373766213, "grad_norm": 5.43059778213501, "learning_rate": 2.7771433232726978e-05, "loss": 0.3525, "num_input_tokens_seen": 6515120, "step": 6810 }, { "epoch": 0.55591810098703, "grad_norm": 3.113370180130005, "learning_rate": 2.7791826413247414e-05, "loss": 0.4557, "num_input_tokens_seen": 6519152, "step": 6815 }, { "epoch": 0.5563259645974387, "grad_norm": 2.6546967029571533, "learning_rate": 2.7812219593767846e-05, "loss": 0.3518, "num_input_tokens_seen": 6523552, "step": 6820 }, { "epoch": 0.5567338282078473, "grad_norm": 2.322180986404419, "learning_rate": 2.7832612774288282e-05, "loss": 0.167, "num_input_tokens_seen": 6528784, "step": 6825 }, { "epoch": 0.557141691818256, "grad_norm": 4.558835983276367, "learning_rate": 2.7853005954808714e-05, "loss": 0.2626, "num_input_tokens_seen": 6532912, "step": 6830 }, { "epoch": 0.5575495554286647, "grad_norm": 1.9393359422683716, "learning_rate": 2.7873399135329147e-05, "loss": 0.2892, "num_input_tokens_seen": 6537520, "step": 6835 }, { "epoch": 0.5579574190390734, "grad_norm": 0.5381116271018982, "learning_rate": 2.7893792315849582e-05, "loss": 0.3879, "num_input_tokens_seen": 6542864, "step": 6840 }, { "epoch": 0.558365282649482, "grad_norm": 3.753859758377075, "learning_rate": 2.7914185496370015e-05, "loss": 0.3402, "num_input_tokens_seen": 6547824, "step": 6845 }, { "epoch": 0.5587731462598907, "grad_norm": 10.82161808013916, "learning_rate": 2.793457867689045e-05, "loss": 0.4006, "num_input_tokens_seen": 6552208, "step": 6850 }, { "epoch": 0.5591810098702994, "grad_norm": 1.4526100158691406, "learning_rate": 2.7954971857410883e-05, "loss": 0.3878, "num_input_tokens_seen": 6556816, "step": 6855 }, { "epoch": 0.5595888734807081, "grad_norm": 1.7893797159194946, "learning_rate": 2.797536503793132e-05, "loss": 0.3364, "num_input_tokens_seen": 6561136, "step": 6860 }, { "epoch": 0.5599967370911167, "grad_norm": 2.302502393722534, "learning_rate": 2.799575821845175e-05, "loss": 0.2874, "num_input_tokens_seen": 6566368, "step": 6865 }, { "epoch": 0.5604046007015254, "grad_norm": 3.494844436645508, "learning_rate": 2.8016151398972183e-05, "loss": 0.352, "num_input_tokens_seen": 6571200, "step": 6870 }, { "epoch": 0.5608124643119341, "grad_norm": 4.143986225128174, "learning_rate": 2.803654457949262e-05, "loss": 0.3008, "num_input_tokens_seen": 6576016, "step": 6875 }, { "epoch": 0.5612203279223428, "grad_norm": 4.956055641174316, "learning_rate": 2.805693776001305e-05, "loss": 0.3038, "num_input_tokens_seen": 6580496, "step": 6880 }, { "epoch": 0.5616281915327515, "grad_norm": 1.78471839427948, "learning_rate": 2.8077330940533487e-05, "loss": 0.2547, "num_input_tokens_seen": 6584416, "step": 6885 }, { "epoch": 0.5620360551431601, "grad_norm": 0.1429557204246521, "learning_rate": 2.809772412105392e-05, "loss": 0.1301, "num_input_tokens_seen": 6588528, "step": 6890 }, { "epoch": 0.5624439187535688, "grad_norm": 5.036365032196045, "learning_rate": 2.8118117301574355e-05, "loss": 0.2111, "num_input_tokens_seen": 6593584, "step": 6895 }, { "epoch": 0.5628517823639775, "grad_norm": 4.361187934875488, "learning_rate": 2.8138510482094788e-05, "loss": 0.2457, "num_input_tokens_seen": 6598016, "step": 6900 }, { "epoch": 0.5632596459743862, "grad_norm": 1.4190952777862549, "learning_rate": 2.815890366261522e-05, "loss": 0.0547, "num_input_tokens_seen": 6603024, "step": 6905 }, { "epoch": 0.5636675095847948, "grad_norm": 8.64985466003418, "learning_rate": 2.8179296843135656e-05, "loss": 0.8163, "num_input_tokens_seen": 6608608, "step": 6910 }, { "epoch": 0.5640753731952035, "grad_norm": 8.702670097351074, "learning_rate": 2.819969002365609e-05, "loss": 0.6917, "num_input_tokens_seen": 6613360, "step": 6915 }, { "epoch": 0.5644832368056122, "grad_norm": 2.8088810443878174, "learning_rate": 2.8220083204176524e-05, "loss": 0.3346, "num_input_tokens_seen": 6618320, "step": 6920 }, { "epoch": 0.5648911004160209, "grad_norm": 6.051054000854492, "learning_rate": 2.8240476384696956e-05, "loss": 0.6037, "num_input_tokens_seen": 6623472, "step": 6925 }, { "epoch": 0.5652989640264295, "grad_norm": 5.723618507385254, "learning_rate": 2.826086956521739e-05, "loss": 0.3121, "num_input_tokens_seen": 6628368, "step": 6930 }, { "epoch": 0.5657068276368382, "grad_norm": 2.1837246417999268, "learning_rate": 2.8281262745737825e-05, "loss": 0.2915, "num_input_tokens_seen": 6632800, "step": 6935 }, { "epoch": 0.5661146912472469, "grad_norm": 1.0932780504226685, "learning_rate": 2.8301655926258257e-05, "loss": 0.313, "num_input_tokens_seen": 6637840, "step": 6940 }, { "epoch": 0.5665225548576556, "grad_norm": 24.726707458496094, "learning_rate": 2.8322049106778693e-05, "loss": 0.4275, "num_input_tokens_seen": 6642576, "step": 6945 }, { "epoch": 0.5669304184680642, "grad_norm": 5.077383518218994, "learning_rate": 2.8342442287299125e-05, "loss": 0.3726, "num_input_tokens_seen": 6647808, "step": 6950 }, { "epoch": 0.5673382820784729, "grad_norm": 1.0458394289016724, "learning_rate": 2.8362835467819564e-05, "loss": 0.4298, "num_input_tokens_seen": 6653120, "step": 6955 }, { "epoch": 0.5677461456888816, "grad_norm": 8.292800903320312, "learning_rate": 2.838322864834e-05, "loss": 0.2478, "num_input_tokens_seen": 6656624, "step": 6960 }, { "epoch": 0.5681540092992903, "grad_norm": 10.213837623596191, "learning_rate": 2.8403621828860432e-05, "loss": 0.3671, "num_input_tokens_seen": 6661584, "step": 6965 }, { "epoch": 0.568561872909699, "grad_norm": 3.2332077026367188, "learning_rate": 2.8424015009380868e-05, "loss": 0.2471, "num_input_tokens_seen": 6666320, "step": 6970 }, { "epoch": 0.5689697365201076, "grad_norm": 9.099130630493164, "learning_rate": 2.84444081899013e-05, "loss": 0.221, "num_input_tokens_seen": 6671168, "step": 6975 }, { "epoch": 0.5693776001305163, "grad_norm": 2.1573240756988525, "learning_rate": 2.8464801370421733e-05, "loss": 0.396, "num_input_tokens_seen": 6676144, "step": 6980 }, { "epoch": 0.569785463740925, "grad_norm": 6.313317775726318, "learning_rate": 2.848519455094217e-05, "loss": 0.5184, "num_input_tokens_seen": 6681056, "step": 6985 }, { "epoch": 0.5701933273513338, "grad_norm": 6.799045562744141, "learning_rate": 2.85055877314626e-05, "loss": 0.2501, "num_input_tokens_seen": 6686384, "step": 6990 }, { "epoch": 0.5706011909617424, "grad_norm": 5.172825813293457, "learning_rate": 2.8525980911983037e-05, "loss": 0.4623, "num_input_tokens_seen": 6691392, "step": 6995 }, { "epoch": 0.5710090545721511, "grad_norm": 3.1734094619750977, "learning_rate": 2.854637409250347e-05, "loss": 0.4831, "num_input_tokens_seen": 6695888, "step": 7000 }, { "epoch": 0.5714169181825598, "grad_norm": 5.221536636352539, "learning_rate": 2.8566767273023905e-05, "loss": 0.3125, "num_input_tokens_seen": 6699824, "step": 7005 }, { "epoch": 0.5718247817929685, "grad_norm": 6.727630615234375, "learning_rate": 2.8587160453544337e-05, "loss": 0.208, "num_input_tokens_seen": 6704112, "step": 7010 }, { "epoch": 0.5722326454033771, "grad_norm": 0.8758237957954407, "learning_rate": 2.860755363406477e-05, "loss": 0.1928, "num_input_tokens_seen": 6709040, "step": 7015 }, { "epoch": 0.5726405090137858, "grad_norm": 2.4124996662139893, "learning_rate": 2.8627946814585205e-05, "loss": 0.5122, "num_input_tokens_seen": 6713904, "step": 7020 }, { "epoch": 0.5730483726241945, "grad_norm": 2.960780382156372, "learning_rate": 2.8648339995105638e-05, "loss": 0.2572, "num_input_tokens_seen": 6719024, "step": 7025 }, { "epoch": 0.5734562362346032, "grad_norm": 6.733853340148926, "learning_rate": 2.8668733175626074e-05, "loss": 0.4954, "num_input_tokens_seen": 6724128, "step": 7030 }, { "epoch": 0.5738640998450119, "grad_norm": 1.4246244430541992, "learning_rate": 2.8689126356146506e-05, "loss": 0.3321, "num_input_tokens_seen": 6729120, "step": 7035 }, { "epoch": 0.5742719634554205, "grad_norm": 5.920324802398682, "learning_rate": 2.870951953666694e-05, "loss": 0.39, "num_input_tokens_seen": 6733328, "step": 7040 }, { "epoch": 0.5746798270658292, "grad_norm": 0.7640700936317444, "learning_rate": 2.8729912717187374e-05, "loss": 0.3175, "num_input_tokens_seen": 6737136, "step": 7045 }, { "epoch": 0.5750876906762379, "grad_norm": 2.145137071609497, "learning_rate": 2.8750305897707807e-05, "loss": 0.3707, "num_input_tokens_seen": 6742304, "step": 7050 }, { "epoch": 0.5754955542866466, "grad_norm": 2.9352645874023438, "learning_rate": 2.8770699078228242e-05, "loss": 0.383, "num_input_tokens_seen": 6747600, "step": 7055 }, { "epoch": 0.5759034178970552, "grad_norm": 1.8059602975845337, "learning_rate": 2.8791092258748675e-05, "loss": 0.3221, "num_input_tokens_seen": 6752336, "step": 7060 }, { "epoch": 0.5763112815074639, "grad_norm": 3.6458539962768555, "learning_rate": 2.881148543926911e-05, "loss": 0.3263, "num_input_tokens_seen": 6757072, "step": 7065 }, { "epoch": 0.5767191451178726, "grad_norm": 2.9218831062316895, "learning_rate": 2.8831878619789543e-05, "loss": 0.1917, "num_input_tokens_seen": 6762672, "step": 7070 }, { "epoch": 0.5771270087282813, "grad_norm": 2.042142152786255, "learning_rate": 2.8852271800309975e-05, "loss": 0.2421, "num_input_tokens_seen": 6767456, "step": 7075 }, { "epoch": 0.5775348723386899, "grad_norm": 3.4313507080078125, "learning_rate": 2.887266498083041e-05, "loss": 0.4116, "num_input_tokens_seen": 6771920, "step": 7080 }, { "epoch": 0.5779427359490986, "grad_norm": 3.66333270072937, "learning_rate": 2.8893058161350843e-05, "loss": 0.1994, "num_input_tokens_seen": 6776176, "step": 7085 }, { "epoch": 0.5783505995595073, "grad_norm": 1.2532035112380981, "learning_rate": 2.891345134187128e-05, "loss": 0.3181, "num_input_tokens_seen": 6781248, "step": 7090 }, { "epoch": 0.578758463169916, "grad_norm": 3.4842958450317383, "learning_rate": 2.893384452239171e-05, "loss": 0.3632, "num_input_tokens_seen": 6785984, "step": 7095 }, { "epoch": 0.5791663267803246, "grad_norm": 7.619007110595703, "learning_rate": 2.8954237702912147e-05, "loss": 0.4417, "num_input_tokens_seen": 6790896, "step": 7100 }, { "epoch": 0.5795741903907333, "grad_norm": 2.395275592803955, "learning_rate": 2.897463088343258e-05, "loss": 0.3023, "num_input_tokens_seen": 6795872, "step": 7105 }, { "epoch": 0.579982054001142, "grad_norm": 0.20877546072006226, "learning_rate": 2.8995024063953012e-05, "loss": 0.1877, "num_input_tokens_seen": 6800768, "step": 7110 }, { "epoch": 0.5803899176115507, "grad_norm": 2.3429362773895264, "learning_rate": 2.9015417244473448e-05, "loss": 0.2202, "num_input_tokens_seen": 6805904, "step": 7115 }, { "epoch": 0.5807977812219594, "grad_norm": 29.810958862304688, "learning_rate": 2.903581042499388e-05, "loss": 0.2906, "num_input_tokens_seen": 6811008, "step": 7120 }, { "epoch": 0.581205644832368, "grad_norm": 6.879574775695801, "learning_rate": 2.9056203605514316e-05, "loss": 0.3605, "num_input_tokens_seen": 6815424, "step": 7125 }, { "epoch": 0.5816135084427767, "grad_norm": 0.18366234004497528, "learning_rate": 2.9076596786034748e-05, "loss": 0.3001, "num_input_tokens_seen": 6820112, "step": 7130 }, { "epoch": 0.5820213720531854, "grad_norm": 8.14057731628418, "learning_rate": 2.9096989966555184e-05, "loss": 0.5588, "num_input_tokens_seen": 6825424, "step": 7135 }, { "epoch": 0.5824292356635941, "grad_norm": 1.5363167524337769, "learning_rate": 2.9117383147075623e-05, "loss": 0.3911, "num_input_tokens_seen": 6829248, "step": 7140 }, { "epoch": 0.5828370992740027, "grad_norm": 3.53981876373291, "learning_rate": 2.9137776327596056e-05, "loss": 0.3707, "num_input_tokens_seen": 6833808, "step": 7145 }, { "epoch": 0.5832449628844114, "grad_norm": 2.737457275390625, "learning_rate": 2.915816950811649e-05, "loss": 0.3004, "num_input_tokens_seen": 6839232, "step": 7150 }, { "epoch": 0.5836528264948201, "grad_norm": 0.6538375616073608, "learning_rate": 2.9178562688636924e-05, "loss": 0.2385, "num_input_tokens_seen": 6844048, "step": 7155 }, { "epoch": 0.5840606901052288, "grad_norm": 30.050968170166016, "learning_rate": 2.9198955869157356e-05, "loss": 0.2757, "num_input_tokens_seen": 6847936, "step": 7160 }, { "epoch": 0.5844685537156374, "grad_norm": 11.244187355041504, "learning_rate": 2.9219349049677792e-05, "loss": 0.2908, "num_input_tokens_seen": 6851632, "step": 7165 }, { "epoch": 0.5848764173260461, "grad_norm": 0.8025100827217102, "learning_rate": 2.9239742230198224e-05, "loss": 0.1047, "num_input_tokens_seen": 6856480, "step": 7170 }, { "epoch": 0.5852842809364549, "grad_norm": 3.3862993717193604, "learning_rate": 2.926013541071866e-05, "loss": 0.4716, "num_input_tokens_seen": 6861440, "step": 7175 }, { "epoch": 0.5856921445468636, "grad_norm": 11.586474418640137, "learning_rate": 2.9280528591239092e-05, "loss": 0.419, "num_input_tokens_seen": 6866272, "step": 7180 }, { "epoch": 0.5861000081572723, "grad_norm": 0.5541324615478516, "learning_rate": 2.9300921771759525e-05, "loss": 0.205, "num_input_tokens_seen": 6871472, "step": 7185 }, { "epoch": 0.5865078717676809, "grad_norm": 5.372840404510498, "learning_rate": 2.932131495227996e-05, "loss": 0.6268, "num_input_tokens_seen": 6876752, "step": 7190 }, { "epoch": 0.5869157353780896, "grad_norm": 3.218903064727783, "learning_rate": 2.9341708132800393e-05, "loss": 0.1714, "num_input_tokens_seen": 6881456, "step": 7195 }, { "epoch": 0.5873235989884983, "grad_norm": 2.859358310699463, "learning_rate": 2.936210131332083e-05, "loss": 0.3372, "num_input_tokens_seen": 6885824, "step": 7200 }, { "epoch": 0.587731462598907, "grad_norm": 39.10120391845703, "learning_rate": 2.938249449384126e-05, "loss": 0.4404, "num_input_tokens_seen": 6890160, "step": 7205 }, { "epoch": 0.5881393262093156, "grad_norm": 0.30696025490760803, "learning_rate": 2.9402887674361697e-05, "loss": 0.4903, "num_input_tokens_seen": 6895376, "step": 7210 }, { "epoch": 0.5885471898197243, "grad_norm": 6.1831159591674805, "learning_rate": 2.942328085488213e-05, "loss": 0.3645, "num_input_tokens_seen": 6900032, "step": 7215 }, { "epoch": 0.588955053430133, "grad_norm": 5.374356269836426, "learning_rate": 2.944367403540256e-05, "loss": 0.2823, "num_input_tokens_seen": 6904960, "step": 7220 }, { "epoch": 0.5893629170405417, "grad_norm": 1.9598950147628784, "learning_rate": 2.9464067215922997e-05, "loss": 0.6555, "num_input_tokens_seen": 6910080, "step": 7225 }, { "epoch": 0.5897707806509503, "grad_norm": 4.989898681640625, "learning_rate": 2.948446039644343e-05, "loss": 0.3445, "num_input_tokens_seen": 6914560, "step": 7230 }, { "epoch": 0.590178644261359, "grad_norm": 5.639434814453125, "learning_rate": 2.9504853576963865e-05, "loss": 0.3126, "num_input_tokens_seen": 6919168, "step": 7235 }, { "epoch": 0.5905865078717677, "grad_norm": 2.507488965988159, "learning_rate": 2.9525246757484298e-05, "loss": 0.3468, "num_input_tokens_seen": 6923744, "step": 7240 }, { "epoch": 0.5909943714821764, "grad_norm": 3.5592381954193115, "learning_rate": 2.9545639938004734e-05, "loss": 0.1882, "num_input_tokens_seen": 6929456, "step": 7245 }, { "epoch": 0.591402235092585, "grad_norm": 17.154335021972656, "learning_rate": 2.9566033118525166e-05, "loss": 0.4278, "num_input_tokens_seen": 6934720, "step": 7250 }, { "epoch": 0.5918100987029937, "grad_norm": 0.9376546740531921, "learning_rate": 2.9586426299045598e-05, "loss": 0.5054, "num_input_tokens_seen": 6938864, "step": 7255 }, { "epoch": 0.5922179623134024, "grad_norm": 1.4564120769500732, "learning_rate": 2.9606819479566034e-05, "loss": 0.4577, "num_input_tokens_seen": 6944080, "step": 7260 }, { "epoch": 0.5926258259238111, "grad_norm": 6.811331748962402, "learning_rate": 2.9627212660086466e-05, "loss": 0.3158, "num_input_tokens_seen": 6949104, "step": 7265 }, { "epoch": 0.5930336895342198, "grad_norm": 2.253514289855957, "learning_rate": 2.9647605840606902e-05, "loss": 0.2746, "num_input_tokens_seen": 6953472, "step": 7270 }, { "epoch": 0.5934415531446284, "grad_norm": 0.17942692339420319, "learning_rate": 2.9667999021127335e-05, "loss": 0.1291, "num_input_tokens_seen": 6957760, "step": 7275 }, { "epoch": 0.5938494167550371, "grad_norm": 18.111127853393555, "learning_rate": 2.968839220164777e-05, "loss": 0.4105, "num_input_tokens_seen": 6962112, "step": 7280 }, { "epoch": 0.5942572803654458, "grad_norm": 9.568790435791016, "learning_rate": 2.9708785382168203e-05, "loss": 0.3853, "num_input_tokens_seen": 6965968, "step": 7285 }, { "epoch": 0.5946651439758545, "grad_norm": 10.027703285217285, "learning_rate": 2.9729178562688635e-05, "loss": 0.377, "num_input_tokens_seen": 6971136, "step": 7290 }, { "epoch": 0.5950730075862631, "grad_norm": 1.1426961421966553, "learning_rate": 2.974957174320907e-05, "loss": 0.1857, "num_input_tokens_seen": 6975584, "step": 7295 }, { "epoch": 0.5954808711966718, "grad_norm": 6.098761558532715, "learning_rate": 2.9769964923729503e-05, "loss": 0.3348, "num_input_tokens_seen": 6980896, "step": 7300 }, { "epoch": 0.5958887348070805, "grad_norm": 8.408178329467773, "learning_rate": 2.979035810424994e-05, "loss": 0.7204, "num_input_tokens_seen": 6985872, "step": 7305 }, { "epoch": 0.5962965984174892, "grad_norm": 5.631744384765625, "learning_rate": 2.981075128477037e-05, "loss": 0.2843, "num_input_tokens_seen": 6991584, "step": 7310 }, { "epoch": 0.5967044620278978, "grad_norm": 5.162776470184326, "learning_rate": 2.9831144465290807e-05, "loss": 0.5592, "num_input_tokens_seen": 6997024, "step": 7315 }, { "epoch": 0.5971123256383065, "grad_norm": 0.8387722373008728, "learning_rate": 2.985153764581124e-05, "loss": 0.2795, "num_input_tokens_seen": 7002064, "step": 7320 }, { "epoch": 0.5975201892487152, "grad_norm": 3.2781825065612793, "learning_rate": 2.987193082633168e-05, "loss": 0.7186, "num_input_tokens_seen": 7006832, "step": 7325 }, { "epoch": 0.5979280528591239, "grad_norm": 2.6401705741882324, "learning_rate": 2.989232400685211e-05, "loss": 0.4281, "num_input_tokens_seen": 7011216, "step": 7330 }, { "epoch": 0.5983359164695325, "grad_norm": 2.922891855239868, "learning_rate": 2.9912717187372547e-05, "loss": 0.262, "num_input_tokens_seen": 7015456, "step": 7335 }, { "epoch": 0.5987437800799412, "grad_norm": 3.4402523040771484, "learning_rate": 2.993311036789298e-05, "loss": 0.3452, "num_input_tokens_seen": 7020112, "step": 7340 }, { "epoch": 0.5991516436903499, "grad_norm": 1.7788690328598022, "learning_rate": 2.9953503548413415e-05, "loss": 0.3807, "num_input_tokens_seen": 7025264, "step": 7345 }, { "epoch": 0.5995595073007586, "grad_norm": 2.755300760269165, "learning_rate": 2.9973896728933847e-05, "loss": 0.2542, "num_input_tokens_seen": 7030112, "step": 7350 }, { "epoch": 0.5999673709111673, "grad_norm": 6.38527774810791, "learning_rate": 2.9994289909454283e-05, "loss": 0.2301, "num_input_tokens_seen": 7035920, "step": 7355 }, { "epoch": 0.600375234521576, "grad_norm": 0.7142125368118286, "learning_rate": 3.0014683089974715e-05, "loss": 0.2469, "num_input_tokens_seen": 7041664, "step": 7360 }, { "epoch": 0.6007830981319847, "grad_norm": 6.245069980621338, "learning_rate": 3.0035076270495148e-05, "loss": 0.3331, "num_input_tokens_seen": 7045280, "step": 7365 }, { "epoch": 0.6011909617423934, "grad_norm": 7.531704902648926, "learning_rate": 3.0055469451015584e-05, "loss": 0.159, "num_input_tokens_seen": 7049392, "step": 7370 }, { "epoch": 0.6015988253528021, "grad_norm": 1.602664589881897, "learning_rate": 3.0075862631536016e-05, "loss": 0.2529, "num_input_tokens_seen": 7054304, "step": 7375 }, { "epoch": 0.6020066889632107, "grad_norm": 6.176246166229248, "learning_rate": 3.0096255812056452e-05, "loss": 0.3391, "num_input_tokens_seen": 7059712, "step": 7380 }, { "epoch": 0.6024145525736194, "grad_norm": 13.052739143371582, "learning_rate": 3.0116648992576884e-05, "loss": 0.3495, "num_input_tokens_seen": 7063696, "step": 7385 }, { "epoch": 0.6028224161840281, "grad_norm": 0.227992981672287, "learning_rate": 3.013704217309732e-05, "loss": 0.1661, "num_input_tokens_seen": 7067920, "step": 7390 }, { "epoch": 0.6032302797944368, "grad_norm": 7.050046920776367, "learning_rate": 3.0157435353617752e-05, "loss": 0.4371, "num_input_tokens_seen": 7072416, "step": 7395 }, { "epoch": 0.6036381434048455, "grad_norm": 0.39692994952201843, "learning_rate": 3.0177828534138185e-05, "loss": 0.0462, "num_input_tokens_seen": 7077040, "step": 7400 }, { "epoch": 0.6040460070152541, "grad_norm": 2.7338345050811768, "learning_rate": 3.019822171465862e-05, "loss": 0.2912, "num_input_tokens_seen": 7081872, "step": 7405 }, { "epoch": 0.6044538706256628, "grad_norm": 0.6861996054649353, "learning_rate": 3.0218614895179053e-05, "loss": 0.3879, "num_input_tokens_seen": 7087392, "step": 7410 }, { "epoch": 0.6048617342360715, "grad_norm": 12.094457626342773, "learning_rate": 3.023900807569949e-05, "loss": 0.3219, "num_input_tokens_seen": 7091584, "step": 7415 }, { "epoch": 0.6052695978464802, "grad_norm": 6.1064863204956055, "learning_rate": 3.025940125621992e-05, "loss": 0.1949, "num_input_tokens_seen": 7096176, "step": 7420 }, { "epoch": 0.6056774614568888, "grad_norm": 0.1928253024816513, "learning_rate": 3.0279794436740357e-05, "loss": 0.0862, "num_input_tokens_seen": 7100864, "step": 7425 }, { "epoch": 0.6060853250672975, "grad_norm": 22.6698055267334, "learning_rate": 3.030018761726079e-05, "loss": 0.699, "num_input_tokens_seen": 7105504, "step": 7430 }, { "epoch": 0.6064931886777062, "grad_norm": 7.055789947509766, "learning_rate": 3.032058079778122e-05, "loss": 0.7199, "num_input_tokens_seen": 7109520, "step": 7435 }, { "epoch": 0.6069010522881149, "grad_norm": 0.7963969111442566, "learning_rate": 3.0340973978301657e-05, "loss": 0.3675, "num_input_tokens_seen": 7114528, "step": 7440 }, { "epoch": 0.6073089158985235, "grad_norm": 130.32249450683594, "learning_rate": 3.036136715882209e-05, "loss": 0.8667, "num_input_tokens_seen": 7119088, "step": 7445 }, { "epoch": 0.6077167795089322, "grad_norm": 9.46718692779541, "learning_rate": 3.0381760339342525e-05, "loss": 0.7288, "num_input_tokens_seen": 7124368, "step": 7450 }, { "epoch": 0.6081246431193409, "grad_norm": 13.7568998336792, "learning_rate": 3.0402153519862958e-05, "loss": 0.3392, "num_input_tokens_seen": 7128432, "step": 7455 }, { "epoch": 0.6085325067297496, "grad_norm": 0.8056121468544006, "learning_rate": 3.0422546700383393e-05, "loss": 0.4331, "num_input_tokens_seen": 7133504, "step": 7460 }, { "epoch": 0.6089403703401582, "grad_norm": 7.370016098022461, "learning_rate": 3.0442939880903826e-05, "loss": 0.2944, "num_input_tokens_seen": 7138832, "step": 7465 }, { "epoch": 0.6093482339505669, "grad_norm": 5.182626247406006, "learning_rate": 3.0463333061424258e-05, "loss": 0.7022, "num_input_tokens_seen": 7144592, "step": 7470 }, { "epoch": 0.6097560975609756, "grad_norm": 4.02085018157959, "learning_rate": 3.0483726241944694e-05, "loss": 0.2317, "num_input_tokens_seen": 7149792, "step": 7475 }, { "epoch": 0.6101639611713843, "grad_norm": 6.311238765716553, "learning_rate": 3.0504119422465126e-05, "loss": 0.3561, "num_input_tokens_seen": 7154512, "step": 7480 }, { "epoch": 0.610571824781793, "grad_norm": 7.544373035430908, "learning_rate": 3.052451260298556e-05, "loss": 0.2709, "num_input_tokens_seen": 7159232, "step": 7485 }, { "epoch": 0.6109796883922016, "grad_norm": 10.0413236618042, "learning_rate": 3.0544905783506e-05, "loss": 0.3208, "num_input_tokens_seen": 7163840, "step": 7490 }, { "epoch": 0.6113875520026103, "grad_norm": 12.743760108947754, "learning_rate": 3.056529896402643e-05, "loss": 0.3041, "num_input_tokens_seen": 7169200, "step": 7495 }, { "epoch": 0.611795415613019, "grad_norm": 5.404170036315918, "learning_rate": 3.058569214454686e-05, "loss": 0.568, "num_input_tokens_seen": 7173488, "step": 7500 }, { "epoch": 0.6122032792234277, "grad_norm": 6.605124473571777, "learning_rate": 3.0606085325067295e-05, "loss": 0.1553, "num_input_tokens_seen": 7178176, "step": 7505 }, { "epoch": 0.6126111428338363, "grad_norm": 7.735110282897949, "learning_rate": 3.0626478505587734e-05, "loss": 0.3195, "num_input_tokens_seen": 7182896, "step": 7510 }, { "epoch": 0.613019006444245, "grad_norm": 19.17605972290039, "learning_rate": 3.0646871686108167e-05, "loss": 0.3642, "num_input_tokens_seen": 7187520, "step": 7515 }, { "epoch": 0.6134268700546537, "grad_norm": 17.176076889038086, "learning_rate": 3.0667264866628606e-05, "loss": 0.4807, "num_input_tokens_seen": 7191712, "step": 7520 }, { "epoch": 0.6138347336650624, "grad_norm": 10.484075546264648, "learning_rate": 3.068765804714904e-05, "loss": 0.4656, "num_input_tokens_seen": 7196640, "step": 7525 }, { "epoch": 0.614242597275471, "grad_norm": 6.832305431365967, "learning_rate": 3.070805122766947e-05, "loss": 0.3598, "num_input_tokens_seen": 7201696, "step": 7530 }, { "epoch": 0.6146504608858797, "grad_norm": 0.658348023891449, "learning_rate": 3.07284444081899e-05, "loss": 0.2883, "num_input_tokens_seen": 7206720, "step": 7535 }, { "epoch": 0.6150583244962884, "grad_norm": 2.2346527576446533, "learning_rate": 3.074883758871034e-05, "loss": 0.2951, "num_input_tokens_seen": 7212112, "step": 7540 }, { "epoch": 0.6154661881066971, "grad_norm": 5.711630344390869, "learning_rate": 3.0769230769230774e-05, "loss": 0.6077, "num_input_tokens_seen": 7216752, "step": 7545 }, { "epoch": 0.6158740517171059, "grad_norm": 15.968098640441895, "learning_rate": 3.078962394975121e-05, "loss": 0.5051, "num_input_tokens_seen": 7222112, "step": 7550 }, { "epoch": 0.6162819153275145, "grad_norm": 22.887256622314453, "learning_rate": 3.081001713027164e-05, "loss": 0.4754, "num_input_tokens_seen": 7226688, "step": 7555 }, { "epoch": 0.6166897789379232, "grad_norm": 9.342429161071777, "learning_rate": 3.083041031079207e-05, "loss": 0.3151, "num_input_tokens_seen": 7231472, "step": 7560 }, { "epoch": 0.6170976425483319, "grad_norm": 4.296843528747559, "learning_rate": 3.085080349131251e-05, "loss": 0.3514, "num_input_tokens_seen": 7236272, "step": 7565 }, { "epoch": 0.6175055061587406, "grad_norm": 4.998960971832275, "learning_rate": 3.087119667183294e-05, "loss": 0.3747, "num_input_tokens_seen": 7240256, "step": 7570 }, { "epoch": 0.6179133697691492, "grad_norm": 9.727500915527344, "learning_rate": 3.0891589852353375e-05, "loss": 0.4073, "num_input_tokens_seen": 7244768, "step": 7575 }, { "epoch": 0.6183212333795579, "grad_norm": 3.7887909412384033, "learning_rate": 3.091198303287381e-05, "loss": 0.3911, "num_input_tokens_seen": 7249728, "step": 7580 }, { "epoch": 0.6187290969899666, "grad_norm": 2.844801425933838, "learning_rate": 3.093237621339424e-05, "loss": 0.3102, "num_input_tokens_seen": 7254000, "step": 7585 }, { "epoch": 0.6191369606003753, "grad_norm": 7.1689910888671875, "learning_rate": 3.095276939391468e-05, "loss": 0.3318, "num_input_tokens_seen": 7259072, "step": 7590 }, { "epoch": 0.6195448242107839, "grad_norm": 6.977562427520752, "learning_rate": 3.097316257443511e-05, "loss": 0.3709, "num_input_tokens_seen": 7263344, "step": 7595 }, { "epoch": 0.6199526878211926, "grad_norm": 5.566565036773682, "learning_rate": 3.0993555754955544e-05, "loss": 0.2914, "num_input_tokens_seen": 7267536, "step": 7600 }, { "epoch": 0.6203605514316013, "grad_norm": 5.27448034286499, "learning_rate": 3.1013948935475976e-05, "loss": 0.4288, "num_input_tokens_seen": 7271760, "step": 7605 }, { "epoch": 0.62076841504201, "grad_norm": 1.7240599393844604, "learning_rate": 3.103434211599641e-05, "loss": 0.2631, "num_input_tokens_seen": 7276800, "step": 7610 }, { "epoch": 0.6211762786524186, "grad_norm": 6.798946857452393, "learning_rate": 3.105473529651685e-05, "loss": 0.31, "num_input_tokens_seen": 7281184, "step": 7615 }, { "epoch": 0.6215841422628273, "grad_norm": 10.43857479095459, "learning_rate": 3.107512847703728e-05, "loss": 0.3603, "num_input_tokens_seen": 7286112, "step": 7620 }, { "epoch": 0.621992005873236, "grad_norm": 4.073036193847656, "learning_rate": 3.109552165755771e-05, "loss": 0.3386, "num_input_tokens_seen": 7291984, "step": 7625 }, { "epoch": 0.6223998694836447, "grad_norm": 6.867862224578857, "learning_rate": 3.1115914838078145e-05, "loss": 0.3332, "num_input_tokens_seen": 7297296, "step": 7630 }, { "epoch": 0.6228077330940534, "grad_norm": 36.53971862792969, "learning_rate": 3.1136308018598584e-05, "loss": 0.5432, "num_input_tokens_seen": 7302592, "step": 7635 }, { "epoch": 0.623215596704462, "grad_norm": 1.6832307577133179, "learning_rate": 3.1156701199119017e-05, "loss": 0.274, "num_input_tokens_seen": 7307120, "step": 7640 }, { "epoch": 0.6236234603148707, "grad_norm": 7.946831226348877, "learning_rate": 3.117709437963945e-05, "loss": 0.3835, "num_input_tokens_seen": 7311968, "step": 7645 }, { "epoch": 0.6240313239252794, "grad_norm": 9.116416931152344, "learning_rate": 3.119748756015988e-05, "loss": 0.5693, "num_input_tokens_seen": 7316816, "step": 7650 }, { "epoch": 0.6244391875356881, "grad_norm": 0.6624404191970825, "learning_rate": 3.1217880740680314e-05, "loss": 0.3202, "num_input_tokens_seen": 7321728, "step": 7655 }, { "epoch": 0.6248470511460967, "grad_norm": 1.1887484788894653, "learning_rate": 3.123827392120075e-05, "loss": 0.3584, "num_input_tokens_seen": 7326560, "step": 7660 }, { "epoch": 0.6252549147565054, "grad_norm": 11.685220718383789, "learning_rate": 3.1258667101721185e-05, "loss": 0.5794, "num_input_tokens_seen": 7331184, "step": 7665 }, { "epoch": 0.6256627783669141, "grad_norm": 4.569261074066162, "learning_rate": 3.127906028224162e-05, "loss": 0.3474, "num_input_tokens_seen": 7335456, "step": 7670 }, { "epoch": 0.6260706419773228, "grad_norm": 6.731314182281494, "learning_rate": 3.129945346276205e-05, "loss": 0.3637, "num_input_tokens_seen": 7340608, "step": 7675 }, { "epoch": 0.6264785055877314, "grad_norm": 3.892083168029785, "learning_rate": 3.131984664328248e-05, "loss": 0.3225, "num_input_tokens_seen": 7345984, "step": 7680 }, { "epoch": 0.6268863691981401, "grad_norm": 7.278768062591553, "learning_rate": 3.134023982380292e-05, "loss": 0.2572, "num_input_tokens_seen": 7349504, "step": 7685 }, { "epoch": 0.6272942328085488, "grad_norm": 3.8065738677978516, "learning_rate": 3.1360633004323354e-05, "loss": 0.2154, "num_input_tokens_seen": 7354336, "step": 7690 }, { "epoch": 0.6277020964189575, "grad_norm": 20.103614807128906, "learning_rate": 3.1381026184843786e-05, "loss": 0.3004, "num_input_tokens_seen": 7358992, "step": 7695 }, { "epoch": 0.6281099600293661, "grad_norm": 1.7823132276535034, "learning_rate": 3.1401419365364225e-05, "loss": 0.5871, "num_input_tokens_seen": 7364736, "step": 7700 }, { "epoch": 0.6285178236397748, "grad_norm": 6.596823215484619, "learning_rate": 3.142181254588466e-05, "loss": 0.2388, "num_input_tokens_seen": 7369024, "step": 7705 }, { "epoch": 0.6289256872501835, "grad_norm": 5.075286388397217, "learning_rate": 3.14422057264051e-05, "loss": 0.3709, "num_input_tokens_seen": 7372528, "step": 7710 }, { "epoch": 0.6293335508605922, "grad_norm": 7.442237854003906, "learning_rate": 3.146259890692553e-05, "loss": 0.2385, "num_input_tokens_seen": 7377616, "step": 7715 }, { "epoch": 0.6297414144710008, "grad_norm": 0.31620150804519653, "learning_rate": 3.148299208744596e-05, "loss": 0.3365, "num_input_tokens_seen": 7382336, "step": 7720 }, { "epoch": 0.6301492780814095, "grad_norm": 7.815780162811279, "learning_rate": 3.1503385267966394e-05, "loss": 0.4959, "num_input_tokens_seen": 7387392, "step": 7725 }, { "epoch": 0.6305571416918182, "grad_norm": 0.43090903759002686, "learning_rate": 3.1523778448486826e-05, "loss": 0.1896, "num_input_tokens_seen": 7392352, "step": 7730 }, { "epoch": 0.630965005302227, "grad_norm": 4.5590410232543945, "learning_rate": 3.1544171629007266e-05, "loss": 0.4597, "num_input_tokens_seen": 7397264, "step": 7735 }, { "epoch": 0.6313728689126357, "grad_norm": 8.369887351989746, "learning_rate": 3.15645648095277e-05, "loss": 0.4322, "num_input_tokens_seen": 7402592, "step": 7740 }, { "epoch": 0.6317807325230443, "grad_norm": 2.942819118499756, "learning_rate": 3.158495799004813e-05, "loss": 0.3109, "num_input_tokens_seen": 7407648, "step": 7745 }, { "epoch": 0.632188596133453, "grad_norm": 4.3921918869018555, "learning_rate": 3.160535117056856e-05, "loss": 0.3245, "num_input_tokens_seen": 7412480, "step": 7750 }, { "epoch": 0.6325964597438617, "grad_norm": 5.468618869781494, "learning_rate": 3.1625744351088995e-05, "loss": 0.3146, "num_input_tokens_seen": 7416848, "step": 7755 }, { "epoch": 0.6330043233542704, "grad_norm": 6.365808963775635, "learning_rate": 3.1646137531609434e-05, "loss": 0.525, "num_input_tokens_seen": 7422128, "step": 7760 }, { "epoch": 0.633412186964679, "grad_norm": 3.184077501296997, "learning_rate": 3.166653071212987e-05, "loss": 0.451, "num_input_tokens_seen": 7426128, "step": 7765 }, { "epoch": 0.6338200505750877, "grad_norm": 3.0695879459381104, "learning_rate": 3.16869238926503e-05, "loss": 0.3742, "num_input_tokens_seen": 7430896, "step": 7770 }, { "epoch": 0.6342279141854964, "grad_norm": 1.5587066411972046, "learning_rate": 3.170731707317073e-05, "loss": 0.4303, "num_input_tokens_seen": 7435248, "step": 7775 }, { "epoch": 0.6346357777959051, "grad_norm": 2.489150047302246, "learning_rate": 3.172771025369117e-05, "loss": 0.3528, "num_input_tokens_seen": 7439920, "step": 7780 }, { "epoch": 0.6350436414063138, "grad_norm": 1.009086012840271, "learning_rate": 3.17481034342116e-05, "loss": 0.3454, "num_input_tokens_seen": 7443888, "step": 7785 }, { "epoch": 0.6354515050167224, "grad_norm": 1.848678708076477, "learning_rate": 3.1768496614732035e-05, "loss": 0.3457, "num_input_tokens_seen": 7449328, "step": 7790 }, { "epoch": 0.6358593686271311, "grad_norm": 1.4532862901687622, "learning_rate": 3.178888979525247e-05, "loss": 0.3504, "num_input_tokens_seen": 7454752, "step": 7795 }, { "epoch": 0.6362672322375398, "grad_norm": 3.1544532775878906, "learning_rate": 3.18092829757729e-05, "loss": 0.3643, "num_input_tokens_seen": 7460336, "step": 7800 }, { "epoch": 0.6366750958479485, "grad_norm": 3.1005494594573975, "learning_rate": 3.182967615629334e-05, "loss": 0.4354, "num_input_tokens_seen": 7464448, "step": 7805 }, { "epoch": 0.6370829594583571, "grad_norm": 3.716813802719116, "learning_rate": 3.185006933681377e-05, "loss": 0.3762, "num_input_tokens_seen": 7469600, "step": 7810 }, { "epoch": 0.6374908230687658, "grad_norm": 3.002713680267334, "learning_rate": 3.1870462517334204e-05, "loss": 0.3162, "num_input_tokens_seen": 7475488, "step": 7815 }, { "epoch": 0.6378986866791745, "grad_norm": 1.87212336063385, "learning_rate": 3.1890855697854636e-05, "loss": 0.1751, "num_input_tokens_seen": 7480192, "step": 7820 }, { "epoch": 0.6383065502895832, "grad_norm": 9.327101707458496, "learning_rate": 3.191124887837507e-05, "loss": 0.3768, "num_input_tokens_seen": 7485104, "step": 7825 }, { "epoch": 0.6387144138999918, "grad_norm": 3.355435371398926, "learning_rate": 3.193164205889551e-05, "loss": 0.2603, "num_input_tokens_seen": 7489280, "step": 7830 }, { "epoch": 0.6391222775104005, "grad_norm": 5.8582048416137695, "learning_rate": 3.195203523941594e-05, "loss": 0.2109, "num_input_tokens_seen": 7493760, "step": 7835 }, { "epoch": 0.6395301411208092, "grad_norm": 5.596894264221191, "learning_rate": 3.197242841993637e-05, "loss": 0.2327, "num_input_tokens_seen": 7498640, "step": 7840 }, { "epoch": 0.6399380047312179, "grad_norm": 3.810800075531006, "learning_rate": 3.1992821600456805e-05, "loss": 0.5978, "num_input_tokens_seen": 7504512, "step": 7845 }, { "epoch": 0.6403458683416265, "grad_norm": 7.91201639175415, "learning_rate": 3.2013214780977244e-05, "loss": 0.2968, "num_input_tokens_seen": 7508864, "step": 7850 }, { "epoch": 0.6407537319520352, "grad_norm": 2.642721652984619, "learning_rate": 3.2033607961497677e-05, "loss": 0.3306, "num_input_tokens_seen": 7513744, "step": 7855 }, { "epoch": 0.6411615955624439, "grad_norm": 0.9816467761993408, "learning_rate": 3.205400114201811e-05, "loss": 0.0862, "num_input_tokens_seen": 7519104, "step": 7860 }, { "epoch": 0.6415694591728526, "grad_norm": 5.690241813659668, "learning_rate": 3.207439432253854e-05, "loss": 0.4009, "num_input_tokens_seen": 7524512, "step": 7865 }, { "epoch": 0.6419773227832613, "grad_norm": 2.9890952110290527, "learning_rate": 3.2094787503058974e-05, "loss": 0.4552, "num_input_tokens_seen": 7528784, "step": 7870 }, { "epoch": 0.6423851863936699, "grad_norm": 0.7082421183586121, "learning_rate": 3.211518068357941e-05, "loss": 0.3367, "num_input_tokens_seen": 7533280, "step": 7875 }, { "epoch": 0.6427930500040786, "grad_norm": 1.116072416305542, "learning_rate": 3.2135573864099845e-05, "loss": 0.3719, "num_input_tokens_seen": 7538576, "step": 7880 }, { "epoch": 0.6432009136144873, "grad_norm": 0.3513072431087494, "learning_rate": 3.2155967044620284e-05, "loss": 0.1436, "num_input_tokens_seen": 7543840, "step": 7885 }, { "epoch": 0.643608777224896, "grad_norm": 3.3536622524261475, "learning_rate": 3.217636022514072e-05, "loss": 0.2796, "num_input_tokens_seen": 7548240, "step": 7890 }, { "epoch": 0.6440166408353046, "grad_norm": 19.236331939697266, "learning_rate": 3.219675340566115e-05, "loss": 0.7237, "num_input_tokens_seen": 7553072, "step": 7895 }, { "epoch": 0.6444245044457133, "grad_norm": 5.9326066970825195, "learning_rate": 3.221714658618158e-05, "loss": 0.5655, "num_input_tokens_seen": 7558480, "step": 7900 }, { "epoch": 0.644832368056122, "grad_norm": 1.32155442237854, "learning_rate": 3.223753976670202e-05, "loss": 0.2891, "num_input_tokens_seen": 7563440, "step": 7905 }, { "epoch": 0.6452402316665307, "grad_norm": 9.341522216796875, "learning_rate": 3.225793294722245e-05, "loss": 0.3558, "num_input_tokens_seen": 7568704, "step": 7910 }, { "epoch": 0.6456480952769393, "grad_norm": 0.32976773381233215, "learning_rate": 3.2278326127742885e-05, "loss": 0.2707, "num_input_tokens_seen": 7573824, "step": 7915 }, { "epoch": 0.6460559588873481, "grad_norm": 6.807096004486084, "learning_rate": 3.229871930826332e-05, "loss": 0.6034, "num_input_tokens_seen": 7578592, "step": 7920 }, { "epoch": 0.6464638224977568, "grad_norm": 0.6586646437644958, "learning_rate": 3.231911248878376e-05, "loss": 0.5246, "num_input_tokens_seen": 7582768, "step": 7925 }, { "epoch": 0.6468716861081655, "grad_norm": 2.967839241027832, "learning_rate": 3.233950566930419e-05, "loss": 0.5358, "num_input_tokens_seen": 7587888, "step": 7930 }, { "epoch": 0.6472795497185742, "grad_norm": 0.7995076179504395, "learning_rate": 3.235989884982462e-05, "loss": 0.2706, "num_input_tokens_seen": 7591968, "step": 7935 }, { "epoch": 0.6476874133289828, "grad_norm": 1.1475800275802612, "learning_rate": 3.2380292030345054e-05, "loss": 0.3293, "num_input_tokens_seen": 7596704, "step": 7940 }, { "epoch": 0.6480952769393915, "grad_norm": 0.9950160384178162, "learning_rate": 3.2400685210865486e-05, "loss": 0.2335, "num_input_tokens_seen": 7602288, "step": 7945 }, { "epoch": 0.6485031405498002, "grad_norm": 1.0067235231399536, "learning_rate": 3.2421078391385926e-05, "loss": 0.2631, "num_input_tokens_seen": 7606864, "step": 7950 }, { "epoch": 0.6489110041602089, "grad_norm": 3.911411762237549, "learning_rate": 3.244147157190636e-05, "loss": 0.3042, "num_input_tokens_seen": 7611040, "step": 7955 }, { "epoch": 0.6493188677706175, "grad_norm": 11.344582557678223, "learning_rate": 3.246186475242679e-05, "loss": 0.2213, "num_input_tokens_seen": 7616256, "step": 7960 }, { "epoch": 0.6497267313810262, "grad_norm": 0.8190839886665344, "learning_rate": 3.248225793294722e-05, "loss": 0.3774, "num_input_tokens_seen": 7620864, "step": 7965 }, { "epoch": 0.6501345949914349, "grad_norm": 3.466235637664795, "learning_rate": 3.2502651113467655e-05, "loss": 0.2485, "num_input_tokens_seen": 7625728, "step": 7970 }, { "epoch": 0.6505424586018436, "grad_norm": 4.1767683029174805, "learning_rate": 3.2523044293988094e-05, "loss": 0.2928, "num_input_tokens_seen": 7630992, "step": 7975 }, { "epoch": 0.6509503222122522, "grad_norm": 6.566626071929932, "learning_rate": 3.2543437474508527e-05, "loss": 0.3356, "num_input_tokens_seen": 7635392, "step": 7980 }, { "epoch": 0.6513581858226609, "grad_norm": 11.879233360290527, "learning_rate": 3.256383065502896e-05, "loss": 0.4872, "num_input_tokens_seen": 7639504, "step": 7985 }, { "epoch": 0.6517660494330696, "grad_norm": 5.699578285217285, "learning_rate": 3.258422383554939e-05, "loss": 0.6639, "num_input_tokens_seen": 7644384, "step": 7990 }, { "epoch": 0.6521739130434783, "grad_norm": 6.05440092086792, "learning_rate": 3.260461701606983e-05, "loss": 0.5349, "num_input_tokens_seen": 7648784, "step": 7995 }, { "epoch": 0.652581776653887, "grad_norm": 2.7249176502227783, "learning_rate": 3.262501019659026e-05, "loss": 0.2917, "num_input_tokens_seen": 7654224, "step": 8000 }, { "epoch": 0.6529896402642956, "grad_norm": 0.8105337023735046, "learning_rate": 3.2645403377110695e-05, "loss": 0.2206, "num_input_tokens_seen": 7658624, "step": 8005 }, { "epoch": 0.6533975038747043, "grad_norm": 2.999237298965454, "learning_rate": 3.266579655763113e-05, "loss": 0.248, "num_input_tokens_seen": 7663552, "step": 8010 }, { "epoch": 0.653805367485113, "grad_norm": 6.832826614379883, "learning_rate": 3.268618973815156e-05, "loss": 0.4904, "num_input_tokens_seen": 7668176, "step": 8015 }, { "epoch": 0.6542132310955217, "grad_norm": 5.1235032081604, "learning_rate": 3.2706582918672e-05, "loss": 0.319, "num_input_tokens_seen": 7673728, "step": 8020 }, { "epoch": 0.6546210947059303, "grad_norm": 2.2351603507995605, "learning_rate": 3.272697609919243e-05, "loss": 0.2078, "num_input_tokens_seen": 7678688, "step": 8025 }, { "epoch": 0.655028958316339, "grad_norm": 3.414428472518921, "learning_rate": 3.2747369279712864e-05, "loss": 0.4898, "num_input_tokens_seen": 7683408, "step": 8030 }, { "epoch": 0.6554368219267477, "grad_norm": 6.504569053649902, "learning_rate": 3.2767762460233296e-05, "loss": 0.2761, "num_input_tokens_seen": 7687872, "step": 8035 }, { "epoch": 0.6558446855371564, "grad_norm": 3.348435640335083, "learning_rate": 3.278815564075373e-05, "loss": 0.3019, "num_input_tokens_seen": 7692624, "step": 8040 }, { "epoch": 0.656252549147565, "grad_norm": 4.551955699920654, "learning_rate": 3.280854882127417e-05, "loss": 0.3046, "num_input_tokens_seen": 7698224, "step": 8045 }, { "epoch": 0.6566604127579737, "grad_norm": 4.75576639175415, "learning_rate": 3.28289420017946e-05, "loss": 0.298, "num_input_tokens_seen": 7702400, "step": 8050 }, { "epoch": 0.6570682763683824, "grad_norm": 5.686522960662842, "learning_rate": 3.284933518231503e-05, "loss": 0.133, "num_input_tokens_seen": 7707264, "step": 8055 }, { "epoch": 0.6574761399787911, "grad_norm": 4.232507228851318, "learning_rate": 3.2869728362835465e-05, "loss": 0.2894, "num_input_tokens_seen": 7712944, "step": 8060 }, { "epoch": 0.6578840035891997, "grad_norm": 0.7494542598724365, "learning_rate": 3.28901215433559e-05, "loss": 0.2651, "num_input_tokens_seen": 7718016, "step": 8065 }, { "epoch": 0.6582918671996084, "grad_norm": 5.8961029052734375, "learning_rate": 3.291051472387634e-05, "loss": 0.5899, "num_input_tokens_seen": 7723360, "step": 8070 }, { "epoch": 0.6586997308100171, "grad_norm": 4.862386703491211, "learning_rate": 3.2930907904396776e-05, "loss": 0.2595, "num_input_tokens_seen": 7728848, "step": 8075 }, { "epoch": 0.6591075944204258, "grad_norm": 6.828092098236084, "learning_rate": 3.295130108491721e-05, "loss": 0.4133, "num_input_tokens_seen": 7734480, "step": 8080 }, { "epoch": 0.6595154580308344, "grad_norm": 8.565481185913086, "learning_rate": 3.297169426543764e-05, "loss": 0.3831, "num_input_tokens_seen": 7738672, "step": 8085 }, { "epoch": 0.6599233216412431, "grad_norm": 12.924174308776855, "learning_rate": 3.299208744595807e-05, "loss": 0.5043, "num_input_tokens_seen": 7743616, "step": 8090 }, { "epoch": 0.6603311852516518, "grad_norm": 0.6474586725234985, "learning_rate": 3.301248062647851e-05, "loss": 0.222, "num_input_tokens_seen": 7748304, "step": 8095 }, { "epoch": 0.6607390488620605, "grad_norm": 2.6972665786743164, "learning_rate": 3.3032873806998944e-05, "loss": 0.3171, "num_input_tokens_seen": 7752640, "step": 8100 }, { "epoch": 0.6611469124724693, "grad_norm": 4.372838973999023, "learning_rate": 3.305326698751938e-05, "loss": 0.3406, "num_input_tokens_seen": 7757792, "step": 8105 }, { "epoch": 0.6615547760828779, "grad_norm": 6.767300128936768, "learning_rate": 3.307366016803981e-05, "loss": 0.1872, "num_input_tokens_seen": 7762576, "step": 8110 }, { "epoch": 0.6619626396932866, "grad_norm": 9.28697395324707, "learning_rate": 3.309405334856024e-05, "loss": 0.3896, "num_input_tokens_seen": 7767536, "step": 8115 }, { "epoch": 0.6623705033036953, "grad_norm": 5.66070032119751, "learning_rate": 3.311444652908068e-05, "loss": 0.365, "num_input_tokens_seen": 7772608, "step": 8120 }, { "epoch": 0.662778366914104, "grad_norm": 6.182912826538086, "learning_rate": 3.313483970960111e-05, "loss": 0.3662, "num_input_tokens_seen": 7778064, "step": 8125 }, { "epoch": 0.6631862305245126, "grad_norm": 6.693013668060303, "learning_rate": 3.3155232890121545e-05, "loss": 0.4798, "num_input_tokens_seen": 7781888, "step": 8130 }, { "epoch": 0.6635940941349213, "grad_norm": 6.031898021697998, "learning_rate": 3.317562607064198e-05, "loss": 0.3598, "num_input_tokens_seen": 7787328, "step": 8135 }, { "epoch": 0.66400195774533, "grad_norm": 5.188581466674805, "learning_rate": 3.319601925116242e-05, "loss": 0.1591, "num_input_tokens_seen": 7792128, "step": 8140 }, { "epoch": 0.6644098213557387, "grad_norm": 0.5008418560028076, "learning_rate": 3.321641243168285e-05, "loss": 0.1324, "num_input_tokens_seen": 7797536, "step": 8145 }, { "epoch": 0.6648176849661473, "grad_norm": 10.521478652954102, "learning_rate": 3.323680561220328e-05, "loss": 0.5599, "num_input_tokens_seen": 7803296, "step": 8150 }, { "epoch": 0.665225548576556, "grad_norm": 13.279610633850098, "learning_rate": 3.3257198792723714e-05, "loss": 0.531, "num_input_tokens_seen": 7807600, "step": 8155 }, { "epoch": 0.6656334121869647, "grad_norm": 14.753479957580566, "learning_rate": 3.3277591973244146e-05, "loss": 0.4932, "num_input_tokens_seen": 7811600, "step": 8160 }, { "epoch": 0.6660412757973734, "grad_norm": 4.093859672546387, "learning_rate": 3.3297985153764585e-05, "loss": 0.3546, "num_input_tokens_seen": 7815712, "step": 8165 }, { "epoch": 0.666449139407782, "grad_norm": 3.3173489570617676, "learning_rate": 3.331837833428502e-05, "loss": 0.2033, "num_input_tokens_seen": 7821136, "step": 8170 }, { "epoch": 0.6668570030181907, "grad_norm": 8.008049011230469, "learning_rate": 3.333877151480545e-05, "loss": 0.368, "num_input_tokens_seen": 7825584, "step": 8175 }, { "epoch": 0.6672648666285994, "grad_norm": 1.5571495294570923, "learning_rate": 3.335916469532588e-05, "loss": 0.2774, "num_input_tokens_seen": 7829488, "step": 8180 }, { "epoch": 0.6676727302390081, "grad_norm": 1.2416630983352661, "learning_rate": 3.3379557875846315e-05, "loss": 0.1491, "num_input_tokens_seen": 7834480, "step": 8185 }, { "epoch": 0.6680805938494168, "grad_norm": 1.5110069513320923, "learning_rate": 3.3399951056366754e-05, "loss": 0.1843, "num_input_tokens_seen": 7839760, "step": 8190 }, { "epoch": 0.6684884574598254, "grad_norm": 8.653284072875977, "learning_rate": 3.3420344236887187e-05, "loss": 0.3313, "num_input_tokens_seen": 7845216, "step": 8195 }, { "epoch": 0.6688963210702341, "grad_norm": 3.7786710262298584, "learning_rate": 3.344073741740762e-05, "loss": 0.3557, "num_input_tokens_seen": 7850048, "step": 8200 }, { "epoch": 0.6693041846806428, "grad_norm": 10.772663116455078, "learning_rate": 3.346113059792805e-05, "loss": 0.5141, "num_input_tokens_seen": 7854592, "step": 8205 }, { "epoch": 0.6697120482910515, "grad_norm": 7.487743377685547, "learning_rate": 3.3481523778448484e-05, "loss": 0.4533, "num_input_tokens_seen": 7859904, "step": 8210 }, { "epoch": 0.6701199119014601, "grad_norm": 4.657465934753418, "learning_rate": 3.350191695896892e-05, "loss": 0.4537, "num_input_tokens_seen": 7865056, "step": 8215 }, { "epoch": 0.6705277755118688, "grad_norm": 8.75829029083252, "learning_rate": 3.3522310139489355e-05, "loss": 0.3801, "num_input_tokens_seen": 7869840, "step": 8220 }, { "epoch": 0.6709356391222775, "grad_norm": 2.6656901836395264, "learning_rate": 3.354270332000979e-05, "loss": 0.2288, "num_input_tokens_seen": 7874208, "step": 8225 }, { "epoch": 0.6713435027326862, "grad_norm": 4.486512184143066, "learning_rate": 3.356309650053022e-05, "loss": 0.2217, "num_input_tokens_seen": 7878288, "step": 8230 }, { "epoch": 0.6717513663430948, "grad_norm": 0.9407968521118164, "learning_rate": 3.358348968105066e-05, "loss": 0.2196, "num_input_tokens_seen": 7882416, "step": 8235 }, { "epoch": 0.6721592299535035, "grad_norm": 0.8678784966468811, "learning_rate": 3.360388286157109e-05, "loss": 0.268, "num_input_tokens_seen": 7888000, "step": 8240 }, { "epoch": 0.6725670935639122, "grad_norm": 2.522124767303467, "learning_rate": 3.3624276042091524e-05, "loss": 0.3658, "num_input_tokens_seen": 7892880, "step": 8245 }, { "epoch": 0.6729749571743209, "grad_norm": 5.265491008758545, "learning_rate": 3.3644669222611956e-05, "loss": 0.5579, "num_input_tokens_seen": 7897760, "step": 8250 }, { "epoch": 0.6733828207847296, "grad_norm": 26.025108337402344, "learning_rate": 3.3665062403132395e-05, "loss": 0.2999, "num_input_tokens_seen": 7902672, "step": 8255 }, { "epoch": 0.6737906843951382, "grad_norm": 5.532973766326904, "learning_rate": 3.368545558365283e-05, "loss": 0.5145, "num_input_tokens_seen": 7907440, "step": 8260 }, { "epoch": 0.6741985480055469, "grad_norm": 7.402491092681885, "learning_rate": 3.370584876417327e-05, "loss": 0.1863, "num_input_tokens_seen": 7911824, "step": 8265 }, { "epoch": 0.6746064116159556, "grad_norm": 0.31432202458381653, "learning_rate": 3.37262419446937e-05, "loss": 0.1248, "num_input_tokens_seen": 7917184, "step": 8270 }, { "epoch": 0.6750142752263643, "grad_norm": 0.8672139048576355, "learning_rate": 3.374663512521413e-05, "loss": 0.2678, "num_input_tokens_seen": 7921760, "step": 8275 }, { "epoch": 0.6754221388367729, "grad_norm": 0.2627467215061188, "learning_rate": 3.3767028305734564e-05, "loss": 0.1586, "num_input_tokens_seen": 7926192, "step": 8280 }, { "epoch": 0.6758300024471816, "grad_norm": 12.315408706665039, "learning_rate": 3.3787421486255e-05, "loss": 0.8093, "num_input_tokens_seen": 7930976, "step": 8285 }, { "epoch": 0.6762378660575903, "grad_norm": 2.283923625946045, "learning_rate": 3.3807814666775436e-05, "loss": 0.1032, "num_input_tokens_seen": 7935168, "step": 8290 }, { "epoch": 0.6766457296679991, "grad_norm": 9.351886749267578, "learning_rate": 3.382820784729587e-05, "loss": 0.5345, "num_input_tokens_seen": 7939312, "step": 8295 }, { "epoch": 0.6770535932784078, "grad_norm": 3.0385303497314453, "learning_rate": 3.38486010278163e-05, "loss": 0.4873, "num_input_tokens_seen": 7944304, "step": 8300 }, { "epoch": 0.6774614568888164, "grad_norm": 5.744367599487305, "learning_rate": 3.386899420833673e-05, "loss": 0.293, "num_input_tokens_seen": 7948912, "step": 8305 }, { "epoch": 0.6778693204992251, "grad_norm": 1.8724517822265625, "learning_rate": 3.388938738885717e-05, "loss": 0.3768, "num_input_tokens_seen": 7953936, "step": 8310 }, { "epoch": 0.6782771841096338, "grad_norm": 1.776807188987732, "learning_rate": 3.3909780569377604e-05, "loss": 0.2138, "num_input_tokens_seen": 7959104, "step": 8315 }, { "epoch": 0.6786850477200425, "grad_norm": 2.353394031524658, "learning_rate": 3.3930173749898037e-05, "loss": 0.3367, "num_input_tokens_seen": 7963808, "step": 8320 }, { "epoch": 0.6790929113304511, "grad_norm": 2.3006834983825684, "learning_rate": 3.395056693041847e-05, "loss": 0.1726, "num_input_tokens_seen": 7968032, "step": 8325 }, { "epoch": 0.6795007749408598, "grad_norm": 7.661231517791748, "learning_rate": 3.39709601109389e-05, "loss": 0.1616, "num_input_tokens_seen": 7973024, "step": 8330 }, { "epoch": 0.6799086385512685, "grad_norm": 4.550341606140137, "learning_rate": 3.399135329145934e-05, "loss": 0.7135, "num_input_tokens_seen": 7977824, "step": 8335 }, { "epoch": 0.6803165021616772, "grad_norm": 0.2991190254688263, "learning_rate": 3.401174647197977e-05, "loss": 0.5507, "num_input_tokens_seen": 7982896, "step": 8340 }, { "epoch": 0.6807243657720858, "grad_norm": 9.665567398071289, "learning_rate": 3.4032139652500205e-05, "loss": 0.2949, "num_input_tokens_seen": 7987312, "step": 8345 }, { "epoch": 0.6811322293824945, "grad_norm": 2.9282052516937256, "learning_rate": 3.405253283302064e-05, "loss": 0.4259, "num_input_tokens_seen": 7992304, "step": 8350 }, { "epoch": 0.6815400929929032, "grad_norm": 4.5996246337890625, "learning_rate": 3.407292601354107e-05, "loss": 0.376, "num_input_tokens_seen": 7997952, "step": 8355 }, { "epoch": 0.6819479566033119, "grad_norm": 7.505980491638184, "learning_rate": 3.409331919406151e-05, "loss": 0.3489, "num_input_tokens_seen": 8002864, "step": 8360 }, { "epoch": 0.6823558202137205, "grad_norm": 295.44110107421875, "learning_rate": 3.411371237458194e-05, "loss": 0.4353, "num_input_tokens_seen": 8007552, "step": 8365 }, { "epoch": 0.6827636838241292, "grad_norm": 8.687475204467773, "learning_rate": 3.4134105555102374e-05, "loss": 0.5898, "num_input_tokens_seen": 8012896, "step": 8370 }, { "epoch": 0.6831715474345379, "grad_norm": 152.6177978515625, "learning_rate": 3.4154498735622806e-05, "loss": 0.9964, "num_input_tokens_seen": 8017376, "step": 8375 }, { "epoch": 0.6835794110449466, "grad_norm": 2.0049986839294434, "learning_rate": 3.4174891916143245e-05, "loss": 0.184, "num_input_tokens_seen": 8022464, "step": 8380 }, { "epoch": 0.6839872746553552, "grad_norm": 5.407594680786133, "learning_rate": 3.419528509666368e-05, "loss": 0.2401, "num_input_tokens_seen": 8026288, "step": 8385 }, { "epoch": 0.6843951382657639, "grad_norm": 5.2954325675964355, "learning_rate": 3.421567827718411e-05, "loss": 0.5556, "num_input_tokens_seen": 8031408, "step": 8390 }, { "epoch": 0.6848030018761726, "grad_norm": 5.517780780792236, "learning_rate": 3.423607145770454e-05, "loss": 0.355, "num_input_tokens_seen": 8036816, "step": 8395 }, { "epoch": 0.6852108654865813, "grad_norm": 2.9370272159576416, "learning_rate": 3.4256464638224975e-05, "loss": 0.145, "num_input_tokens_seen": 8041120, "step": 8400 }, { "epoch": 0.68561872909699, "grad_norm": 0.5159239768981934, "learning_rate": 3.4276857818745414e-05, "loss": 0.2736, "num_input_tokens_seen": 8046064, "step": 8405 }, { "epoch": 0.6860265927073986, "grad_norm": 5.2244086265563965, "learning_rate": 3.4297250999265846e-05, "loss": 0.3636, "num_input_tokens_seen": 8051200, "step": 8410 }, { "epoch": 0.6864344563178073, "grad_norm": 5.957608222961426, "learning_rate": 3.431764417978628e-05, "loss": 0.3347, "num_input_tokens_seen": 8055936, "step": 8415 }, { "epoch": 0.686842319928216, "grad_norm": 3.562805414199829, "learning_rate": 3.433803736030671e-05, "loss": 0.428, "num_input_tokens_seen": 8060848, "step": 8420 }, { "epoch": 0.6872501835386247, "grad_norm": 1.3540128469467163, "learning_rate": 3.4358430540827144e-05, "loss": 0.2083, "num_input_tokens_seen": 8066240, "step": 8425 }, { "epoch": 0.6876580471490333, "grad_norm": 3.3571507930755615, "learning_rate": 3.437882372134758e-05, "loss": 0.2607, "num_input_tokens_seen": 8070944, "step": 8430 }, { "epoch": 0.688065910759442, "grad_norm": 4.1814985275268555, "learning_rate": 3.4399216901868015e-05, "loss": 0.2763, "num_input_tokens_seen": 8076048, "step": 8435 }, { "epoch": 0.6884737743698507, "grad_norm": 1.6516278982162476, "learning_rate": 3.441961008238845e-05, "loss": 0.2689, "num_input_tokens_seen": 8080848, "step": 8440 }, { "epoch": 0.6888816379802594, "grad_norm": 7.961226940155029, "learning_rate": 3.444000326290889e-05, "loss": 0.295, "num_input_tokens_seen": 8086016, "step": 8445 }, { "epoch": 0.689289501590668, "grad_norm": 0.8317230939865112, "learning_rate": 3.446039644342932e-05, "loss": 0.1534, "num_input_tokens_seen": 8090416, "step": 8450 }, { "epoch": 0.6896973652010767, "grad_norm": 2.377271890640259, "learning_rate": 3.448078962394976e-05, "loss": 0.1067, "num_input_tokens_seen": 8096144, "step": 8455 }, { "epoch": 0.6901052288114854, "grad_norm": 3.3665273189544678, "learning_rate": 3.450118280447019e-05, "loss": 0.3509, "num_input_tokens_seen": 8101696, "step": 8460 }, { "epoch": 0.6905130924218941, "grad_norm": 8.741144180297852, "learning_rate": 3.452157598499062e-05, "loss": 0.5214, "num_input_tokens_seen": 8106416, "step": 8465 }, { "epoch": 0.6909209560323027, "grad_norm": 3.249805450439453, "learning_rate": 3.4541969165511055e-05, "loss": 0.3786, "num_input_tokens_seen": 8110912, "step": 8470 }, { "epoch": 0.6913288196427114, "grad_norm": 2.373617649078369, "learning_rate": 3.456236234603149e-05, "loss": 0.5655, "num_input_tokens_seen": 8115344, "step": 8475 }, { "epoch": 0.6917366832531202, "grad_norm": 6.342329025268555, "learning_rate": 3.458275552655193e-05, "loss": 0.1919, "num_input_tokens_seen": 8119552, "step": 8480 }, { "epoch": 0.6921445468635289, "grad_norm": 1.4734855890274048, "learning_rate": 3.460314870707236e-05, "loss": 0.2091, "num_input_tokens_seen": 8123520, "step": 8485 }, { "epoch": 0.6925524104739376, "grad_norm": 10.877909660339355, "learning_rate": 3.462354188759279e-05, "loss": 0.3076, "num_input_tokens_seen": 8128368, "step": 8490 }, { "epoch": 0.6929602740843462, "grad_norm": 6.304263114929199, "learning_rate": 3.4643935068113224e-05, "loss": 0.4107, "num_input_tokens_seen": 8133408, "step": 8495 }, { "epoch": 0.6933681376947549, "grad_norm": 7.3919806480407715, "learning_rate": 3.4664328248633656e-05, "loss": 0.3267, "num_input_tokens_seen": 8137888, "step": 8500 }, { "epoch": 0.6937760013051636, "grad_norm": 7.56325626373291, "learning_rate": 3.4684721429154095e-05, "loss": 0.5466, "num_input_tokens_seen": 8142624, "step": 8505 }, { "epoch": 0.6941838649155723, "grad_norm": 9.82279109954834, "learning_rate": 3.470511460967453e-05, "loss": 0.2968, "num_input_tokens_seen": 8146208, "step": 8510 }, { "epoch": 0.694591728525981, "grad_norm": 7.486039161682129, "learning_rate": 3.472550779019496e-05, "loss": 0.323, "num_input_tokens_seen": 8150912, "step": 8515 }, { "epoch": 0.6949995921363896, "grad_norm": 8.009119033813477, "learning_rate": 3.474590097071539e-05, "loss": 0.2767, "num_input_tokens_seen": 8155408, "step": 8520 }, { "epoch": 0.6954074557467983, "grad_norm": 12.319746017456055, "learning_rate": 3.476629415123583e-05, "loss": 0.2705, "num_input_tokens_seen": 8160896, "step": 8525 }, { "epoch": 0.695815319357207, "grad_norm": 6.099485397338867, "learning_rate": 3.4786687331756264e-05, "loss": 0.2129, "num_input_tokens_seen": 8165184, "step": 8530 }, { "epoch": 0.6962231829676157, "grad_norm": 5.409004211425781, "learning_rate": 3.4807080512276696e-05, "loss": 0.3489, "num_input_tokens_seen": 8169520, "step": 8535 }, { "epoch": 0.6966310465780243, "grad_norm": 3.413749933242798, "learning_rate": 3.482747369279713e-05, "loss": 0.3469, "num_input_tokens_seen": 8174240, "step": 8540 }, { "epoch": 0.697038910188433, "grad_norm": 14.192168235778809, "learning_rate": 3.484786687331756e-05, "loss": 0.2873, "num_input_tokens_seen": 8178080, "step": 8545 }, { "epoch": 0.6974467737988417, "grad_norm": 0.14293240010738373, "learning_rate": 3.4868260053838e-05, "loss": 0.4355, "num_input_tokens_seen": 8182896, "step": 8550 }, { "epoch": 0.6978546374092504, "grad_norm": 6.410829067230225, "learning_rate": 3.488865323435843e-05, "loss": 0.2302, "num_input_tokens_seen": 8187664, "step": 8555 }, { "epoch": 0.698262501019659, "grad_norm": 0.7954385876655579, "learning_rate": 3.4909046414878865e-05, "loss": 0.3484, "num_input_tokens_seen": 8192768, "step": 8560 }, { "epoch": 0.6986703646300677, "grad_norm": 8.315848350524902, "learning_rate": 3.49294395953993e-05, "loss": 0.1702, "num_input_tokens_seen": 8197088, "step": 8565 }, { "epoch": 0.6990782282404764, "grad_norm": 7.72318172454834, "learning_rate": 3.494983277591973e-05, "loss": 0.2702, "num_input_tokens_seen": 8202032, "step": 8570 }, { "epoch": 0.6994860918508851, "grad_norm": 7.39330530166626, "learning_rate": 3.497022595644017e-05, "loss": 0.2967, "num_input_tokens_seen": 8206192, "step": 8575 }, { "epoch": 0.6998939554612937, "grad_norm": 15.814242362976074, "learning_rate": 3.49906191369606e-05, "loss": 0.4867, "num_input_tokens_seen": 8211392, "step": 8580 }, { "epoch": 0.7003018190717024, "grad_norm": 1.476863145828247, "learning_rate": 3.5011012317481034e-05, "loss": 0.4714, "num_input_tokens_seen": 8215952, "step": 8585 }, { "epoch": 0.7007096826821111, "grad_norm": 2.04992413520813, "learning_rate": 3.5031405498001466e-05, "loss": 0.1761, "num_input_tokens_seen": 8219616, "step": 8590 }, { "epoch": 0.7011175462925198, "grad_norm": 2.3715200424194336, "learning_rate": 3.5051798678521905e-05, "loss": 0.086, "num_input_tokens_seen": 8224848, "step": 8595 }, { "epoch": 0.7015254099029284, "grad_norm": 0.4640612304210663, "learning_rate": 3.507219185904234e-05, "loss": 0.1077, "num_input_tokens_seen": 8228032, "step": 8600 }, { "epoch": 0.7019332735133371, "grad_norm": 0.6360623240470886, "learning_rate": 3.509258503956277e-05, "loss": 0.1113, "num_input_tokens_seen": 8232880, "step": 8605 }, { "epoch": 0.7023411371237458, "grad_norm": 0.10276800394058228, "learning_rate": 3.51129782200832e-05, "loss": 0.7752, "num_input_tokens_seen": 8237216, "step": 8610 }, { "epoch": 0.7027490007341545, "grad_norm": 14.131406784057617, "learning_rate": 3.5133371400603635e-05, "loss": 0.2653, "num_input_tokens_seen": 8242000, "step": 8615 }, { "epoch": 0.7031568643445631, "grad_norm": 0.08831587433815002, "learning_rate": 3.5153764581124074e-05, "loss": 0.5614, "num_input_tokens_seen": 8246784, "step": 8620 }, { "epoch": 0.7035647279549718, "grad_norm": 0.538556694984436, "learning_rate": 3.5174157761644506e-05, "loss": 0.3167, "num_input_tokens_seen": 8251424, "step": 8625 }, { "epoch": 0.7039725915653805, "grad_norm": 0.766830563545227, "learning_rate": 3.5194550942164946e-05, "loss": 0.2745, "num_input_tokens_seen": 8255904, "step": 8630 }, { "epoch": 0.7043804551757892, "grad_norm": 5.406044960021973, "learning_rate": 3.521494412268538e-05, "loss": 0.3786, "num_input_tokens_seen": 8261248, "step": 8635 }, { "epoch": 0.7047883187861979, "grad_norm": 16.872852325439453, "learning_rate": 3.523533730320581e-05, "loss": 0.4247, "num_input_tokens_seen": 8266000, "step": 8640 }, { "epoch": 0.7051961823966065, "grad_norm": 0.9178974032402039, "learning_rate": 3.525573048372624e-05, "loss": 0.4452, "num_input_tokens_seen": 8271648, "step": 8645 }, { "epoch": 0.7056040460070152, "grad_norm": 5.292439937591553, "learning_rate": 3.527612366424668e-05, "loss": 0.1248, "num_input_tokens_seen": 8275888, "step": 8650 }, { "epoch": 0.7060119096174239, "grad_norm": 3.136730670928955, "learning_rate": 3.5296516844767114e-05, "loss": 0.254, "num_input_tokens_seen": 8281280, "step": 8655 }, { "epoch": 0.7064197732278326, "grad_norm": 7.489554405212402, "learning_rate": 3.5316910025287547e-05, "loss": 0.3801, "num_input_tokens_seen": 8286656, "step": 8660 }, { "epoch": 0.7068276368382413, "grad_norm": 3.7171077728271484, "learning_rate": 3.533730320580798e-05, "loss": 0.3317, "num_input_tokens_seen": 8290960, "step": 8665 }, { "epoch": 0.70723550044865, "grad_norm": 2.7009856700897217, "learning_rate": 3.535769638632842e-05, "loss": 0.4093, "num_input_tokens_seen": 8295824, "step": 8670 }, { "epoch": 0.7076433640590587, "grad_norm": 2.1093661785125732, "learning_rate": 3.537808956684885e-05, "loss": 0.0837, "num_input_tokens_seen": 8301360, "step": 8675 }, { "epoch": 0.7080512276694674, "grad_norm": 6.1118316650390625, "learning_rate": 3.539848274736928e-05, "loss": 0.6836, "num_input_tokens_seen": 8306368, "step": 8680 }, { "epoch": 0.708459091279876, "grad_norm": 2.9812943935394287, "learning_rate": 3.5418875927889715e-05, "loss": 0.3531, "num_input_tokens_seen": 8311760, "step": 8685 }, { "epoch": 0.7088669548902847, "grad_norm": 6.685284614562988, "learning_rate": 3.543926910841015e-05, "loss": 0.3717, "num_input_tokens_seen": 8317040, "step": 8690 }, { "epoch": 0.7092748185006934, "grad_norm": 1.88273286819458, "learning_rate": 3.545966228893059e-05, "loss": 0.4345, "num_input_tokens_seen": 8321968, "step": 8695 }, { "epoch": 0.7096826821111021, "grad_norm": 0.2985645532608032, "learning_rate": 3.548005546945102e-05, "loss": 0.2739, "num_input_tokens_seen": 8327056, "step": 8700 }, { "epoch": 0.7100905457215108, "grad_norm": 0.8854671120643616, "learning_rate": 3.550044864997145e-05, "loss": 0.4657, "num_input_tokens_seen": 8332704, "step": 8705 }, { "epoch": 0.7104984093319194, "grad_norm": 1.032921314239502, "learning_rate": 3.5520841830491884e-05, "loss": 0.2557, "num_input_tokens_seen": 8337072, "step": 8710 }, { "epoch": 0.7109062729423281, "grad_norm": 5.020815849304199, "learning_rate": 3.5541235011012316e-05, "loss": 0.2178, "num_input_tokens_seen": 8341024, "step": 8715 }, { "epoch": 0.7113141365527368, "grad_norm": 0.4023021459579468, "learning_rate": 3.5561628191532755e-05, "loss": 0.3004, "num_input_tokens_seen": 8345856, "step": 8720 }, { "epoch": 0.7117220001631455, "grad_norm": 2.2788350582122803, "learning_rate": 3.558202137205319e-05, "loss": 0.2607, "num_input_tokens_seen": 8349520, "step": 8725 }, { "epoch": 0.7121298637735541, "grad_norm": 6.731573104858398, "learning_rate": 3.560241455257362e-05, "loss": 0.2793, "num_input_tokens_seen": 8354224, "step": 8730 }, { "epoch": 0.7125377273839628, "grad_norm": 2.796066999435425, "learning_rate": 3.562280773309405e-05, "loss": 0.7236, "num_input_tokens_seen": 8359904, "step": 8735 }, { "epoch": 0.7129455909943715, "grad_norm": 3.1361844539642334, "learning_rate": 3.564320091361449e-05, "loss": 0.193, "num_input_tokens_seen": 8364080, "step": 8740 }, { "epoch": 0.7133534546047802, "grad_norm": 5.879598617553711, "learning_rate": 3.5663594094134924e-05, "loss": 0.4648, "num_input_tokens_seen": 8368496, "step": 8745 }, { "epoch": 0.7137613182151888, "grad_norm": 1.7217129468917847, "learning_rate": 3.5683987274655356e-05, "loss": 0.2998, "num_input_tokens_seen": 8373232, "step": 8750 }, { "epoch": 0.7141691818255975, "grad_norm": 3.895474910736084, "learning_rate": 3.570438045517579e-05, "loss": 0.2005, "num_input_tokens_seen": 8378064, "step": 8755 }, { "epoch": 0.7145770454360062, "grad_norm": 0.41585880517959595, "learning_rate": 3.572477363569622e-05, "loss": 0.0922, "num_input_tokens_seen": 8382000, "step": 8760 }, { "epoch": 0.7149849090464149, "grad_norm": 15.806160926818848, "learning_rate": 3.574516681621666e-05, "loss": 0.287, "num_input_tokens_seen": 8386784, "step": 8765 }, { "epoch": 0.7153927726568236, "grad_norm": 0.36862191557884216, "learning_rate": 3.576555999673709e-05, "loss": 0.1667, "num_input_tokens_seen": 8391552, "step": 8770 }, { "epoch": 0.7158006362672322, "grad_norm": 3.6373777389526367, "learning_rate": 3.5785953177257525e-05, "loss": 0.5307, "num_input_tokens_seen": 8396304, "step": 8775 }, { "epoch": 0.7162084998776409, "grad_norm": 0.5252140760421753, "learning_rate": 3.580634635777796e-05, "loss": 0.2484, "num_input_tokens_seen": 8401536, "step": 8780 }, { "epoch": 0.7166163634880496, "grad_norm": 2.873396158218384, "learning_rate": 3.582673953829839e-05, "loss": 0.2922, "num_input_tokens_seen": 8406080, "step": 8785 }, { "epoch": 0.7170242270984583, "grad_norm": 0.32037657499313354, "learning_rate": 3.584713271881883e-05, "loss": 0.4308, "num_input_tokens_seen": 8410784, "step": 8790 }, { "epoch": 0.7174320907088669, "grad_norm": 12.554055213928223, "learning_rate": 3.586752589933926e-05, "loss": 0.2699, "num_input_tokens_seen": 8415664, "step": 8795 }, { "epoch": 0.7178399543192756, "grad_norm": 1.6073238849639893, "learning_rate": 3.5887919079859694e-05, "loss": 0.3827, "num_input_tokens_seen": 8420448, "step": 8800 }, { "epoch": 0.7182478179296843, "grad_norm": 10.127340316772461, "learning_rate": 3.5908312260380126e-05, "loss": 0.2533, "num_input_tokens_seen": 8424656, "step": 8805 }, { "epoch": 0.718655681540093, "grad_norm": 2.921640396118164, "learning_rate": 3.5928705440900565e-05, "loss": 0.2944, "num_input_tokens_seen": 8429296, "step": 8810 }, { "epoch": 0.7190635451505016, "grad_norm": 8.381497383117676, "learning_rate": 3.5949098621421004e-05, "loss": 0.3621, "num_input_tokens_seen": 8434528, "step": 8815 }, { "epoch": 0.7194714087609103, "grad_norm": 7.306320667266846, "learning_rate": 3.596949180194144e-05, "loss": 0.462, "num_input_tokens_seen": 8439264, "step": 8820 }, { "epoch": 0.719879272371319, "grad_norm": 7.494168281555176, "learning_rate": 3.598988498246187e-05, "loss": 0.5525, "num_input_tokens_seen": 8443760, "step": 8825 }, { "epoch": 0.7202871359817277, "grad_norm": 1.1048463582992554, "learning_rate": 3.60102781629823e-05, "loss": 0.2975, "num_input_tokens_seen": 8448112, "step": 8830 }, { "epoch": 0.7206949995921363, "grad_norm": 9.016098022460938, "learning_rate": 3.6030671343502734e-05, "loss": 0.3261, "num_input_tokens_seen": 8452448, "step": 8835 }, { "epoch": 0.721102863202545, "grad_norm": 2.1333000659942627, "learning_rate": 3.605106452402317e-05, "loss": 0.3673, "num_input_tokens_seen": 8457200, "step": 8840 }, { "epoch": 0.7215107268129537, "grad_norm": 4.047451019287109, "learning_rate": 3.6071457704543605e-05, "loss": 0.4501, "num_input_tokens_seen": 8461632, "step": 8845 }, { "epoch": 0.7219185904233625, "grad_norm": 3.4342710971832275, "learning_rate": 3.609185088506404e-05, "loss": 0.4154, "num_input_tokens_seen": 8466544, "step": 8850 }, { "epoch": 0.7223264540337712, "grad_norm": 1.2734802961349487, "learning_rate": 3.611224406558447e-05, "loss": 0.2517, "num_input_tokens_seen": 8470960, "step": 8855 }, { "epoch": 0.7227343176441798, "grad_norm": 1.5501455068588257, "learning_rate": 3.61326372461049e-05, "loss": 0.2525, "num_input_tokens_seen": 8475376, "step": 8860 }, { "epoch": 0.7231421812545885, "grad_norm": 0.4159806966781616, "learning_rate": 3.615303042662534e-05, "loss": 0.3355, "num_input_tokens_seen": 8480784, "step": 8865 }, { "epoch": 0.7235500448649972, "grad_norm": 1.291194200515747, "learning_rate": 3.6173423607145774e-05, "loss": 0.2021, "num_input_tokens_seen": 8485328, "step": 8870 }, { "epoch": 0.7239579084754059, "grad_norm": 0.5411566495895386, "learning_rate": 3.6193816787666206e-05, "loss": 0.3565, "num_input_tokens_seen": 8490496, "step": 8875 }, { "epoch": 0.7243657720858145, "grad_norm": 1.1751726865768433, "learning_rate": 3.621420996818664e-05, "loss": 0.218, "num_input_tokens_seen": 8495248, "step": 8880 }, { "epoch": 0.7247736356962232, "grad_norm": 0.3519246578216553, "learning_rate": 3.623460314870708e-05, "loss": 0.3621, "num_input_tokens_seen": 8500128, "step": 8885 }, { "epoch": 0.7251814993066319, "grad_norm": 3.1537842750549316, "learning_rate": 3.625499632922751e-05, "loss": 0.1361, "num_input_tokens_seen": 8504928, "step": 8890 }, { "epoch": 0.7255893629170406, "grad_norm": 9.221778869628906, "learning_rate": 3.627538950974794e-05, "loss": 0.3817, "num_input_tokens_seen": 8509856, "step": 8895 }, { "epoch": 0.7259972265274492, "grad_norm": 9.05040454864502, "learning_rate": 3.6295782690268375e-05, "loss": 0.3229, "num_input_tokens_seen": 8514096, "step": 8900 }, { "epoch": 0.7264050901378579, "grad_norm": 4.781082630157471, "learning_rate": 3.631617587078881e-05, "loss": 0.3379, "num_input_tokens_seen": 8519056, "step": 8905 }, { "epoch": 0.7268129537482666, "grad_norm": 5.397861003875732, "learning_rate": 3.633656905130925e-05, "loss": 0.5035, "num_input_tokens_seen": 8523072, "step": 8910 }, { "epoch": 0.7272208173586753, "grad_norm": 0.35046201944351196, "learning_rate": 3.635696223182968e-05, "loss": 0.5362, "num_input_tokens_seen": 8527104, "step": 8915 }, { "epoch": 0.727628680969084, "grad_norm": 0.7631012201309204, "learning_rate": 3.637735541235011e-05, "loss": 0.2267, "num_input_tokens_seen": 8532320, "step": 8920 }, { "epoch": 0.7280365445794926, "grad_norm": 1.666246771812439, "learning_rate": 3.6397748592870544e-05, "loss": 0.4369, "num_input_tokens_seen": 8537264, "step": 8925 }, { "epoch": 0.7284444081899013, "grad_norm": 0.37540486454963684, "learning_rate": 3.6418141773390976e-05, "loss": 0.3193, "num_input_tokens_seen": 8543168, "step": 8930 }, { "epoch": 0.72885227180031, "grad_norm": 6.954981327056885, "learning_rate": 3.6438534953911415e-05, "loss": 0.2073, "num_input_tokens_seen": 8548656, "step": 8935 }, { "epoch": 0.7292601354107187, "grad_norm": 4.849117755889893, "learning_rate": 3.645892813443185e-05, "loss": 0.4576, "num_input_tokens_seen": 8553952, "step": 8940 }, { "epoch": 0.7296679990211273, "grad_norm": 5.580561637878418, "learning_rate": 3.647932131495228e-05, "loss": 0.4914, "num_input_tokens_seen": 8558848, "step": 8945 }, { "epoch": 0.730075862631536, "grad_norm": 3.613363027572632, "learning_rate": 3.649971449547271e-05, "loss": 0.2297, "num_input_tokens_seen": 8563504, "step": 8950 }, { "epoch": 0.7304837262419447, "grad_norm": 0.4072559177875519, "learning_rate": 3.6520107675993145e-05, "loss": 0.6019, "num_input_tokens_seen": 8568080, "step": 8955 }, { "epoch": 0.7308915898523534, "grad_norm": 6.074283599853516, "learning_rate": 3.6540500856513584e-05, "loss": 0.2745, "num_input_tokens_seen": 8573616, "step": 8960 }, { "epoch": 0.731299453462762, "grad_norm": 4.104914665222168, "learning_rate": 3.6560894037034016e-05, "loss": 0.2808, "num_input_tokens_seen": 8577840, "step": 8965 }, { "epoch": 0.7317073170731707, "grad_norm": 4.697442531585693, "learning_rate": 3.658128721755445e-05, "loss": 0.259, "num_input_tokens_seen": 8583184, "step": 8970 }, { "epoch": 0.7321151806835794, "grad_norm": 2.4230399131774902, "learning_rate": 3.660168039807488e-05, "loss": 0.3488, "num_input_tokens_seen": 8588320, "step": 8975 }, { "epoch": 0.7325230442939881, "grad_norm": 3.4063491821289062, "learning_rate": 3.662207357859532e-05, "loss": 0.3098, "num_input_tokens_seen": 8592912, "step": 8980 }, { "epoch": 0.7329309079043967, "grad_norm": 1.7726211547851562, "learning_rate": 3.664246675911575e-05, "loss": 0.4698, "num_input_tokens_seen": 8597360, "step": 8985 }, { "epoch": 0.7333387715148054, "grad_norm": 0.8059538006782532, "learning_rate": 3.6662859939636185e-05, "loss": 0.1801, "num_input_tokens_seen": 8602416, "step": 8990 }, { "epoch": 0.7337466351252141, "grad_norm": 1.7397571802139282, "learning_rate": 3.668325312015662e-05, "loss": 0.3777, "num_input_tokens_seen": 8607088, "step": 8995 }, { "epoch": 0.7341544987356228, "grad_norm": 4.2567949295043945, "learning_rate": 3.6703646300677057e-05, "loss": 0.3981, "num_input_tokens_seen": 8611008, "step": 9000 }, { "epoch": 0.7345623623460314, "grad_norm": 0.8325409889221191, "learning_rate": 3.672403948119749e-05, "loss": 0.2359, "num_input_tokens_seen": 8615216, "step": 9005 }, { "epoch": 0.7349702259564401, "grad_norm": 3.029027223587036, "learning_rate": 3.674443266171793e-05, "loss": 0.4973, "num_input_tokens_seen": 8620432, "step": 9010 }, { "epoch": 0.7353780895668488, "grad_norm": 5.9850945472717285, "learning_rate": 3.676482584223836e-05, "loss": 0.3164, "num_input_tokens_seen": 8625248, "step": 9015 }, { "epoch": 0.7357859531772575, "grad_norm": 3.9372000694274902, "learning_rate": 3.678521902275879e-05, "loss": 0.374, "num_input_tokens_seen": 8628928, "step": 9020 }, { "epoch": 0.7361938167876662, "grad_norm": 1.2545346021652222, "learning_rate": 3.6805612203279225e-05, "loss": 0.3452, "num_input_tokens_seen": 8634336, "step": 9025 }, { "epoch": 0.7366016803980748, "grad_norm": 1.1987472772598267, "learning_rate": 3.6826005383799664e-05, "loss": 0.3612, "num_input_tokens_seen": 8638496, "step": 9030 }, { "epoch": 0.7370095440084835, "grad_norm": 1.0981847047805786, "learning_rate": 3.68463985643201e-05, "loss": 0.2513, "num_input_tokens_seen": 8643248, "step": 9035 }, { "epoch": 0.7374174076188923, "grad_norm": 1.9048891067504883, "learning_rate": 3.686679174484053e-05, "loss": 0.1714, "num_input_tokens_seen": 8646848, "step": 9040 }, { "epoch": 0.737825271229301, "grad_norm": 4.03552770614624, "learning_rate": 3.688718492536096e-05, "loss": 0.5035, "num_input_tokens_seen": 8652480, "step": 9045 }, { "epoch": 0.7382331348397096, "grad_norm": 9.744843482971191, "learning_rate": 3.6907578105881394e-05, "loss": 0.1255, "num_input_tokens_seen": 8657536, "step": 9050 }, { "epoch": 0.7386409984501183, "grad_norm": 1.9149549007415771, "learning_rate": 3.692797128640183e-05, "loss": 0.3788, "num_input_tokens_seen": 8662576, "step": 9055 }, { "epoch": 0.739048862060527, "grad_norm": 0.7604697346687317, "learning_rate": 3.6948364466922265e-05, "loss": 0.2423, "num_input_tokens_seen": 8667568, "step": 9060 }, { "epoch": 0.7394567256709357, "grad_norm": 1.508899450302124, "learning_rate": 3.69687576474427e-05, "loss": 0.1173, "num_input_tokens_seen": 8672960, "step": 9065 }, { "epoch": 0.7398645892813444, "grad_norm": 10.312590599060059, "learning_rate": 3.698915082796313e-05, "loss": 0.2555, "num_input_tokens_seen": 8678096, "step": 9070 }, { "epoch": 0.740272452891753, "grad_norm": 0.5712976455688477, "learning_rate": 3.700954400848356e-05, "loss": 0.133, "num_input_tokens_seen": 8683248, "step": 9075 }, { "epoch": 0.7406803165021617, "grad_norm": 0.22292229533195496, "learning_rate": 3.7029937189004e-05, "loss": 0.6857, "num_input_tokens_seen": 8687968, "step": 9080 }, { "epoch": 0.7410881801125704, "grad_norm": 15.262531280517578, "learning_rate": 3.7050330369524434e-05, "loss": 0.4021, "num_input_tokens_seen": 8692672, "step": 9085 }, { "epoch": 0.7414960437229791, "grad_norm": 2.411566972732544, "learning_rate": 3.7070723550044866e-05, "loss": 0.1136, "num_input_tokens_seen": 8696736, "step": 9090 }, { "epoch": 0.7419039073333877, "grad_norm": 0.2153564691543579, "learning_rate": 3.70911167305653e-05, "loss": 0.167, "num_input_tokens_seen": 8701504, "step": 9095 }, { "epoch": 0.7423117709437964, "grad_norm": 0.5517135262489319, "learning_rate": 3.711150991108573e-05, "loss": 0.4942, "num_input_tokens_seen": 8707136, "step": 9100 }, { "epoch": 0.7427196345542051, "grad_norm": 8.353586196899414, "learning_rate": 3.713190309160617e-05, "loss": 0.5309, "num_input_tokens_seen": 8711968, "step": 9105 }, { "epoch": 0.7431274981646138, "grad_norm": 11.312336921691895, "learning_rate": 3.71522962721266e-05, "loss": 0.1963, "num_input_tokens_seen": 8716192, "step": 9110 }, { "epoch": 0.7435353617750224, "grad_norm": 5.712327480316162, "learning_rate": 3.7172689452647035e-05, "loss": 0.3934, "num_input_tokens_seen": 8721344, "step": 9115 }, { "epoch": 0.7439432253854311, "grad_norm": 5.115220069885254, "learning_rate": 3.719308263316747e-05, "loss": 0.5198, "num_input_tokens_seen": 8726416, "step": 9120 }, { "epoch": 0.7443510889958398, "grad_norm": 0.41294190287590027, "learning_rate": 3.7213475813687907e-05, "loss": 0.2954, "num_input_tokens_seen": 8731104, "step": 9125 }, { "epoch": 0.7447589526062485, "grad_norm": 0.9321710467338562, "learning_rate": 3.723386899420834e-05, "loss": 0.2033, "num_input_tokens_seen": 8734960, "step": 9130 }, { "epoch": 0.7451668162166571, "grad_norm": 0.8668684959411621, "learning_rate": 3.725426217472877e-05, "loss": 0.4157, "num_input_tokens_seen": 8739152, "step": 9135 }, { "epoch": 0.7455746798270658, "grad_norm": 3.3458828926086426, "learning_rate": 3.7274655355249204e-05, "loss": 0.5061, "num_input_tokens_seen": 8743872, "step": 9140 }, { "epoch": 0.7459825434374745, "grad_norm": 0.8810086846351624, "learning_rate": 3.7295048535769636e-05, "loss": 0.4539, "num_input_tokens_seen": 8749104, "step": 9145 }, { "epoch": 0.7463904070478832, "grad_norm": 1.4395934343338013, "learning_rate": 3.7315441716290075e-05, "loss": 0.3407, "num_input_tokens_seen": 8753616, "step": 9150 }, { "epoch": 0.7467982706582919, "grad_norm": 0.6105338335037231, "learning_rate": 3.733583489681051e-05, "loss": 0.259, "num_input_tokens_seen": 8757680, "step": 9155 }, { "epoch": 0.7472061342687005, "grad_norm": 4.830454349517822, "learning_rate": 3.735622807733094e-05, "loss": 0.3151, "num_input_tokens_seen": 8763184, "step": 9160 }, { "epoch": 0.7476139978791092, "grad_norm": 7.883775234222412, "learning_rate": 3.737662125785137e-05, "loss": 0.3849, "num_input_tokens_seen": 8768416, "step": 9165 }, { "epoch": 0.7480218614895179, "grad_norm": 2.6770942211151123, "learning_rate": 3.7397014438371805e-05, "loss": 0.6285, "num_input_tokens_seen": 8772800, "step": 9170 }, { "epoch": 0.7484297250999266, "grad_norm": 3.8550949096679688, "learning_rate": 3.7417407618892244e-05, "loss": 0.2268, "num_input_tokens_seen": 8777024, "step": 9175 }, { "epoch": 0.7488375887103352, "grad_norm": 3.7241461277008057, "learning_rate": 3.7437800799412676e-05, "loss": 0.2132, "num_input_tokens_seen": 8781840, "step": 9180 }, { "epoch": 0.7492454523207439, "grad_norm": 0.8052108883857727, "learning_rate": 3.745819397993311e-05, "loss": 0.1653, "num_input_tokens_seen": 8787344, "step": 9185 }, { "epoch": 0.7496533159311526, "grad_norm": 3.6578476428985596, "learning_rate": 3.747858716045355e-05, "loss": 0.5065, "num_input_tokens_seen": 8793200, "step": 9190 }, { "epoch": 0.7500611795415613, "grad_norm": 1.8459763526916504, "learning_rate": 3.749898034097398e-05, "loss": 0.424, "num_input_tokens_seen": 8798096, "step": 9195 }, { "epoch": 0.7504690431519699, "grad_norm": 4.516590118408203, "learning_rate": 3.751937352149442e-05, "loss": 0.1665, "num_input_tokens_seen": 8803504, "step": 9200 }, { "epoch": 0.7508769067623786, "grad_norm": 4.950534820556641, "learning_rate": 3.753976670201485e-05, "loss": 0.1916, "num_input_tokens_seen": 8808000, "step": 9205 }, { "epoch": 0.7512847703727873, "grad_norm": 6.245016098022461, "learning_rate": 3.7560159882535284e-05, "loss": 0.1394, "num_input_tokens_seen": 8813344, "step": 9210 }, { "epoch": 0.751692633983196, "grad_norm": 4.968808174133301, "learning_rate": 3.7580553063055716e-05, "loss": 0.2226, "num_input_tokens_seen": 8817952, "step": 9215 }, { "epoch": 0.7521004975936046, "grad_norm": 1.4007521867752075, "learning_rate": 3.760094624357615e-05, "loss": 0.2569, "num_input_tokens_seen": 8823648, "step": 9220 }, { "epoch": 0.7525083612040134, "grad_norm": 5.271491050720215, "learning_rate": 3.762133942409659e-05, "loss": 0.2049, "num_input_tokens_seen": 8828672, "step": 9225 }, { "epoch": 0.7529162248144221, "grad_norm": 17.955766677856445, "learning_rate": 3.764173260461702e-05, "loss": 0.5021, "num_input_tokens_seen": 8832736, "step": 9230 }, { "epoch": 0.7533240884248308, "grad_norm": 0.7459152340888977, "learning_rate": 3.766212578513745e-05, "loss": 0.1507, "num_input_tokens_seen": 8837712, "step": 9235 }, { "epoch": 0.7537319520352395, "grad_norm": 12.481998443603516, "learning_rate": 3.7682518965657885e-05, "loss": 0.3315, "num_input_tokens_seen": 8841600, "step": 9240 }, { "epoch": 0.7541398156456481, "grad_norm": 8.2741060256958, "learning_rate": 3.770291214617832e-05, "loss": 0.6597, "num_input_tokens_seen": 8846480, "step": 9245 }, { "epoch": 0.7545476792560568, "grad_norm": 0.17079725861549377, "learning_rate": 3.772330532669876e-05, "loss": 0.5242, "num_input_tokens_seen": 8851120, "step": 9250 }, { "epoch": 0.7549555428664655, "grad_norm": 4.68863582611084, "learning_rate": 3.774369850721919e-05, "loss": 0.3374, "num_input_tokens_seen": 8855232, "step": 9255 }, { "epoch": 0.7553634064768742, "grad_norm": 13.163915634155273, "learning_rate": 3.776409168773962e-05, "loss": 0.4172, "num_input_tokens_seen": 8859584, "step": 9260 }, { "epoch": 0.7557712700872828, "grad_norm": 5.111806392669678, "learning_rate": 3.7784484868260054e-05, "loss": 0.2058, "num_input_tokens_seen": 8863728, "step": 9265 }, { "epoch": 0.7561791336976915, "grad_norm": 8.367329597473145, "learning_rate": 3.780487804878049e-05, "loss": 0.5133, "num_input_tokens_seen": 8868720, "step": 9270 }, { "epoch": 0.7565869973081002, "grad_norm": 14.906713485717773, "learning_rate": 3.7825271229300925e-05, "loss": 0.4264, "num_input_tokens_seen": 8873696, "step": 9275 }, { "epoch": 0.7569948609185089, "grad_norm": 12.203692436218262, "learning_rate": 3.784566440982136e-05, "loss": 0.3946, "num_input_tokens_seen": 8878480, "step": 9280 }, { "epoch": 0.7574027245289175, "grad_norm": 5.709881782531738, "learning_rate": 3.786605759034179e-05, "loss": 0.3429, "num_input_tokens_seen": 8883808, "step": 9285 }, { "epoch": 0.7578105881393262, "grad_norm": 0.3828015625476837, "learning_rate": 3.788645077086222e-05, "loss": 0.1677, "num_input_tokens_seen": 8888032, "step": 9290 }, { "epoch": 0.7582184517497349, "grad_norm": 7.878770351409912, "learning_rate": 3.790684395138266e-05, "loss": 0.4229, "num_input_tokens_seen": 8893776, "step": 9295 }, { "epoch": 0.7586263153601436, "grad_norm": 2.837658643722534, "learning_rate": 3.7927237131903094e-05, "loss": 0.2946, "num_input_tokens_seen": 8898352, "step": 9300 }, { "epoch": 0.7590341789705523, "grad_norm": 5.560368061065674, "learning_rate": 3.7947630312423526e-05, "loss": 0.3434, "num_input_tokens_seen": 8903008, "step": 9305 }, { "epoch": 0.7594420425809609, "grad_norm": 0.8416566252708435, "learning_rate": 3.796802349294396e-05, "loss": 0.173, "num_input_tokens_seen": 8907696, "step": 9310 }, { "epoch": 0.7598499061913696, "grad_norm": 4.854750633239746, "learning_rate": 3.798841667346439e-05, "loss": 0.3329, "num_input_tokens_seen": 8912416, "step": 9315 }, { "epoch": 0.7602577698017783, "grad_norm": 5.670470237731934, "learning_rate": 3.800880985398483e-05, "loss": 0.6474, "num_input_tokens_seen": 8917072, "step": 9320 }, { "epoch": 0.760665633412187, "grad_norm": 0.3713097870349884, "learning_rate": 3.802920303450526e-05, "loss": 0.1269, "num_input_tokens_seen": 8922416, "step": 9325 }, { "epoch": 0.7610734970225956, "grad_norm": 5.018829345703125, "learning_rate": 3.8049596215025695e-05, "loss": 0.4257, "num_input_tokens_seen": 8927136, "step": 9330 }, { "epoch": 0.7614813606330043, "grad_norm": 4.033553600311279, "learning_rate": 3.806998939554613e-05, "loss": 0.1182, "num_input_tokens_seen": 8931840, "step": 9335 }, { "epoch": 0.761889224243413, "grad_norm": 7.107000350952148, "learning_rate": 3.8090382576066567e-05, "loss": 0.2762, "num_input_tokens_seen": 8937536, "step": 9340 }, { "epoch": 0.7622970878538217, "grad_norm": 0.813744306564331, "learning_rate": 3.8110775756587e-05, "loss": 0.2056, "num_input_tokens_seen": 8942448, "step": 9345 }, { "epoch": 0.7627049514642303, "grad_norm": 0.9535072445869446, "learning_rate": 3.813116893710743e-05, "loss": 0.2405, "num_input_tokens_seen": 8946944, "step": 9350 }, { "epoch": 0.763112815074639, "grad_norm": 11.042513847351074, "learning_rate": 3.8151562117627864e-05, "loss": 0.3504, "num_input_tokens_seen": 8952000, "step": 9355 }, { "epoch": 0.7635206786850477, "grad_norm": 2.512115001678467, "learning_rate": 3.8171955298148296e-05, "loss": 0.3766, "num_input_tokens_seen": 8956128, "step": 9360 }, { "epoch": 0.7639285422954564, "grad_norm": 2.394397258758545, "learning_rate": 3.8192348478668735e-05, "loss": 0.3077, "num_input_tokens_seen": 8961344, "step": 9365 }, { "epoch": 0.764336405905865, "grad_norm": 2.949530601501465, "learning_rate": 3.821274165918917e-05, "loss": 0.6509, "num_input_tokens_seen": 8965264, "step": 9370 }, { "epoch": 0.7647442695162737, "grad_norm": 6.287330150604248, "learning_rate": 3.823313483970961e-05, "loss": 0.506, "num_input_tokens_seen": 8969248, "step": 9375 }, { "epoch": 0.7651521331266824, "grad_norm": 2.4488399028778076, "learning_rate": 3.825352802023004e-05, "loss": 0.4065, "num_input_tokens_seen": 8973840, "step": 9380 }, { "epoch": 0.7655599967370911, "grad_norm": 2.6105401515960693, "learning_rate": 3.827392120075047e-05, "loss": 0.2035, "num_input_tokens_seen": 8978560, "step": 9385 }, { "epoch": 0.7659678603474998, "grad_norm": 5.00554895401001, "learning_rate": 3.8294314381270904e-05, "loss": 0.2231, "num_input_tokens_seen": 8983904, "step": 9390 }, { "epoch": 0.7663757239579084, "grad_norm": 0.8865417242050171, "learning_rate": 3.831470756179134e-05, "loss": 0.4343, "num_input_tokens_seen": 8989424, "step": 9395 }, { "epoch": 0.7667835875683171, "grad_norm": 5.551746845245361, "learning_rate": 3.8335100742311775e-05, "loss": 0.2391, "num_input_tokens_seen": 8994832, "step": 9400 }, { "epoch": 0.7671914511787258, "grad_norm": 4.42197847366333, "learning_rate": 3.835549392283221e-05, "loss": 0.3303, "num_input_tokens_seen": 8999488, "step": 9405 }, { "epoch": 0.7675993147891346, "grad_norm": 6.349663257598877, "learning_rate": 3.837588710335264e-05, "loss": 0.1736, "num_input_tokens_seen": 9004560, "step": 9410 }, { "epoch": 0.7680071783995432, "grad_norm": 4.308825969696045, "learning_rate": 3.839628028387308e-05, "loss": 0.1731, "num_input_tokens_seen": 9009424, "step": 9415 }, { "epoch": 0.7684150420099519, "grad_norm": 4.314601898193359, "learning_rate": 3.841667346439351e-05, "loss": 0.2969, "num_input_tokens_seen": 9014096, "step": 9420 }, { "epoch": 0.7688229056203606, "grad_norm": 3.300293445587158, "learning_rate": 3.8437066644913944e-05, "loss": 0.1456, "num_input_tokens_seen": 9018208, "step": 9425 }, { "epoch": 0.7692307692307693, "grad_norm": 9.6439208984375, "learning_rate": 3.8457459825434376e-05, "loss": 0.166, "num_input_tokens_seen": 9023392, "step": 9430 }, { "epoch": 0.769638632841178, "grad_norm": 0.3352242410182953, "learning_rate": 3.847785300595481e-05, "loss": 0.1604, "num_input_tokens_seen": 9027712, "step": 9435 }, { "epoch": 0.7700464964515866, "grad_norm": 5.617564678192139, "learning_rate": 3.849824618647525e-05, "loss": 0.2331, "num_input_tokens_seen": 9032192, "step": 9440 }, { "epoch": 0.7704543600619953, "grad_norm": 13.136704444885254, "learning_rate": 3.851863936699568e-05, "loss": 0.4295, "num_input_tokens_seen": 9036976, "step": 9445 }, { "epoch": 0.770862223672404, "grad_norm": 3.5636003017425537, "learning_rate": 3.853903254751611e-05, "loss": 0.3919, "num_input_tokens_seen": 9041712, "step": 9450 }, { "epoch": 0.7712700872828127, "grad_norm": 0.36557233333587646, "learning_rate": 3.8559425728036545e-05, "loss": 0.158, "num_input_tokens_seen": 9046656, "step": 9455 }, { "epoch": 0.7716779508932213, "grad_norm": 10.887154579162598, "learning_rate": 3.857981890855698e-05, "loss": 0.5631, "num_input_tokens_seen": 9051632, "step": 9460 }, { "epoch": 0.77208581450363, "grad_norm": 5.697245121002197, "learning_rate": 3.8600212089077417e-05, "loss": 0.2337, "num_input_tokens_seen": 9056896, "step": 9465 }, { "epoch": 0.7724936781140387, "grad_norm": 0.1799808293581009, "learning_rate": 3.862060526959785e-05, "loss": 0.0646, "num_input_tokens_seen": 9062208, "step": 9470 }, { "epoch": 0.7729015417244474, "grad_norm": 0.7943159341812134, "learning_rate": 3.864099845011828e-05, "loss": 0.0749, "num_input_tokens_seen": 9066032, "step": 9475 }, { "epoch": 0.773309405334856, "grad_norm": 6.7921223640441895, "learning_rate": 3.8661391630638714e-05, "loss": 0.4434, "num_input_tokens_seen": 9070464, "step": 9480 }, { "epoch": 0.7737172689452647, "grad_norm": 6.0015106201171875, "learning_rate": 3.868178481115915e-05, "loss": 0.1554, "num_input_tokens_seen": 9075456, "step": 9485 }, { "epoch": 0.7741251325556734, "grad_norm": 9.40984058380127, "learning_rate": 3.8702177991679585e-05, "loss": 0.2551, "num_input_tokens_seen": 9080080, "step": 9490 }, { "epoch": 0.7745329961660821, "grad_norm": 5.3652448654174805, "learning_rate": 3.872257117220002e-05, "loss": 0.258, "num_input_tokens_seen": 9085056, "step": 9495 }, { "epoch": 0.7749408597764907, "grad_norm": 0.7279713749885559, "learning_rate": 3.874296435272045e-05, "loss": 0.0622, "num_input_tokens_seen": 9089248, "step": 9500 }, { "epoch": 0.7753487233868994, "grad_norm": 0.06282295286655426, "learning_rate": 3.876335753324088e-05, "loss": 0.41, "num_input_tokens_seen": 9094400, "step": 9505 }, { "epoch": 0.7757565869973081, "grad_norm": 0.08963970839977264, "learning_rate": 3.878375071376132e-05, "loss": 0.3719, "num_input_tokens_seen": 9099360, "step": 9510 }, { "epoch": 0.7761644506077168, "grad_norm": 11.525443077087402, "learning_rate": 3.8804143894281754e-05, "loss": 0.3924, "num_input_tokens_seen": 9103600, "step": 9515 }, { "epoch": 0.7765723142181254, "grad_norm": 2.770169496536255, "learning_rate": 3.8824537074802186e-05, "loss": 0.6419, "num_input_tokens_seen": 9108704, "step": 9520 }, { "epoch": 0.7769801778285341, "grad_norm": 0.929131031036377, "learning_rate": 3.884493025532262e-05, "loss": 0.1444, "num_input_tokens_seen": 9113776, "step": 9525 }, { "epoch": 0.7773880414389428, "grad_norm": 2.4669649600982666, "learning_rate": 3.886532343584305e-05, "loss": 0.1858, "num_input_tokens_seen": 9119136, "step": 9530 }, { "epoch": 0.7777959050493515, "grad_norm": 0.7616574168205261, "learning_rate": 3.888571661636349e-05, "loss": 0.4404, "num_input_tokens_seen": 9124416, "step": 9535 }, { "epoch": 0.7782037686597602, "grad_norm": 11.722362518310547, "learning_rate": 3.890610979688392e-05, "loss": 0.1641, "num_input_tokens_seen": 9129056, "step": 9540 }, { "epoch": 0.7786116322701688, "grad_norm": 1.5035568475723267, "learning_rate": 3.8926502977404355e-05, "loss": 0.1714, "num_input_tokens_seen": 9133552, "step": 9545 }, { "epoch": 0.7790194958805775, "grad_norm": 15.039284706115723, "learning_rate": 3.894689615792479e-05, "loss": 0.6592, "num_input_tokens_seen": 9137392, "step": 9550 }, { "epoch": 0.7794273594909862, "grad_norm": 12.083220481872559, "learning_rate": 3.8967289338445226e-05, "loss": 0.5489, "num_input_tokens_seen": 9141920, "step": 9555 }, { "epoch": 0.7798352231013949, "grad_norm": 0.2479516565799713, "learning_rate": 3.8987682518965666e-05, "loss": 0.5563, "num_input_tokens_seen": 9147152, "step": 9560 }, { "epoch": 0.7802430867118035, "grad_norm": 0.7516690492630005, "learning_rate": 3.90080756994861e-05, "loss": 0.187, "num_input_tokens_seen": 9152400, "step": 9565 }, { "epoch": 0.7806509503222122, "grad_norm": 0.6394683122634888, "learning_rate": 3.902846888000653e-05, "loss": 0.3811, "num_input_tokens_seen": 9157360, "step": 9570 }, { "epoch": 0.7810588139326209, "grad_norm": 0.6151845455169678, "learning_rate": 3.904886206052696e-05, "loss": 0.4427, "num_input_tokens_seen": 9161648, "step": 9575 }, { "epoch": 0.7814666775430296, "grad_norm": 5.785190105438232, "learning_rate": 3.9069255241047395e-05, "loss": 0.2209, "num_input_tokens_seen": 9167264, "step": 9580 }, { "epoch": 0.7818745411534382, "grad_norm": 11.321887016296387, "learning_rate": 3.9089648421567834e-05, "loss": 0.2917, "num_input_tokens_seen": 9171904, "step": 9585 }, { "epoch": 0.7822824047638469, "grad_norm": 4.572163105010986, "learning_rate": 3.911004160208827e-05, "loss": 0.2729, "num_input_tokens_seen": 9176976, "step": 9590 }, { "epoch": 0.7826902683742557, "grad_norm": 0.40897053480148315, "learning_rate": 3.91304347826087e-05, "loss": 0.4403, "num_input_tokens_seen": 9181616, "step": 9595 }, { "epoch": 0.7830981319846644, "grad_norm": 1.5749402046203613, "learning_rate": 3.915082796312913e-05, "loss": 0.377, "num_input_tokens_seen": 9186960, "step": 9600 }, { "epoch": 0.7835059955950731, "grad_norm": 3.3066492080688477, "learning_rate": 3.9171221143649564e-05, "loss": 0.6138, "num_input_tokens_seen": 9191728, "step": 9605 }, { "epoch": 0.7839138592054817, "grad_norm": 3.398756742477417, "learning_rate": 3.919161432417e-05, "loss": 0.3644, "num_input_tokens_seen": 9195552, "step": 9610 }, { "epoch": 0.7843217228158904, "grad_norm": 31.307405471801758, "learning_rate": 3.9212007504690435e-05, "loss": 0.5709, "num_input_tokens_seen": 9200720, "step": 9615 }, { "epoch": 0.7847295864262991, "grad_norm": 6.3591108322143555, "learning_rate": 3.923240068521087e-05, "loss": 0.374, "num_input_tokens_seen": 9205920, "step": 9620 }, { "epoch": 0.7851374500367078, "grad_norm": 3.461759567260742, "learning_rate": 3.92527938657313e-05, "loss": 0.1532, "num_input_tokens_seen": 9210384, "step": 9625 }, { "epoch": 0.7855453136471164, "grad_norm": 0.5074048638343811, "learning_rate": 3.927318704625174e-05, "loss": 0.2146, "num_input_tokens_seen": 9214720, "step": 9630 }, { "epoch": 0.7859531772575251, "grad_norm": 1.1362043619155884, "learning_rate": 3.929358022677217e-05, "loss": 0.3009, "num_input_tokens_seen": 9219200, "step": 9635 }, { "epoch": 0.7863610408679338, "grad_norm": 2.4000980854034424, "learning_rate": 3.9313973407292604e-05, "loss": 0.2012, "num_input_tokens_seen": 9223696, "step": 9640 }, { "epoch": 0.7867689044783425, "grad_norm": 3.422388792037964, "learning_rate": 3.9334366587813036e-05, "loss": 0.2422, "num_input_tokens_seen": 9228384, "step": 9645 }, { "epoch": 0.7871767680887511, "grad_norm": 0.8518636226654053, "learning_rate": 3.935475976833347e-05, "loss": 0.2169, "num_input_tokens_seen": 9233712, "step": 9650 }, { "epoch": 0.7875846316991598, "grad_norm": 0.2880246043205261, "learning_rate": 3.937515294885391e-05, "loss": 0.3012, "num_input_tokens_seen": 9238048, "step": 9655 }, { "epoch": 0.7879924953095685, "grad_norm": 6.0175862312316895, "learning_rate": 3.939554612937434e-05, "loss": 0.2179, "num_input_tokens_seen": 9242544, "step": 9660 }, { "epoch": 0.7884003589199772, "grad_norm": 0.8202518224716187, "learning_rate": 3.941593930989477e-05, "loss": 0.5404, "num_input_tokens_seen": 9247344, "step": 9665 }, { "epoch": 0.7888082225303858, "grad_norm": 33.825870513916016, "learning_rate": 3.9436332490415205e-05, "loss": 0.3312, "num_input_tokens_seen": 9252304, "step": 9670 }, { "epoch": 0.7892160861407945, "grad_norm": 1.6269829273223877, "learning_rate": 3.945672567093564e-05, "loss": 0.3355, "num_input_tokens_seen": 9257184, "step": 9675 }, { "epoch": 0.7896239497512032, "grad_norm": 31.866741180419922, "learning_rate": 3.9477118851456076e-05, "loss": 0.3579, "num_input_tokens_seen": 9261632, "step": 9680 }, { "epoch": 0.7900318133616119, "grad_norm": 0.16740287840366364, "learning_rate": 3.949751203197651e-05, "loss": 0.2602, "num_input_tokens_seen": 9266944, "step": 9685 }, { "epoch": 0.7904396769720206, "grad_norm": 2.921787738800049, "learning_rate": 3.951790521249694e-05, "loss": 0.154, "num_input_tokens_seen": 9270352, "step": 9690 }, { "epoch": 0.7908475405824292, "grad_norm": 5.5164923667907715, "learning_rate": 3.9538298393017374e-05, "loss": 0.515, "num_input_tokens_seen": 9275200, "step": 9695 }, { "epoch": 0.7912554041928379, "grad_norm": 2.865252733230591, "learning_rate": 3.955869157353781e-05, "loss": 0.2615, "num_input_tokens_seen": 9280528, "step": 9700 }, { "epoch": 0.7916632678032466, "grad_norm": 2.9685275554656982, "learning_rate": 3.9579084754058245e-05, "loss": 0.2385, "num_input_tokens_seen": 9285008, "step": 9705 }, { "epoch": 0.7920711314136553, "grad_norm": 0.26982513070106506, "learning_rate": 3.959947793457868e-05, "loss": 0.3687, "num_input_tokens_seen": 9288352, "step": 9710 }, { "epoch": 0.7924789950240639, "grad_norm": 1.6464945077896118, "learning_rate": 3.961987111509911e-05, "loss": 0.3676, "num_input_tokens_seen": 9292160, "step": 9715 }, { "epoch": 0.7928868586344726, "grad_norm": 0.6316019296646118, "learning_rate": 3.964026429561954e-05, "loss": 0.3144, "num_input_tokens_seen": 9296656, "step": 9720 }, { "epoch": 0.7932947222448813, "grad_norm": 5.0262579917907715, "learning_rate": 3.966065747613998e-05, "loss": 0.4808, "num_input_tokens_seen": 9301408, "step": 9725 }, { "epoch": 0.79370258585529, "grad_norm": 9.76951789855957, "learning_rate": 3.9681050656660414e-05, "loss": 0.5625, "num_input_tokens_seen": 9306704, "step": 9730 }, { "epoch": 0.7941104494656986, "grad_norm": 9.899516105651855, "learning_rate": 3.9701443837180846e-05, "loss": 0.2372, "num_input_tokens_seen": 9311696, "step": 9735 }, { "epoch": 0.7945183130761073, "grad_norm": 6.981930255889893, "learning_rate": 3.972183701770128e-05, "loss": 0.2465, "num_input_tokens_seen": 9315728, "step": 9740 }, { "epoch": 0.794926176686516, "grad_norm": 0.14204315841197968, "learning_rate": 3.974223019822172e-05, "loss": 0.0711, "num_input_tokens_seen": 9320880, "step": 9745 }, { "epoch": 0.7953340402969247, "grad_norm": 8.770439147949219, "learning_rate": 3.976262337874215e-05, "loss": 0.408, "num_input_tokens_seen": 9325280, "step": 9750 }, { "epoch": 0.7957419039073333, "grad_norm": 4.341379642486572, "learning_rate": 3.978301655926259e-05, "loss": 0.3586, "num_input_tokens_seen": 9328880, "step": 9755 }, { "epoch": 0.796149767517742, "grad_norm": 3.726335287094116, "learning_rate": 3.980340973978302e-05, "loss": 0.1174, "num_input_tokens_seen": 9334256, "step": 9760 }, { "epoch": 0.7965576311281507, "grad_norm": 8.586880683898926, "learning_rate": 3.9823802920303454e-05, "loss": 0.5412, "num_input_tokens_seen": 9338512, "step": 9765 }, { "epoch": 0.7969654947385594, "grad_norm": 13.041607856750488, "learning_rate": 3.9844196100823886e-05, "loss": 0.3498, "num_input_tokens_seen": 9344048, "step": 9770 }, { "epoch": 0.797373358348968, "grad_norm": 8.776028633117676, "learning_rate": 3.9864589281344326e-05, "loss": 0.1962, "num_input_tokens_seen": 9348720, "step": 9775 }, { "epoch": 0.7977812219593767, "grad_norm": 5.490469455718994, "learning_rate": 3.988498246186476e-05, "loss": 0.5778, "num_input_tokens_seen": 9353664, "step": 9780 }, { "epoch": 0.7981890855697855, "grad_norm": 2.847912549972534, "learning_rate": 3.990537564238519e-05, "loss": 0.387, "num_input_tokens_seen": 9358512, "step": 9785 }, { "epoch": 0.7985969491801942, "grad_norm": 5.253718376159668, "learning_rate": 3.992576882290562e-05, "loss": 0.3032, "num_input_tokens_seen": 9363520, "step": 9790 }, { "epoch": 0.7990048127906029, "grad_norm": 9.873558044433594, "learning_rate": 3.9946162003426055e-05, "loss": 0.2297, "num_input_tokens_seen": 9368304, "step": 9795 }, { "epoch": 0.7994126764010115, "grad_norm": 4.6480183601379395, "learning_rate": 3.9966555183946494e-05, "loss": 0.3643, "num_input_tokens_seen": 9372160, "step": 9800 }, { "epoch": 0.7998205400114202, "grad_norm": 6.611136436462402, "learning_rate": 3.9986948364466927e-05, "loss": 0.3903, "num_input_tokens_seen": 9377328, "step": 9805 }, { "epoch": 0.8002284036218289, "grad_norm": 13.850024223327637, "learning_rate": 4.000734154498736e-05, "loss": 0.3251, "num_input_tokens_seen": 9382544, "step": 9810 }, { "epoch": 0.8006362672322376, "grad_norm": 3.5177218914031982, "learning_rate": 4.002773472550779e-05, "loss": 0.331, "num_input_tokens_seen": 9386816, "step": 9815 }, { "epoch": 0.8010441308426463, "grad_norm": 1.9308900833129883, "learning_rate": 4.0048127906028224e-05, "loss": 0.3587, "num_input_tokens_seen": 9391200, "step": 9820 }, { "epoch": 0.8014519944530549, "grad_norm": 30.675922393798828, "learning_rate": 4.006852108654866e-05, "loss": 0.4046, "num_input_tokens_seen": 9396256, "step": 9825 }, { "epoch": 0.8018598580634636, "grad_norm": 3.445012092590332, "learning_rate": 4.0088914267069095e-05, "loss": 0.3688, "num_input_tokens_seen": 9400848, "step": 9830 }, { "epoch": 0.8022677216738723, "grad_norm": 3.8746211528778076, "learning_rate": 4.010930744758953e-05, "loss": 0.4371, "num_input_tokens_seen": 9404144, "step": 9835 }, { "epoch": 0.802675585284281, "grad_norm": 1.182396411895752, "learning_rate": 4.012970062810996e-05, "loss": 0.4258, "num_input_tokens_seen": 9408688, "step": 9840 }, { "epoch": 0.8030834488946896, "grad_norm": 1.2851287126541138, "learning_rate": 4.015009380863039e-05, "loss": 0.2053, "num_input_tokens_seen": 9413264, "step": 9845 }, { "epoch": 0.8034913125050983, "grad_norm": 2.6314430236816406, "learning_rate": 4.017048698915083e-05, "loss": 0.355, "num_input_tokens_seen": 9417616, "step": 9850 }, { "epoch": 0.803899176115507, "grad_norm": 0.614959716796875, "learning_rate": 4.0190880169671264e-05, "loss": 0.1865, "num_input_tokens_seen": 9422400, "step": 9855 }, { "epoch": 0.8043070397259157, "grad_norm": 0.6968668699264526, "learning_rate": 4.0211273350191696e-05, "loss": 0.304, "num_input_tokens_seen": 9427200, "step": 9860 }, { "epoch": 0.8047149033363243, "grad_norm": 1.0568454265594482, "learning_rate": 4.023166653071213e-05, "loss": 0.2316, "num_input_tokens_seen": 9431760, "step": 9865 }, { "epoch": 0.805122766946733, "grad_norm": 3.3513667583465576, "learning_rate": 4.025205971123257e-05, "loss": 0.1815, "num_input_tokens_seen": 9436304, "step": 9870 }, { "epoch": 0.8055306305571417, "grad_norm": 3.9959945678710938, "learning_rate": 4.0272452891753e-05, "loss": 0.3067, "num_input_tokens_seen": 9440992, "step": 9875 }, { "epoch": 0.8059384941675504, "grad_norm": 7.977418422698975, "learning_rate": 4.029284607227343e-05, "loss": 0.3358, "num_input_tokens_seen": 9446064, "step": 9880 }, { "epoch": 0.806346357777959, "grad_norm": 0.632849395275116, "learning_rate": 4.0313239252793865e-05, "loss": 0.4188, "num_input_tokens_seen": 9451888, "step": 9885 }, { "epoch": 0.8067542213883677, "grad_norm": 1.9110420942306519, "learning_rate": 4.03336324333143e-05, "loss": 0.3019, "num_input_tokens_seen": 9456464, "step": 9890 }, { "epoch": 0.8071620849987764, "grad_norm": 6.620742321014404, "learning_rate": 4.0354025613834736e-05, "loss": 0.4773, "num_input_tokens_seen": 9462112, "step": 9895 }, { "epoch": 0.8075699486091851, "grad_norm": 0.3755037486553192, "learning_rate": 4.037441879435517e-05, "loss": 0.3177, "num_input_tokens_seen": 9467008, "step": 9900 }, { "epoch": 0.8079778122195937, "grad_norm": 6.027318000793457, "learning_rate": 4.03948119748756e-05, "loss": 0.2875, "num_input_tokens_seen": 9471488, "step": 9905 }, { "epoch": 0.8083856758300024, "grad_norm": 0.22950828075408936, "learning_rate": 4.0415205155396034e-05, "loss": 0.2357, "num_input_tokens_seen": 9477088, "step": 9910 }, { "epoch": 0.8087935394404111, "grad_norm": 0.43724358081817627, "learning_rate": 4.0435598335916466e-05, "loss": 0.2202, "num_input_tokens_seen": 9480720, "step": 9915 }, { "epoch": 0.8092014030508198, "grad_norm": 5.800824165344238, "learning_rate": 4.0455991516436905e-05, "loss": 0.3633, "num_input_tokens_seen": 9484864, "step": 9920 }, { "epoch": 0.8096092666612285, "grad_norm": 4.088624000549316, "learning_rate": 4.047638469695734e-05, "loss": 0.3358, "num_input_tokens_seen": 9489824, "step": 9925 }, { "epoch": 0.8100171302716371, "grad_norm": 2.7866766452789307, "learning_rate": 4.049677787747777e-05, "loss": 0.3295, "num_input_tokens_seen": 9494608, "step": 9930 }, { "epoch": 0.8104249938820458, "grad_norm": 5.3766093254089355, "learning_rate": 4.051717105799821e-05, "loss": 0.3575, "num_input_tokens_seen": 9499184, "step": 9935 }, { "epoch": 0.8108328574924545, "grad_norm": 0.5188109278678894, "learning_rate": 4.053756423851864e-05, "loss": 0.3003, "num_input_tokens_seen": 9504880, "step": 9940 }, { "epoch": 0.8112407211028632, "grad_norm": 2.580455780029297, "learning_rate": 4.055795741903908e-05, "loss": 0.3229, "num_input_tokens_seen": 9510848, "step": 9945 }, { "epoch": 0.8116485847132718, "grad_norm": 0.6648761034011841, "learning_rate": 4.057835059955951e-05, "loss": 0.3712, "num_input_tokens_seen": 9515920, "step": 9950 }, { "epoch": 0.8120564483236805, "grad_norm": 1.7607985734939575, "learning_rate": 4.0598743780079945e-05, "loss": 0.2146, "num_input_tokens_seen": 9521456, "step": 9955 }, { "epoch": 0.8124643119340892, "grad_norm": 0.6770973205566406, "learning_rate": 4.061913696060038e-05, "loss": 0.2933, "num_input_tokens_seen": 9525536, "step": 9960 }, { "epoch": 0.8128721755444979, "grad_norm": 1.2944881916046143, "learning_rate": 4.063953014112081e-05, "loss": 0.2823, "num_input_tokens_seen": 9530144, "step": 9965 }, { "epoch": 0.8132800391549067, "grad_norm": 0.6408566832542419, "learning_rate": 4.065992332164125e-05, "loss": 0.2201, "num_input_tokens_seen": 9535312, "step": 9970 }, { "epoch": 0.8136879027653153, "grad_norm": 1.1258864402770996, "learning_rate": 4.068031650216168e-05, "loss": 0.2165, "num_input_tokens_seen": 9541040, "step": 9975 }, { "epoch": 0.814095766375724, "grad_norm": 6.330913543701172, "learning_rate": 4.0700709682682114e-05, "loss": 0.5665, "num_input_tokens_seen": 9545520, "step": 9980 }, { "epoch": 0.8145036299861327, "grad_norm": 5.196399688720703, "learning_rate": 4.0721102863202546e-05, "loss": 0.4188, "num_input_tokens_seen": 9549552, "step": 9985 }, { "epoch": 0.8149114935965414, "grad_norm": 0.19281233847141266, "learning_rate": 4.074149604372298e-05, "loss": 0.1028, "num_input_tokens_seen": 9554144, "step": 9990 }, { "epoch": 0.81531935720695, "grad_norm": 9.037720680236816, "learning_rate": 4.076188922424342e-05, "loss": 0.1232, "num_input_tokens_seen": 9559104, "step": 9995 }, { "epoch": 0.8157272208173587, "grad_norm": 13.753026008605957, "learning_rate": 4.078228240476385e-05, "loss": 0.2458, "num_input_tokens_seen": 9564160, "step": 10000 }, { "epoch": 0.8161350844277674, "grad_norm": 0.09212423115968704, "learning_rate": 4.080267558528428e-05, "loss": 0.3283, "num_input_tokens_seen": 9568688, "step": 10005 }, { "epoch": 0.8165429480381761, "grad_norm": 2.089782238006592, "learning_rate": 4.0823068765804715e-05, "loss": 0.3652, "num_input_tokens_seen": 9573840, "step": 10010 }, { "epoch": 0.8169508116485847, "grad_norm": 0.2991783022880554, "learning_rate": 4.0843461946325154e-05, "loss": 0.1144, "num_input_tokens_seen": 9578768, "step": 10015 }, { "epoch": 0.8173586752589934, "grad_norm": 0.7785455584526062, "learning_rate": 4.0863855126845586e-05, "loss": 0.4747, "num_input_tokens_seen": 9584112, "step": 10020 }, { "epoch": 0.8177665388694021, "grad_norm": 5.734153747558594, "learning_rate": 4.088424830736602e-05, "loss": 0.4784, "num_input_tokens_seen": 9588880, "step": 10025 }, { "epoch": 0.8181744024798108, "grad_norm": 12.741752624511719, "learning_rate": 4.090464148788645e-05, "loss": 0.2736, "num_input_tokens_seen": 9592720, "step": 10030 }, { "epoch": 0.8185822660902194, "grad_norm": 1.3854786157608032, "learning_rate": 4.0925034668406884e-05, "loss": 0.2844, "num_input_tokens_seen": 9597984, "step": 10035 }, { "epoch": 0.8189901297006281, "grad_norm": 2.789161205291748, "learning_rate": 4.094542784892732e-05, "loss": 0.3573, "num_input_tokens_seen": 9603120, "step": 10040 }, { "epoch": 0.8193979933110368, "grad_norm": 9.301288604736328, "learning_rate": 4.0965821029447755e-05, "loss": 0.1574, "num_input_tokens_seen": 9608288, "step": 10045 }, { "epoch": 0.8198058569214455, "grad_norm": 1.7148785591125488, "learning_rate": 4.098621420996819e-05, "loss": 0.5591, "num_input_tokens_seen": 9612560, "step": 10050 }, { "epoch": 0.8202137205318542, "grad_norm": 12.067951202392578, "learning_rate": 4.100660739048862e-05, "loss": 0.8819, "num_input_tokens_seen": 9617488, "step": 10055 }, { "epoch": 0.8206215841422628, "grad_norm": 2.7484025955200195, "learning_rate": 4.102700057100905e-05, "loss": 0.4096, "num_input_tokens_seen": 9622000, "step": 10060 }, { "epoch": 0.8210294477526715, "grad_norm": 0.5019771456718445, "learning_rate": 4.104739375152949e-05, "loss": 0.3188, "num_input_tokens_seen": 9627264, "step": 10065 }, { "epoch": 0.8214373113630802, "grad_norm": 0.3779551088809967, "learning_rate": 4.1067786932049924e-05, "loss": 0.3406, "num_input_tokens_seen": 9632224, "step": 10070 }, { "epoch": 0.8218451749734889, "grad_norm": 2.6865901947021484, "learning_rate": 4.1088180112570356e-05, "loss": 0.2446, "num_input_tokens_seen": 9636752, "step": 10075 }, { "epoch": 0.8222530385838975, "grad_norm": 5.557721138000488, "learning_rate": 4.110857329309079e-05, "loss": 0.497, "num_input_tokens_seen": 9641584, "step": 10080 }, { "epoch": 0.8226609021943062, "grad_norm": 0.9716848134994507, "learning_rate": 4.112896647361123e-05, "loss": 0.2089, "num_input_tokens_seen": 9646000, "step": 10085 }, { "epoch": 0.8230687658047149, "grad_norm": 2.9723715782165527, "learning_rate": 4.114935965413166e-05, "loss": 0.2721, "num_input_tokens_seen": 9650592, "step": 10090 }, { "epoch": 0.8234766294151236, "grad_norm": 8.006124496459961, "learning_rate": 4.116975283465209e-05, "loss": 0.3641, "num_input_tokens_seen": 9655312, "step": 10095 }, { "epoch": 0.8238844930255322, "grad_norm": 0.770324170589447, "learning_rate": 4.1190146015172525e-05, "loss": 0.1689, "num_input_tokens_seen": 9660016, "step": 10100 }, { "epoch": 0.8242923566359409, "grad_norm": 4.729194641113281, "learning_rate": 4.121053919569296e-05, "loss": 0.2991, "num_input_tokens_seen": 9664464, "step": 10105 }, { "epoch": 0.8247002202463496, "grad_norm": 0.8760412931442261, "learning_rate": 4.1230932376213396e-05, "loss": 0.2289, "num_input_tokens_seen": 9669808, "step": 10110 }, { "epoch": 0.8251080838567583, "grad_norm": 0.3962760269641876, "learning_rate": 4.125132555673383e-05, "loss": 0.3318, "num_input_tokens_seen": 9674688, "step": 10115 }, { "epoch": 0.8255159474671669, "grad_norm": 2.8708481788635254, "learning_rate": 4.127171873725427e-05, "loss": 0.6319, "num_input_tokens_seen": 9679072, "step": 10120 }, { "epoch": 0.8259238110775756, "grad_norm": 4.110567569732666, "learning_rate": 4.12921119177747e-05, "loss": 0.5928, "num_input_tokens_seen": 9683024, "step": 10125 }, { "epoch": 0.8263316746879843, "grad_norm": 6.616087913513184, "learning_rate": 4.131250509829513e-05, "loss": 0.1919, "num_input_tokens_seen": 9687296, "step": 10130 }, { "epoch": 0.826739538298393, "grad_norm": 1.3090394735336304, "learning_rate": 4.1332898278815565e-05, "loss": 0.1333, "num_input_tokens_seen": 9691920, "step": 10135 }, { "epoch": 0.8271474019088016, "grad_norm": 11.086771011352539, "learning_rate": 4.1353291459336004e-05, "loss": 0.4615, "num_input_tokens_seen": 9695968, "step": 10140 }, { "epoch": 0.8275552655192103, "grad_norm": 3.949611186981201, "learning_rate": 4.1373684639856437e-05, "loss": 0.2208, "num_input_tokens_seen": 9700800, "step": 10145 }, { "epoch": 0.827963129129619, "grad_norm": 10.532939910888672, "learning_rate": 4.139407782037687e-05, "loss": 0.286, "num_input_tokens_seen": 9705456, "step": 10150 }, { "epoch": 0.8283709927400278, "grad_norm": 7.02453088760376, "learning_rate": 4.14144710008973e-05, "loss": 0.2433, "num_input_tokens_seen": 9709632, "step": 10155 }, { "epoch": 0.8287788563504365, "grad_norm": 1.294332504272461, "learning_rate": 4.143486418141774e-05, "loss": 0.2455, "num_input_tokens_seen": 9714384, "step": 10160 }, { "epoch": 0.8291867199608451, "grad_norm": 2.6507441997528076, "learning_rate": 4.145525736193817e-05, "loss": 0.3098, "num_input_tokens_seen": 9719984, "step": 10165 }, { "epoch": 0.8295945835712538, "grad_norm": 3.3406505584716797, "learning_rate": 4.1475650542458605e-05, "loss": 0.5735, "num_input_tokens_seen": 9724464, "step": 10170 }, { "epoch": 0.8300024471816625, "grad_norm": 6.5790324211120605, "learning_rate": 4.149604372297904e-05, "loss": 0.1635, "num_input_tokens_seen": 9728960, "step": 10175 }, { "epoch": 0.8304103107920712, "grad_norm": 5.068627834320068, "learning_rate": 4.151643690349947e-05, "loss": 0.6581, "num_input_tokens_seen": 9732832, "step": 10180 }, { "epoch": 0.8308181744024798, "grad_norm": 1.5150991678237915, "learning_rate": 4.153683008401991e-05, "loss": 0.409, "num_input_tokens_seen": 9737712, "step": 10185 }, { "epoch": 0.8312260380128885, "grad_norm": 2.637836217880249, "learning_rate": 4.155722326454034e-05, "loss": 0.2745, "num_input_tokens_seen": 9742272, "step": 10190 }, { "epoch": 0.8316339016232972, "grad_norm": 5.0952911376953125, "learning_rate": 4.1577616445060774e-05, "loss": 0.2037, "num_input_tokens_seen": 9746752, "step": 10195 }, { "epoch": 0.8320417652337059, "grad_norm": 0.48128223419189453, "learning_rate": 4.1598009625581206e-05, "loss": 0.1871, "num_input_tokens_seen": 9751648, "step": 10200 }, { "epoch": 0.8324496288441146, "grad_norm": 6.709329605102539, "learning_rate": 4.161840280610164e-05, "loss": 0.3276, "num_input_tokens_seen": 9756480, "step": 10205 }, { "epoch": 0.8328574924545232, "grad_norm": 8.274636268615723, "learning_rate": 4.163879598662208e-05, "loss": 0.6641, "num_input_tokens_seen": 9761312, "step": 10210 }, { "epoch": 0.8332653560649319, "grad_norm": 5.6080241203308105, "learning_rate": 4.165918916714251e-05, "loss": 0.387, "num_input_tokens_seen": 9766592, "step": 10215 }, { "epoch": 0.8336732196753406, "grad_norm": 0.5148457288742065, "learning_rate": 4.167958234766294e-05, "loss": 0.2371, "num_input_tokens_seen": 9771360, "step": 10220 }, { "epoch": 0.8340810832857493, "grad_norm": 0.4165317118167877, "learning_rate": 4.1699975528183375e-05, "loss": 0.1551, "num_input_tokens_seen": 9775936, "step": 10225 }, { "epoch": 0.8344889468961579, "grad_norm": 3.205221652984619, "learning_rate": 4.1720368708703814e-05, "loss": 0.2842, "num_input_tokens_seen": 9781136, "step": 10230 }, { "epoch": 0.8348968105065666, "grad_norm": 11.088983535766602, "learning_rate": 4.1740761889224246e-05, "loss": 0.2126, "num_input_tokens_seen": 9785952, "step": 10235 }, { "epoch": 0.8353046741169753, "grad_norm": 8.239246368408203, "learning_rate": 4.176115506974468e-05, "loss": 0.4956, "num_input_tokens_seen": 9789872, "step": 10240 }, { "epoch": 0.835712537727384, "grad_norm": 0.16218701004981995, "learning_rate": 4.178154825026511e-05, "loss": 0.1234, "num_input_tokens_seen": 9795104, "step": 10245 }, { "epoch": 0.8361204013377926, "grad_norm": 5.517661094665527, "learning_rate": 4.1801941430785544e-05, "loss": 0.9713, "num_input_tokens_seen": 9799280, "step": 10250 }, { "epoch": 0.8365282649482013, "grad_norm": 6.488050937652588, "learning_rate": 4.182233461130598e-05, "loss": 0.2893, "num_input_tokens_seen": 9804400, "step": 10255 }, { "epoch": 0.83693612855861, "grad_norm": 0.24621941149234772, "learning_rate": 4.1842727791826415e-05, "loss": 0.3493, "num_input_tokens_seen": 9808912, "step": 10260 }, { "epoch": 0.8373439921690187, "grad_norm": 0.5293190479278564, "learning_rate": 4.186312097234685e-05, "loss": 0.1425, "num_input_tokens_seen": 9813008, "step": 10265 }, { "epoch": 0.8377518557794273, "grad_norm": 0.43993058800697327, "learning_rate": 4.188351415286728e-05, "loss": 0.2702, "num_input_tokens_seen": 9818528, "step": 10270 }, { "epoch": 0.838159719389836, "grad_norm": 3.462887763977051, "learning_rate": 4.190390733338771e-05, "loss": 0.2953, "num_input_tokens_seen": 9823712, "step": 10275 }, { "epoch": 0.8385675830002447, "grad_norm": 9.214715957641602, "learning_rate": 4.192430051390815e-05, "loss": 0.2076, "num_input_tokens_seen": 9828032, "step": 10280 }, { "epoch": 0.8389754466106534, "grad_norm": 4.468163013458252, "learning_rate": 4.1944693694428584e-05, "loss": 0.3916, "num_input_tokens_seen": 9831792, "step": 10285 }, { "epoch": 0.839383310221062, "grad_norm": 2.139904737472534, "learning_rate": 4.1965086874949016e-05, "loss": 0.2025, "num_input_tokens_seen": 9836240, "step": 10290 }, { "epoch": 0.8397911738314707, "grad_norm": 5.147182941436768, "learning_rate": 4.198548005546945e-05, "loss": 0.5018, "num_input_tokens_seen": 9839824, "step": 10295 }, { "epoch": 0.8401990374418794, "grad_norm": 12.573209762573242, "learning_rate": 4.200587323598989e-05, "loss": 0.3421, "num_input_tokens_seen": 9844592, "step": 10300 }, { "epoch": 0.8406069010522881, "grad_norm": 9.541277885437012, "learning_rate": 4.202626641651033e-05, "loss": 0.357, "num_input_tokens_seen": 9848512, "step": 10305 }, { "epoch": 0.8410147646626968, "grad_norm": 4.514345645904541, "learning_rate": 4.204665959703076e-05, "loss": 0.4404, "num_input_tokens_seen": 9852752, "step": 10310 }, { "epoch": 0.8414226282731054, "grad_norm": 1.078894853591919, "learning_rate": 4.206705277755119e-05, "loss": 0.1447, "num_input_tokens_seen": 9857792, "step": 10315 }, { "epoch": 0.8418304918835141, "grad_norm": 7.255938529968262, "learning_rate": 4.2087445958071624e-05, "loss": 0.5858, "num_input_tokens_seen": 9863216, "step": 10320 }, { "epoch": 0.8422383554939228, "grad_norm": 7.605249881744385, "learning_rate": 4.2107839138592056e-05, "loss": 0.3561, "num_input_tokens_seen": 9868160, "step": 10325 }, { "epoch": 0.8426462191043315, "grad_norm": 7.378081798553467, "learning_rate": 4.2128232319112495e-05, "loss": 0.3234, "num_input_tokens_seen": 9873056, "step": 10330 }, { "epoch": 0.8430540827147401, "grad_norm": 2.8923909664154053, "learning_rate": 4.214862549963293e-05, "loss": 0.2067, "num_input_tokens_seen": 9877584, "step": 10335 }, { "epoch": 0.8434619463251488, "grad_norm": 6.300473690032959, "learning_rate": 4.216901868015336e-05, "loss": 0.224, "num_input_tokens_seen": 9881728, "step": 10340 }, { "epoch": 0.8438698099355576, "grad_norm": 5.057811260223389, "learning_rate": 4.218941186067379e-05, "loss": 0.2491, "num_input_tokens_seen": 9887056, "step": 10345 }, { "epoch": 0.8442776735459663, "grad_norm": 5.231666088104248, "learning_rate": 4.2209805041194225e-05, "loss": 0.5906, "num_input_tokens_seen": 9891712, "step": 10350 }, { "epoch": 0.844685537156375, "grad_norm": 6.572986125946045, "learning_rate": 4.2230198221714664e-05, "loss": 0.1495, "num_input_tokens_seen": 9897344, "step": 10355 }, { "epoch": 0.8450934007667836, "grad_norm": 4.8360395431518555, "learning_rate": 4.2250591402235096e-05, "loss": 0.4698, "num_input_tokens_seen": 9902000, "step": 10360 }, { "epoch": 0.8455012643771923, "grad_norm": 11.030585289001465, "learning_rate": 4.227098458275553e-05, "loss": 0.5447, "num_input_tokens_seen": 9907104, "step": 10365 }, { "epoch": 0.845909127987601, "grad_norm": 2.9238924980163574, "learning_rate": 4.229137776327596e-05, "loss": 0.3039, "num_input_tokens_seen": 9911312, "step": 10370 }, { "epoch": 0.8463169915980097, "grad_norm": 3.8923909664154053, "learning_rate": 4.23117709437964e-05, "loss": 0.5295, "num_input_tokens_seen": 9916560, "step": 10375 }, { "epoch": 0.8467248552084183, "grad_norm": 2.0991697311401367, "learning_rate": 4.233216412431683e-05, "loss": 0.279, "num_input_tokens_seen": 9921600, "step": 10380 }, { "epoch": 0.847132718818827, "grad_norm": 0.9017382264137268, "learning_rate": 4.2352557304837265e-05, "loss": 0.2665, "num_input_tokens_seen": 9926560, "step": 10385 }, { "epoch": 0.8475405824292357, "grad_norm": 3.2499446868896484, "learning_rate": 4.23729504853577e-05, "loss": 0.3499, "num_input_tokens_seen": 9931792, "step": 10390 }, { "epoch": 0.8479484460396444, "grad_norm": 4.061107158660889, "learning_rate": 4.239334366587813e-05, "loss": 0.2562, "num_input_tokens_seen": 9936176, "step": 10395 }, { "epoch": 0.848356309650053, "grad_norm": 10.85608959197998, "learning_rate": 4.241373684639857e-05, "loss": 0.2853, "num_input_tokens_seen": 9940544, "step": 10400 }, { "epoch": 0.8487641732604617, "grad_norm": 4.3609938621521, "learning_rate": 4.2434130026919e-05, "loss": 0.3297, "num_input_tokens_seen": 9944352, "step": 10405 }, { "epoch": 0.8491720368708704, "grad_norm": 2.162370443344116, "learning_rate": 4.2454523207439434e-05, "loss": 0.2092, "num_input_tokens_seen": 9950160, "step": 10410 }, { "epoch": 0.8495799004812791, "grad_norm": 3.737347364425659, "learning_rate": 4.2474916387959866e-05, "loss": 0.276, "num_input_tokens_seen": 9955056, "step": 10415 }, { "epoch": 0.8499877640916877, "grad_norm": 0.4155261218547821, "learning_rate": 4.24953095684803e-05, "loss": 0.1863, "num_input_tokens_seen": 9958496, "step": 10420 }, { "epoch": 0.8503956277020964, "grad_norm": 3.628070592880249, "learning_rate": 4.251570274900074e-05, "loss": 0.3804, "num_input_tokens_seen": 9963360, "step": 10425 }, { "epoch": 0.8508034913125051, "grad_norm": 5.766629695892334, "learning_rate": 4.253609592952117e-05, "loss": 0.276, "num_input_tokens_seen": 9967584, "step": 10430 }, { "epoch": 0.8512113549229138, "grad_norm": 2.451897382736206, "learning_rate": 4.25564891100416e-05, "loss": 0.5068, "num_input_tokens_seen": 9972944, "step": 10435 }, { "epoch": 0.8516192185333225, "grad_norm": 6.952939510345459, "learning_rate": 4.2576882290562035e-05, "loss": 0.2463, "num_input_tokens_seen": 9978048, "step": 10440 }, { "epoch": 0.8520270821437311, "grad_norm": 3.5283048152923584, "learning_rate": 4.2597275471082474e-05, "loss": 0.3938, "num_input_tokens_seen": 9982784, "step": 10445 }, { "epoch": 0.8524349457541398, "grad_norm": 5.175345420837402, "learning_rate": 4.2617668651602906e-05, "loss": 0.1614, "num_input_tokens_seen": 9986752, "step": 10450 }, { "epoch": 0.8528428093645485, "grad_norm": 0.48498985171318054, "learning_rate": 4.263806183212334e-05, "loss": 0.1453, "num_input_tokens_seen": 9991568, "step": 10455 }, { "epoch": 0.8532506729749572, "grad_norm": 6.337662220001221, "learning_rate": 4.265845501264377e-05, "loss": 0.3832, "num_input_tokens_seen": 9996512, "step": 10460 }, { "epoch": 0.8536585365853658, "grad_norm": 1.0132957696914673, "learning_rate": 4.2678848193164203e-05, "loss": 0.182, "num_input_tokens_seen": 10000240, "step": 10465 }, { "epoch": 0.8540664001957745, "grad_norm": 31.865575790405273, "learning_rate": 4.269924137368464e-05, "loss": 0.2968, "num_input_tokens_seen": 10005488, "step": 10470 }, { "epoch": 0.8544742638061832, "grad_norm": 10.28657341003418, "learning_rate": 4.2719634554205075e-05, "loss": 0.6022, "num_input_tokens_seen": 10010016, "step": 10475 }, { "epoch": 0.8548821274165919, "grad_norm": 0.2979561686515808, "learning_rate": 4.274002773472551e-05, "loss": 0.286, "num_input_tokens_seen": 10014352, "step": 10480 }, { "epoch": 0.8552899910270005, "grad_norm": 0.10744739323854446, "learning_rate": 4.276042091524594e-05, "loss": 0.1271, "num_input_tokens_seen": 10019168, "step": 10485 }, { "epoch": 0.8556978546374092, "grad_norm": 0.2533685266971588, "learning_rate": 4.278081409576637e-05, "loss": 0.2703, "num_input_tokens_seen": 10023296, "step": 10490 }, { "epoch": 0.8561057182478179, "grad_norm": 6.9854888916015625, "learning_rate": 4.280120727628681e-05, "loss": 0.2804, "num_input_tokens_seen": 10028448, "step": 10495 }, { "epoch": 0.8565135818582266, "grad_norm": 1.915886640548706, "learning_rate": 4.282160045680725e-05, "loss": 0.3512, "num_input_tokens_seen": 10033760, "step": 10500 }, { "epoch": 0.8569214454686352, "grad_norm": 6.478882312774658, "learning_rate": 4.284199363732768e-05, "loss": 0.2866, "num_input_tokens_seen": 10037904, "step": 10505 }, { "epoch": 0.8573293090790439, "grad_norm": 8.596314430236816, "learning_rate": 4.2862386817848115e-05, "loss": 0.417, "num_input_tokens_seen": 10042176, "step": 10510 }, { "epoch": 0.8577371726894526, "grad_norm": 3.673179864883423, "learning_rate": 4.288277999836855e-05, "loss": 0.5812, "num_input_tokens_seen": 10047072, "step": 10515 }, { "epoch": 0.8581450362998613, "grad_norm": 3.433685779571533, "learning_rate": 4.290317317888899e-05, "loss": 0.49, "num_input_tokens_seen": 10051504, "step": 10520 }, { "epoch": 0.85855289991027, "grad_norm": 1.2492637634277344, "learning_rate": 4.292356635940942e-05, "loss": 0.0816, "num_input_tokens_seen": 10056496, "step": 10525 }, { "epoch": 0.8589607635206787, "grad_norm": 0.3361670970916748, "learning_rate": 4.294395953992985e-05, "loss": 0.1644, "num_input_tokens_seen": 10061376, "step": 10530 }, { "epoch": 0.8593686271310874, "grad_norm": 5.980679988861084, "learning_rate": 4.2964352720450284e-05, "loss": 0.399, "num_input_tokens_seen": 10065792, "step": 10535 }, { "epoch": 0.8597764907414961, "grad_norm": 4.935391902923584, "learning_rate": 4.2984745900970716e-05, "loss": 0.3923, "num_input_tokens_seen": 10070976, "step": 10540 }, { "epoch": 0.8601843543519048, "grad_norm": 0.7118882536888123, "learning_rate": 4.3005139081491155e-05, "loss": 0.2644, "num_input_tokens_seen": 10075040, "step": 10545 }, { "epoch": 0.8605922179623134, "grad_norm": 7.062863349914551, "learning_rate": 4.302553226201159e-05, "loss": 0.2583, "num_input_tokens_seen": 10079216, "step": 10550 }, { "epoch": 0.8610000815727221, "grad_norm": 3.465683937072754, "learning_rate": 4.304592544253202e-05, "loss": 0.4194, "num_input_tokens_seen": 10083904, "step": 10555 }, { "epoch": 0.8614079451831308, "grad_norm": 5.606765270233154, "learning_rate": 4.306631862305245e-05, "loss": 0.2003, "num_input_tokens_seen": 10089344, "step": 10560 }, { "epoch": 0.8618158087935395, "grad_norm": 2.1709227561950684, "learning_rate": 4.3086711803572885e-05, "loss": 0.4111, "num_input_tokens_seen": 10094688, "step": 10565 }, { "epoch": 0.8622236724039481, "grad_norm": 8.000990867614746, "learning_rate": 4.3107104984093324e-05, "loss": 0.3346, "num_input_tokens_seen": 10099056, "step": 10570 }, { "epoch": 0.8626315360143568, "grad_norm": 7.032660007476807, "learning_rate": 4.3127498164613756e-05, "loss": 0.2505, "num_input_tokens_seen": 10104464, "step": 10575 }, { "epoch": 0.8630393996247655, "grad_norm": 3.9732048511505127, "learning_rate": 4.314789134513419e-05, "loss": 0.1596, "num_input_tokens_seen": 10109280, "step": 10580 }, { "epoch": 0.8634472632351742, "grad_norm": 0.7616381645202637, "learning_rate": 4.316828452565462e-05, "loss": 0.2607, "num_input_tokens_seen": 10114064, "step": 10585 }, { "epoch": 0.8638551268455829, "grad_norm": 0.24594111740589142, "learning_rate": 4.318867770617506e-05, "loss": 0.3326, "num_input_tokens_seen": 10119456, "step": 10590 }, { "epoch": 0.8642629904559915, "grad_norm": 5.89882755279541, "learning_rate": 4.320907088669549e-05, "loss": 0.4345, "num_input_tokens_seen": 10122880, "step": 10595 }, { "epoch": 0.8646708540664002, "grad_norm": 8.256138801574707, "learning_rate": 4.3229464067215925e-05, "loss": 0.1715, "num_input_tokens_seen": 10127552, "step": 10600 }, { "epoch": 0.8650787176768089, "grad_norm": 8.075008392333984, "learning_rate": 4.324985724773636e-05, "loss": 0.7731, "num_input_tokens_seen": 10131568, "step": 10605 }, { "epoch": 0.8654865812872176, "grad_norm": 12.883519172668457, "learning_rate": 4.327025042825679e-05, "loss": 0.3888, "num_input_tokens_seen": 10136736, "step": 10610 }, { "epoch": 0.8658944448976262, "grad_norm": 6.983640670776367, "learning_rate": 4.329064360877723e-05, "loss": 0.4564, "num_input_tokens_seen": 10141008, "step": 10615 }, { "epoch": 0.8663023085080349, "grad_norm": 3.177949905395508, "learning_rate": 4.331103678929766e-05, "loss": 0.0894, "num_input_tokens_seen": 10146240, "step": 10620 }, { "epoch": 0.8667101721184436, "grad_norm": 13.277552604675293, "learning_rate": 4.3331429969818094e-05, "loss": 0.2281, "num_input_tokens_seen": 10151648, "step": 10625 }, { "epoch": 0.8671180357288523, "grad_norm": 7.529143333435059, "learning_rate": 4.3351823150338526e-05, "loss": 0.4877, "num_input_tokens_seen": 10156976, "step": 10630 }, { "epoch": 0.8675258993392609, "grad_norm": 19.913297653198242, "learning_rate": 4.337221633085896e-05, "loss": 0.37, "num_input_tokens_seen": 10161488, "step": 10635 }, { "epoch": 0.8679337629496696, "grad_norm": 4.026606559753418, "learning_rate": 4.33926095113794e-05, "loss": 0.4304, "num_input_tokens_seen": 10166928, "step": 10640 }, { "epoch": 0.8683416265600783, "grad_norm": 4.331300258636475, "learning_rate": 4.341300269189983e-05, "loss": 0.2254, "num_input_tokens_seen": 10172640, "step": 10645 }, { "epoch": 0.868749490170487, "grad_norm": 0.5818652510643005, "learning_rate": 4.343339587242026e-05, "loss": 0.1636, "num_input_tokens_seen": 10177600, "step": 10650 }, { "epoch": 0.8691573537808956, "grad_norm": 0.45338869094848633, "learning_rate": 4.3453789052940695e-05, "loss": 0.3436, "num_input_tokens_seen": 10182016, "step": 10655 }, { "epoch": 0.8695652173913043, "grad_norm": 0.6545053720474243, "learning_rate": 4.347418223346113e-05, "loss": 0.3605, "num_input_tokens_seen": 10186224, "step": 10660 }, { "epoch": 0.869973081001713, "grad_norm": 1.859480381011963, "learning_rate": 4.3494575413981566e-05, "loss": 0.1728, "num_input_tokens_seen": 10191504, "step": 10665 }, { "epoch": 0.8703809446121217, "grad_norm": 0.41473570466041565, "learning_rate": 4.3514968594502e-05, "loss": 0.7413, "num_input_tokens_seen": 10196064, "step": 10670 }, { "epoch": 0.8707888082225304, "grad_norm": 1.3186503648757935, "learning_rate": 4.353536177502243e-05, "loss": 0.3647, "num_input_tokens_seen": 10200624, "step": 10675 }, { "epoch": 0.871196671832939, "grad_norm": 6.87603759765625, "learning_rate": 4.355575495554287e-05, "loss": 0.2073, "num_input_tokens_seen": 10205776, "step": 10680 }, { "epoch": 0.8716045354433477, "grad_norm": 12.616522789001465, "learning_rate": 4.35761481360633e-05, "loss": 0.1779, "num_input_tokens_seen": 10211296, "step": 10685 }, { "epoch": 0.8720123990537564, "grad_norm": 8.492469787597656, "learning_rate": 4.359654131658374e-05, "loss": 0.3794, "num_input_tokens_seen": 10215808, "step": 10690 }, { "epoch": 0.8724202626641651, "grad_norm": 18.91407585144043, "learning_rate": 4.3616934497104174e-05, "loss": 0.2052, "num_input_tokens_seen": 10220896, "step": 10695 }, { "epoch": 0.8728281262745737, "grad_norm": 1.5183438062667847, "learning_rate": 4.3637327677624606e-05, "loss": 0.2737, "num_input_tokens_seen": 10225456, "step": 10700 }, { "epoch": 0.8732359898849824, "grad_norm": 1.8559178113937378, "learning_rate": 4.365772085814504e-05, "loss": 0.2065, "num_input_tokens_seen": 10230240, "step": 10705 }, { "epoch": 0.8736438534953911, "grad_norm": 4.331625938415527, "learning_rate": 4.367811403866547e-05, "loss": 0.4349, "num_input_tokens_seen": 10235008, "step": 10710 }, { "epoch": 0.8740517171057999, "grad_norm": 5.987566947937012, "learning_rate": 4.369850721918591e-05, "loss": 0.4643, "num_input_tokens_seen": 10240816, "step": 10715 }, { "epoch": 0.8744595807162086, "grad_norm": 0.2702908515930176, "learning_rate": 4.371890039970634e-05, "loss": 0.1793, "num_input_tokens_seen": 10245984, "step": 10720 }, { "epoch": 0.8748674443266172, "grad_norm": 2.4039134979248047, "learning_rate": 4.3739293580226775e-05, "loss": 0.1949, "num_input_tokens_seen": 10251008, "step": 10725 }, { "epoch": 0.8752753079370259, "grad_norm": 0.4348084628582001, "learning_rate": 4.375968676074721e-05, "loss": 0.1744, "num_input_tokens_seen": 10256224, "step": 10730 }, { "epoch": 0.8756831715474346, "grad_norm": 0.5527408719062805, "learning_rate": 4.378007994126764e-05, "loss": 0.0438, "num_input_tokens_seen": 10262256, "step": 10735 }, { "epoch": 0.8760910351578433, "grad_norm": 12.56520938873291, "learning_rate": 4.380047312178808e-05, "loss": 0.3123, "num_input_tokens_seen": 10266640, "step": 10740 }, { "epoch": 0.8764988987682519, "grad_norm": 0.46669113636016846, "learning_rate": 4.382086630230851e-05, "loss": 0.1838, "num_input_tokens_seen": 10271440, "step": 10745 }, { "epoch": 0.8769067623786606, "grad_norm": 18.4443302154541, "learning_rate": 4.3841259482828944e-05, "loss": 0.125, "num_input_tokens_seen": 10276144, "step": 10750 }, { "epoch": 0.8773146259890693, "grad_norm": 18.800521850585938, "learning_rate": 4.3861652663349376e-05, "loss": 0.6676, "num_input_tokens_seen": 10280096, "step": 10755 }, { "epoch": 0.877722489599478, "grad_norm": 26.126850128173828, "learning_rate": 4.3882045843869815e-05, "loss": 0.1934, "num_input_tokens_seen": 10285328, "step": 10760 }, { "epoch": 0.8781303532098866, "grad_norm": 0.12293879687786102, "learning_rate": 4.390243902439025e-05, "loss": 0.1469, "num_input_tokens_seen": 10290080, "step": 10765 }, { "epoch": 0.8785382168202953, "grad_norm": 32.03215789794922, "learning_rate": 4.392283220491068e-05, "loss": 0.3487, "num_input_tokens_seen": 10294000, "step": 10770 }, { "epoch": 0.878946080430704, "grad_norm": 90.72288513183594, "learning_rate": 4.394322538543111e-05, "loss": 0.2681, "num_input_tokens_seen": 10298544, "step": 10775 }, { "epoch": 0.8793539440411127, "grad_norm": 6.015121936798096, "learning_rate": 4.3963618565951545e-05, "loss": 0.6372, "num_input_tokens_seen": 10304000, "step": 10780 }, { "epoch": 0.8797618076515213, "grad_norm": 0.15195442736148834, "learning_rate": 4.3984011746471984e-05, "loss": 0.1347, "num_input_tokens_seen": 10308544, "step": 10785 }, { "epoch": 0.88016967126193, "grad_norm": 0.1091381162405014, "learning_rate": 4.4004404926992416e-05, "loss": 0.2303, "num_input_tokens_seen": 10312704, "step": 10790 }, { "epoch": 0.8805775348723387, "grad_norm": 1.1659924983978271, "learning_rate": 4.402479810751285e-05, "loss": 0.3662, "num_input_tokens_seen": 10317200, "step": 10795 }, { "epoch": 0.8809853984827474, "grad_norm": 10.167241096496582, "learning_rate": 4.404519128803328e-05, "loss": 0.4628, "num_input_tokens_seen": 10321472, "step": 10800 }, { "epoch": 0.881393262093156, "grad_norm": 48.82841873168945, "learning_rate": 4.4065584468553713e-05, "loss": 0.7376, "num_input_tokens_seen": 10326880, "step": 10805 }, { "epoch": 0.8818011257035647, "grad_norm": 0.5748950242996216, "learning_rate": 4.408597764907415e-05, "loss": 0.1937, "num_input_tokens_seen": 10331536, "step": 10810 }, { "epoch": 0.8822089893139734, "grad_norm": 0.20768699049949646, "learning_rate": 4.4106370829594585e-05, "loss": 0.0629, "num_input_tokens_seen": 10335472, "step": 10815 }, { "epoch": 0.8826168529243821, "grad_norm": 10.443316459655762, "learning_rate": 4.412676401011502e-05, "loss": 0.4789, "num_input_tokens_seen": 10339632, "step": 10820 }, { "epoch": 0.8830247165347908, "grad_norm": 8.30568790435791, "learning_rate": 4.414715719063545e-05, "loss": 0.2492, "num_input_tokens_seen": 10345152, "step": 10825 }, { "epoch": 0.8834325801451994, "grad_norm": 0.2910822629928589, "learning_rate": 4.416755037115589e-05, "loss": 0.0311, "num_input_tokens_seen": 10349648, "step": 10830 }, { "epoch": 0.8838404437556081, "grad_norm": 0.04452795535326004, "learning_rate": 4.418794355167632e-05, "loss": 0.3616, "num_input_tokens_seen": 10354320, "step": 10835 }, { "epoch": 0.8842483073660168, "grad_norm": 0.35297825932502747, "learning_rate": 4.4208336732196754e-05, "loss": 0.2985, "num_input_tokens_seen": 10358704, "step": 10840 }, { "epoch": 0.8846561709764255, "grad_norm": 59.19512176513672, "learning_rate": 4.4228729912717186e-05, "loss": 0.3768, "num_input_tokens_seen": 10363152, "step": 10845 }, { "epoch": 0.8850640345868341, "grad_norm": 9.733586311340332, "learning_rate": 4.424912309323762e-05, "loss": 0.5177, "num_input_tokens_seen": 10367792, "step": 10850 }, { "epoch": 0.8854718981972428, "grad_norm": 18.09516143798828, "learning_rate": 4.426951627375806e-05, "loss": 0.3102, "num_input_tokens_seen": 10372752, "step": 10855 }, { "epoch": 0.8858797618076515, "grad_norm": 0.5081624388694763, "learning_rate": 4.428990945427849e-05, "loss": 0.0426, "num_input_tokens_seen": 10378304, "step": 10860 }, { "epoch": 0.8862876254180602, "grad_norm": 1.090208888053894, "learning_rate": 4.431030263479893e-05, "loss": 0.0954, "num_input_tokens_seen": 10383264, "step": 10865 }, { "epoch": 0.8866954890284688, "grad_norm": 17.94464111328125, "learning_rate": 4.433069581531936e-05, "loss": 0.4576, "num_input_tokens_seen": 10387792, "step": 10870 }, { "epoch": 0.8871033526388775, "grad_norm": 1.3103634119033813, "learning_rate": 4.4351088995839794e-05, "loss": 0.5312, "num_input_tokens_seen": 10392272, "step": 10875 }, { "epoch": 0.8875112162492862, "grad_norm": 4.697896957397461, "learning_rate": 4.4371482176360226e-05, "loss": 0.4579, "num_input_tokens_seen": 10397488, "step": 10880 }, { "epoch": 0.8879190798596949, "grad_norm": 0.3734126389026642, "learning_rate": 4.4391875356880665e-05, "loss": 0.2134, "num_input_tokens_seen": 10402144, "step": 10885 }, { "epoch": 0.8883269434701035, "grad_norm": 0.3133879601955414, "learning_rate": 4.44122685374011e-05, "loss": 0.4161, "num_input_tokens_seen": 10407104, "step": 10890 }, { "epoch": 0.8887348070805122, "grad_norm": 6.888193130493164, "learning_rate": 4.443266171792153e-05, "loss": 0.5723, "num_input_tokens_seen": 10411744, "step": 10895 }, { "epoch": 0.889142670690921, "grad_norm": 5.10781717300415, "learning_rate": 4.445305489844196e-05, "loss": 0.3878, "num_input_tokens_seen": 10417104, "step": 10900 }, { "epoch": 0.8895505343013297, "grad_norm": 3.3455348014831543, "learning_rate": 4.44734480789624e-05, "loss": 0.867, "num_input_tokens_seen": 10421360, "step": 10905 }, { "epoch": 0.8899583979117384, "grad_norm": 6.745330333709717, "learning_rate": 4.4493841259482834e-05, "loss": 0.4552, "num_input_tokens_seen": 10426016, "step": 10910 }, { "epoch": 0.890366261522147, "grad_norm": 6.861766815185547, "learning_rate": 4.4514234440003266e-05, "loss": 0.4099, "num_input_tokens_seen": 10430752, "step": 10915 }, { "epoch": 0.8907741251325557, "grad_norm": 0.600766658782959, "learning_rate": 4.45346276205237e-05, "loss": 0.2736, "num_input_tokens_seen": 10435632, "step": 10920 }, { "epoch": 0.8911819887429644, "grad_norm": 0.823456346988678, "learning_rate": 4.455502080104413e-05, "loss": 0.2105, "num_input_tokens_seen": 10439808, "step": 10925 }, { "epoch": 0.8915898523533731, "grad_norm": 9.489004135131836, "learning_rate": 4.457541398156457e-05, "loss": 0.2442, "num_input_tokens_seen": 10443968, "step": 10930 }, { "epoch": 0.8919977159637817, "grad_norm": 1.9809068441390991, "learning_rate": 4.4595807162085e-05, "loss": 0.1689, "num_input_tokens_seen": 10448320, "step": 10935 }, { "epoch": 0.8924055795741904, "grad_norm": 2.006533622741699, "learning_rate": 4.4616200342605435e-05, "loss": 0.1195, "num_input_tokens_seen": 10453392, "step": 10940 }, { "epoch": 0.8928134431845991, "grad_norm": 0.7333968877792358, "learning_rate": 4.463659352312587e-05, "loss": 0.2544, "num_input_tokens_seen": 10458608, "step": 10945 }, { "epoch": 0.8932213067950078, "grad_norm": 8.226465225219727, "learning_rate": 4.46569867036463e-05, "loss": 0.3489, "num_input_tokens_seen": 10463456, "step": 10950 }, { "epoch": 0.8936291704054165, "grad_norm": 0.6087920069694519, "learning_rate": 4.467737988416674e-05, "loss": 0.3247, "num_input_tokens_seen": 10468432, "step": 10955 }, { "epoch": 0.8940370340158251, "grad_norm": 17.682905197143555, "learning_rate": 4.469777306468717e-05, "loss": 0.072, "num_input_tokens_seen": 10473488, "step": 10960 }, { "epoch": 0.8944448976262338, "grad_norm": 10.399805068969727, "learning_rate": 4.4718166245207604e-05, "loss": 0.2138, "num_input_tokens_seen": 10478304, "step": 10965 }, { "epoch": 0.8948527612366425, "grad_norm": 16.75421905517578, "learning_rate": 4.4738559425728036e-05, "loss": 0.2648, "num_input_tokens_seen": 10483040, "step": 10970 }, { "epoch": 0.8952606248470512, "grad_norm": 21.829439163208008, "learning_rate": 4.4758952606248475e-05, "loss": 0.341, "num_input_tokens_seen": 10487616, "step": 10975 }, { "epoch": 0.8956684884574598, "grad_norm": 0.17470744252204895, "learning_rate": 4.477934578676891e-05, "loss": 0.5494, "num_input_tokens_seen": 10492320, "step": 10980 }, { "epoch": 0.8960763520678685, "grad_norm": 0.17886540293693542, "learning_rate": 4.479973896728934e-05, "loss": 0.2619, "num_input_tokens_seen": 10497536, "step": 10985 }, { "epoch": 0.8964842156782772, "grad_norm": 0.05064539611339569, "learning_rate": 4.482013214780977e-05, "loss": 0.1779, "num_input_tokens_seen": 10503056, "step": 10990 }, { "epoch": 0.8968920792886859, "grad_norm": 6.951767921447754, "learning_rate": 4.4840525328330205e-05, "loss": 0.3747, "num_input_tokens_seen": 10508368, "step": 10995 }, { "epoch": 0.8972999428990945, "grad_norm": 4.149134635925293, "learning_rate": 4.4860918508850644e-05, "loss": 0.3751, "num_input_tokens_seen": 10513248, "step": 11000 }, { "epoch": 0.8977078065095032, "grad_norm": 0.8808862566947937, "learning_rate": 4.4881311689371076e-05, "loss": 0.3479, "num_input_tokens_seen": 10517840, "step": 11005 }, { "epoch": 0.8981156701199119, "grad_norm": 13.309587478637695, "learning_rate": 4.490170486989151e-05, "loss": 0.4366, "num_input_tokens_seen": 10522848, "step": 11010 }, { "epoch": 0.8985235337303206, "grad_norm": 0.05485556647181511, "learning_rate": 4.492209805041194e-05, "loss": 0.0983, "num_input_tokens_seen": 10527872, "step": 11015 }, { "epoch": 0.8989313973407292, "grad_norm": 1.2836500406265259, "learning_rate": 4.494249123093237e-05, "loss": 0.0985, "num_input_tokens_seen": 10532416, "step": 11020 }, { "epoch": 0.8993392609511379, "grad_norm": 5.737029552459717, "learning_rate": 4.496288441145281e-05, "loss": 0.1327, "num_input_tokens_seen": 10537696, "step": 11025 }, { "epoch": 0.8997471245615466, "grad_norm": 0.4621797502040863, "learning_rate": 4.4983277591973245e-05, "loss": 0.0231, "num_input_tokens_seen": 10541872, "step": 11030 }, { "epoch": 0.9001549881719553, "grad_norm": 7.783262729644775, "learning_rate": 4.500367077249368e-05, "loss": 0.2414, "num_input_tokens_seen": 10546848, "step": 11035 }, { "epoch": 0.900562851782364, "grad_norm": 0.1177314817905426, "learning_rate": 4.502406395301411e-05, "loss": 0.2041, "num_input_tokens_seen": 10550880, "step": 11040 }, { "epoch": 0.9009707153927726, "grad_norm": 41.46785354614258, "learning_rate": 4.504445713353455e-05, "loss": 0.7216, "num_input_tokens_seen": 10555264, "step": 11045 }, { "epoch": 0.9013785790031813, "grad_norm": 0.16755332052707672, "learning_rate": 4.506485031405499e-05, "loss": 0.0208, "num_input_tokens_seen": 10559952, "step": 11050 }, { "epoch": 0.90178644261359, "grad_norm": 0.11684009432792664, "learning_rate": 4.508524349457542e-05, "loss": 0.3053, "num_input_tokens_seen": 10564816, "step": 11055 }, { "epoch": 0.9021943062239987, "grad_norm": 0.2704034447669983, "learning_rate": 4.510563667509585e-05, "loss": 0.6452, "num_input_tokens_seen": 10569568, "step": 11060 }, { "epoch": 0.9026021698344073, "grad_norm": 0.1339755654335022, "learning_rate": 4.5126029855616285e-05, "loss": 0.3126, "num_input_tokens_seen": 10573424, "step": 11065 }, { "epoch": 0.903010033444816, "grad_norm": 5.996142387390137, "learning_rate": 4.514642303613672e-05, "loss": 0.2311, "num_input_tokens_seen": 10578064, "step": 11070 }, { "epoch": 0.9034178970552247, "grad_norm": 0.11235406249761581, "learning_rate": 4.5166816216657157e-05, "loss": 0.1096, "num_input_tokens_seen": 10583584, "step": 11075 }, { "epoch": 0.9038257606656334, "grad_norm": 0.11090181767940521, "learning_rate": 4.518720939717759e-05, "loss": 0.1163, "num_input_tokens_seen": 10588064, "step": 11080 }, { "epoch": 0.904233624276042, "grad_norm": 0.20077800750732422, "learning_rate": 4.520760257769802e-05, "loss": 0.4149, "num_input_tokens_seen": 10592592, "step": 11085 }, { "epoch": 0.9046414878864508, "grad_norm": 0.07598971575498581, "learning_rate": 4.5227995758218454e-05, "loss": 0.5285, "num_input_tokens_seen": 10597824, "step": 11090 }, { "epoch": 0.9050493514968595, "grad_norm": 0.9887428879737854, "learning_rate": 4.5248388938738886e-05, "loss": 0.3474, "num_input_tokens_seen": 10602896, "step": 11095 }, { "epoch": 0.9054572151072682, "grad_norm": 10.456665992736816, "learning_rate": 4.5268782119259325e-05, "loss": 0.6422, "num_input_tokens_seen": 10607984, "step": 11100 }, { "epoch": 0.9058650787176769, "grad_norm": 1.6704562902450562, "learning_rate": 4.528917529977976e-05, "loss": 0.0235, "num_input_tokens_seen": 10613472, "step": 11105 }, { "epoch": 0.9062729423280855, "grad_norm": 7.8154449462890625, "learning_rate": 4.530956848030019e-05, "loss": 0.6545, "num_input_tokens_seen": 10617648, "step": 11110 }, { "epoch": 0.9066808059384942, "grad_norm": 11.828213691711426, "learning_rate": 4.532996166082062e-05, "loss": 0.1729, "num_input_tokens_seen": 10622928, "step": 11115 }, { "epoch": 0.9070886695489029, "grad_norm": 4.4976019859313965, "learning_rate": 4.535035484134106e-05, "loss": 0.2701, "num_input_tokens_seen": 10628240, "step": 11120 }, { "epoch": 0.9074965331593116, "grad_norm": 10.590241432189941, "learning_rate": 4.5370748021861494e-05, "loss": 0.3767, "num_input_tokens_seen": 10632736, "step": 11125 }, { "epoch": 0.9079043967697202, "grad_norm": 4.577548980712891, "learning_rate": 4.5391141202381926e-05, "loss": 0.1443, "num_input_tokens_seen": 10636560, "step": 11130 }, { "epoch": 0.9083122603801289, "grad_norm": 0.7149935960769653, "learning_rate": 4.541153438290236e-05, "loss": 0.2992, "num_input_tokens_seen": 10641296, "step": 11135 }, { "epoch": 0.9087201239905376, "grad_norm": 10.358735084533691, "learning_rate": 4.543192756342279e-05, "loss": 0.4145, "num_input_tokens_seen": 10646448, "step": 11140 }, { "epoch": 0.9091279876009463, "grad_norm": 13.43858814239502, "learning_rate": 4.545232074394323e-05, "loss": 0.3803, "num_input_tokens_seen": 10651376, "step": 11145 }, { "epoch": 0.9095358512113549, "grad_norm": 3.756927728652954, "learning_rate": 4.547271392446366e-05, "loss": 0.1123, "num_input_tokens_seen": 10656816, "step": 11150 }, { "epoch": 0.9099437148217636, "grad_norm": 0.23091304302215576, "learning_rate": 4.5493107104984095e-05, "loss": 0.0601, "num_input_tokens_seen": 10661952, "step": 11155 }, { "epoch": 0.9103515784321723, "grad_norm": 7.373678207397461, "learning_rate": 4.551350028550453e-05, "loss": 0.5538, "num_input_tokens_seen": 10666832, "step": 11160 }, { "epoch": 0.910759442042581, "grad_norm": 0.0365392342209816, "learning_rate": 4.553389346602496e-05, "loss": 0.5113, "num_input_tokens_seen": 10671552, "step": 11165 }, { "epoch": 0.9111673056529896, "grad_norm": 6.435271739959717, "learning_rate": 4.55542866465454e-05, "loss": 0.3669, "num_input_tokens_seen": 10675808, "step": 11170 }, { "epoch": 0.9115751692633983, "grad_norm": 0.6388537883758545, "learning_rate": 4.557467982706583e-05, "loss": 0.0509, "num_input_tokens_seen": 10680048, "step": 11175 }, { "epoch": 0.911983032873807, "grad_norm": 9.341167449951172, "learning_rate": 4.5595073007586264e-05, "loss": 0.6644, "num_input_tokens_seen": 10685344, "step": 11180 }, { "epoch": 0.9123908964842157, "grad_norm": 7.524158477783203, "learning_rate": 4.5615466188106696e-05, "loss": 0.3816, "num_input_tokens_seen": 10690144, "step": 11185 }, { "epoch": 0.9127987600946244, "grad_norm": 0.16340099275112152, "learning_rate": 4.5635859368627135e-05, "loss": 0.3973, "num_input_tokens_seen": 10694960, "step": 11190 }, { "epoch": 0.913206623705033, "grad_norm": 0.37241461873054504, "learning_rate": 4.565625254914757e-05, "loss": 0.3782, "num_input_tokens_seen": 10699952, "step": 11195 }, { "epoch": 0.9136144873154417, "grad_norm": 35.23428726196289, "learning_rate": 4.5676645729668e-05, "loss": 0.164, "num_input_tokens_seen": 10705088, "step": 11200 }, { "epoch": 0.9140223509258504, "grad_norm": 0.7644211053848267, "learning_rate": 4.569703891018843e-05, "loss": 0.3114, "num_input_tokens_seen": 10709440, "step": 11205 }, { "epoch": 0.9144302145362591, "grad_norm": 5.528002738952637, "learning_rate": 4.5717432090708865e-05, "loss": 0.2236, "num_input_tokens_seen": 10714624, "step": 11210 }, { "epoch": 0.9148380781466677, "grad_norm": 22.45814323425293, "learning_rate": 4.5737825271229304e-05, "loss": 0.166, "num_input_tokens_seen": 10719760, "step": 11215 }, { "epoch": 0.9152459417570764, "grad_norm": 0.6256879568099976, "learning_rate": 4.5758218451749736e-05, "loss": 0.3788, "num_input_tokens_seen": 10724496, "step": 11220 }, { "epoch": 0.9156538053674851, "grad_norm": 0.6433069109916687, "learning_rate": 4.577861163227017e-05, "loss": 0.2291, "num_input_tokens_seen": 10728592, "step": 11225 }, { "epoch": 0.9160616689778938, "grad_norm": 21.084278106689453, "learning_rate": 4.57990048127906e-05, "loss": 0.3036, "num_input_tokens_seen": 10733040, "step": 11230 }, { "epoch": 0.9164695325883024, "grad_norm": 4.926137924194336, "learning_rate": 4.581939799331103e-05, "loss": 0.2682, "num_input_tokens_seen": 10737376, "step": 11235 }, { "epoch": 0.9168773961987111, "grad_norm": 10.168501853942871, "learning_rate": 4.583979117383147e-05, "loss": 0.0286, "num_input_tokens_seen": 10741808, "step": 11240 }, { "epoch": 0.9172852598091198, "grad_norm": 0.038962945342063904, "learning_rate": 4.586018435435191e-05, "loss": 0.0089, "num_input_tokens_seen": 10746368, "step": 11245 }, { "epoch": 0.9176931234195285, "grad_norm": 0.6949910521507263, "learning_rate": 4.5880577534872344e-05, "loss": 0.2138, "num_input_tokens_seen": 10750928, "step": 11250 }, { "epoch": 0.9181009870299371, "grad_norm": 0.16676642000675201, "learning_rate": 4.5900970715392776e-05, "loss": 0.4131, "num_input_tokens_seen": 10755456, "step": 11255 }, { "epoch": 0.9185088506403458, "grad_norm": 4.393820762634277, "learning_rate": 4.592136389591321e-05, "loss": 0.288, "num_input_tokens_seen": 10760928, "step": 11260 }, { "epoch": 0.9189167142507545, "grad_norm": 4.492329120635986, "learning_rate": 4.594175707643365e-05, "loss": 0.1615, "num_input_tokens_seen": 10765760, "step": 11265 }, { "epoch": 0.9193245778611632, "grad_norm": 26.21645736694336, "learning_rate": 4.596215025695408e-05, "loss": 0.3329, "num_input_tokens_seen": 10769728, "step": 11270 }, { "epoch": 0.919732441471572, "grad_norm": 22.32183265686035, "learning_rate": 4.598254343747451e-05, "loss": 0.4665, "num_input_tokens_seen": 10774832, "step": 11275 }, { "epoch": 0.9201403050819806, "grad_norm": 0.06443332135677338, "learning_rate": 4.6002936617994945e-05, "loss": 0.0015, "num_input_tokens_seen": 10779408, "step": 11280 }, { "epoch": 0.9205481686923893, "grad_norm": 10.086633682250977, "learning_rate": 4.602332979851538e-05, "loss": 0.628, "num_input_tokens_seen": 10783680, "step": 11285 }, { "epoch": 0.920956032302798, "grad_norm": 46.34685516357422, "learning_rate": 4.6043722979035817e-05, "loss": 0.101, "num_input_tokens_seen": 10788512, "step": 11290 }, { "epoch": 0.9213638959132067, "grad_norm": 45.579933166503906, "learning_rate": 4.606411615955625e-05, "loss": 0.2662, "num_input_tokens_seen": 10792240, "step": 11295 }, { "epoch": 0.9217717595236153, "grad_norm": 0.6752848029136658, "learning_rate": 4.608450934007668e-05, "loss": 0.0043, "num_input_tokens_seen": 10797440, "step": 11300 }, { "epoch": 0.922179623134024, "grad_norm": 0.14550279080867767, "learning_rate": 4.6104902520597114e-05, "loss": 0.3269, "num_input_tokens_seen": 10802256, "step": 11305 }, { "epoch": 0.9225874867444327, "grad_norm": 0.05205995962023735, "learning_rate": 4.6125295701117546e-05, "loss": 0.0123, "num_input_tokens_seen": 10807008, "step": 11310 }, { "epoch": 0.9229953503548414, "grad_norm": 0.08715806156396866, "learning_rate": 4.6145688881637985e-05, "loss": 0.51, "num_input_tokens_seen": 10811120, "step": 11315 }, { "epoch": 0.92340321396525, "grad_norm": 11.69830322265625, "learning_rate": 4.616608206215842e-05, "loss": 0.5351, "num_input_tokens_seen": 10816000, "step": 11320 }, { "epoch": 0.9238110775756587, "grad_norm": 18.653772354125977, "learning_rate": 4.618647524267885e-05, "loss": 0.2641, "num_input_tokens_seen": 10821472, "step": 11325 }, { "epoch": 0.9242189411860674, "grad_norm": 6.06147575378418, "learning_rate": 4.620686842319928e-05, "loss": 0.4433, "num_input_tokens_seen": 10826192, "step": 11330 }, { "epoch": 0.9246268047964761, "grad_norm": 0.18194860219955444, "learning_rate": 4.622726160371972e-05, "loss": 0.458, "num_input_tokens_seen": 10830416, "step": 11335 }, { "epoch": 0.9250346684068848, "grad_norm": 5.115262508392334, "learning_rate": 4.6247654784240154e-05, "loss": 0.0227, "num_input_tokens_seen": 10835248, "step": 11340 }, { "epoch": 0.9254425320172934, "grad_norm": 1.1085338592529297, "learning_rate": 4.6268047964760586e-05, "loss": 0.4398, "num_input_tokens_seen": 10839328, "step": 11345 }, { "epoch": 0.9258503956277021, "grad_norm": 0.22343122959136963, "learning_rate": 4.628844114528102e-05, "loss": 0.0311, "num_input_tokens_seen": 10844320, "step": 11350 }, { "epoch": 0.9262582592381108, "grad_norm": 0.14413043856620789, "learning_rate": 4.630883432580145e-05, "loss": 0.6974, "num_input_tokens_seen": 10849232, "step": 11355 }, { "epoch": 0.9266661228485195, "grad_norm": 0.22695709764957428, "learning_rate": 4.632922750632189e-05, "loss": 0.4153, "num_input_tokens_seen": 10854320, "step": 11360 }, { "epoch": 0.9270739864589281, "grad_norm": 23.740516662597656, "learning_rate": 4.634962068684232e-05, "loss": 0.0503, "num_input_tokens_seen": 10859216, "step": 11365 }, { "epoch": 0.9274818500693368, "grad_norm": 0.3075149953365326, "learning_rate": 4.6370013867362755e-05, "loss": 0.1088, "num_input_tokens_seen": 10863808, "step": 11370 }, { "epoch": 0.9278897136797455, "grad_norm": 0.7857294678688049, "learning_rate": 4.639040704788319e-05, "loss": 0.6563, "num_input_tokens_seen": 10868576, "step": 11375 }, { "epoch": 0.9282975772901542, "grad_norm": 4.6529998779296875, "learning_rate": 4.641080022840362e-05, "loss": 0.76, "num_input_tokens_seen": 10873808, "step": 11380 }, { "epoch": 0.9287054409005628, "grad_norm": 3.2943363189697266, "learning_rate": 4.643119340892406e-05, "loss": 0.1817, "num_input_tokens_seen": 10878848, "step": 11385 }, { "epoch": 0.9291133045109715, "grad_norm": 9.38017463684082, "learning_rate": 4.645158658944449e-05, "loss": 0.4365, "num_input_tokens_seen": 10883696, "step": 11390 }, { "epoch": 0.9295211681213802, "grad_norm": 2.3090686798095703, "learning_rate": 4.6471979769964924e-05, "loss": 0.3825, "num_input_tokens_seen": 10888592, "step": 11395 }, { "epoch": 0.9299290317317889, "grad_norm": 6.548527717590332, "learning_rate": 4.6492372950485356e-05, "loss": 0.263, "num_input_tokens_seen": 10892992, "step": 11400 }, { "epoch": 0.9303368953421975, "grad_norm": 5.153987407684326, "learning_rate": 4.651276613100579e-05, "loss": 0.3944, "num_input_tokens_seen": 10898272, "step": 11405 }, { "epoch": 0.9307447589526062, "grad_norm": 0.600858211517334, "learning_rate": 4.653315931152623e-05, "loss": 0.0981, "num_input_tokens_seen": 10902512, "step": 11410 }, { "epoch": 0.9311526225630149, "grad_norm": 2.0091605186462402, "learning_rate": 4.655355249204666e-05, "loss": 0.2444, "num_input_tokens_seen": 10906608, "step": 11415 }, { "epoch": 0.9315604861734236, "grad_norm": 0.12911361455917358, "learning_rate": 4.657394567256709e-05, "loss": 0.2117, "num_input_tokens_seen": 10912528, "step": 11420 }, { "epoch": 0.9319683497838323, "grad_norm": 0.1884322464466095, "learning_rate": 4.659433885308753e-05, "loss": 0.1074, "num_input_tokens_seen": 10917616, "step": 11425 }, { "epoch": 0.9323762133942409, "grad_norm": 10.689774513244629, "learning_rate": 4.6614732033607964e-05, "loss": 0.096, "num_input_tokens_seen": 10923152, "step": 11430 }, { "epoch": 0.9327840770046496, "grad_norm": 24.68108558654785, "learning_rate": 4.66351252141284e-05, "loss": 0.2287, "num_input_tokens_seen": 10927504, "step": 11435 }, { "epoch": 0.9331919406150583, "grad_norm": 13.132997512817383, "learning_rate": 4.6655518394648835e-05, "loss": 0.497, "num_input_tokens_seen": 10931952, "step": 11440 }, { "epoch": 0.933599804225467, "grad_norm": 1.7298191785812378, "learning_rate": 4.667591157516927e-05, "loss": 0.2217, "num_input_tokens_seen": 10936992, "step": 11445 }, { "epoch": 0.9340076678358756, "grad_norm": 22.920297622680664, "learning_rate": 4.66963047556897e-05, "loss": 0.5099, "num_input_tokens_seen": 10941536, "step": 11450 }, { "epoch": 0.9344155314462843, "grad_norm": 0.10568545013666153, "learning_rate": 4.671669793621013e-05, "loss": 0.3013, "num_input_tokens_seen": 10945872, "step": 11455 }, { "epoch": 0.9348233950566931, "grad_norm": 2.31876540184021, "learning_rate": 4.673709111673057e-05, "loss": 0.5951, "num_input_tokens_seen": 10950144, "step": 11460 }, { "epoch": 0.9352312586671018, "grad_norm": 9.701691627502441, "learning_rate": 4.6757484297251004e-05, "loss": 0.0911, "num_input_tokens_seen": 10954784, "step": 11465 }, { "epoch": 0.9356391222775104, "grad_norm": 0.6757541298866272, "learning_rate": 4.6777877477771436e-05, "loss": 0.1517, "num_input_tokens_seen": 10959824, "step": 11470 }, { "epoch": 0.9360469858879191, "grad_norm": 0.8289041519165039, "learning_rate": 4.679827065829187e-05, "loss": 0.0489, "num_input_tokens_seen": 10964192, "step": 11475 }, { "epoch": 0.9364548494983278, "grad_norm": 9.767748832702637, "learning_rate": 4.681866383881231e-05, "loss": 0.2593, "num_input_tokens_seen": 10969360, "step": 11480 }, { "epoch": 0.9368627131087365, "grad_norm": 3.505681276321411, "learning_rate": 4.683905701933274e-05, "loss": 0.3, "num_input_tokens_seen": 10974032, "step": 11485 }, { "epoch": 0.9372705767191452, "grad_norm": 0.5731717944145203, "learning_rate": 4.685945019985317e-05, "loss": 0.3348, "num_input_tokens_seen": 10979072, "step": 11490 }, { "epoch": 0.9376784403295538, "grad_norm": 0.30667731165885925, "learning_rate": 4.6879843380373605e-05, "loss": 0.2511, "num_input_tokens_seen": 10984160, "step": 11495 }, { "epoch": 0.9380863039399625, "grad_norm": 0.5197260975837708, "learning_rate": 4.690023656089404e-05, "loss": 0.4118, "num_input_tokens_seen": 10989184, "step": 11500 }, { "epoch": 0.9384941675503712, "grad_norm": 14.089096069335938, "learning_rate": 4.6920629741414476e-05, "loss": 0.2684, "num_input_tokens_seen": 10993888, "step": 11505 }, { "epoch": 0.9389020311607799, "grad_norm": 0.25088363885879517, "learning_rate": 4.694102292193491e-05, "loss": 0.0328, "num_input_tokens_seen": 10998256, "step": 11510 }, { "epoch": 0.9393098947711885, "grad_norm": 0.057817574590444565, "learning_rate": 4.696141610245534e-05, "loss": 0.0109, "num_input_tokens_seen": 11003280, "step": 11515 }, { "epoch": 0.9397177583815972, "grad_norm": 0.41645151376724243, "learning_rate": 4.6981809282975774e-05, "loss": 0.0112, "num_input_tokens_seen": 11009008, "step": 11520 }, { "epoch": 0.9401256219920059, "grad_norm": 5.874898910522461, "learning_rate": 4.7002202463496206e-05, "loss": 0.7192, "num_input_tokens_seen": 11013872, "step": 11525 }, { "epoch": 0.9405334856024146, "grad_norm": 0.19022516906261444, "learning_rate": 4.7022595644016645e-05, "loss": 0.0066, "num_input_tokens_seen": 11018720, "step": 11530 }, { "epoch": 0.9409413492128232, "grad_norm": 26.1656494140625, "learning_rate": 4.704298882453708e-05, "loss": 0.2607, "num_input_tokens_seen": 11023360, "step": 11535 }, { "epoch": 0.9413492128232319, "grad_norm": 8.77208137512207, "learning_rate": 4.706338200505751e-05, "loss": 1.1676, "num_input_tokens_seen": 11028464, "step": 11540 }, { "epoch": 0.9417570764336406, "grad_norm": 4.634047031402588, "learning_rate": 4.708377518557794e-05, "loss": 0.4579, "num_input_tokens_seen": 11033456, "step": 11545 }, { "epoch": 0.9421649400440493, "grad_norm": 11.269021034240723, "learning_rate": 4.7104168366098375e-05, "loss": 0.2258, "num_input_tokens_seen": 11038176, "step": 11550 }, { "epoch": 0.942572803654458, "grad_norm": 0.6643468141555786, "learning_rate": 4.7124561546618814e-05, "loss": 0.0205, "num_input_tokens_seen": 11042384, "step": 11555 }, { "epoch": 0.9429806672648666, "grad_norm": 0.21649736166000366, "learning_rate": 4.7144954727139246e-05, "loss": 0.0195, "num_input_tokens_seen": 11046528, "step": 11560 }, { "epoch": 0.9433885308752753, "grad_norm": 2.990654706954956, "learning_rate": 4.716534790765968e-05, "loss": 0.1075, "num_input_tokens_seen": 11050528, "step": 11565 }, { "epoch": 0.943796394485684, "grad_norm": 0.0945005789399147, "learning_rate": 4.718574108818011e-05, "loss": 0.0162, "num_input_tokens_seen": 11055056, "step": 11570 }, { "epoch": 0.9442042580960927, "grad_norm": 0.02841286174952984, "learning_rate": 4.720613426870055e-05, "loss": 0.1208, "num_input_tokens_seen": 11059856, "step": 11575 }, { "epoch": 0.9446121217065013, "grad_norm": 5.384934902191162, "learning_rate": 4.722652744922098e-05, "loss": 0.4777, "num_input_tokens_seen": 11064464, "step": 11580 }, { "epoch": 0.94501998531691, "grad_norm": 2.580565929412842, "learning_rate": 4.7246920629741415e-05, "loss": 0.5484, "num_input_tokens_seen": 11069280, "step": 11585 }, { "epoch": 0.9454278489273187, "grad_norm": 0.4580410122871399, "learning_rate": 4.726731381026185e-05, "loss": 0.3095, "num_input_tokens_seen": 11073424, "step": 11590 }, { "epoch": 0.9458357125377274, "grad_norm": 1.017743468284607, "learning_rate": 4.728770699078228e-05, "loss": 0.2184, "num_input_tokens_seen": 11078864, "step": 11595 }, { "epoch": 0.946243576148136, "grad_norm": 19.095134735107422, "learning_rate": 4.730810017130272e-05, "loss": 0.1669, "num_input_tokens_seen": 11082912, "step": 11600 }, { "epoch": 0.9466514397585447, "grad_norm": 9.041131019592285, "learning_rate": 4.732849335182315e-05, "loss": 0.3268, "num_input_tokens_seen": 11087696, "step": 11605 }, { "epoch": 0.9470593033689534, "grad_norm": 0.28772419691085815, "learning_rate": 4.734888653234359e-05, "loss": 0.0933, "num_input_tokens_seen": 11092736, "step": 11610 }, { "epoch": 0.9474671669793621, "grad_norm": 0.13105346262454987, "learning_rate": 4.736927971286402e-05, "loss": 0.1551, "num_input_tokens_seen": 11097232, "step": 11615 }, { "epoch": 0.9478750305897707, "grad_norm": 0.1088060811161995, "learning_rate": 4.7389672893384455e-05, "loss": 0.1378, "num_input_tokens_seen": 11102288, "step": 11620 }, { "epoch": 0.9482828942001794, "grad_norm": 73.06128692626953, "learning_rate": 4.741006607390489e-05, "loss": 0.4029, "num_input_tokens_seen": 11107008, "step": 11625 }, { "epoch": 0.9486907578105881, "grad_norm": 13.73388957977295, "learning_rate": 4.7430459254425327e-05, "loss": 0.4138, "num_input_tokens_seen": 11112048, "step": 11630 }, { "epoch": 0.9490986214209968, "grad_norm": 15.610745429992676, "learning_rate": 4.745085243494576e-05, "loss": 0.1306, "num_input_tokens_seen": 11117232, "step": 11635 }, { "epoch": 0.9495064850314054, "grad_norm": 10.143689155578613, "learning_rate": 4.747124561546619e-05, "loss": 0.5429, "num_input_tokens_seen": 11122320, "step": 11640 }, { "epoch": 0.9499143486418142, "grad_norm": 3.535456657409668, "learning_rate": 4.7491638795986624e-05, "loss": 0.2135, "num_input_tokens_seen": 11127696, "step": 11645 }, { "epoch": 0.9503222122522229, "grad_norm": 0.3803652822971344, "learning_rate": 4.751203197650706e-05, "loss": 0.1527, "num_input_tokens_seen": 11132848, "step": 11650 }, { "epoch": 0.9507300758626316, "grad_norm": 0.22129888832569122, "learning_rate": 4.7532425157027495e-05, "loss": 0.0161, "num_input_tokens_seen": 11137984, "step": 11655 }, { "epoch": 0.9511379394730403, "grad_norm": 0.40673351287841797, "learning_rate": 4.755281833754793e-05, "loss": 0.0127, "num_input_tokens_seen": 11142960, "step": 11660 }, { "epoch": 0.9515458030834489, "grad_norm": 8.297196388244629, "learning_rate": 4.757321151806836e-05, "loss": 0.3394, "num_input_tokens_seen": 11147872, "step": 11665 }, { "epoch": 0.9519536666938576, "grad_norm": 0.13850520551204681, "learning_rate": 4.759360469858879e-05, "loss": 0.4001, "num_input_tokens_seen": 11152432, "step": 11670 }, { "epoch": 0.9523615303042663, "grad_norm": 8.178245544433594, "learning_rate": 4.761399787910923e-05, "loss": 0.203, "num_input_tokens_seen": 11157488, "step": 11675 }, { "epoch": 0.952769393914675, "grad_norm": 4.338950157165527, "learning_rate": 4.7634391059629664e-05, "loss": 0.1503, "num_input_tokens_seen": 11161456, "step": 11680 }, { "epoch": 0.9531772575250836, "grad_norm": 0.31207725405693054, "learning_rate": 4.7654784240150096e-05, "loss": 0.2372, "num_input_tokens_seen": 11166352, "step": 11685 }, { "epoch": 0.9535851211354923, "grad_norm": 0.09340458363294601, "learning_rate": 4.767517742067053e-05, "loss": 0.2052, "num_input_tokens_seen": 11171200, "step": 11690 }, { "epoch": 0.953992984745901, "grad_norm": 0.1150798425078392, "learning_rate": 4.769557060119096e-05, "loss": 0.2115, "num_input_tokens_seen": 11175488, "step": 11695 }, { "epoch": 0.9544008483563097, "grad_norm": 7.131397724151611, "learning_rate": 4.77159637817114e-05, "loss": 0.3107, "num_input_tokens_seen": 11180144, "step": 11700 }, { "epoch": 0.9548087119667183, "grad_norm": 0.10244901478290558, "learning_rate": 4.773635696223183e-05, "loss": 0.0116, "num_input_tokens_seen": 11184752, "step": 11705 }, { "epoch": 0.955216575577127, "grad_norm": 1.1073163747787476, "learning_rate": 4.7756750142752265e-05, "loss": 0.0142, "num_input_tokens_seen": 11189264, "step": 11710 }, { "epoch": 0.9556244391875357, "grad_norm": 0.13585439324378967, "learning_rate": 4.77771433232727e-05, "loss": 0.1724, "num_input_tokens_seen": 11194416, "step": 11715 }, { "epoch": 0.9560323027979444, "grad_norm": 8.957136154174805, "learning_rate": 4.7797536503793136e-05, "loss": 0.6047, "num_input_tokens_seen": 11199392, "step": 11720 }, { "epoch": 0.956440166408353, "grad_norm": 10.856334686279297, "learning_rate": 4.781792968431357e-05, "loss": 0.5255, "num_input_tokens_seen": 11203728, "step": 11725 }, { "epoch": 0.9568480300187617, "grad_norm": 1.1766026020050049, "learning_rate": 4.7838322864834e-05, "loss": 0.3891, "num_input_tokens_seen": 11208368, "step": 11730 }, { "epoch": 0.9572558936291704, "grad_norm": 0.1462777554988861, "learning_rate": 4.7858716045354434e-05, "loss": 0.196, "num_input_tokens_seen": 11212944, "step": 11735 }, { "epoch": 0.9576637572395791, "grad_norm": 0.3922816514968872, "learning_rate": 4.7879109225874866e-05, "loss": 0.0198, "num_input_tokens_seen": 11217984, "step": 11740 }, { "epoch": 0.9580716208499878, "grad_norm": 0.16336360573768616, "learning_rate": 4.7899502406395305e-05, "loss": 0.3909, "num_input_tokens_seen": 11222560, "step": 11745 }, { "epoch": 0.9584794844603964, "grad_norm": 0.05004219710826874, "learning_rate": 4.791989558691574e-05, "loss": 0.3242, "num_input_tokens_seen": 11226960, "step": 11750 }, { "epoch": 0.9588873480708051, "grad_norm": 0.16761261224746704, "learning_rate": 4.794028876743617e-05, "loss": 0.2373, "num_input_tokens_seen": 11231616, "step": 11755 }, { "epoch": 0.9592952116812138, "grad_norm": 0.15726162493228912, "learning_rate": 4.79606819479566e-05, "loss": 0.2247, "num_input_tokens_seen": 11236528, "step": 11760 }, { "epoch": 0.9597030752916225, "grad_norm": 19.133726119995117, "learning_rate": 4.7981075128477035e-05, "loss": 0.6386, "num_input_tokens_seen": 11241296, "step": 11765 }, { "epoch": 0.9601109389020311, "grad_norm": 26.64348030090332, "learning_rate": 4.8001468308997474e-05, "loss": 0.1556, "num_input_tokens_seen": 11246288, "step": 11770 }, { "epoch": 0.9605188025124398, "grad_norm": 47.7135124206543, "learning_rate": 4.8021861489517906e-05, "loss": 0.147, "num_input_tokens_seen": 11250752, "step": 11775 }, { "epoch": 0.9609266661228485, "grad_norm": 0.08666137605905533, "learning_rate": 4.804225467003834e-05, "loss": 0.6074, "num_input_tokens_seen": 11255312, "step": 11780 }, { "epoch": 0.9613345297332572, "grad_norm": 43.48322677612305, "learning_rate": 4.806264785055877e-05, "loss": 0.4295, "num_input_tokens_seen": 11260240, "step": 11785 }, { "epoch": 0.9617423933436658, "grad_norm": 0.42710626125335693, "learning_rate": 4.808304103107921e-05, "loss": 0.0161, "num_input_tokens_seen": 11264592, "step": 11790 }, { "epoch": 0.9621502569540745, "grad_norm": 0.5542681813240051, "learning_rate": 4.810343421159965e-05, "loss": 0.205, "num_input_tokens_seen": 11269216, "step": 11795 }, { "epoch": 0.9625581205644832, "grad_norm": 9.048221588134766, "learning_rate": 4.812382739212008e-05, "loss": 0.2701, "num_input_tokens_seen": 11273440, "step": 11800 }, { "epoch": 0.9629659841748919, "grad_norm": 0.12397903203964233, "learning_rate": 4.8144220572640514e-05, "loss": 0.0125, "num_input_tokens_seen": 11278560, "step": 11805 }, { "epoch": 0.9633738477853006, "grad_norm": 10.17318344116211, "learning_rate": 4.8164613753160946e-05, "loss": 0.3965, "num_input_tokens_seen": 11282992, "step": 11810 }, { "epoch": 0.9637817113957092, "grad_norm": 15.295110702514648, "learning_rate": 4.818500693368138e-05, "loss": 0.0659, "num_input_tokens_seen": 11287760, "step": 11815 }, { "epoch": 0.9641895750061179, "grad_norm": 9.387687683105469, "learning_rate": 4.820540011420182e-05, "loss": 0.5899, "num_input_tokens_seen": 11292320, "step": 11820 }, { "epoch": 0.9645974386165266, "grad_norm": 5.329090595245361, "learning_rate": 4.822579329472225e-05, "loss": 0.5022, "num_input_tokens_seen": 11297792, "step": 11825 }, { "epoch": 0.9650053022269353, "grad_norm": 5.277543067932129, "learning_rate": 4.824618647524268e-05, "loss": 0.3295, "num_input_tokens_seen": 11302496, "step": 11830 }, { "epoch": 0.965413165837344, "grad_norm": 3.2344136238098145, "learning_rate": 4.8266579655763115e-05, "loss": 0.0814, "num_input_tokens_seen": 11307376, "step": 11835 }, { "epoch": 0.9658210294477527, "grad_norm": 0.041227590292692184, "learning_rate": 4.828697283628355e-05, "loss": 0.083, "num_input_tokens_seen": 11312432, "step": 11840 }, { "epoch": 0.9662288930581614, "grad_norm": 11.334041595458984, "learning_rate": 4.8307366016803986e-05, "loss": 0.1946, "num_input_tokens_seen": 11316656, "step": 11845 }, { "epoch": 0.9666367566685701, "grad_norm": 0.028597520664334297, "learning_rate": 4.832775919732442e-05, "loss": 0.3416, "num_input_tokens_seen": 11322144, "step": 11850 }, { "epoch": 0.9670446202789788, "grad_norm": 0.10204839706420898, "learning_rate": 4.834815237784485e-05, "loss": 0.3209, "num_input_tokens_seen": 11327056, "step": 11855 }, { "epoch": 0.9674524838893874, "grad_norm": 1.2305339574813843, "learning_rate": 4.8368545558365284e-05, "loss": 0.142, "num_input_tokens_seen": 11331216, "step": 11860 }, { "epoch": 0.9678603474997961, "grad_norm": 0.34261634945869446, "learning_rate": 4.838893873888572e-05, "loss": 0.0138, "num_input_tokens_seen": 11337088, "step": 11865 }, { "epoch": 0.9682682111102048, "grad_norm": 23.74901008605957, "learning_rate": 4.8409331919406155e-05, "loss": 0.0414, "num_input_tokens_seen": 11341664, "step": 11870 }, { "epoch": 0.9686760747206135, "grad_norm": 0.06491328775882721, "learning_rate": 4.842972509992659e-05, "loss": 0.1856, "num_input_tokens_seen": 11346560, "step": 11875 }, { "epoch": 0.9690839383310221, "grad_norm": 0.033605266362428665, "learning_rate": 4.845011828044702e-05, "loss": 0.041, "num_input_tokens_seen": 11352416, "step": 11880 }, { "epoch": 0.9694918019414308, "grad_norm": 2.853681802749634, "learning_rate": 4.847051146096745e-05, "loss": 0.2737, "num_input_tokens_seen": 11357408, "step": 11885 }, { "epoch": 0.9698996655518395, "grad_norm": 0.009291372261941433, "learning_rate": 4.849090464148789e-05, "loss": 0.0035, "num_input_tokens_seen": 11362512, "step": 11890 }, { "epoch": 0.9703075291622482, "grad_norm": 7.958995819091797, "learning_rate": 4.8511297822008324e-05, "loss": 0.2115, "num_input_tokens_seen": 11367856, "step": 11895 }, { "epoch": 0.9707153927726568, "grad_norm": 17.97307777404785, "learning_rate": 4.8531691002528756e-05, "loss": 0.2606, "num_input_tokens_seen": 11372224, "step": 11900 }, { "epoch": 0.9711232563830655, "grad_norm": 10.777276039123535, "learning_rate": 4.855208418304919e-05, "loss": 0.3207, "num_input_tokens_seen": 11376800, "step": 11905 }, { "epoch": 0.9715311199934742, "grad_norm": 0.028835413977503777, "learning_rate": 4.857247736356962e-05, "loss": 0.4872, "num_input_tokens_seen": 11381584, "step": 11910 }, { "epoch": 0.9719389836038829, "grad_norm": 38.81364822387695, "learning_rate": 4.859287054409006e-05, "loss": 0.4604, "num_input_tokens_seen": 11386480, "step": 11915 }, { "epoch": 0.9723468472142915, "grad_norm": 0.02349124848842621, "learning_rate": 4.861326372461049e-05, "loss": 0.3818, "num_input_tokens_seen": 11391792, "step": 11920 }, { "epoch": 0.9727547108247002, "grad_norm": 3.4854321479797363, "learning_rate": 4.8633656905130925e-05, "loss": 0.4392, "num_input_tokens_seen": 11397264, "step": 11925 }, { "epoch": 0.9731625744351089, "grad_norm": 3.54852032661438, "learning_rate": 4.865405008565136e-05, "loss": 0.1802, "num_input_tokens_seen": 11402368, "step": 11930 }, { "epoch": 0.9735704380455176, "grad_norm": 14.324861526489258, "learning_rate": 4.8674443266171796e-05, "loss": 0.2658, "num_input_tokens_seen": 11407296, "step": 11935 }, { "epoch": 0.9739783016559262, "grad_norm": 0.3345623016357422, "learning_rate": 4.869483644669223e-05, "loss": 0.0552, "num_input_tokens_seen": 11411856, "step": 11940 }, { "epoch": 0.9743861652663349, "grad_norm": 0.09218847006559372, "learning_rate": 4.871522962721266e-05, "loss": 0.2426, "num_input_tokens_seen": 11417008, "step": 11945 }, { "epoch": 0.9747940288767436, "grad_norm": 9.823990821838379, "learning_rate": 4.8735622807733093e-05, "loss": 0.8832, "num_input_tokens_seen": 11420736, "step": 11950 }, { "epoch": 0.9752018924871523, "grad_norm": 5.172850608825684, "learning_rate": 4.8756015988253526e-05, "loss": 0.928, "num_input_tokens_seen": 11425360, "step": 11955 }, { "epoch": 0.975609756097561, "grad_norm": 6.2478766441345215, "learning_rate": 4.8776409168773965e-05, "loss": 0.377, "num_input_tokens_seen": 11430752, "step": 11960 }, { "epoch": 0.9760176197079696, "grad_norm": 5.155200958251953, "learning_rate": 4.87968023492944e-05, "loss": 0.2164, "num_input_tokens_seen": 11435264, "step": 11965 }, { "epoch": 0.9764254833183783, "grad_norm": 0.1037234365940094, "learning_rate": 4.881719552981483e-05, "loss": 0.0585, "num_input_tokens_seen": 11439408, "step": 11970 }, { "epoch": 0.976833346928787, "grad_norm": 5.4518141746521, "learning_rate": 4.883758871033526e-05, "loss": 0.713, "num_input_tokens_seen": 11443776, "step": 11975 }, { "epoch": 0.9772412105391957, "grad_norm": 6.971922874450684, "learning_rate": 4.8857981890855694e-05, "loss": 0.1908, "num_input_tokens_seen": 11449232, "step": 11980 }, { "epoch": 0.9776490741496043, "grad_norm": 3.5778262615203857, "learning_rate": 4.8878375071376134e-05, "loss": 0.3423, "num_input_tokens_seen": 11454656, "step": 11985 }, { "epoch": 0.978056937760013, "grad_norm": 2.15216326713562, "learning_rate": 4.889876825189657e-05, "loss": 0.1217, "num_input_tokens_seen": 11459536, "step": 11990 }, { "epoch": 0.9784648013704217, "grad_norm": 11.949115753173828, "learning_rate": 4.8919161432417005e-05, "loss": 0.1133, "num_input_tokens_seen": 11464256, "step": 11995 }, { "epoch": 0.9788726649808304, "grad_norm": 0.15251417458057404, "learning_rate": 4.893955461293744e-05, "loss": 0.2258, "num_input_tokens_seen": 11467936, "step": 12000 }, { "epoch": 0.979280528591239, "grad_norm": 0.6090974807739258, "learning_rate": 4.895994779345787e-05, "loss": 0.1486, "num_input_tokens_seen": 11472448, "step": 12005 }, { "epoch": 0.9796883922016477, "grad_norm": 0.07339674234390259, "learning_rate": 4.898034097397831e-05, "loss": 0.0127, "num_input_tokens_seen": 11477744, "step": 12010 }, { "epoch": 0.9800962558120564, "grad_norm": 21.53631591796875, "learning_rate": 4.900073415449874e-05, "loss": 0.2698, "num_input_tokens_seen": 11482640, "step": 12015 }, { "epoch": 0.9805041194224652, "grad_norm": 0.35358378291130066, "learning_rate": 4.9021127335019174e-05, "loss": 0.2399, "num_input_tokens_seen": 11486736, "step": 12020 }, { "epoch": 0.9809119830328739, "grad_norm": 0.04956285282969475, "learning_rate": 4.9041520515539606e-05, "loss": 0.0058, "num_input_tokens_seen": 11491168, "step": 12025 }, { "epoch": 0.9813198466432825, "grad_norm": 0.0388394258916378, "learning_rate": 4.906191369606004e-05, "loss": 0.2008, "num_input_tokens_seen": 11495232, "step": 12030 }, { "epoch": 0.9817277102536912, "grad_norm": 12.774979591369629, "learning_rate": 4.908230687658048e-05, "loss": 0.3534, "num_input_tokens_seen": 11499952, "step": 12035 }, { "epoch": 0.9821355738640999, "grad_norm": 0.7582738995552063, "learning_rate": 4.910270005710091e-05, "loss": 0.0072, "num_input_tokens_seen": 11505200, "step": 12040 }, { "epoch": 0.9825434374745086, "grad_norm": 42.99439239501953, "learning_rate": 4.912309323762134e-05, "loss": 0.0446, "num_input_tokens_seen": 11510448, "step": 12045 }, { "epoch": 0.9829513010849172, "grad_norm": 0.03902691230177879, "learning_rate": 4.9143486418141775e-05, "loss": 0.3568, "num_input_tokens_seen": 11515280, "step": 12050 }, { "epoch": 0.9833591646953259, "grad_norm": 0.14777371287345886, "learning_rate": 4.916387959866221e-05, "loss": 0.1941, "num_input_tokens_seen": 11520688, "step": 12055 }, { "epoch": 0.9837670283057346, "grad_norm": 0.15235759317874908, "learning_rate": 4.9184272779182646e-05, "loss": 0.1899, "num_input_tokens_seen": 11525648, "step": 12060 }, { "epoch": 0.9841748919161433, "grad_norm": 0.36303284764289856, "learning_rate": 4.920466595970308e-05, "loss": 0.7433, "num_input_tokens_seen": 11530416, "step": 12065 }, { "epoch": 0.984582755526552, "grad_norm": 9.637406349182129, "learning_rate": 4.922505914022351e-05, "loss": 0.0354, "num_input_tokens_seen": 11535344, "step": 12070 }, { "epoch": 0.9849906191369606, "grad_norm": 0.17090365290641785, "learning_rate": 4.9245452320743944e-05, "loss": 0.2355, "num_input_tokens_seen": 11540544, "step": 12075 }, { "epoch": 0.9853984827473693, "grad_norm": 0.19019490480422974, "learning_rate": 4.926584550126438e-05, "loss": 0.3725, "num_input_tokens_seen": 11544896, "step": 12080 }, { "epoch": 0.985806346357778, "grad_norm": 0.056170754134655, "learning_rate": 4.9286238681784815e-05, "loss": 0.2358, "num_input_tokens_seen": 11550320, "step": 12085 }, { "epoch": 0.9862142099681867, "grad_norm": 0.03300270438194275, "learning_rate": 4.930663186230525e-05, "loss": 0.065, "num_input_tokens_seen": 11553952, "step": 12090 }, { "epoch": 0.9866220735785953, "grad_norm": 20.792404174804688, "learning_rate": 4.932702504282568e-05, "loss": 0.0578, "num_input_tokens_seen": 11558752, "step": 12095 }, { "epoch": 0.987029937189004, "grad_norm": 0.06413212418556213, "learning_rate": 4.934741822334611e-05, "loss": 0.5001, "num_input_tokens_seen": 11562336, "step": 12100 }, { "epoch": 0.9874378007994127, "grad_norm": 0.16116884350776672, "learning_rate": 4.936781140386655e-05, "loss": 0.2373, "num_input_tokens_seen": 11567872, "step": 12105 }, { "epoch": 0.9878456644098214, "grad_norm": 0.09011838585138321, "learning_rate": 4.9388204584386984e-05, "loss": 0.0057, "num_input_tokens_seen": 11572928, "step": 12110 }, { "epoch": 0.98825352802023, "grad_norm": 0.06896699219942093, "learning_rate": 4.9408597764907416e-05, "loss": 0.2142, "num_input_tokens_seen": 11578096, "step": 12115 }, { "epoch": 0.9886613916306387, "grad_norm": 0.20620687305927277, "learning_rate": 4.942899094542785e-05, "loss": 0.5434, "num_input_tokens_seen": 11582688, "step": 12120 }, { "epoch": 0.9890692552410474, "grad_norm": 7.410512924194336, "learning_rate": 4.944938412594828e-05, "loss": 0.4176, "num_input_tokens_seen": 11587472, "step": 12125 }, { "epoch": 0.9894771188514561, "grad_norm": 0.1342029571533203, "learning_rate": 4.946977730646872e-05, "loss": 0.0863, "num_input_tokens_seen": 11592672, "step": 12130 }, { "epoch": 0.9898849824618647, "grad_norm": 0.19602492451667786, "learning_rate": 4.949017048698915e-05, "loss": 0.2021, "num_input_tokens_seen": 11597760, "step": 12135 }, { "epoch": 0.9902928460722734, "grad_norm": 0.14061564207077026, "learning_rate": 4.9510563667509585e-05, "loss": 0.0128, "num_input_tokens_seen": 11602608, "step": 12140 }, { "epoch": 0.9907007096826821, "grad_norm": 56.51357650756836, "learning_rate": 4.953095684803002e-05, "loss": 0.3446, "num_input_tokens_seen": 11607584, "step": 12145 }, { "epoch": 0.9911085732930908, "grad_norm": 5.639443874359131, "learning_rate": 4.955135002855045e-05, "loss": 0.3568, "num_input_tokens_seen": 11612208, "step": 12150 }, { "epoch": 0.9915164369034994, "grad_norm": 31.900175094604492, "learning_rate": 4.957174320907089e-05, "loss": 0.7783, "num_input_tokens_seen": 11616704, "step": 12155 }, { "epoch": 0.9919243005139081, "grad_norm": 25.882511138916016, "learning_rate": 4.959213638959132e-05, "loss": 0.2125, "num_input_tokens_seen": 11621984, "step": 12160 }, { "epoch": 0.9923321641243168, "grad_norm": 17.693958282470703, "learning_rate": 4.961252957011175e-05, "loss": 0.2518, "num_input_tokens_seen": 11625824, "step": 12165 }, { "epoch": 0.9927400277347255, "grad_norm": 9.95188045501709, "learning_rate": 4.963292275063219e-05, "loss": 0.2238, "num_input_tokens_seen": 11630240, "step": 12170 }, { "epoch": 0.9931478913451341, "grad_norm": 0.2048361897468567, "learning_rate": 4.9653315931152625e-05, "loss": 0.0696, "num_input_tokens_seen": 11635424, "step": 12175 }, { "epoch": 0.9935557549555428, "grad_norm": 2.0622689723968506, "learning_rate": 4.9673709111673064e-05, "loss": 0.1565, "num_input_tokens_seen": 11639520, "step": 12180 }, { "epoch": 0.9939636185659515, "grad_norm": 0.20558618009090424, "learning_rate": 4.9694102292193496e-05, "loss": 0.1913, "num_input_tokens_seen": 11645248, "step": 12185 }, { "epoch": 0.9943714821763602, "grad_norm": 15.287420272827148, "learning_rate": 4.971449547271393e-05, "loss": 0.5897, "num_input_tokens_seen": 11650240, "step": 12190 }, { "epoch": 0.9947793457867689, "grad_norm": 45.8211669921875, "learning_rate": 4.973488865323436e-05, "loss": 0.8684, "num_input_tokens_seen": 11654944, "step": 12195 }, { "epoch": 0.9951872093971775, "grad_norm": 0.08487266302108765, "learning_rate": 4.9755281833754794e-05, "loss": 0.184, "num_input_tokens_seen": 11659392, "step": 12200 }, { "epoch": 0.9955950730075863, "grad_norm": 6.501053810119629, "learning_rate": 4.977567501427523e-05, "loss": 0.3006, "num_input_tokens_seen": 11663104, "step": 12205 }, { "epoch": 0.996002936617995, "grad_norm": 0.11327753961086273, "learning_rate": 4.9796068194795665e-05, "loss": 0.1443, "num_input_tokens_seen": 11668032, "step": 12210 }, { "epoch": 0.9964108002284037, "grad_norm": 0.44473955035209656, "learning_rate": 4.98164613753161e-05, "loss": 0.0541, "num_input_tokens_seen": 11672176, "step": 12215 }, { "epoch": 0.9968186638388123, "grad_norm": 0.06928205490112305, "learning_rate": 4.983685455583653e-05, "loss": 0.0599, "num_input_tokens_seen": 11677152, "step": 12220 }, { "epoch": 0.997226527449221, "grad_norm": 31.636322021484375, "learning_rate": 4.985724773635697e-05, "loss": 0.2295, "num_input_tokens_seen": 11682048, "step": 12225 }, { "epoch": 0.9976343910596297, "grad_norm": 0.11482347548007965, "learning_rate": 4.98776409168774e-05, "loss": 0.3591, "num_input_tokens_seen": 11686880, "step": 12230 }, { "epoch": 0.9980422546700384, "grad_norm": 14.265369415283203, "learning_rate": 4.9898034097397834e-05, "loss": 0.2099, "num_input_tokens_seen": 11692064, "step": 12235 }, { "epoch": 0.998450118280447, "grad_norm": 7.245553970336914, "learning_rate": 4.9918427277918266e-05, "loss": 0.0289, "num_input_tokens_seen": 11697008, "step": 12240 }, { "epoch": 0.9988579818908557, "grad_norm": 10.050851821899414, "learning_rate": 4.99388204584387e-05, "loss": 0.3159, "num_input_tokens_seen": 11702400, "step": 12245 }, { "epoch": 0.9992658455012644, "grad_norm": 3.2631278038024902, "learning_rate": 4.995921363895914e-05, "loss": 0.0152, "num_input_tokens_seen": 11707376, "step": 12250 }, { "epoch": 0.9996737091116731, "grad_norm": 46.547203063964844, "learning_rate": 4.997960681947957e-05, "loss": 0.2335, "num_input_tokens_seen": 11712176, "step": 12255 }, { "epoch": 1.0000815727220818, "grad_norm": 0.1375686228275299, "learning_rate": 5e-05, "loss": 0.5557, "num_input_tokens_seen": 11717296, "step": 12260 }, { "epoch": 1.0000815727220818, "eval_loss": 0.34934306144714355, "eval_runtime": 570.9314, "eval_samples_per_second": 4.773, "eval_steps_per_second": 2.387, "num_input_tokens_seen": 11717296, "step": 12260 }, { "epoch": 1.0004894363324903, "grad_norm": 0.10003790259361267, "learning_rate": 4.999999974663031e-05, "loss": 0.4652, "num_input_tokens_seen": 11722304, "step": 12265 }, { "epoch": 1.000897299942899, "grad_norm": 0.03914268687367439, "learning_rate": 4.999999898652126e-05, "loss": 0.7746, "num_input_tokens_seen": 11727536, "step": 12270 }, { "epoch": 1.0013051635533077, "grad_norm": 18.305130004882812, "learning_rate": 4.999999771967284e-05, "loss": 0.6609, "num_input_tokens_seen": 11732368, "step": 12275 }, { "epoch": 1.0017130271637165, "grad_norm": 0.027848295867443085, "learning_rate": 4.999999594608509e-05, "loss": 0.0438, "num_input_tokens_seen": 11737760, "step": 12280 }, { "epoch": 1.002120890774125, "grad_norm": 77.85916900634766, "learning_rate": 4.999999366575805e-05, "loss": 0.4792, "num_input_tokens_seen": 11742416, "step": 12285 }, { "epoch": 1.0025287543845338, "grad_norm": 0.03447384387254715, "learning_rate": 4.999999087869176e-05, "loss": 0.6389, "num_input_tokens_seen": 11747264, "step": 12290 }, { "epoch": 1.0029366179949424, "grad_norm": 0.2230788916349411, "learning_rate": 4.999998758488628e-05, "loss": 0.2168, "num_input_tokens_seen": 11752768, "step": 12295 }, { "epoch": 1.0033444816053512, "grad_norm": 0.4635215401649475, "learning_rate": 4.999998378434167e-05, "loss": 0.3667, "num_input_tokens_seen": 11757504, "step": 12300 }, { "epoch": 1.00375234521576, "grad_norm": 0.19108155369758606, "learning_rate": 4.9999979477058014e-05, "loss": 0.4592, "num_input_tokens_seen": 11762384, "step": 12305 }, { "epoch": 1.0041602088261685, "grad_norm": 0.36948707699775696, "learning_rate": 4.99999746630354e-05, "loss": 0.2303, "num_input_tokens_seen": 11767152, "step": 12310 }, { "epoch": 1.0045680724365773, "grad_norm": 2.273078680038452, "learning_rate": 4.9999969342273916e-05, "loss": 0.047, "num_input_tokens_seen": 11771136, "step": 12315 }, { "epoch": 1.0049759360469859, "grad_norm": 1.5547564029693604, "learning_rate": 4.9999963514773684e-05, "loss": 0.094, "num_input_tokens_seen": 11776048, "step": 12320 }, { "epoch": 1.0053837996573947, "grad_norm": 0.32085928320884705, "learning_rate": 4.9999957180534805e-05, "loss": 0.2962, "num_input_tokens_seen": 11780992, "step": 12325 }, { "epoch": 1.0057916632678032, "grad_norm": 66.34131622314453, "learning_rate": 4.999995033955743e-05, "loss": 0.072, "num_input_tokens_seen": 11786448, "step": 12330 }, { "epoch": 1.006199526878212, "grad_norm": 0.16640621423721313, "learning_rate": 4.9999942991841677e-05, "loss": 0.0201, "num_input_tokens_seen": 11791392, "step": 12335 }, { "epoch": 1.0066073904886206, "grad_norm": 0.46509110927581787, "learning_rate": 4.99999351373877e-05, "loss": 0.2194, "num_input_tokens_seen": 11796448, "step": 12340 }, { "epoch": 1.0070152540990294, "grad_norm": 0.02994319051504135, "learning_rate": 4.999992677619567e-05, "loss": 0.2003, "num_input_tokens_seen": 11801648, "step": 12345 }, { "epoch": 1.007423117709438, "grad_norm": 0.4432227611541748, "learning_rate": 4.9999917908265734e-05, "loss": 0.4946, "num_input_tokens_seen": 11805456, "step": 12350 }, { "epoch": 1.0078309813198467, "grad_norm": 0.0854010283946991, "learning_rate": 4.99999085335981e-05, "loss": 0.3414, "num_input_tokens_seen": 11810560, "step": 12355 }, { "epoch": 1.0082388449302553, "grad_norm": 0.26030871272087097, "learning_rate": 4.9999898652192937e-05, "loss": 0.169, "num_input_tokens_seen": 11816080, "step": 12360 }, { "epoch": 1.008646708540664, "grad_norm": 0.21515880525112152, "learning_rate": 4.999988826405045e-05, "loss": 0.1211, "num_input_tokens_seen": 11821376, "step": 12365 }, { "epoch": 1.0090545721510726, "grad_norm": 0.4977397918701172, "learning_rate": 4.999987736917085e-05, "loss": 0.1134, "num_input_tokens_seen": 11825936, "step": 12370 }, { "epoch": 1.0094624357614814, "grad_norm": 0.05330958217382431, "learning_rate": 4.999986596755436e-05, "loss": 0.1351, "num_input_tokens_seen": 11830736, "step": 12375 }, { "epoch": 1.00987029937189, "grad_norm": 0.7769466042518616, "learning_rate": 4.999985405920121e-05, "loss": 0.0211, "num_input_tokens_seen": 11835312, "step": 12380 }, { "epoch": 1.0102781629822988, "grad_norm": 0.06359097361564636, "learning_rate": 4.999984164411164e-05, "loss": 0.2905, "num_input_tokens_seen": 11839536, "step": 12385 }, { "epoch": 1.0106860265927073, "grad_norm": 23.930862426757812, "learning_rate": 4.999982872228591e-05, "loss": 0.2591, "num_input_tokens_seen": 11843728, "step": 12390 }, { "epoch": 1.0110938902031161, "grad_norm": 9.868656158447266, "learning_rate": 4.999981529372427e-05, "loss": 0.5068, "num_input_tokens_seen": 11848272, "step": 12395 }, { "epoch": 1.0115017538135247, "grad_norm": 8.231616020202637, "learning_rate": 4.9999801358427e-05, "loss": 0.2666, "num_input_tokens_seen": 11853536, "step": 12400 }, { "epoch": 1.0119096174239335, "grad_norm": 0.035799067467451096, "learning_rate": 4.999978691639438e-05, "loss": 0.4478, "num_input_tokens_seen": 11859408, "step": 12405 }, { "epoch": 1.012317481034342, "grad_norm": 0.05428246781229973, "learning_rate": 4.999977196762669e-05, "loss": 0.0601, "num_input_tokens_seen": 11864384, "step": 12410 }, { "epoch": 1.0127253446447508, "grad_norm": 0.09975548088550568, "learning_rate": 4.999975651212425e-05, "loss": 0.0992, "num_input_tokens_seen": 11869296, "step": 12415 }, { "epoch": 1.0131332082551594, "grad_norm": 0.3034943640232086, "learning_rate": 4.999974054988738e-05, "loss": 0.1384, "num_input_tokens_seen": 11873920, "step": 12420 }, { "epoch": 1.0135410718655682, "grad_norm": 0.058273423463106155, "learning_rate": 4.999972408091638e-05, "loss": 0.1501, "num_input_tokens_seen": 11879264, "step": 12425 }, { "epoch": 1.0139489354759768, "grad_norm": 11.857726097106934, "learning_rate": 4.99997071052116e-05, "loss": 0.1838, "num_input_tokens_seen": 11884416, "step": 12430 }, { "epoch": 1.0143567990863855, "grad_norm": 0.20150844752788544, "learning_rate": 4.999968962277338e-05, "loss": 0.1865, "num_input_tokens_seen": 11889808, "step": 12435 }, { "epoch": 1.014764662696794, "grad_norm": 0.10052966326475143, "learning_rate": 4.9999671633602074e-05, "loss": 0.3505, "num_input_tokens_seen": 11894912, "step": 12440 }, { "epoch": 1.015172526307203, "grad_norm": 0.04855881631374359, "learning_rate": 4.999965313769805e-05, "loss": 0.0035, "num_input_tokens_seen": 11898672, "step": 12445 }, { "epoch": 1.0155803899176115, "grad_norm": 12.989995002746582, "learning_rate": 4.999963413506168e-05, "loss": 0.7411, "num_input_tokens_seen": 11902896, "step": 12450 }, { "epoch": 1.0159882535280202, "grad_norm": 0.026348214596509933, "learning_rate": 4.9999614625693345e-05, "loss": 0.0046, "num_input_tokens_seen": 11907152, "step": 12455 }, { "epoch": 1.0163961171384288, "grad_norm": 0.07301168888807297, "learning_rate": 4.999959460959345e-05, "loss": 0.3156, "num_input_tokens_seen": 11912256, "step": 12460 }, { "epoch": 1.0168039807488376, "grad_norm": 0.017088009044528008, "learning_rate": 4.999957408676239e-05, "loss": 0.0354, "num_input_tokens_seen": 11917072, "step": 12465 }, { "epoch": 1.0172118443592462, "grad_norm": 0.17663933336734772, "learning_rate": 4.999955305720059e-05, "loss": 0.2131, "num_input_tokens_seen": 11921792, "step": 12470 }, { "epoch": 1.017619707969655, "grad_norm": 12.195871353149414, "learning_rate": 4.9999531520908464e-05, "loss": 0.3423, "num_input_tokens_seen": 11926512, "step": 12475 }, { "epoch": 1.0180275715800637, "grad_norm": 6.648066997528076, "learning_rate": 4.999950947788646e-05, "loss": 0.5995, "num_input_tokens_seen": 11931184, "step": 12480 }, { "epoch": 1.0184354351904723, "grad_norm": 0.04880300536751747, "learning_rate": 4.9999486928135034e-05, "loss": 0.1816, "num_input_tokens_seen": 11936272, "step": 12485 }, { "epoch": 1.018843298800881, "grad_norm": 6.1225199699401855, "learning_rate": 4.9999463871654616e-05, "loss": 0.3445, "num_input_tokens_seen": 11941312, "step": 12490 }, { "epoch": 1.0192511624112897, "grad_norm": 0.052976999431848526, "learning_rate": 4.9999440308445696e-05, "loss": 0.1725, "num_input_tokens_seen": 11946176, "step": 12495 }, { "epoch": 1.0196590260216984, "grad_norm": 0.6398360729217529, "learning_rate": 4.999941623850875e-05, "loss": 0.2417, "num_input_tokens_seen": 11951408, "step": 12500 }, { "epoch": 1.020066889632107, "grad_norm": 0.054385848343372345, "learning_rate": 4.999939166184424e-05, "loss": 0.0536, "num_input_tokens_seen": 11956160, "step": 12505 }, { "epoch": 1.0204747532425158, "grad_norm": 0.03592368960380554, "learning_rate": 4.99993665784527e-05, "loss": 0.0126, "num_input_tokens_seen": 11960512, "step": 12510 }, { "epoch": 1.0208826168529244, "grad_norm": 0.0813213363289833, "learning_rate": 4.999934098833462e-05, "loss": 0.3872, "num_input_tokens_seen": 11965200, "step": 12515 }, { "epoch": 1.0212904804633332, "grad_norm": 0.08811228722333908, "learning_rate": 4.9999314891490514e-05, "loss": 0.4432, "num_input_tokens_seen": 11969952, "step": 12520 }, { "epoch": 1.0216983440737417, "grad_norm": 1.3684368133544922, "learning_rate": 4.999928828792093e-05, "loss": 0.5839, "num_input_tokens_seen": 11975024, "step": 12525 }, { "epoch": 1.0221062076841505, "grad_norm": 0.15933601558208466, "learning_rate": 4.999926117762638e-05, "loss": 0.0145, "num_input_tokens_seen": 11979152, "step": 12530 }, { "epoch": 1.022514071294559, "grad_norm": 1.6804466247558594, "learning_rate": 4.999923356060744e-05, "loss": 0.3268, "num_input_tokens_seen": 11983280, "step": 12535 }, { "epoch": 1.0229219349049679, "grad_norm": 0.042325299233198166, "learning_rate": 4.999920543686466e-05, "loss": 0.0351, "num_input_tokens_seen": 11987888, "step": 12540 }, { "epoch": 1.0233297985153764, "grad_norm": 0.0323045551776886, "learning_rate": 4.9999176806398596e-05, "loss": 0.0115, "num_input_tokens_seen": 11993072, "step": 12545 }, { "epoch": 1.0237376621257852, "grad_norm": 0.08081375807523727, "learning_rate": 4.9999147669209855e-05, "loss": 0.1767, "num_input_tokens_seen": 11996816, "step": 12550 }, { "epoch": 1.0241455257361938, "grad_norm": 0.34276270866394043, "learning_rate": 4.999911802529901e-05, "loss": 0.3203, "num_input_tokens_seen": 12001664, "step": 12555 }, { "epoch": 1.0245533893466026, "grad_norm": 0.14647752046585083, "learning_rate": 4.9999087874666663e-05, "loss": 0.0062, "num_input_tokens_seen": 12007120, "step": 12560 }, { "epoch": 1.0249612529570111, "grad_norm": 0.04051945358514786, "learning_rate": 4.999905721731342e-05, "loss": 0.2111, "num_input_tokens_seen": 12012640, "step": 12565 }, { "epoch": 1.02536911656742, "grad_norm": 15.836780548095703, "learning_rate": 4.9999026053239915e-05, "loss": 0.0936, "num_input_tokens_seen": 12017920, "step": 12570 }, { "epoch": 1.0257769801778285, "grad_norm": 0.03783288225531578, "learning_rate": 4.9998994382446784e-05, "loss": 0.0024, "num_input_tokens_seen": 12022160, "step": 12575 }, { "epoch": 1.0261848437882373, "grad_norm": 0.081730917096138, "learning_rate": 4.999896220493465e-05, "loss": 0.0663, "num_input_tokens_seen": 12027248, "step": 12580 }, { "epoch": 1.0265927073986458, "grad_norm": 0.06123528629541397, "learning_rate": 4.999892952070417e-05, "loss": 0.2413, "num_input_tokens_seen": 12032192, "step": 12585 }, { "epoch": 1.0270005710090546, "grad_norm": 0.07767833024263382, "learning_rate": 4.999889632975603e-05, "loss": 0.0049, "num_input_tokens_seen": 12037520, "step": 12590 }, { "epoch": 1.0274084346194632, "grad_norm": 0.017612148076295853, "learning_rate": 4.999886263209087e-05, "loss": 0.0105, "num_input_tokens_seen": 12042240, "step": 12595 }, { "epoch": 1.027816298229872, "grad_norm": 0.03180863708257675, "learning_rate": 4.9998828427709385e-05, "loss": 0.5823, "num_input_tokens_seen": 12046368, "step": 12600 }, { "epoch": 1.0282241618402805, "grad_norm": 0.016357842832803726, "learning_rate": 4.999879371661228e-05, "loss": 0.1845, "num_input_tokens_seen": 12051568, "step": 12605 }, { "epoch": 1.0286320254506893, "grad_norm": 24.35929298400879, "learning_rate": 4.9998758498800245e-05, "loss": 0.5711, "num_input_tokens_seen": 12056144, "step": 12610 }, { "epoch": 1.029039889061098, "grad_norm": 0.05271343141794205, "learning_rate": 4.9998722774273996e-05, "loss": 0.0048, "num_input_tokens_seen": 12060656, "step": 12615 }, { "epoch": 1.0294477526715067, "grad_norm": 0.047577790915966034, "learning_rate": 4.9998686543034254e-05, "loss": 0.2037, "num_input_tokens_seen": 12065296, "step": 12620 }, { "epoch": 1.0298556162819152, "grad_norm": 0.02848881669342518, "learning_rate": 4.999864980508177e-05, "loss": 0.3809, "num_input_tokens_seen": 12070848, "step": 12625 }, { "epoch": 1.030263479892324, "grad_norm": 16.968952178955078, "learning_rate": 4.999861256041727e-05, "loss": 0.3715, "num_input_tokens_seen": 12075968, "step": 12630 }, { "epoch": 1.0306713435027326, "grad_norm": 6.011484622955322, "learning_rate": 4.999857480904152e-05, "loss": 0.3684, "num_input_tokens_seen": 12080528, "step": 12635 }, { "epoch": 1.0310792071131414, "grad_norm": 7.216614723205566, "learning_rate": 4.9998536550955275e-05, "loss": 0.5047, "num_input_tokens_seen": 12085984, "step": 12640 }, { "epoch": 1.03148707072355, "grad_norm": 14.876664161682129, "learning_rate": 4.999849778615932e-05, "loss": 0.244, "num_input_tokens_seen": 12090512, "step": 12645 }, { "epoch": 1.0318949343339587, "grad_norm": 0.43236464262008667, "learning_rate": 4.9998458514654445e-05, "loss": 0.131, "num_input_tokens_seen": 12095616, "step": 12650 }, { "epoch": 1.0323027979443673, "grad_norm": 0.055538423359394073, "learning_rate": 4.999841873644143e-05, "loss": 0.1921, "num_input_tokens_seen": 12100464, "step": 12655 }, { "epoch": 1.032710661554776, "grad_norm": 19.224258422851562, "learning_rate": 4.999837845152109e-05, "loss": 0.1154, "num_input_tokens_seen": 12105232, "step": 12660 }, { "epoch": 1.0331185251651847, "grad_norm": 10.382377624511719, "learning_rate": 4.999833765989424e-05, "loss": 0.4489, "num_input_tokens_seen": 12109936, "step": 12665 }, { "epoch": 1.0335263887755934, "grad_norm": 0.38395339250564575, "learning_rate": 4.9998296361561713e-05, "loss": 0.0229, "num_input_tokens_seen": 12114944, "step": 12670 }, { "epoch": 1.0339342523860022, "grad_norm": 0.059656232595443726, "learning_rate": 4.9998254556524336e-05, "loss": 0.2635, "num_input_tokens_seen": 12119968, "step": 12675 }, { "epoch": 1.0343421159964108, "grad_norm": 5.644641876220703, "learning_rate": 4.999821224478296e-05, "loss": 0.2944, "num_input_tokens_seen": 12124480, "step": 12680 }, { "epoch": 1.0347499796068196, "grad_norm": 0.17252564430236816, "learning_rate": 4.999816942633845e-05, "loss": 0.0048, "num_input_tokens_seen": 12129392, "step": 12685 }, { "epoch": 1.0351578432172281, "grad_norm": 0.044055573642253876, "learning_rate": 4.999812610119167e-05, "loss": 0.3306, "num_input_tokens_seen": 12135104, "step": 12690 }, { "epoch": 1.035565706827637, "grad_norm": 0.2344522625207901, "learning_rate": 4.9998082269343496e-05, "loss": 0.0069, "num_input_tokens_seen": 12140272, "step": 12695 }, { "epoch": 1.0359735704380455, "grad_norm": 1.2216590642929077, "learning_rate": 4.999803793079481e-05, "loss": 0.0816, "num_input_tokens_seen": 12145152, "step": 12700 }, { "epoch": 1.0363814340484543, "grad_norm": 0.17012397944927216, "learning_rate": 4.999799308554652e-05, "loss": 0.1411, "num_input_tokens_seen": 12150720, "step": 12705 }, { "epoch": 1.0367892976588629, "grad_norm": 0.010896924883127213, "learning_rate": 4.999794773359953e-05, "loss": 0.0031, "num_input_tokens_seen": 12155376, "step": 12710 }, { "epoch": 1.0371971612692716, "grad_norm": 0.24288922548294067, "learning_rate": 4.9997901874954766e-05, "loss": 0.3044, "num_input_tokens_seen": 12159952, "step": 12715 }, { "epoch": 1.0376050248796802, "grad_norm": 0.020987162366509438, "learning_rate": 4.999785550961315e-05, "loss": 0.7133, "num_input_tokens_seen": 12164720, "step": 12720 }, { "epoch": 1.038012888490089, "grad_norm": 0.04542376101016998, "learning_rate": 4.9997808637575624e-05, "loss": 0.0227, "num_input_tokens_seen": 12169728, "step": 12725 }, { "epoch": 1.0384207521004976, "grad_norm": 22.015058517456055, "learning_rate": 4.999776125884314e-05, "loss": 0.721, "num_input_tokens_seen": 12175152, "step": 12730 }, { "epoch": 1.0388286157109063, "grad_norm": 0.2951592803001404, "learning_rate": 4.999771337341666e-05, "loss": 0.2121, "num_input_tokens_seen": 12179936, "step": 12735 }, { "epoch": 1.039236479321315, "grad_norm": 5.692570209503174, "learning_rate": 4.9997664981297155e-05, "loss": 0.3137, "num_input_tokens_seen": 12184672, "step": 12740 }, { "epoch": 1.0396443429317237, "grad_norm": 5.60897970199585, "learning_rate": 4.99976160824856e-05, "loss": 0.2189, "num_input_tokens_seen": 12189232, "step": 12745 }, { "epoch": 1.0400522065421323, "grad_norm": 9.500894546508789, "learning_rate": 4.999756667698299e-05, "loss": 0.2711, "num_input_tokens_seen": 12193760, "step": 12750 }, { "epoch": 1.040460070152541, "grad_norm": 0.11120691895484924, "learning_rate": 4.9997516764790325e-05, "loss": 0.2035, "num_input_tokens_seen": 12198624, "step": 12755 }, { "epoch": 1.0408679337629496, "grad_norm": 0.0693674311041832, "learning_rate": 4.9997466345908615e-05, "loss": 0.2466, "num_input_tokens_seen": 12203024, "step": 12760 }, { "epoch": 1.0412757973733584, "grad_norm": 3.7804510593414307, "learning_rate": 4.999741542033889e-05, "loss": 0.3153, "num_input_tokens_seen": 12207712, "step": 12765 }, { "epoch": 1.041683660983767, "grad_norm": 0.5614417195320129, "learning_rate": 4.999736398808217e-05, "loss": 0.0093, "num_input_tokens_seen": 12212528, "step": 12770 }, { "epoch": 1.0420915245941758, "grad_norm": 0.05905545502901077, "learning_rate": 4.999731204913951e-05, "loss": 0.1792, "num_input_tokens_seen": 12217376, "step": 12775 }, { "epoch": 1.0424993882045843, "grad_norm": 0.1189575344324112, "learning_rate": 4.999725960351195e-05, "loss": 0.3297, "num_input_tokens_seen": 12222400, "step": 12780 }, { "epoch": 1.042907251814993, "grad_norm": 9.668766975402832, "learning_rate": 4.999720665120056e-05, "loss": 0.7785, "num_input_tokens_seen": 12228016, "step": 12785 }, { "epoch": 1.0433151154254017, "grad_norm": 3.78550124168396, "learning_rate": 4.9997153192206414e-05, "loss": 0.2354, "num_input_tokens_seen": 12233200, "step": 12790 }, { "epoch": 1.0437229790358105, "grad_norm": 9.184205055236816, "learning_rate": 4.99970992265306e-05, "loss": 0.2346, "num_input_tokens_seen": 12237456, "step": 12795 }, { "epoch": 1.044130842646219, "grad_norm": 0.2354298084974289, "learning_rate": 4.9997044754174205e-05, "loss": 0.0778, "num_input_tokens_seen": 12242112, "step": 12800 }, { "epoch": 1.0445387062566278, "grad_norm": 0.16384939849376678, "learning_rate": 4.999698977513833e-05, "loss": 0.0528, "num_input_tokens_seen": 12246752, "step": 12805 }, { "epoch": 1.0449465698670364, "grad_norm": 0.0348404161632061, "learning_rate": 4.99969342894241e-05, "loss": 0.3231, "num_input_tokens_seen": 12251728, "step": 12810 }, { "epoch": 1.0453544334774452, "grad_norm": 0.6159699559211731, "learning_rate": 4.999687829703263e-05, "loss": 0.2807, "num_input_tokens_seen": 12257056, "step": 12815 }, { "epoch": 1.0457622970878537, "grad_norm": 0.0817289650440216, "learning_rate": 4.999682179796506e-05, "loss": 0.0164, "num_input_tokens_seen": 12261984, "step": 12820 }, { "epoch": 1.0461701606982625, "grad_norm": 0.42501771450042725, "learning_rate": 4.999676479222253e-05, "loss": 0.0063, "num_input_tokens_seen": 12265744, "step": 12825 }, { "epoch": 1.046578024308671, "grad_norm": 7.412408351898193, "learning_rate": 4.999670727980621e-05, "loss": 0.2564, "num_input_tokens_seen": 12270224, "step": 12830 }, { "epoch": 1.0469858879190799, "grad_norm": 21.467859268188477, "learning_rate": 4.9996649260717245e-05, "loss": 0.505, "num_input_tokens_seen": 12275792, "step": 12835 }, { "epoch": 1.0473937515294884, "grad_norm": 0.14298459887504578, "learning_rate": 4.999659073495683e-05, "loss": 0.3858, "num_input_tokens_seen": 12280432, "step": 12840 }, { "epoch": 1.0478016151398972, "grad_norm": 0.03931167349219322, "learning_rate": 4.999653170252613e-05, "loss": 0.0496, "num_input_tokens_seen": 12285136, "step": 12845 }, { "epoch": 1.048209478750306, "grad_norm": 0.9091126918792725, "learning_rate": 4.999647216342636e-05, "loss": 0.0078, "num_input_tokens_seen": 12290544, "step": 12850 }, { "epoch": 1.0486173423607146, "grad_norm": 9.747698783874512, "learning_rate": 4.9996412117658725e-05, "loss": 0.7554, "num_input_tokens_seen": 12294944, "step": 12855 }, { "epoch": 1.0490252059711231, "grad_norm": 8.451412200927734, "learning_rate": 4.999635156522443e-05, "loss": 0.5027, "num_input_tokens_seen": 12299952, "step": 12860 }, { "epoch": 1.049433069581532, "grad_norm": 0.41272616386413574, "learning_rate": 4.9996290506124724e-05, "loss": 0.0735, "num_input_tokens_seen": 12304336, "step": 12865 }, { "epoch": 1.0498409331919407, "grad_norm": 12.41528606414795, "learning_rate": 4.999622894036082e-05, "loss": 0.0364, "num_input_tokens_seen": 12309344, "step": 12870 }, { "epoch": 1.0502487968023493, "grad_norm": 0.10895663499832153, "learning_rate": 4.999616686793398e-05, "loss": 0.1639, "num_input_tokens_seen": 12313808, "step": 12875 }, { "epoch": 1.050656660412758, "grad_norm": 0.09016191959381104, "learning_rate": 4.999610428884546e-05, "loss": 0.0866, "num_input_tokens_seen": 12318320, "step": 12880 }, { "epoch": 1.0510645240231666, "grad_norm": 0.04054704308509827, "learning_rate": 4.9996041203096525e-05, "loss": 0.0284, "num_input_tokens_seen": 12323552, "step": 12885 }, { "epoch": 1.0514723876335754, "grad_norm": 0.08927333354949951, "learning_rate": 4.999597761068845e-05, "loss": 0.2043, "num_input_tokens_seen": 12328784, "step": 12890 }, { "epoch": 1.051880251243984, "grad_norm": 0.296951562166214, "learning_rate": 4.999591351162254e-05, "loss": 0.0785, "num_input_tokens_seen": 12333616, "step": 12895 }, { "epoch": 1.0522881148543928, "grad_norm": 0.030441517010331154, "learning_rate": 4.999584890590008e-05, "loss": 0.2193, "num_input_tokens_seen": 12338272, "step": 12900 }, { "epoch": 1.0526959784648013, "grad_norm": 0.5870345830917358, "learning_rate": 4.9995783793522386e-05, "loss": 0.0921, "num_input_tokens_seen": 12343264, "step": 12905 }, { "epoch": 1.0531038420752101, "grad_norm": 0.011564969085156918, "learning_rate": 4.999571817449078e-05, "loss": 0.0104, "num_input_tokens_seen": 12346960, "step": 12910 }, { "epoch": 1.0535117056856187, "grad_norm": 0.020221339538693428, "learning_rate": 4.999565204880658e-05, "loss": 0.054, "num_input_tokens_seen": 12352880, "step": 12915 }, { "epoch": 1.0539195692960275, "grad_norm": 0.012739509344100952, "learning_rate": 4.999558541647114e-05, "loss": 0.0007, "num_input_tokens_seen": 12358112, "step": 12920 }, { "epoch": 1.054327432906436, "grad_norm": 0.03510044142603874, "learning_rate": 4.99955182774858e-05, "loss": 0.4291, "num_input_tokens_seen": 12362880, "step": 12925 }, { "epoch": 1.0547352965168448, "grad_norm": 0.020379355177283287, "learning_rate": 4.9995450631851934e-05, "loss": 0.1136, "num_input_tokens_seen": 12366976, "step": 12930 }, { "epoch": 1.0551431601272534, "grad_norm": 4.897333145141602, "learning_rate": 4.9995382479570895e-05, "loss": 0.0243, "num_input_tokens_seen": 12371968, "step": 12935 }, { "epoch": 1.0555510237376622, "grad_norm": 0.05690969526767731, "learning_rate": 4.999531382064408e-05, "loss": 0.4623, "num_input_tokens_seen": 12376288, "step": 12940 }, { "epoch": 1.0559588873480708, "grad_norm": 0.02850969508290291, "learning_rate": 4.999524465507288e-05, "loss": 0.024, "num_input_tokens_seen": 12381616, "step": 12945 }, { "epoch": 1.0563667509584795, "grad_norm": 0.033581167459487915, "learning_rate": 4.9995174982858686e-05, "loss": 0.0062, "num_input_tokens_seen": 12386880, "step": 12950 }, { "epoch": 1.056774614568888, "grad_norm": 5.931169033050537, "learning_rate": 4.999510480400291e-05, "loss": 0.6142, "num_input_tokens_seen": 12391440, "step": 12955 }, { "epoch": 1.057182478179297, "grad_norm": 18.958698272705078, "learning_rate": 4.999503411850699e-05, "loss": 0.0848, "num_input_tokens_seen": 12396352, "step": 12960 }, { "epoch": 1.0575903417897055, "grad_norm": 18.88854217529297, "learning_rate": 4.999496292637235e-05, "loss": 0.5418, "num_input_tokens_seen": 12401552, "step": 12965 }, { "epoch": 1.0579982054001142, "grad_norm": 0.015282864682376385, "learning_rate": 4.999489122760042e-05, "loss": 0.0082, "num_input_tokens_seen": 12406400, "step": 12970 }, { "epoch": 1.0584060690105228, "grad_norm": 0.4542066752910614, "learning_rate": 4.999481902219267e-05, "loss": 0.3463, "num_input_tokens_seen": 12411472, "step": 12975 }, { "epoch": 1.0588139326209316, "grad_norm": 0.08624686300754547, "learning_rate": 4.999474631015057e-05, "loss": 0.1176, "num_input_tokens_seen": 12416912, "step": 12980 }, { "epoch": 1.0592217962313402, "grad_norm": 0.25596505403518677, "learning_rate": 4.999467309147558e-05, "loss": 0.0081, "num_input_tokens_seen": 12421968, "step": 12985 }, { "epoch": 1.059629659841749, "grad_norm": 0.26430970430374146, "learning_rate": 4.999459936616918e-05, "loss": 0.0063, "num_input_tokens_seen": 12425792, "step": 12990 }, { "epoch": 1.0600375234521575, "grad_norm": 0.022144103422760963, "learning_rate": 4.999452513423288e-05, "loss": 0.0089, "num_input_tokens_seen": 12429968, "step": 12995 }, { "epoch": 1.0604453870625663, "grad_norm": 0.04216131940484047, "learning_rate": 4.999445039566817e-05, "loss": 0.0029, "num_input_tokens_seen": 12434464, "step": 13000 }, { "epoch": 1.0608532506729749, "grad_norm": 0.05380505695939064, "learning_rate": 4.999437515047658e-05, "loss": 0.3172, "num_input_tokens_seen": 12439360, "step": 13005 }, { "epoch": 1.0612611142833837, "grad_norm": 1.1141126155853271, "learning_rate": 4.9994299398659615e-05, "loss": 0.3781, "num_input_tokens_seen": 12444480, "step": 13010 }, { "epoch": 1.0616689778937922, "grad_norm": 0.06008315458893776, "learning_rate": 4.999422314021883e-05, "loss": 0.3147, "num_input_tokens_seen": 12449856, "step": 13015 }, { "epoch": 1.062076841504201, "grad_norm": 0.04435333237051964, "learning_rate": 4.999414637515576e-05, "loss": 0.1959, "num_input_tokens_seen": 12455088, "step": 13020 }, { "epoch": 1.0624847051146096, "grad_norm": 0.006925804540514946, "learning_rate": 4.999406910347196e-05, "loss": 0.1497, "num_input_tokens_seen": 12460048, "step": 13025 }, { "epoch": 1.0628925687250184, "grad_norm": 5.814479827880859, "learning_rate": 4.9993991325169e-05, "loss": 0.4414, "num_input_tokens_seen": 12465072, "step": 13030 }, { "epoch": 1.063300432335427, "grad_norm": 0.605272650718689, "learning_rate": 4.999391304024846e-05, "loss": 0.0218, "num_input_tokens_seen": 12469264, "step": 13035 }, { "epoch": 1.0637082959458357, "grad_norm": 0.05714871361851692, "learning_rate": 4.999383424871192e-05, "loss": 0.0391, "num_input_tokens_seen": 12473936, "step": 13040 }, { "epoch": 1.0641161595562445, "grad_norm": 0.04033493250608444, "learning_rate": 4.999375495056099e-05, "loss": 0.0053, "num_input_tokens_seen": 12477984, "step": 13045 }, { "epoch": 1.064524023166653, "grad_norm": 0.01646164245903492, "learning_rate": 4.999367514579726e-05, "loss": 0.2938, "num_input_tokens_seen": 12483472, "step": 13050 }, { "epoch": 1.0649318867770619, "grad_norm": 0.01473039947450161, "learning_rate": 4.9993594834422353e-05, "loss": 0.1975, "num_input_tokens_seen": 12489200, "step": 13055 }, { "epoch": 1.0653397503874704, "grad_norm": 26.366539001464844, "learning_rate": 4.99935140164379e-05, "loss": 0.2696, "num_input_tokens_seen": 12493920, "step": 13060 }, { "epoch": 1.0657476139978792, "grad_norm": 0.020303282886743546, "learning_rate": 4.999343269184554e-05, "loss": 0.0083, "num_input_tokens_seen": 12499728, "step": 13065 }, { "epoch": 1.0661554776082878, "grad_norm": 0.0589006170630455, "learning_rate": 4.9993350860646926e-05, "loss": 0.3133, "num_input_tokens_seen": 12504144, "step": 13070 }, { "epoch": 1.0665633412186966, "grad_norm": 0.05885757878422737, "learning_rate": 4.9993268522843704e-05, "loss": 0.1511, "num_input_tokens_seen": 12508576, "step": 13075 }, { "epoch": 1.0669712048291051, "grad_norm": 0.02527770586311817, "learning_rate": 4.999318567843755e-05, "loss": 0.006, "num_input_tokens_seen": 12513440, "step": 13080 }, { "epoch": 1.067379068439514, "grad_norm": 0.13278204202651978, "learning_rate": 4.999310232743014e-05, "loss": 0.0041, "num_input_tokens_seen": 12519056, "step": 13085 }, { "epoch": 1.0677869320499225, "grad_norm": 6.923699855804443, "learning_rate": 4.999301846982317e-05, "loss": 0.194, "num_input_tokens_seen": 12523920, "step": 13090 }, { "epoch": 1.0681947956603313, "grad_norm": 0.048835352063179016, "learning_rate": 4.9992934105618334e-05, "loss": 0.0971, "num_input_tokens_seen": 12528848, "step": 13095 }, { "epoch": 1.0686026592707398, "grad_norm": 0.030277157202363014, "learning_rate": 4.999284923481734e-05, "loss": 0.1171, "num_input_tokens_seen": 12533760, "step": 13100 }, { "epoch": 1.0690105228811486, "grad_norm": 31.42963218688965, "learning_rate": 4.999276385742192e-05, "loss": 0.3948, "num_input_tokens_seen": 12538448, "step": 13105 }, { "epoch": 1.0694183864915572, "grad_norm": 0.04376225918531418, "learning_rate": 4.9992677973433794e-05, "loss": 0.5469, "num_input_tokens_seen": 12543264, "step": 13110 }, { "epoch": 1.069826250101966, "grad_norm": 0.039778146892786026, "learning_rate": 4.99925915828547e-05, "loss": 0.0063, "num_input_tokens_seen": 12548288, "step": 13115 }, { "epoch": 1.0702341137123745, "grad_norm": 0.08822818845510483, "learning_rate": 4.99925046856864e-05, "loss": 0.3568, "num_input_tokens_seen": 12553328, "step": 13120 }, { "epoch": 1.0706419773227833, "grad_norm": 20.629179000854492, "learning_rate": 4.9992417281930647e-05, "loss": 0.1503, "num_input_tokens_seen": 12558656, "step": 13125 }, { "epoch": 1.0710498409331919, "grad_norm": 0.020048443228006363, "learning_rate": 4.999232937158922e-05, "loss": 0.0086, "num_input_tokens_seen": 12564032, "step": 13130 }, { "epoch": 1.0714577045436007, "grad_norm": 0.05907859653234482, "learning_rate": 4.9992240954663894e-05, "loss": 0.0048, "num_input_tokens_seen": 12568416, "step": 13135 }, { "epoch": 1.0718655681540092, "grad_norm": 9.511675834655762, "learning_rate": 4.999215203115646e-05, "loss": 0.5638, "num_input_tokens_seen": 12573488, "step": 13140 }, { "epoch": 1.072273431764418, "grad_norm": 0.0417257659137249, "learning_rate": 4.9992062601068736e-05, "loss": 0.016, "num_input_tokens_seen": 12578896, "step": 13145 }, { "epoch": 1.0726812953748266, "grad_norm": 0.2431352585554123, "learning_rate": 4.999197266440251e-05, "loss": 0.247, "num_input_tokens_seen": 12583936, "step": 13150 }, { "epoch": 1.0730891589852354, "grad_norm": 0.04609784856438637, "learning_rate": 4.999188222115963e-05, "loss": 0.4808, "num_input_tokens_seen": 12589568, "step": 13155 }, { "epoch": 1.073497022595644, "grad_norm": 0.11224979907274246, "learning_rate": 4.99917912713419e-05, "loss": 0.4003, "num_input_tokens_seen": 12594752, "step": 13160 }, { "epoch": 1.0739048862060527, "grad_norm": 7.592658519744873, "learning_rate": 4.999169981495119e-05, "loss": 0.2259, "num_input_tokens_seen": 12599024, "step": 13165 }, { "epoch": 1.0743127498164613, "grad_norm": 0.10348115116357803, "learning_rate": 4.999160785198935e-05, "loss": 0.2077, "num_input_tokens_seen": 12603776, "step": 13170 }, { "epoch": 1.07472061342687, "grad_norm": 0.5906289219856262, "learning_rate": 4.999151538245823e-05, "loss": 0.0766, "num_input_tokens_seen": 12608800, "step": 13175 }, { "epoch": 1.0751284770372787, "grad_norm": 8.12436294555664, "learning_rate": 4.999142240635972e-05, "loss": 0.2655, "num_input_tokens_seen": 12613280, "step": 13180 }, { "epoch": 1.0755363406476874, "grad_norm": 8.899682998657227, "learning_rate": 4.999132892369569e-05, "loss": 0.2415, "num_input_tokens_seen": 12618064, "step": 13185 }, { "epoch": 1.075944204258096, "grad_norm": 29.981828689575195, "learning_rate": 4.999123493446805e-05, "loss": 0.139, "num_input_tokens_seen": 12623456, "step": 13190 }, { "epoch": 1.0763520678685048, "grad_norm": 0.03483587130904198, "learning_rate": 4.999114043867869e-05, "loss": 0.0692, "num_input_tokens_seen": 12628016, "step": 13195 }, { "epoch": 1.0767599314789134, "grad_norm": 0.06034398823976517, "learning_rate": 4.9991045436329534e-05, "loss": 0.0089, "num_input_tokens_seen": 12632864, "step": 13200 }, { "epoch": 1.0771677950893221, "grad_norm": 0.5479111671447754, "learning_rate": 4.99909499274225e-05, "loss": 0.0069, "num_input_tokens_seen": 12637840, "step": 13205 }, { "epoch": 1.0775756586997307, "grad_norm": 0.5122626423835754, "learning_rate": 4.999085391195955e-05, "loss": 0.2565, "num_input_tokens_seen": 12642768, "step": 13210 }, { "epoch": 1.0779835223101395, "grad_norm": 0.015385851263999939, "learning_rate": 4.9990757389942586e-05, "loss": 0.3091, "num_input_tokens_seen": 12647584, "step": 13215 }, { "epoch": 1.0783913859205483, "grad_norm": 12.018577575683594, "learning_rate": 4.999066036137361e-05, "loss": 0.2859, "num_input_tokens_seen": 12652144, "step": 13220 }, { "epoch": 1.0787992495309568, "grad_norm": 1.0863410234451294, "learning_rate": 4.9990562826254556e-05, "loss": 0.1008, "num_input_tokens_seen": 12656880, "step": 13225 }, { "epoch": 1.0792071131413654, "grad_norm": 0.06114378571510315, "learning_rate": 4.999046478458742e-05, "loss": 0.0218, "num_input_tokens_seen": 12662480, "step": 13230 }, { "epoch": 1.0796149767517742, "grad_norm": 7.766071796417236, "learning_rate": 4.9990366236374175e-05, "loss": 0.2831, "num_input_tokens_seen": 12667632, "step": 13235 }, { "epoch": 1.080022840362183, "grad_norm": 4.481287956237793, "learning_rate": 4.9990267181616825e-05, "loss": 0.6084, "num_input_tokens_seen": 12671216, "step": 13240 }, { "epoch": 1.0804307039725916, "grad_norm": 0.05692850425839424, "learning_rate": 4.999016762031739e-05, "loss": 0.1302, "num_input_tokens_seen": 12675392, "step": 13245 }, { "epoch": 1.0808385675830003, "grad_norm": 27.488922119140625, "learning_rate": 4.999006755247786e-05, "loss": 0.0617, "num_input_tokens_seen": 12679632, "step": 13250 }, { "epoch": 1.081246431193409, "grad_norm": 27.43821907043457, "learning_rate": 4.9989966978100286e-05, "loss": 0.2294, "num_input_tokens_seen": 12684816, "step": 13255 }, { "epoch": 1.0816542948038177, "grad_norm": 0.7997534871101379, "learning_rate": 4.998986589718671e-05, "loss": 0.0153, "num_input_tokens_seen": 12690096, "step": 13260 }, { "epoch": 1.0820621584142263, "grad_norm": 0.15513788163661957, "learning_rate": 4.9989764309739154e-05, "loss": 0.0791, "num_input_tokens_seen": 12694624, "step": 13265 }, { "epoch": 1.082470022024635, "grad_norm": 0.0161104928702116, "learning_rate": 4.9989662215759705e-05, "loss": 0.2901, "num_input_tokens_seen": 12699152, "step": 13270 }, { "epoch": 1.0828778856350436, "grad_norm": 0.027070827782154083, "learning_rate": 4.998955961525041e-05, "loss": 0.4288, "num_input_tokens_seen": 12703520, "step": 13275 }, { "epoch": 1.0832857492454524, "grad_norm": 4.36246919631958, "learning_rate": 4.998945650821338e-05, "loss": 0.4358, "num_input_tokens_seen": 12707872, "step": 13280 }, { "epoch": 1.083693612855861, "grad_norm": 0.10045444965362549, "learning_rate": 4.998935289465067e-05, "loss": 0.0062, "num_input_tokens_seen": 12712304, "step": 13285 }, { "epoch": 1.0841014764662698, "grad_norm": 0.12168335914611816, "learning_rate": 4.9989248774564403e-05, "loss": 0.1995, "num_input_tokens_seen": 12717712, "step": 13290 }, { "epoch": 1.0845093400766783, "grad_norm": 21.806724548339844, "learning_rate": 4.998914414795668e-05, "loss": 0.289, "num_input_tokens_seen": 12722144, "step": 13295 }, { "epoch": 1.084917203687087, "grad_norm": 4.95402193069458, "learning_rate": 4.998903901482962e-05, "loss": 0.1685, "num_input_tokens_seen": 12727248, "step": 13300 }, { "epoch": 1.0853250672974957, "grad_norm": 0.21254907548427582, "learning_rate": 4.9988933375185365e-05, "loss": 0.3384, "num_input_tokens_seen": 12731584, "step": 13305 }, { "epoch": 1.0857329309079045, "grad_norm": 0.24035397171974182, "learning_rate": 4.998882722902604e-05, "loss": 0.0766, "num_input_tokens_seen": 12736672, "step": 13310 }, { "epoch": 1.086140794518313, "grad_norm": 3.484797954559326, "learning_rate": 4.998872057635381e-05, "loss": 0.6113, "num_input_tokens_seen": 12741872, "step": 13315 }, { "epoch": 1.0865486581287218, "grad_norm": 0.14288084208965302, "learning_rate": 4.9988613417170835e-05, "loss": 0.3388, "num_input_tokens_seen": 12747248, "step": 13320 }, { "epoch": 1.0869565217391304, "grad_norm": 1.7710188627243042, "learning_rate": 4.9988505751479274e-05, "loss": 0.2179, "num_input_tokens_seen": 12752976, "step": 13325 }, { "epoch": 1.0873643853495392, "grad_norm": 10.275592803955078, "learning_rate": 4.998839757928133e-05, "loss": 0.2861, "num_input_tokens_seen": 12757904, "step": 13330 }, { "epoch": 1.0877722489599477, "grad_norm": 5.944726943969727, "learning_rate": 4.998828890057918e-05, "loss": 0.1679, "num_input_tokens_seen": 12762384, "step": 13335 }, { "epoch": 1.0881801125703565, "grad_norm": 0.7708321213722229, "learning_rate": 4.998817971537504e-05, "loss": 0.1319, "num_input_tokens_seen": 12767072, "step": 13340 }, { "epoch": 1.088587976180765, "grad_norm": 3.2978644371032715, "learning_rate": 4.998807002367111e-05, "loss": 0.379, "num_input_tokens_seen": 12771760, "step": 13345 }, { "epoch": 1.0889958397911739, "grad_norm": 11.665943145751953, "learning_rate": 4.998795982546962e-05, "loss": 0.1765, "num_input_tokens_seen": 12777056, "step": 13350 }, { "epoch": 1.0894037034015824, "grad_norm": 0.09393071383237839, "learning_rate": 4.99878491207728e-05, "loss": 0.2723, "num_input_tokens_seen": 12781312, "step": 13355 }, { "epoch": 1.0898115670119912, "grad_norm": 0.15858161449432373, "learning_rate": 4.99877379095829e-05, "loss": 0.1246, "num_input_tokens_seen": 12786848, "step": 13360 }, { "epoch": 1.0902194306223998, "grad_norm": 0.2991058826446533, "learning_rate": 4.998762619190216e-05, "loss": 0.0211, "num_input_tokens_seen": 12791120, "step": 13365 }, { "epoch": 1.0906272942328086, "grad_norm": 0.05935792997479439, "learning_rate": 4.998751396773287e-05, "loss": 0.0147, "num_input_tokens_seen": 12795680, "step": 13370 }, { "epoch": 1.0910351578432171, "grad_norm": 0.017982816323637962, "learning_rate": 4.998740123707728e-05, "loss": 0.2403, "num_input_tokens_seen": 12800432, "step": 13375 }, { "epoch": 1.091443021453626, "grad_norm": 0.28919482231140137, "learning_rate": 4.998728799993768e-05, "loss": 0.325, "num_input_tokens_seen": 12805664, "step": 13380 }, { "epoch": 1.0918508850640345, "grad_norm": 0.15855343639850616, "learning_rate": 4.998717425631638e-05, "loss": 0.584, "num_input_tokens_seen": 12810000, "step": 13385 }, { "epoch": 1.0922587486744433, "grad_norm": 7.079113006591797, "learning_rate": 4.9987060006215675e-05, "loss": 0.0258, "num_input_tokens_seen": 12813952, "step": 13390 }, { "epoch": 1.0926666122848518, "grad_norm": 4.016931533813477, "learning_rate": 4.998694524963788e-05, "loss": 0.4138, "num_input_tokens_seen": 12819328, "step": 13395 }, { "epoch": 1.0930744758952606, "grad_norm": 0.1176735907793045, "learning_rate": 4.998682998658532e-05, "loss": 0.3726, "num_input_tokens_seen": 12824320, "step": 13400 }, { "epoch": 1.0934823395056692, "grad_norm": 0.5336257815361023, "learning_rate": 4.9986714217060336e-05, "loss": 0.0582, "num_input_tokens_seen": 12829504, "step": 13405 }, { "epoch": 1.093890203116078, "grad_norm": 0.13162578642368317, "learning_rate": 4.998659794106527e-05, "loss": 0.1805, "num_input_tokens_seen": 12834448, "step": 13410 }, { "epoch": 1.0942980667264868, "grad_norm": 0.11760853976011276, "learning_rate": 4.998648115860248e-05, "loss": 0.1544, "num_input_tokens_seen": 12839232, "step": 13415 }, { "epoch": 1.0947059303368953, "grad_norm": 5.05778169631958, "learning_rate": 4.998636386967434e-05, "loss": 0.3463, "num_input_tokens_seen": 12844304, "step": 13420 }, { "epoch": 1.095113793947304, "grad_norm": 0.09244248270988464, "learning_rate": 4.9986246074283216e-05, "loss": 0.2235, "num_input_tokens_seen": 12848368, "step": 13425 }, { "epoch": 1.0955216575577127, "grad_norm": 2.1009304523468018, "learning_rate": 4.998612777243151e-05, "loss": 0.2621, "num_input_tokens_seen": 12853552, "step": 13430 }, { "epoch": 1.0959295211681215, "grad_norm": 4.9376654624938965, "learning_rate": 4.99860089641216e-05, "loss": 0.1123, "num_input_tokens_seen": 12857984, "step": 13435 }, { "epoch": 1.09633738477853, "grad_norm": 0.1379403918981552, "learning_rate": 4.998588964935591e-05, "loss": 0.0193, "num_input_tokens_seen": 12862816, "step": 13440 }, { "epoch": 1.0967452483889388, "grad_norm": 3.523966073989868, "learning_rate": 4.9985769828136855e-05, "loss": 0.3304, "num_input_tokens_seen": 12867056, "step": 13445 }, { "epoch": 1.0971531119993474, "grad_norm": 0.3622463643550873, "learning_rate": 4.998564950046686e-05, "loss": 0.0221, "num_input_tokens_seen": 12871584, "step": 13450 }, { "epoch": 1.0975609756097562, "grad_norm": 0.041727881878614426, "learning_rate": 4.9985528666348366e-05, "loss": 0.0251, "num_input_tokens_seen": 12875760, "step": 13455 }, { "epoch": 1.0979688392201647, "grad_norm": 17.73961067199707, "learning_rate": 4.998540732578384e-05, "loss": 0.1945, "num_input_tokens_seen": 12880944, "step": 13460 }, { "epoch": 1.0983767028305735, "grad_norm": 0.0769025906920433, "learning_rate": 4.99852854787757e-05, "loss": 0.0189, "num_input_tokens_seen": 12884832, "step": 13465 }, { "epoch": 1.098784566440982, "grad_norm": 0.0687074288725853, "learning_rate": 4.998516312532645e-05, "loss": 0.1723, "num_input_tokens_seen": 12889440, "step": 13470 }, { "epoch": 1.099192430051391, "grad_norm": 0.03792019188404083, "learning_rate": 4.998504026543856e-05, "loss": 0.247, "num_input_tokens_seen": 12894848, "step": 13475 }, { "epoch": 1.0996002936617995, "grad_norm": 0.03167542815208435, "learning_rate": 4.998491689911452e-05, "loss": 0.0506, "num_input_tokens_seen": 12899920, "step": 13480 }, { "epoch": 1.1000081572722082, "grad_norm": 0.30190935730934143, "learning_rate": 4.998479302635684e-05, "loss": 0.0725, "num_input_tokens_seen": 12903584, "step": 13485 }, { "epoch": 1.1004160208826168, "grad_norm": 6.532119274139404, "learning_rate": 4.998466864716802e-05, "loss": 0.2387, "num_input_tokens_seen": 12906944, "step": 13490 }, { "epoch": 1.1008238844930256, "grad_norm": 0.05364503338932991, "learning_rate": 4.998454376155057e-05, "loss": 0.1734, "num_input_tokens_seen": 12911392, "step": 13495 }, { "epoch": 1.1012317481034342, "grad_norm": 0.016584089025855064, "learning_rate": 4.998441836950705e-05, "loss": 1.0714, "num_input_tokens_seen": 12916016, "step": 13500 }, { "epoch": 1.101639611713843, "grad_norm": 0.14565755426883698, "learning_rate": 4.9984292471039975e-05, "loss": 0.6429, "num_input_tokens_seen": 12921360, "step": 13505 }, { "epoch": 1.1020474753242515, "grad_norm": 23.86941146850586, "learning_rate": 4.998416606615191e-05, "loss": 0.3565, "num_input_tokens_seen": 12926784, "step": 13510 }, { "epoch": 1.1024553389346603, "grad_norm": 0.03536234796047211, "learning_rate": 4.998403915484542e-05, "loss": 0.8731, "num_input_tokens_seen": 12931408, "step": 13515 }, { "epoch": 1.1028632025450689, "grad_norm": 0.10022617876529694, "learning_rate": 4.998391173712307e-05, "loss": 0.0194, "num_input_tokens_seen": 12936528, "step": 13520 }, { "epoch": 1.1032710661554777, "grad_norm": 5.308334827423096, "learning_rate": 4.998378381298744e-05, "loss": 0.6483, "num_input_tokens_seen": 12940224, "step": 13525 }, { "epoch": 1.1036789297658862, "grad_norm": 0.7111561298370361, "learning_rate": 4.998365538244113e-05, "loss": 0.7251, "num_input_tokens_seen": 12944688, "step": 13530 }, { "epoch": 1.104086793376295, "grad_norm": 0.348105251789093, "learning_rate": 4.998352644548675e-05, "loss": 2.8966, "num_input_tokens_seen": 12948848, "step": 13535 }, { "epoch": 1.1044946569867036, "grad_norm": 1.406754493713379, "learning_rate": 4.99833970021269e-05, "loss": 3.5005, "num_input_tokens_seen": 12953760, "step": 13540 }, { "epoch": 1.1049025205971124, "grad_norm": 429.25, "learning_rate": 4.998326705236421e-05, "loss": 4.4969, "num_input_tokens_seen": 12957536, "step": 13545 }, { "epoch": 1.105310384207521, "grad_norm": 0.7613445520401001, "learning_rate": 4.998313659620131e-05, "loss": 1.8784, "num_input_tokens_seen": 12961952, "step": 13550 }, { "epoch": 1.1057182478179297, "grad_norm": 3.4953410625457764, "learning_rate": 4.998300563364085e-05, "loss": 0.9597, "num_input_tokens_seen": 12965808, "step": 13555 }, { "epoch": 1.1061261114283383, "grad_norm": 17.549591064453125, "learning_rate": 4.998287416468548e-05, "loss": 2.0429, "num_input_tokens_seen": 12970592, "step": 13560 }, { "epoch": 1.106533975038747, "grad_norm": 1.4625712633132935, "learning_rate": 4.9982742189337864e-05, "loss": 0.895, "num_input_tokens_seen": 12975344, "step": 13565 }, { "epoch": 1.1069418386491556, "grad_norm": 0.07800161838531494, "learning_rate": 4.998260970760069e-05, "loss": 0.7464, "num_input_tokens_seen": 12979888, "step": 13570 }, { "epoch": 1.1073497022595644, "grad_norm": 0.34745895862579346, "learning_rate": 4.998247671947662e-05, "loss": 0.2655, "num_input_tokens_seen": 12984768, "step": 13575 }, { "epoch": 1.107757565869973, "grad_norm": 21.022844314575195, "learning_rate": 4.998234322496837e-05, "loss": 0.1009, "num_input_tokens_seen": 12989872, "step": 13580 }, { "epoch": 1.1081654294803818, "grad_norm": 53.19095993041992, "learning_rate": 4.998220922407864e-05, "loss": 0.7968, "num_input_tokens_seen": 12994352, "step": 13585 }, { "epoch": 1.1085732930907906, "grad_norm": 50.29641342163086, "learning_rate": 4.998207471681014e-05, "loss": 2.1519, "num_input_tokens_seen": 12999552, "step": 13590 }, { "epoch": 1.1089811567011991, "grad_norm": 15.41148853302002, "learning_rate": 4.9981939703165604e-05, "loss": 3.3722, "num_input_tokens_seen": 13004336, "step": 13595 }, { "epoch": 1.1093890203116077, "grad_norm": 21.114030838012695, "learning_rate": 4.998180418314776e-05, "loss": 5.0727, "num_input_tokens_seen": 13009248, "step": 13600 }, { "epoch": 1.1097968839220165, "grad_norm": 16.240028381347656, "learning_rate": 4.9981668156759366e-05, "loss": 7.1437, "num_input_tokens_seen": 13013744, "step": 13605 }, { "epoch": 1.1102047475324253, "grad_norm": 11.80872631072998, "learning_rate": 4.9981531624003176e-05, "loss": 5.287, "num_input_tokens_seen": 13017840, "step": 13610 }, { "epoch": 1.1106126111428338, "grad_norm": 8.145484924316406, "learning_rate": 4.998139458488195e-05, "loss": 3.5302, "num_input_tokens_seen": 13023152, "step": 13615 }, { "epoch": 1.1110204747532426, "grad_norm": 7.741354942321777, "learning_rate": 4.998125703939847e-05, "loss": 3.1603, "num_input_tokens_seen": 13028528, "step": 13620 }, { "epoch": 1.1114283383636512, "grad_norm": 77.40349578857422, "learning_rate": 4.998111898755553e-05, "loss": 2.2595, "num_input_tokens_seen": 13033840, "step": 13625 }, { "epoch": 1.11183620197406, "grad_norm": 39.30592727661133, "learning_rate": 4.9980980429355926e-05, "loss": 1.8892, "num_input_tokens_seen": 13037952, "step": 13630 }, { "epoch": 1.1122440655844685, "grad_norm": 11.103538513183594, "learning_rate": 4.9980841364802453e-05, "loss": 0.934, "num_input_tokens_seen": 13043248, "step": 13635 }, { "epoch": 1.1126519291948773, "grad_norm": 18.93400001525879, "learning_rate": 4.998070179389795e-05, "loss": 0.988, "num_input_tokens_seen": 13047312, "step": 13640 }, { "epoch": 1.1130597928052859, "grad_norm": 17.69611358642578, "learning_rate": 4.998056171664523e-05, "loss": 0.6847, "num_input_tokens_seen": 13051504, "step": 13645 }, { "epoch": 1.1134676564156947, "grad_norm": 28.846256256103516, "learning_rate": 4.998042113304714e-05, "loss": 1.2918, "num_input_tokens_seen": 13056128, "step": 13650 }, { "epoch": 1.1138755200261032, "grad_norm": 16.986875534057617, "learning_rate": 4.998028004310653e-05, "loss": 0.4843, "num_input_tokens_seen": 13060656, "step": 13655 }, { "epoch": 1.114283383636512, "grad_norm": 9.481204986572266, "learning_rate": 4.998013844682626e-05, "loss": 0.5819, "num_input_tokens_seen": 13065088, "step": 13660 }, { "epoch": 1.1146912472469206, "grad_norm": 8.197066307067871, "learning_rate": 4.99799963442092e-05, "loss": 0.3798, "num_input_tokens_seen": 13069744, "step": 13665 }, { "epoch": 1.1150991108573294, "grad_norm": 8.847759246826172, "learning_rate": 4.9979853735258226e-05, "loss": 0.7842, "num_input_tokens_seen": 13073504, "step": 13670 }, { "epoch": 1.115506974467738, "grad_norm": 5.44000244140625, "learning_rate": 4.997971061997623e-05, "loss": 1.2404, "num_input_tokens_seen": 13078544, "step": 13675 }, { "epoch": 1.1159148380781467, "grad_norm": 7.705081462860107, "learning_rate": 4.997956699836611e-05, "loss": 0.5013, "num_input_tokens_seen": 13083232, "step": 13680 }, { "epoch": 1.1163227016885553, "grad_norm": 8.525578498840332, "learning_rate": 4.9979422870430794e-05, "loss": 0.3236, "num_input_tokens_seen": 13087760, "step": 13685 }, { "epoch": 1.116730565298964, "grad_norm": 6.363124847412109, "learning_rate": 4.997927823617318e-05, "loss": 0.3259, "num_input_tokens_seen": 13092176, "step": 13690 }, { "epoch": 1.1171384289093726, "grad_norm": 17.6774845123291, "learning_rate": 4.997913309559622e-05, "loss": 0.5035, "num_input_tokens_seen": 13096688, "step": 13695 }, { "epoch": 1.1175462925197814, "grad_norm": 1.7169562578201294, "learning_rate": 4.9978987448702844e-05, "loss": 0.2808, "num_input_tokens_seen": 13102272, "step": 13700 }, { "epoch": 1.11795415613019, "grad_norm": 1.9376976490020752, "learning_rate": 4.997884129549599e-05, "loss": 0.4366, "num_input_tokens_seen": 13106048, "step": 13705 }, { "epoch": 1.1183620197405988, "grad_norm": 2.842550277709961, "learning_rate": 4.9978694635978656e-05, "loss": 0.7808, "num_input_tokens_seen": 13111440, "step": 13710 }, { "epoch": 1.1187698833510074, "grad_norm": 1.9946750402450562, "learning_rate": 4.9978547470153794e-05, "loss": 0.3423, "num_input_tokens_seen": 13116576, "step": 13715 }, { "epoch": 1.1191777469614161, "grad_norm": 44.2409782409668, "learning_rate": 4.997839979802438e-05, "loss": 0.409, "num_input_tokens_seen": 13121136, "step": 13720 }, { "epoch": 1.1195856105718247, "grad_norm": 24.7237548828125, "learning_rate": 4.997825161959342e-05, "loss": 0.4654, "num_input_tokens_seen": 13125456, "step": 13725 }, { "epoch": 1.1199934741822335, "grad_norm": 28.102405548095703, "learning_rate": 4.9978102934863915e-05, "loss": 0.3531, "num_input_tokens_seen": 13130880, "step": 13730 }, { "epoch": 1.120401337792642, "grad_norm": 7.500150680541992, "learning_rate": 4.997795374383888e-05, "loss": 0.3174, "num_input_tokens_seen": 13136032, "step": 13735 }, { "epoch": 1.1208092014030508, "grad_norm": 4.836019515991211, "learning_rate": 4.997780404652134e-05, "loss": 0.5967, "num_input_tokens_seen": 13140896, "step": 13740 }, { "epoch": 1.1212170650134594, "grad_norm": 1.7595703601837158, "learning_rate": 4.997765384291431e-05, "loss": 0.4237, "num_input_tokens_seen": 13145280, "step": 13745 }, { "epoch": 1.1216249286238682, "grad_norm": 0.5538645386695862, "learning_rate": 4.997750313302086e-05, "loss": 0.3555, "num_input_tokens_seen": 13149968, "step": 13750 }, { "epoch": 1.1220327922342768, "grad_norm": 47.016780853271484, "learning_rate": 4.997735191684404e-05, "loss": 0.2665, "num_input_tokens_seen": 13154144, "step": 13755 }, { "epoch": 1.1224406558446856, "grad_norm": 0.8404544591903687, "learning_rate": 4.99772001943869e-05, "loss": 0.464, "num_input_tokens_seen": 13158864, "step": 13760 }, { "epoch": 1.1228485194550941, "grad_norm": 1.8170171976089478, "learning_rate": 4.997704796565253e-05, "loss": 0.3707, "num_input_tokens_seen": 13163632, "step": 13765 }, { "epoch": 1.123256383065503, "grad_norm": 28.5623779296875, "learning_rate": 4.997689523064401e-05, "loss": 0.3378, "num_input_tokens_seen": 13167696, "step": 13770 }, { "epoch": 1.1236642466759115, "grad_norm": 13.689993858337402, "learning_rate": 4.997674198936444e-05, "loss": 0.6586, "num_input_tokens_seen": 13172448, "step": 13775 }, { "epoch": 1.1240721102863203, "grad_norm": 187.82244873046875, "learning_rate": 4.997658824181692e-05, "loss": 0.731, "num_input_tokens_seen": 13176800, "step": 13780 }, { "epoch": 1.124479973896729, "grad_norm": 21.203378677368164, "learning_rate": 4.9976433988004575e-05, "loss": 0.3709, "num_input_tokens_seen": 13182288, "step": 13785 }, { "epoch": 1.1248878375071376, "grad_norm": 2.79111647605896, "learning_rate": 4.997627922793052e-05, "loss": 0.4039, "num_input_tokens_seen": 13186736, "step": 13790 }, { "epoch": 1.1252957011175462, "grad_norm": 43.55433654785156, "learning_rate": 4.99761239615979e-05, "loss": 0.2939, "num_input_tokens_seen": 13192096, "step": 13795 }, { "epoch": 1.125703564727955, "grad_norm": 7.922434329986572, "learning_rate": 4.9975968189009856e-05, "loss": 0.2222, "num_input_tokens_seen": 13197024, "step": 13800 }, { "epoch": 1.1261114283383638, "grad_norm": 4.516755104064941, "learning_rate": 4.997581191016956e-05, "loss": 0.2751, "num_input_tokens_seen": 13201712, "step": 13805 }, { "epoch": 1.1265192919487723, "grad_norm": 0.6204772591590881, "learning_rate": 4.997565512508016e-05, "loss": 0.2627, "num_input_tokens_seen": 13207120, "step": 13810 }, { "epoch": 1.126927155559181, "grad_norm": 0.5114254355430603, "learning_rate": 4.9975497833744845e-05, "loss": 0.1233, "num_input_tokens_seen": 13211632, "step": 13815 }, { "epoch": 1.1273350191695897, "grad_norm": 0.8991081118583679, "learning_rate": 4.997534003616681e-05, "loss": 0.2019, "num_input_tokens_seen": 13217488, "step": 13820 }, { "epoch": 1.1277428827799985, "grad_norm": 5.120956897735596, "learning_rate": 4.997518173234924e-05, "loss": 0.1642, "num_input_tokens_seen": 13221936, "step": 13825 }, { "epoch": 1.128150746390407, "grad_norm": 24.52604866027832, "learning_rate": 4.9975022922295343e-05, "loss": 0.1904, "num_input_tokens_seen": 13226736, "step": 13830 }, { "epoch": 1.1285586100008158, "grad_norm": 16.837430953979492, "learning_rate": 4.997486360600835e-05, "loss": 0.3373, "num_input_tokens_seen": 13231456, "step": 13835 }, { "epoch": 1.1289664736112244, "grad_norm": 0.19124290347099304, "learning_rate": 4.9974703783491486e-05, "loss": 0.2513, "num_input_tokens_seen": 13235344, "step": 13840 }, { "epoch": 1.1293743372216332, "grad_norm": 2.322234630584717, "learning_rate": 4.997454345474798e-05, "loss": 0.1718, "num_input_tokens_seen": 13240160, "step": 13845 }, { "epoch": 1.1297822008320417, "grad_norm": 0.1587606817483902, "learning_rate": 4.997438261978109e-05, "loss": 0.1209, "num_input_tokens_seen": 13244400, "step": 13850 }, { "epoch": 1.1301900644424505, "grad_norm": 0.8889793157577515, "learning_rate": 4.9974221278594094e-05, "loss": 0.4987, "num_input_tokens_seen": 13248656, "step": 13855 }, { "epoch": 1.130597928052859, "grad_norm": 14.675108909606934, "learning_rate": 4.997405943119023e-05, "loss": 0.5678, "num_input_tokens_seen": 13253456, "step": 13860 }, { "epoch": 1.1310057916632679, "grad_norm": 9.966734886169434, "learning_rate": 4.997389707757279e-05, "loss": 0.9703, "num_input_tokens_seen": 13258896, "step": 13865 }, { "epoch": 1.1314136552736764, "grad_norm": 19.875730514526367, "learning_rate": 4.9973734217745085e-05, "loss": 0.5244, "num_input_tokens_seen": 13263696, "step": 13870 }, { "epoch": 1.1318215188840852, "grad_norm": 4.040135860443115, "learning_rate": 4.997357085171038e-05, "loss": 0.3272, "num_input_tokens_seen": 13267760, "step": 13875 }, { "epoch": 1.1322293824944938, "grad_norm": 1.6510446071624756, "learning_rate": 4.997340697947202e-05, "loss": 0.491, "num_input_tokens_seen": 13272288, "step": 13880 }, { "epoch": 1.1326372461049026, "grad_norm": 2.1221401691436768, "learning_rate": 4.997324260103331e-05, "loss": 0.1622, "num_input_tokens_seen": 13276736, "step": 13885 }, { "epoch": 1.1330451097153111, "grad_norm": 0.7969181537628174, "learning_rate": 4.997307771639758e-05, "loss": 0.2838, "num_input_tokens_seen": 13280464, "step": 13890 }, { "epoch": 1.13345297332572, "grad_norm": 0.8476366996765137, "learning_rate": 4.9972912325568176e-05, "loss": 0.2638, "num_input_tokens_seen": 13285872, "step": 13895 }, { "epoch": 1.1338608369361285, "grad_norm": 0.78012615442276, "learning_rate": 4.9972746428548455e-05, "loss": 0.0404, "num_input_tokens_seen": 13290464, "step": 13900 }, { "epoch": 1.1342687005465373, "grad_norm": 10.33905029296875, "learning_rate": 4.997258002534178e-05, "loss": 0.18, "num_input_tokens_seen": 13295120, "step": 13905 }, { "epoch": 1.1346765641569458, "grad_norm": 5.4694929122924805, "learning_rate": 4.9972413115951504e-05, "loss": 0.0892, "num_input_tokens_seen": 13299808, "step": 13910 }, { "epoch": 1.1350844277673546, "grad_norm": 0.29092368483543396, "learning_rate": 4.9972245700381036e-05, "loss": 0.0472, "num_input_tokens_seen": 13305680, "step": 13915 }, { "epoch": 1.1354922913777632, "grad_norm": 0.14095255732536316, "learning_rate": 4.9972077778633755e-05, "loss": 0.174, "num_input_tokens_seen": 13309984, "step": 13920 }, { "epoch": 1.135900154988172, "grad_norm": 0.3212795555591583, "learning_rate": 4.997190935071308e-05, "loss": 0.1298, "num_input_tokens_seen": 13314688, "step": 13925 }, { "epoch": 1.1363080185985805, "grad_norm": 0.09126206487417221, "learning_rate": 4.99717404166224e-05, "loss": 0.1393, "num_input_tokens_seen": 13320064, "step": 13930 }, { "epoch": 1.1367158822089893, "grad_norm": 0.2795400321483612, "learning_rate": 4.997157097636515e-05, "loss": 0.0024, "num_input_tokens_seen": 13324432, "step": 13935 }, { "epoch": 1.137123745819398, "grad_norm": 0.46102002263069153, "learning_rate": 4.997140102994478e-05, "loss": 0.5857, "num_input_tokens_seen": 13328768, "step": 13940 }, { "epoch": 1.1375316094298067, "grad_norm": 0.12808196246623993, "learning_rate": 4.997123057736472e-05, "loss": 0.0784, "num_input_tokens_seen": 13332544, "step": 13945 }, { "epoch": 1.1379394730402153, "grad_norm": 0.09912820160388947, "learning_rate": 4.997105961862843e-05, "loss": 0.0718, "num_input_tokens_seen": 13336560, "step": 13950 }, { "epoch": 1.138347336650624, "grad_norm": 0.01553028728812933, "learning_rate": 4.997088815373936e-05, "loss": 0.0169, "num_input_tokens_seen": 13341856, "step": 13955 }, { "epoch": 1.1387552002610328, "grad_norm": 0.1008152887225151, "learning_rate": 4.9970716182701005e-05, "loss": 0.2844, "num_input_tokens_seen": 13346432, "step": 13960 }, { "epoch": 1.1391630638714414, "grad_norm": 0.020362412557005882, "learning_rate": 4.9970543705516845e-05, "loss": 0.2951, "num_input_tokens_seen": 13351152, "step": 13965 }, { "epoch": 1.13957092748185, "grad_norm": 80.7247314453125, "learning_rate": 4.997037072219038e-05, "loss": 0.3085, "num_input_tokens_seen": 13356240, "step": 13970 }, { "epoch": 1.1399787910922587, "grad_norm": 0.06595879793167114, "learning_rate": 4.997019723272509e-05, "loss": 0.2327, "num_input_tokens_seen": 13361184, "step": 13975 }, { "epoch": 1.1403866547026675, "grad_norm": 39.66258239746094, "learning_rate": 4.997002323712453e-05, "loss": 0.2321, "num_input_tokens_seen": 13365600, "step": 13980 }, { "epoch": 1.140794518313076, "grad_norm": 30.288421630859375, "learning_rate": 4.996984873539221e-05, "loss": 0.1361, "num_input_tokens_seen": 13370928, "step": 13985 }, { "epoch": 1.1412023819234847, "grad_norm": 0.2102632373571396, "learning_rate": 4.9969673727531665e-05, "loss": 0.2235, "num_input_tokens_seen": 13375040, "step": 13990 }, { "epoch": 1.1416102455338935, "grad_norm": 13.931524276733398, "learning_rate": 4.9969498213546436e-05, "loss": 0.3062, "num_input_tokens_seen": 13380480, "step": 13995 }, { "epoch": 1.1420181091443022, "grad_norm": 0.08100192993879318, "learning_rate": 4.9969322193440095e-05, "loss": 0.5272, "num_input_tokens_seen": 13385360, "step": 14000 }, { "epoch": 1.1424259727547108, "grad_norm": 0.06316155195236206, "learning_rate": 4.99691456672162e-05, "loss": 0.0325, "num_input_tokens_seen": 13390032, "step": 14005 }, { "epoch": 1.1428338363651196, "grad_norm": 0.17542727291584015, "learning_rate": 4.9968968634878334e-05, "loss": 0.0103, "num_input_tokens_seen": 13394496, "step": 14010 }, { "epoch": 1.1432416999755282, "grad_norm": 4.266188144683838, "learning_rate": 4.9968791096430083e-05, "loss": 0.3819, "num_input_tokens_seen": 13398720, "step": 14015 }, { "epoch": 1.143649563585937, "grad_norm": 26.17937660217285, "learning_rate": 4.9968613051875046e-05, "loss": 0.0652, "num_input_tokens_seen": 13403392, "step": 14020 }, { "epoch": 1.1440574271963455, "grad_norm": 0.11915508657693863, "learning_rate": 4.996843450121684e-05, "loss": 0.1864, "num_input_tokens_seen": 13408272, "step": 14025 }, { "epoch": 1.1444652908067543, "grad_norm": 45.08562088012695, "learning_rate": 4.996825544445907e-05, "loss": 0.1976, "num_input_tokens_seen": 13412736, "step": 14030 }, { "epoch": 1.1448731544171629, "grad_norm": 0.13361787796020508, "learning_rate": 4.996807588160537e-05, "loss": 0.4488, "num_input_tokens_seen": 13417440, "step": 14035 }, { "epoch": 1.1452810180275717, "grad_norm": 12.467068672180176, "learning_rate": 4.996789581265938e-05, "loss": 0.3497, "num_input_tokens_seen": 13422752, "step": 14040 }, { "epoch": 1.1456888816379802, "grad_norm": 0.05211026594042778, "learning_rate": 4.996771523762476e-05, "loss": 0.0102, "num_input_tokens_seen": 13427328, "step": 14045 }, { "epoch": 1.146096745248389, "grad_norm": 0.1983393281698227, "learning_rate": 4.996753415650515e-05, "loss": 0.0075, "num_input_tokens_seen": 13431696, "step": 14050 }, { "epoch": 1.1465046088587976, "grad_norm": 23.856048583984375, "learning_rate": 4.9967352569304234e-05, "loss": 0.8348, "num_input_tokens_seen": 13436256, "step": 14055 }, { "epoch": 1.1469124724692064, "grad_norm": 16.113971710205078, "learning_rate": 4.99671704760257e-05, "loss": 0.3806, "num_input_tokens_seen": 13441280, "step": 14060 }, { "epoch": 1.147320336079615, "grad_norm": 0.02043810859322548, "learning_rate": 4.996698787667323e-05, "loss": 0.0074, "num_input_tokens_seen": 13446704, "step": 14065 }, { "epoch": 1.1477281996900237, "grad_norm": 6.9537577629089355, "learning_rate": 4.996680477125052e-05, "loss": 0.2982, "num_input_tokens_seen": 13452000, "step": 14070 }, { "epoch": 1.1481360633004323, "grad_norm": 0.2674700915813446, "learning_rate": 4.9966621159761284e-05, "loss": 0.4629, "num_input_tokens_seen": 13456736, "step": 14075 }, { "epoch": 1.148543926910841, "grad_norm": 0.05705780163407326, "learning_rate": 4.996643704220925e-05, "loss": 0.5121, "num_input_tokens_seen": 13461088, "step": 14080 }, { "epoch": 1.1489517905212496, "grad_norm": 15.413098335266113, "learning_rate": 4.996625241859814e-05, "loss": 0.2397, "num_input_tokens_seen": 13465648, "step": 14085 }, { "epoch": 1.1493596541316584, "grad_norm": 0.5193037986755371, "learning_rate": 4.9966067288931706e-05, "loss": 0.1698, "num_input_tokens_seen": 13470128, "step": 14090 }, { "epoch": 1.149767517742067, "grad_norm": 0.6581798195838928, "learning_rate": 4.99658816532137e-05, "loss": 0.046, "num_input_tokens_seen": 13474528, "step": 14095 }, { "epoch": 1.1501753813524758, "grad_norm": 0.08562710136175156, "learning_rate": 4.996569551144787e-05, "loss": 0.2091, "num_input_tokens_seen": 13479424, "step": 14100 }, { "epoch": 1.1505832449628843, "grad_norm": 3.4156546592712402, "learning_rate": 4.996550886363801e-05, "loss": 0.2133, "num_input_tokens_seen": 13483600, "step": 14105 }, { "epoch": 1.1509911085732931, "grad_norm": 0.5098080635070801, "learning_rate": 4.996532170978789e-05, "loss": 0.0186, "num_input_tokens_seen": 13488544, "step": 14110 }, { "epoch": 1.1513989721837017, "grad_norm": 2.455460786819458, "learning_rate": 4.996513404990132e-05, "loss": 0.382, "num_input_tokens_seen": 13492944, "step": 14115 }, { "epoch": 1.1518068357941105, "grad_norm": 3.940356731414795, "learning_rate": 4.996494588398207e-05, "loss": 0.4018, "num_input_tokens_seen": 13497680, "step": 14120 }, { "epoch": 1.152214699404519, "grad_norm": 0.19248321652412415, "learning_rate": 4.996475721203399e-05, "loss": 0.0131, "num_input_tokens_seen": 13502672, "step": 14125 }, { "epoch": 1.1526225630149278, "grad_norm": 0.19622647762298584, "learning_rate": 4.996456803406089e-05, "loss": 0.0347, "num_input_tokens_seen": 13508288, "step": 14130 }, { "epoch": 1.1530304266253364, "grad_norm": 0.09670379757881165, "learning_rate": 4.99643783500666e-05, "loss": 0.1385, "num_input_tokens_seen": 13513088, "step": 14135 }, { "epoch": 1.1534382902357452, "grad_norm": 0.5709365606307983, "learning_rate": 4.996418816005497e-05, "loss": 0.2178, "num_input_tokens_seen": 13517680, "step": 14140 }, { "epoch": 1.1538461538461537, "grad_norm": 0.0738556757569313, "learning_rate": 4.9963997464029846e-05, "loss": 0.006, "num_input_tokens_seen": 13521952, "step": 14145 }, { "epoch": 1.1542540174565625, "grad_norm": 0.06804068386554718, "learning_rate": 4.996380626199511e-05, "loss": 0.0104, "num_input_tokens_seen": 13526384, "step": 14150 }, { "epoch": 1.1546618810669713, "grad_norm": 0.33420032262802124, "learning_rate": 4.996361455395463e-05, "loss": 0.1682, "num_input_tokens_seen": 13530848, "step": 14155 }, { "epoch": 1.1550697446773799, "grad_norm": 6.230397701263428, "learning_rate": 4.9963422339912286e-05, "loss": 0.3891, "num_input_tokens_seen": 13535376, "step": 14160 }, { "epoch": 1.1554776082877884, "grad_norm": 0.01440747082233429, "learning_rate": 4.9963229619871974e-05, "loss": 0.0041, "num_input_tokens_seen": 13539920, "step": 14165 }, { "epoch": 1.1558854718981972, "grad_norm": 12.50994873046875, "learning_rate": 4.996303639383761e-05, "loss": 0.2243, "num_input_tokens_seen": 13545104, "step": 14170 }, { "epoch": 1.156293335508606, "grad_norm": 0.018209850415587425, "learning_rate": 4.996284266181312e-05, "loss": 0.2412, "num_input_tokens_seen": 13549712, "step": 14175 }, { "epoch": 1.1567011991190146, "grad_norm": 0.09255140274763107, "learning_rate": 4.996264842380239e-05, "loss": 0.1997, "num_input_tokens_seen": 13554992, "step": 14180 }, { "epoch": 1.1571090627294234, "grad_norm": 0.1653909832239151, "learning_rate": 4.9962453679809405e-05, "loss": 0.2098, "num_input_tokens_seen": 13559616, "step": 14185 }, { "epoch": 1.157516926339832, "grad_norm": 4.3560662269592285, "learning_rate": 4.996225842983808e-05, "loss": 0.438, "num_input_tokens_seen": 13564560, "step": 14190 }, { "epoch": 1.1579247899502407, "grad_norm": 10.057804107666016, "learning_rate": 4.996206267389239e-05, "loss": 0.1842, "num_input_tokens_seen": 13569344, "step": 14195 }, { "epoch": 1.1583326535606493, "grad_norm": 1.094535231590271, "learning_rate": 4.9961866411976294e-05, "loss": 0.4163, "num_input_tokens_seen": 13573152, "step": 14200 }, { "epoch": 1.158740517171058, "grad_norm": 10.110101699829102, "learning_rate": 4.9961669644093775e-05, "loss": 0.1677, "num_input_tokens_seen": 13577776, "step": 14205 }, { "epoch": 1.1591483807814666, "grad_norm": 0.06138652190566063, "learning_rate": 4.996147237024882e-05, "loss": 0.2893, "num_input_tokens_seen": 13581792, "step": 14210 }, { "epoch": 1.1595562443918754, "grad_norm": 0.12020204961299896, "learning_rate": 4.996127459044542e-05, "loss": 0.085, "num_input_tokens_seen": 13586560, "step": 14215 }, { "epoch": 1.159964108002284, "grad_norm": 0.11339642852544785, "learning_rate": 4.996107630468759e-05, "loss": 0.0117, "num_input_tokens_seen": 13590480, "step": 14220 }, { "epoch": 1.1603719716126928, "grad_norm": 32.54682540893555, "learning_rate": 4.996087751297936e-05, "loss": 0.0444, "num_input_tokens_seen": 13595616, "step": 14225 }, { "epoch": 1.1607798352231014, "grad_norm": 8.276582717895508, "learning_rate": 4.9960678215324744e-05, "loss": 0.3062, "num_input_tokens_seen": 13599904, "step": 14230 }, { "epoch": 1.1611876988335101, "grad_norm": 0.09393111616373062, "learning_rate": 4.9960478411727784e-05, "loss": 0.0318, "num_input_tokens_seen": 13604640, "step": 14235 }, { "epoch": 1.1615955624439187, "grad_norm": 16.399600982666016, "learning_rate": 4.9960278102192536e-05, "loss": 0.2887, "num_input_tokens_seen": 13609376, "step": 14240 }, { "epoch": 1.1620034260543275, "grad_norm": 5.236959457397461, "learning_rate": 4.996007728672306e-05, "loss": 0.36, "num_input_tokens_seen": 13614368, "step": 14245 }, { "epoch": 1.162411289664736, "grad_norm": 38.097164154052734, "learning_rate": 4.995987596532342e-05, "loss": 0.2657, "num_input_tokens_seen": 13618640, "step": 14250 }, { "epoch": 1.1628191532751448, "grad_norm": 23.396984100341797, "learning_rate": 4.995967413799769e-05, "loss": 0.0957, "num_input_tokens_seen": 13624208, "step": 14255 }, { "epoch": 1.1632270168855534, "grad_norm": 31.21438980102539, "learning_rate": 4.995947180474999e-05, "loss": 0.1846, "num_input_tokens_seen": 13629248, "step": 14260 }, { "epoch": 1.1636348804959622, "grad_norm": 15.190410614013672, "learning_rate": 4.995926896558439e-05, "loss": 0.3894, "num_input_tokens_seen": 13634096, "step": 14265 }, { "epoch": 1.1640427441063708, "grad_norm": 15.388240814208984, "learning_rate": 4.9959065620505016e-05, "loss": 0.1557, "num_input_tokens_seen": 13639680, "step": 14270 }, { "epoch": 1.1644506077167796, "grad_norm": 0.8718141317367554, "learning_rate": 4.9958861769515996e-05, "loss": 0.3544, "num_input_tokens_seen": 13644192, "step": 14275 }, { "epoch": 1.1648584713271881, "grad_norm": 1.3710825443267822, "learning_rate": 4.9958657412621444e-05, "loss": 0.0391, "num_input_tokens_seen": 13648512, "step": 14280 }, { "epoch": 1.165266334937597, "grad_norm": 0.4206438362598419, "learning_rate": 4.995845254982552e-05, "loss": 0.2076, "num_input_tokens_seen": 13653696, "step": 14285 }, { "epoch": 1.1656741985480055, "grad_norm": 3.649754524230957, "learning_rate": 4.995824718113236e-05, "loss": 0.0381, "num_input_tokens_seen": 13658352, "step": 14290 }, { "epoch": 1.1660820621584143, "grad_norm": 1.448935627937317, "learning_rate": 4.995804130654614e-05, "loss": 0.3441, "num_input_tokens_seen": 13663552, "step": 14295 }, { "epoch": 1.1664899257688228, "grad_norm": 0.6919191479682922, "learning_rate": 4.9957834926071026e-05, "loss": 0.0906, "num_input_tokens_seen": 13668416, "step": 14300 }, { "epoch": 1.1668977893792316, "grad_norm": 0.3283978998661041, "learning_rate": 4.99576280397112e-05, "loss": 0.0041, "num_input_tokens_seen": 13672256, "step": 14305 }, { "epoch": 1.1673056529896402, "grad_norm": 34.74053955078125, "learning_rate": 4.9957420647470865e-05, "loss": 0.5502, "num_input_tokens_seen": 13676416, "step": 14310 }, { "epoch": 1.167713516600049, "grad_norm": 40.526065826416016, "learning_rate": 4.9957212749354215e-05, "loss": 0.3213, "num_input_tokens_seen": 13680160, "step": 14315 }, { "epoch": 1.1681213802104575, "grad_norm": 0.07077562808990479, "learning_rate": 4.995700434536547e-05, "loss": 0.2171, "num_input_tokens_seen": 13684608, "step": 14320 }, { "epoch": 1.1685292438208663, "grad_norm": 0.07886248081922531, "learning_rate": 4.995679543550885e-05, "loss": 0.0063, "num_input_tokens_seen": 13689360, "step": 14325 }, { "epoch": 1.168937107431275, "grad_norm": 0.06534077972173691, "learning_rate": 4.9956586019788584e-05, "loss": 0.1728, "num_input_tokens_seen": 13694688, "step": 14330 }, { "epoch": 1.1693449710416837, "grad_norm": 0.05592981353402138, "learning_rate": 4.995637609820894e-05, "loss": 0.0051, "num_input_tokens_seen": 13699296, "step": 14335 }, { "epoch": 1.1697528346520922, "grad_norm": 0.03585513308644295, "learning_rate": 4.9956165670774145e-05, "loss": 0.0085, "num_input_tokens_seen": 13703712, "step": 14340 }, { "epoch": 1.170160698262501, "grad_norm": 0.0397690050303936, "learning_rate": 4.995595473748849e-05, "loss": 0.2935, "num_input_tokens_seen": 13707760, "step": 14345 }, { "epoch": 1.1705685618729098, "grad_norm": 1.2430086135864258, "learning_rate": 4.995574329835622e-05, "loss": 0.4875, "num_input_tokens_seen": 13713728, "step": 14350 }, { "epoch": 1.1709764254833184, "grad_norm": 4.7861738204956055, "learning_rate": 4.995553135338165e-05, "loss": 0.3935, "num_input_tokens_seen": 13718944, "step": 14355 }, { "epoch": 1.171384289093727, "grad_norm": 0.1779196411371231, "learning_rate": 4.995531890256905e-05, "loss": 0.2858, "num_input_tokens_seen": 13723392, "step": 14360 }, { "epoch": 1.1717921527041357, "grad_norm": 12.100712776184082, "learning_rate": 4.9955105945922756e-05, "loss": 0.1639, "num_input_tokens_seen": 13727680, "step": 14365 }, { "epoch": 1.1722000163145445, "grad_norm": 0.06908956915140152, "learning_rate": 4.995489248344706e-05, "loss": 0.0106, "num_input_tokens_seen": 13732624, "step": 14370 }, { "epoch": 1.172607879924953, "grad_norm": 0.06215524673461914, "learning_rate": 4.995467851514629e-05, "loss": 0.2599, "num_input_tokens_seen": 13737328, "step": 14375 }, { "epoch": 1.1730157435353619, "grad_norm": 0.06380024552345276, "learning_rate": 4.9954464041024807e-05, "loss": 0.2094, "num_input_tokens_seen": 13742768, "step": 14380 }, { "epoch": 1.1734236071457704, "grad_norm": 0.4337489902973175, "learning_rate": 4.9954249061086936e-05, "loss": 0.1679, "num_input_tokens_seen": 13746880, "step": 14385 }, { "epoch": 1.1738314707561792, "grad_norm": 0.04588663578033447, "learning_rate": 4.995403357533703e-05, "loss": 0.0556, "num_input_tokens_seen": 13752112, "step": 14390 }, { "epoch": 1.1742393343665878, "grad_norm": 0.09931381791830063, "learning_rate": 4.995381758377948e-05, "loss": 0.2415, "num_input_tokens_seen": 13757056, "step": 14395 }, { "epoch": 1.1746471979769966, "grad_norm": 7.6478071212768555, "learning_rate": 4.995360108641864e-05, "loss": 0.2563, "num_input_tokens_seen": 13761776, "step": 14400 }, { "epoch": 1.1750550615874051, "grad_norm": 8.944559097290039, "learning_rate": 4.995338408325892e-05, "loss": 0.0404, "num_input_tokens_seen": 13766592, "step": 14405 }, { "epoch": 1.175462925197814, "grad_norm": 0.041455648839473724, "learning_rate": 4.99531665743047e-05, "loss": 0.1908, "num_input_tokens_seen": 13771856, "step": 14410 }, { "epoch": 1.1758707888082225, "grad_norm": 0.2667725682258606, "learning_rate": 4.99529485595604e-05, "loss": 0.4035, "num_input_tokens_seen": 13776752, "step": 14415 }, { "epoch": 1.1762786524186313, "grad_norm": 0.08757155388593674, "learning_rate": 4.995273003903044e-05, "loss": 0.0113, "num_input_tokens_seen": 13781616, "step": 14420 }, { "epoch": 1.1766865160290398, "grad_norm": 3.396066427230835, "learning_rate": 4.995251101271924e-05, "loss": 0.2383, "num_input_tokens_seen": 13785920, "step": 14425 }, { "epoch": 1.1770943796394486, "grad_norm": 0.03645672649145126, "learning_rate": 4.995229148063124e-05, "loss": 0.0647, "num_input_tokens_seen": 13789760, "step": 14430 }, { "epoch": 1.1775022432498572, "grad_norm": 0.1360839456319809, "learning_rate": 4.99520714427709e-05, "loss": 0.6068, "num_input_tokens_seen": 13795008, "step": 14435 }, { "epoch": 1.177910106860266, "grad_norm": 0.11585614085197449, "learning_rate": 4.995185089914268e-05, "loss": 0.0496, "num_input_tokens_seen": 13799328, "step": 14440 }, { "epoch": 1.1783179704706745, "grad_norm": 2.4853155612945557, "learning_rate": 4.995162984975104e-05, "loss": 0.0517, "num_input_tokens_seen": 13804240, "step": 14445 }, { "epoch": 1.1787258340810833, "grad_norm": 0.039220914244651794, "learning_rate": 4.9951408294600464e-05, "loss": 0.4003, "num_input_tokens_seen": 13808752, "step": 14450 }, { "epoch": 1.179133697691492, "grad_norm": 0.17702051997184753, "learning_rate": 4.995118623369545e-05, "loss": 0.0548, "num_input_tokens_seen": 13813552, "step": 14455 }, { "epoch": 1.1795415613019007, "grad_norm": 0.048391688615083694, "learning_rate": 4.995096366704049e-05, "loss": 0.0459, "num_input_tokens_seen": 13817856, "step": 14460 }, { "epoch": 1.1799494249123093, "grad_norm": 19.297361373901367, "learning_rate": 4.995074059464009e-05, "loss": 0.3997, "num_input_tokens_seen": 13823056, "step": 14465 }, { "epoch": 1.180357288522718, "grad_norm": 0.15094512701034546, "learning_rate": 4.995051701649879e-05, "loss": 0.3814, "num_input_tokens_seen": 13827616, "step": 14470 }, { "epoch": 1.1807651521331266, "grad_norm": 0.06024116650223732, "learning_rate": 4.9950292932621114e-05, "loss": 0.2662, "num_input_tokens_seen": 13832928, "step": 14475 }, { "epoch": 1.1811730157435354, "grad_norm": 0.4255431890487671, "learning_rate": 4.9950068343011596e-05, "loss": 0.0211, "num_input_tokens_seen": 13838080, "step": 14480 }, { "epoch": 1.181580879353944, "grad_norm": 1.2931770086288452, "learning_rate": 4.99498432476748e-05, "loss": 0.1743, "num_input_tokens_seen": 13843648, "step": 14485 }, { "epoch": 1.1819887429643527, "grad_norm": 0.0833563506603241, "learning_rate": 4.9949617646615276e-05, "loss": 0.2952, "num_input_tokens_seen": 13847808, "step": 14490 }, { "epoch": 1.1823966065747613, "grad_norm": 14.76605224609375, "learning_rate": 4.9949391539837606e-05, "loss": 0.0876, "num_input_tokens_seen": 13852848, "step": 14495 }, { "epoch": 1.18280447018517, "grad_norm": 0.08665364980697632, "learning_rate": 4.994916492734638e-05, "loss": 0.0144, "num_input_tokens_seen": 13858080, "step": 14500 }, { "epoch": 1.1832123337955787, "grad_norm": 0.13318859040737152, "learning_rate": 4.994893780914618e-05, "loss": 0.5067, "num_input_tokens_seen": 13862736, "step": 14505 }, { "epoch": 1.1836201974059875, "grad_norm": 13.126238822937012, "learning_rate": 4.9948710185241605e-05, "loss": 0.3147, "num_input_tokens_seen": 13867296, "step": 14510 }, { "epoch": 1.184028061016396, "grad_norm": 0.15153345465660095, "learning_rate": 4.994848205563728e-05, "loss": 0.0139, "num_input_tokens_seen": 13871568, "step": 14515 }, { "epoch": 1.1844359246268048, "grad_norm": 0.05782618746161461, "learning_rate": 4.994825342033782e-05, "loss": 0.3547, "num_input_tokens_seen": 13875664, "step": 14520 }, { "epoch": 1.1848437882372136, "grad_norm": 2.1936168670654297, "learning_rate": 4.994802427934787e-05, "loss": 0.4555, "num_input_tokens_seen": 13880928, "step": 14525 }, { "epoch": 1.1852516518476222, "grad_norm": 0.3759691119194031, "learning_rate": 4.994779463267207e-05, "loss": 0.2037, "num_input_tokens_seen": 13885440, "step": 14530 }, { "epoch": 1.1856595154580307, "grad_norm": 2.149742364883423, "learning_rate": 4.9947564480315065e-05, "loss": 0.1133, "num_input_tokens_seen": 13890512, "step": 14535 }, { "epoch": 1.1860673790684395, "grad_norm": 0.06611920893192291, "learning_rate": 4.994733382228154e-05, "loss": 0.535, "num_input_tokens_seen": 13894960, "step": 14540 }, { "epoch": 1.1864752426788483, "grad_norm": 0.20024915039539337, "learning_rate": 4.9947102658576154e-05, "loss": 0.2183, "num_input_tokens_seen": 13900224, "step": 14545 }, { "epoch": 1.1868831062892569, "grad_norm": 12.17383861541748, "learning_rate": 4.99468709892036e-05, "loss": 0.2928, "num_input_tokens_seen": 13905392, "step": 14550 }, { "epoch": 1.1872909698996654, "grad_norm": 0.16942167282104492, "learning_rate": 4.994663881416857e-05, "loss": 0.308, "num_input_tokens_seen": 13911264, "step": 14555 }, { "epoch": 1.1876988335100742, "grad_norm": 0.34323903918266296, "learning_rate": 4.9946406133475774e-05, "loss": 0.1189, "num_input_tokens_seen": 13916816, "step": 14560 }, { "epoch": 1.188106697120483, "grad_norm": 25.74456214904785, "learning_rate": 4.9946172947129924e-05, "loss": 0.2101, "num_input_tokens_seen": 13920944, "step": 14565 }, { "epoch": 1.1885145607308916, "grad_norm": 0.1024784967303276, "learning_rate": 4.994593925513574e-05, "loss": 0.1781, "num_input_tokens_seen": 13926416, "step": 14570 }, { "epoch": 1.1889224243413004, "grad_norm": 0.530659556388855, "learning_rate": 4.994570505749798e-05, "loss": 0.252, "num_input_tokens_seen": 13931872, "step": 14575 }, { "epoch": 1.189330287951709, "grad_norm": 9.04384994506836, "learning_rate": 4.994547035422137e-05, "loss": 0.5343, "num_input_tokens_seen": 13936112, "step": 14580 }, { "epoch": 1.1897381515621177, "grad_norm": 3.1695759296417236, "learning_rate": 4.9945235145310684e-05, "loss": 0.3383, "num_input_tokens_seen": 13940352, "step": 14585 }, { "epoch": 1.1901460151725263, "grad_norm": 7.218469619750977, "learning_rate": 4.9944999430770675e-05, "loss": 0.3583, "num_input_tokens_seen": 13945216, "step": 14590 }, { "epoch": 1.190553878782935, "grad_norm": 6.059757709503174, "learning_rate": 4.9944763210606125e-05, "loss": 0.1513, "num_input_tokens_seen": 13950256, "step": 14595 }, { "epoch": 1.1909617423933436, "grad_norm": 0.03137647360563278, "learning_rate": 4.994452648482183e-05, "loss": 0.1458, "num_input_tokens_seen": 13954416, "step": 14600 }, { "epoch": 1.1913696060037524, "grad_norm": 4.621180057525635, "learning_rate": 4.994428925342258e-05, "loss": 0.5763, "num_input_tokens_seen": 13958720, "step": 14605 }, { "epoch": 1.191777469614161, "grad_norm": 2.7878870964050293, "learning_rate": 4.9944051516413194e-05, "loss": 0.2648, "num_input_tokens_seen": 13962960, "step": 14610 }, { "epoch": 1.1921853332245698, "grad_norm": 0.4009498357772827, "learning_rate": 4.994381327379848e-05, "loss": 0.3409, "num_input_tokens_seen": 13967312, "step": 14615 }, { "epoch": 1.1925931968349783, "grad_norm": 12.186923027038574, "learning_rate": 4.994357452558326e-05, "loss": 0.1827, "num_input_tokens_seen": 13972864, "step": 14620 }, { "epoch": 1.1930010604453871, "grad_norm": 7.995589256286621, "learning_rate": 4.9943335271772394e-05, "loss": 0.3016, "num_input_tokens_seen": 13977808, "step": 14625 }, { "epoch": 1.1934089240557957, "grad_norm": 0.4867385923862457, "learning_rate": 4.994309551237072e-05, "loss": 0.1366, "num_input_tokens_seen": 13983200, "step": 14630 }, { "epoch": 1.1938167876662045, "grad_norm": 2.3559670448303223, "learning_rate": 4.99428552473831e-05, "loss": 0.3333, "num_input_tokens_seen": 13988832, "step": 14635 }, { "epoch": 1.194224651276613, "grad_norm": 5.971883773803711, "learning_rate": 4.99426144768144e-05, "loss": 0.4691, "num_input_tokens_seen": 13994128, "step": 14640 }, { "epoch": 1.1946325148870218, "grad_norm": 0.2965707778930664, "learning_rate": 4.99423732006695e-05, "loss": 0.1072, "num_input_tokens_seen": 13998416, "step": 14645 }, { "epoch": 1.1950403784974304, "grad_norm": 0.12357013672590256, "learning_rate": 4.99421314189533e-05, "loss": 0.0311, "num_input_tokens_seen": 14003200, "step": 14650 }, { "epoch": 1.1954482421078392, "grad_norm": 0.4622859060764313, "learning_rate": 4.9941889131670696e-05, "loss": 0.0591, "num_input_tokens_seen": 14007856, "step": 14655 }, { "epoch": 1.1958561057182477, "grad_norm": 8.217857360839844, "learning_rate": 4.9941646338826585e-05, "loss": 0.1286, "num_input_tokens_seen": 14012976, "step": 14660 }, { "epoch": 1.1962639693286565, "grad_norm": 0.0572112575173378, "learning_rate": 4.994140304042592e-05, "loss": 0.4185, "num_input_tokens_seen": 14018096, "step": 14665 }, { "epoch": 1.196671832939065, "grad_norm": 4.848361492156982, "learning_rate": 4.99411592364736e-05, "loss": 0.3235, "num_input_tokens_seen": 14023056, "step": 14670 }, { "epoch": 1.1970796965494739, "grad_norm": 0.05272354185581207, "learning_rate": 4.994091492697458e-05, "loss": 0.1769, "num_input_tokens_seen": 14027232, "step": 14675 }, { "epoch": 1.1974875601598824, "grad_norm": 0.019823875278234482, "learning_rate": 4.994067011193382e-05, "loss": 0.0037, "num_input_tokens_seen": 14032576, "step": 14680 }, { "epoch": 1.1978954237702912, "grad_norm": 0.04347431659698486, "learning_rate": 4.9940424791356265e-05, "loss": 0.2047, "num_input_tokens_seen": 14037152, "step": 14685 }, { "epoch": 1.1983032873806998, "grad_norm": 0.47989675402641296, "learning_rate": 4.994017896524691e-05, "loss": 0.2755, "num_input_tokens_seen": 14042448, "step": 14690 }, { "epoch": 1.1987111509911086, "grad_norm": 0.02049754373729229, "learning_rate": 4.993993263361071e-05, "loss": 0.004, "num_input_tokens_seen": 14047232, "step": 14695 }, { "epoch": 1.1991190146015174, "grad_norm": 0.03180721029639244, "learning_rate": 4.9939685796452684e-05, "loss": 0.0038, "num_input_tokens_seen": 14051760, "step": 14700 }, { "epoch": 1.199526878211926, "grad_norm": 11.34609317779541, "learning_rate": 4.993943845377782e-05, "loss": 0.1787, "num_input_tokens_seen": 14056608, "step": 14705 }, { "epoch": 1.1999347418223345, "grad_norm": 6.747633457183838, "learning_rate": 4.993919060559114e-05, "loss": 0.1977, "num_input_tokens_seen": 14061264, "step": 14710 }, { "epoch": 1.2003426054327433, "grad_norm": 9.585359573364258, "learning_rate": 4.993894225189766e-05, "loss": 0.2933, "num_input_tokens_seen": 14065312, "step": 14715 }, { "epoch": 1.200750469043152, "grad_norm": 0.15402065217494965, "learning_rate": 4.9938693392702415e-05, "loss": 0.173, "num_input_tokens_seen": 14070544, "step": 14720 }, { "epoch": 1.2011583326535606, "grad_norm": 31.234590530395508, "learning_rate": 4.993844402801047e-05, "loss": 0.2825, "num_input_tokens_seen": 14074880, "step": 14725 }, { "epoch": 1.2015661962639692, "grad_norm": 0.07639284431934357, "learning_rate": 4.993819415782684e-05, "loss": 0.2601, "num_input_tokens_seen": 14079776, "step": 14730 }, { "epoch": 1.201974059874378, "grad_norm": 0.18120424449443817, "learning_rate": 4.993794378215662e-05, "loss": 0.0076, "num_input_tokens_seen": 14083600, "step": 14735 }, { "epoch": 1.2023819234847868, "grad_norm": 0.5687342882156372, "learning_rate": 4.9937692901004876e-05, "loss": 0.7607, "num_input_tokens_seen": 14088784, "step": 14740 }, { "epoch": 1.2027897870951954, "grad_norm": 0.4160524904727936, "learning_rate": 4.9937441514376694e-05, "loss": 0.2621, "num_input_tokens_seen": 14093904, "step": 14745 }, { "epoch": 1.2031976507056041, "grad_norm": 0.18296599388122559, "learning_rate": 4.993718962227717e-05, "loss": 0.0077, "num_input_tokens_seen": 14098944, "step": 14750 }, { "epoch": 1.2036055143160127, "grad_norm": 0.1978035718202591, "learning_rate": 4.9936937224711404e-05, "loss": 0.0076, "num_input_tokens_seen": 14103152, "step": 14755 }, { "epoch": 1.2040133779264215, "grad_norm": 0.16785737872123718, "learning_rate": 4.993668432168452e-05, "loss": 0.1691, "num_input_tokens_seen": 14107952, "step": 14760 }, { "epoch": 1.20442124153683, "grad_norm": 6.329675674438477, "learning_rate": 4.993643091320164e-05, "loss": 0.364, "num_input_tokens_seen": 14113008, "step": 14765 }, { "epoch": 1.2048291051472388, "grad_norm": 4.479642868041992, "learning_rate": 4.99361769992679e-05, "loss": 0.3059, "num_input_tokens_seen": 14117168, "step": 14770 }, { "epoch": 1.2052369687576474, "grad_norm": 1.3277685642242432, "learning_rate": 4.993592257988845e-05, "loss": 0.1589, "num_input_tokens_seen": 14122112, "step": 14775 }, { "epoch": 1.2056448323680562, "grad_norm": 0.03255479782819748, "learning_rate": 4.9935667655068454e-05, "loss": 0.4674, "num_input_tokens_seen": 14126912, "step": 14780 }, { "epoch": 1.2060526959784648, "grad_norm": 0.3010357618331909, "learning_rate": 4.9935412224813063e-05, "loss": 0.0225, "num_input_tokens_seen": 14132592, "step": 14785 }, { "epoch": 1.2064605595888735, "grad_norm": 0.1829637587070465, "learning_rate": 4.9935156289127455e-05, "loss": 0.0118, "num_input_tokens_seen": 14137440, "step": 14790 }, { "epoch": 1.2068684231992821, "grad_norm": 0.22747229039669037, "learning_rate": 4.993489984801683e-05, "loss": 0.0074, "num_input_tokens_seen": 14142816, "step": 14795 }, { "epoch": 1.207276286809691, "grad_norm": 0.040762223303318024, "learning_rate": 4.993464290148638e-05, "loss": 0.0778, "num_input_tokens_seen": 14147936, "step": 14800 }, { "epoch": 1.2076841504200995, "grad_norm": 0.023244984447956085, "learning_rate": 4.9934385449541324e-05, "loss": 0.0162, "num_input_tokens_seen": 14152880, "step": 14805 }, { "epoch": 1.2080920140305083, "grad_norm": 0.12600384652614594, "learning_rate": 4.993412749218686e-05, "loss": 0.4722, "num_input_tokens_seen": 14157040, "step": 14810 }, { "epoch": 1.2084998776409168, "grad_norm": 0.01811780408024788, "learning_rate": 4.9933869029428223e-05, "loss": 0.655, "num_input_tokens_seen": 14162208, "step": 14815 }, { "epoch": 1.2089077412513256, "grad_norm": 4.622838497161865, "learning_rate": 4.993361006127066e-05, "loss": 0.7837, "num_input_tokens_seen": 14166896, "step": 14820 }, { "epoch": 1.2093156048617342, "grad_norm": 0.09135965257883072, "learning_rate": 4.9933350587719416e-05, "loss": 0.0067, "num_input_tokens_seen": 14171856, "step": 14825 }, { "epoch": 1.209723468472143, "grad_norm": 0.026861120015382767, "learning_rate": 4.9933090608779754e-05, "loss": 0.0543, "num_input_tokens_seen": 14176960, "step": 14830 }, { "epoch": 1.2101313320825515, "grad_norm": 4.773749828338623, "learning_rate": 4.9932830124456936e-05, "loss": 0.0146, "num_input_tokens_seen": 14181184, "step": 14835 }, { "epoch": 1.2105391956929603, "grad_norm": 1.3329427242279053, "learning_rate": 4.9932569134756246e-05, "loss": 0.1344, "num_input_tokens_seen": 14186448, "step": 14840 }, { "epoch": 1.2109470593033689, "grad_norm": 0.15270355343818665, "learning_rate": 4.993230763968298e-05, "loss": 0.1217, "num_input_tokens_seen": 14191680, "step": 14845 }, { "epoch": 1.2113549229137777, "grad_norm": 14.394495010375977, "learning_rate": 4.993204563924243e-05, "loss": 0.4928, "num_input_tokens_seen": 14196704, "step": 14850 }, { "epoch": 1.2117627865241862, "grad_norm": 0.03327140212059021, "learning_rate": 4.99317831334399e-05, "loss": 0.613, "num_input_tokens_seen": 14201360, "step": 14855 }, { "epoch": 1.212170650134595, "grad_norm": 0.1110139712691307, "learning_rate": 4.993152012228073e-05, "loss": 0.1206, "num_input_tokens_seen": 14206208, "step": 14860 }, { "epoch": 1.2125785137450036, "grad_norm": 0.158119797706604, "learning_rate": 4.9931256605770236e-05, "loss": 0.0121, "num_input_tokens_seen": 14211536, "step": 14865 }, { "epoch": 1.2129863773554124, "grad_norm": 0.40901607275009155, "learning_rate": 4.993099258391377e-05, "loss": 0.4638, "num_input_tokens_seen": 14216400, "step": 14870 }, { "epoch": 1.213394240965821, "grad_norm": 0.09389742463827133, "learning_rate": 4.993072805671667e-05, "loss": 0.0149, "num_input_tokens_seen": 14221424, "step": 14875 }, { "epoch": 1.2138021045762297, "grad_norm": 2.5878212451934814, "learning_rate": 4.993046302418431e-05, "loss": 0.3468, "num_input_tokens_seen": 14225680, "step": 14880 }, { "epoch": 1.2142099681866383, "grad_norm": 0.27245062589645386, "learning_rate": 4.9930197486322066e-05, "loss": 0.0582, "num_input_tokens_seen": 14230064, "step": 14885 }, { "epoch": 1.214617831797047, "grad_norm": 0.5677114129066467, "learning_rate": 4.9929931443135304e-05, "loss": 0.1786, "num_input_tokens_seen": 14235984, "step": 14890 }, { "epoch": 1.2150256954074559, "grad_norm": 2.695666551589966, "learning_rate": 4.992966489462942e-05, "loss": 0.171, "num_input_tokens_seen": 14240832, "step": 14895 }, { "epoch": 1.2154335590178644, "grad_norm": 2.12855863571167, "learning_rate": 4.992939784080983e-05, "loss": 0.5011, "num_input_tokens_seen": 14246048, "step": 14900 }, { "epoch": 1.215841422628273, "grad_norm": 0.1257176697254181, "learning_rate": 4.992913028168194e-05, "loss": 0.1278, "num_input_tokens_seen": 14250800, "step": 14905 }, { "epoch": 1.2162492862386818, "grad_norm": 0.049760330468416214, "learning_rate": 4.992886221725117e-05, "loss": 0.0327, "num_input_tokens_seen": 14255920, "step": 14910 }, { "epoch": 1.2166571498490906, "grad_norm": 0.027462847530841827, "learning_rate": 4.992859364752296e-05, "loss": 0.0482, "num_input_tokens_seen": 14260880, "step": 14915 }, { "epoch": 1.2170650134594991, "grad_norm": 0.24916264414787292, "learning_rate": 4.9928324572502745e-05, "loss": 0.242, "num_input_tokens_seen": 14265856, "step": 14920 }, { "epoch": 1.2174728770699077, "grad_norm": 0.22111889719963074, "learning_rate": 4.9928054992195985e-05, "loss": 0.3528, "num_input_tokens_seen": 14270672, "step": 14925 }, { "epoch": 1.2178807406803165, "grad_norm": 0.03504597023129463, "learning_rate": 4.992778490660814e-05, "loss": 0.2032, "num_input_tokens_seen": 14275280, "step": 14930 }, { "epoch": 1.2182886042907253, "grad_norm": 0.0953364148736, "learning_rate": 4.992751431574469e-05, "loss": 0.3066, "num_input_tokens_seen": 14279968, "step": 14935 }, { "epoch": 1.2186964679011338, "grad_norm": 3.220334529876709, "learning_rate": 4.9927243219611116e-05, "loss": 0.3884, "num_input_tokens_seen": 14284592, "step": 14940 }, { "epoch": 1.2191043315115426, "grad_norm": 0.2405054122209549, "learning_rate": 4.992697161821291e-05, "loss": 0.1694, "num_input_tokens_seen": 14288992, "step": 14945 }, { "epoch": 1.2195121951219512, "grad_norm": 1.2004899978637695, "learning_rate": 4.992669951155559e-05, "loss": 0.0974, "num_input_tokens_seen": 14293408, "step": 14950 }, { "epoch": 1.21992005873236, "grad_norm": 7.0467634201049805, "learning_rate": 4.992642689964466e-05, "loss": 0.3492, "num_input_tokens_seen": 14297264, "step": 14955 }, { "epoch": 1.2203279223427685, "grad_norm": 0.0807182639837265, "learning_rate": 4.992615378248564e-05, "loss": 0.1917, "num_input_tokens_seen": 14302544, "step": 14960 }, { "epoch": 1.2207357859531773, "grad_norm": 0.3893492519855499, "learning_rate": 4.9925880160084085e-05, "loss": 0.028, "num_input_tokens_seen": 14306992, "step": 14965 }, { "epoch": 1.221143649563586, "grad_norm": 10.966254234313965, "learning_rate": 4.992560603244553e-05, "loss": 0.1397, "num_input_tokens_seen": 14311696, "step": 14970 }, { "epoch": 1.2215515131739947, "grad_norm": 0.1276102066040039, "learning_rate": 4.992533139957553e-05, "loss": 0.1, "num_input_tokens_seen": 14315920, "step": 14975 }, { "epoch": 1.2219593767844033, "grad_norm": 0.018186166882514954, "learning_rate": 4.9925056261479654e-05, "loss": 0.0287, "num_input_tokens_seen": 14320752, "step": 14980 }, { "epoch": 1.222367240394812, "grad_norm": 0.0544610470533371, "learning_rate": 4.992478061816348e-05, "loss": 0.5176, "num_input_tokens_seen": 14324912, "step": 14985 }, { "epoch": 1.2227751040052206, "grad_norm": 31.686906814575195, "learning_rate": 4.992450446963259e-05, "loss": 0.3355, "num_input_tokens_seen": 14329856, "step": 14990 }, { "epoch": 1.2231829676156294, "grad_norm": 1.003019094467163, "learning_rate": 4.992422781589259e-05, "loss": 0.2115, "num_input_tokens_seen": 14334832, "step": 14995 }, { "epoch": 1.223590831226038, "grad_norm": 0.0799194946885109, "learning_rate": 4.992395065694908e-05, "loss": 0.1886, "num_input_tokens_seen": 14339632, "step": 15000 }, { "epoch": 1.2239986948364467, "grad_norm": 0.10769493132829666, "learning_rate": 4.992367299280768e-05, "loss": 0.1229, "num_input_tokens_seen": 14345056, "step": 15005 }, { "epoch": 1.2244065584468553, "grad_norm": 0.10617868602275848, "learning_rate": 4.992339482347402e-05, "loss": 0.1025, "num_input_tokens_seen": 14350336, "step": 15010 }, { "epoch": 1.224814422057264, "grad_norm": 0.19378195703029633, "learning_rate": 4.992311614895374e-05, "loss": 0.3569, "num_input_tokens_seen": 14355280, "step": 15015 }, { "epoch": 1.2252222856676727, "grad_norm": 13.930987358093262, "learning_rate": 4.9922836969252484e-05, "loss": 0.2002, "num_input_tokens_seen": 14359168, "step": 15020 }, { "epoch": 1.2256301492780814, "grad_norm": 7.384514808654785, "learning_rate": 4.9922557284375914e-05, "loss": 0.3748, "num_input_tokens_seen": 14364048, "step": 15025 }, { "epoch": 1.22603801288849, "grad_norm": 221.18580627441406, "learning_rate": 4.99222770943297e-05, "loss": 0.1117, "num_input_tokens_seen": 14369152, "step": 15030 }, { "epoch": 1.2264458764988988, "grad_norm": 1.9508862495422363, "learning_rate": 4.992199639911952e-05, "loss": 0.2035, "num_input_tokens_seen": 14373280, "step": 15035 }, { "epoch": 1.2268537401093074, "grad_norm": 0.12514296174049377, "learning_rate": 4.9921715198751065e-05, "loss": 0.0175, "num_input_tokens_seen": 14378272, "step": 15040 }, { "epoch": 1.2272616037197162, "grad_norm": 0.09459799528121948, "learning_rate": 4.992143349323002e-05, "loss": 0.035, "num_input_tokens_seen": 14383392, "step": 15045 }, { "epoch": 1.2276694673301247, "grad_norm": 0.06357795745134354, "learning_rate": 4.992115128256212e-05, "loss": 0.0863, "num_input_tokens_seen": 14387568, "step": 15050 }, { "epoch": 1.2280773309405335, "grad_norm": 0.0546521432697773, "learning_rate": 4.992086856675306e-05, "loss": 0.1864, "num_input_tokens_seen": 14392368, "step": 15055 }, { "epoch": 1.228485194550942, "grad_norm": 0.057118527591228485, "learning_rate": 4.9920585345808604e-05, "loss": 0.0331, "num_input_tokens_seen": 14396544, "step": 15060 }, { "epoch": 1.2288930581613509, "grad_norm": 0.1513623744249344, "learning_rate": 4.9920301619734454e-05, "loss": 0.0436, "num_input_tokens_seen": 14401104, "step": 15065 }, { "epoch": 1.2293009217717596, "grad_norm": 7.89931583404541, "learning_rate": 4.992001738853639e-05, "loss": 0.3973, "num_input_tokens_seen": 14406176, "step": 15070 }, { "epoch": 1.2297087853821682, "grad_norm": 0.08036282658576965, "learning_rate": 4.9919732652220164e-05, "loss": 0.1626, "num_input_tokens_seen": 14410080, "step": 15075 }, { "epoch": 1.2301166489925768, "grad_norm": 6.183897972106934, "learning_rate": 4.9919447410791535e-05, "loss": 0.3496, "num_input_tokens_seen": 14415328, "step": 15080 }, { "epoch": 1.2305245126029856, "grad_norm": 0.10162230581045151, "learning_rate": 4.991916166425631e-05, "loss": 0.1966, "num_input_tokens_seen": 14420480, "step": 15085 }, { "epoch": 1.2309323762133944, "grad_norm": 0.11017423123121262, "learning_rate": 4.991887541262026e-05, "loss": 0.4087, "num_input_tokens_seen": 14424960, "step": 15090 }, { "epoch": 1.231340239823803, "grad_norm": 0.044613227248191833, "learning_rate": 4.991858865588919e-05, "loss": 0.2535, "num_input_tokens_seen": 14429888, "step": 15095 }, { "epoch": 1.2317481034342115, "grad_norm": 0.12825489044189453, "learning_rate": 4.991830139406892e-05, "loss": 0.6467, "num_input_tokens_seen": 14434256, "step": 15100 }, { "epoch": 1.2321559670446203, "grad_norm": 0.025837861001491547, "learning_rate": 4.9918013627165265e-05, "loss": 0.0208, "num_input_tokens_seen": 14440304, "step": 15105 }, { "epoch": 1.232563830655029, "grad_norm": 19.044923782348633, "learning_rate": 4.991772535518407e-05, "loss": 0.3465, "num_input_tokens_seen": 14444336, "step": 15110 }, { "epoch": 1.2329716942654376, "grad_norm": 6.552525520324707, "learning_rate": 4.991743657813117e-05, "loss": 0.3177, "num_input_tokens_seen": 14448864, "step": 15115 }, { "epoch": 1.2333795578758464, "grad_norm": 0.012803667224943638, "learning_rate": 4.991714729601241e-05, "loss": 0.4899, "num_input_tokens_seen": 14453728, "step": 15120 }, { "epoch": 1.233787421486255, "grad_norm": 0.07806868106126785, "learning_rate": 4.991685750883367e-05, "loss": 0.312, "num_input_tokens_seen": 14458992, "step": 15125 }, { "epoch": 1.2341952850966638, "grad_norm": 13.435542106628418, "learning_rate": 4.991656721660081e-05, "loss": 0.107, "num_input_tokens_seen": 14464048, "step": 15130 }, { "epoch": 1.2346031487070723, "grad_norm": 0.06592865288257599, "learning_rate": 4.9916276419319726e-05, "loss": 0.2079, "num_input_tokens_seen": 14468592, "step": 15135 }, { "epoch": 1.2350110123174811, "grad_norm": 0.06505356729030609, "learning_rate": 4.99159851169963e-05, "loss": 0.2636, "num_input_tokens_seen": 14473072, "step": 15140 }, { "epoch": 1.2354188759278897, "grad_norm": 0.14721691608428955, "learning_rate": 4.991569330963645e-05, "loss": 0.0494, "num_input_tokens_seen": 14478560, "step": 15145 }, { "epoch": 1.2358267395382985, "grad_norm": 0.0822732150554657, "learning_rate": 4.991540099724608e-05, "loss": 0.0143, "num_input_tokens_seen": 14483680, "step": 15150 }, { "epoch": 1.236234603148707, "grad_norm": 9.019136428833008, "learning_rate": 4.991510817983112e-05, "loss": 0.143, "num_input_tokens_seen": 14488496, "step": 15155 }, { "epoch": 1.2366424667591158, "grad_norm": 7.933577060699463, "learning_rate": 4.991481485739751e-05, "loss": 0.3435, "num_input_tokens_seen": 14493072, "step": 15160 }, { "epoch": 1.2370503303695244, "grad_norm": 0.02093414217233658, "learning_rate": 4.9914521029951175e-05, "loss": 0.1626, "num_input_tokens_seen": 14498480, "step": 15165 }, { "epoch": 1.2374581939799332, "grad_norm": 0.024739008396863937, "learning_rate": 4.99142266974981e-05, "loss": 0.2197, "num_input_tokens_seen": 14502736, "step": 15170 }, { "epoch": 1.2378660575903417, "grad_norm": 3.1536407470703125, "learning_rate": 4.9913931860044235e-05, "loss": 0.4174, "num_input_tokens_seen": 14507936, "step": 15175 }, { "epoch": 1.2382739212007505, "grad_norm": 1.6286276578903198, "learning_rate": 4.991363651759555e-05, "loss": 0.5394, "num_input_tokens_seen": 14512528, "step": 15180 }, { "epoch": 1.238681784811159, "grad_norm": 5.728386878967285, "learning_rate": 4.9913340670158046e-05, "loss": 0.1467, "num_input_tokens_seen": 14517120, "step": 15185 }, { "epoch": 1.2390896484215679, "grad_norm": 11.579188346862793, "learning_rate": 4.9913044317737713e-05, "loss": 0.4916, "num_input_tokens_seen": 14522112, "step": 15190 }, { "epoch": 1.2394975120319764, "grad_norm": 0.4182945787906647, "learning_rate": 4.9912747460340556e-05, "loss": 0.0483, "num_input_tokens_seen": 14527152, "step": 15195 }, { "epoch": 1.2399053756423852, "grad_norm": 0.13959293067455292, "learning_rate": 4.991245009797259e-05, "loss": 0.0415, "num_input_tokens_seen": 14531600, "step": 15200 }, { "epoch": 1.2403132392527938, "grad_norm": 0.08441486954689026, "learning_rate": 4.991215223063985e-05, "loss": 0.0345, "num_input_tokens_seen": 14536304, "step": 15205 }, { "epoch": 1.2407211028632026, "grad_norm": 0.6812829375267029, "learning_rate": 4.991185385834837e-05, "loss": 0.5752, "num_input_tokens_seen": 14540576, "step": 15210 }, { "epoch": 1.2411289664736112, "grad_norm": 11.45086669921875, "learning_rate": 4.9911554981104194e-05, "loss": 0.4034, "num_input_tokens_seen": 14544832, "step": 15215 }, { "epoch": 1.24153683008402, "grad_norm": 45.51823425292969, "learning_rate": 4.991125559891339e-05, "loss": 0.2706, "num_input_tokens_seen": 14549680, "step": 15220 }, { "epoch": 1.2419446936944285, "grad_norm": 7.479496002197266, "learning_rate": 4.991095571178202e-05, "loss": 0.1671, "num_input_tokens_seen": 14553776, "step": 15225 }, { "epoch": 1.2423525573048373, "grad_norm": 0.30978405475616455, "learning_rate": 4.991065531971616e-05, "loss": 0.0959, "num_input_tokens_seen": 14557648, "step": 15230 }, { "epoch": 1.2427604209152459, "grad_norm": 3.350783586502075, "learning_rate": 4.9910354422721906e-05, "loss": 0.3021, "num_input_tokens_seen": 14563120, "step": 15235 }, { "epoch": 1.2431682845256546, "grad_norm": 0.34359580278396606, "learning_rate": 4.9910053020805345e-05, "loss": 0.2574, "num_input_tokens_seen": 14567632, "step": 15240 }, { "epoch": 1.2435761481360632, "grad_norm": 0.06819593906402588, "learning_rate": 4.99097511139726e-05, "loss": 0.2988, "num_input_tokens_seen": 14572400, "step": 15245 }, { "epoch": 1.243984011746472, "grad_norm": 22.129947662353516, "learning_rate": 4.990944870222979e-05, "loss": 0.2537, "num_input_tokens_seen": 14576752, "step": 15250 }, { "epoch": 1.2443918753568806, "grad_norm": 0.0435405857861042, "learning_rate": 4.9909145785583034e-05, "loss": 0.1907, "num_input_tokens_seen": 14580704, "step": 15255 }, { "epoch": 1.2447997389672893, "grad_norm": 18.079364776611328, "learning_rate": 4.990884236403847e-05, "loss": 0.2484, "num_input_tokens_seen": 14585488, "step": 15260 }, { "epoch": 1.2452076025776981, "grad_norm": 1.089681625366211, "learning_rate": 4.990853843760227e-05, "loss": 0.015, "num_input_tokens_seen": 14590256, "step": 15265 }, { "epoch": 1.2456154661881067, "grad_norm": 11.531121253967285, "learning_rate": 4.990823400628057e-05, "loss": 0.2275, "num_input_tokens_seen": 14595648, "step": 15270 }, { "epoch": 1.2460233297985153, "grad_norm": 0.05543415620923042, "learning_rate": 4.990792907007955e-05, "loss": 0.0161, "num_input_tokens_seen": 14600720, "step": 15275 }, { "epoch": 1.246431193408924, "grad_norm": 1.564185619354248, "learning_rate": 4.99076236290054e-05, "loss": 0.1972, "num_input_tokens_seen": 14605680, "step": 15280 }, { "epoch": 1.2468390570193328, "grad_norm": 0.09020987898111343, "learning_rate": 4.990731768306429e-05, "loss": 0.2171, "num_input_tokens_seen": 14610560, "step": 15285 }, { "epoch": 1.2472469206297414, "grad_norm": 11.505433082580566, "learning_rate": 4.990701123226244e-05, "loss": 0.0363, "num_input_tokens_seen": 14615888, "step": 15290 }, { "epoch": 1.24765478424015, "grad_norm": 0.051122311502695084, "learning_rate": 4.990670427660606e-05, "loss": 0.0122, "num_input_tokens_seen": 14620800, "step": 15295 }, { "epoch": 1.2480626478505588, "grad_norm": 0.05754733830690384, "learning_rate": 4.9906396816101374e-05, "loss": 0.3368, "num_input_tokens_seen": 14626224, "step": 15300 }, { "epoch": 1.2484705114609675, "grad_norm": 0.09695638716220856, "learning_rate": 4.99060888507546e-05, "loss": 0.2567, "num_input_tokens_seen": 14630864, "step": 15305 }, { "epoch": 1.2488783750713761, "grad_norm": 8.47470760345459, "learning_rate": 4.990578038057199e-05, "loss": 0.2689, "num_input_tokens_seen": 14634864, "step": 15310 }, { "epoch": 1.249286238681785, "grad_norm": 0.13872858881950378, "learning_rate": 4.990547140555979e-05, "loss": 0.1663, "num_input_tokens_seen": 14639712, "step": 15315 }, { "epoch": 1.2496941022921935, "grad_norm": 0.14473657310009003, "learning_rate": 4.990516192572428e-05, "loss": 0.2573, "num_input_tokens_seen": 14644000, "step": 15320 }, { "epoch": 1.2501019659026023, "grad_norm": 0.10549650341272354, "learning_rate": 4.990485194107171e-05, "loss": 0.1861, "num_input_tokens_seen": 14648944, "step": 15325 }, { "epoch": 1.2505098295130108, "grad_norm": 45.533138275146484, "learning_rate": 4.9904541451608364e-05, "loss": 0.5744, "num_input_tokens_seen": 14653840, "step": 15330 }, { "epoch": 1.2509176931234196, "grad_norm": 0.31566378474235535, "learning_rate": 4.9904230457340564e-05, "loss": 0.3789, "num_input_tokens_seen": 14659680, "step": 15335 }, { "epoch": 1.2513255567338282, "grad_norm": 7.316086769104004, "learning_rate": 4.990391895827459e-05, "loss": 0.5003, "num_input_tokens_seen": 14664528, "step": 15340 }, { "epoch": 1.251733420344237, "grad_norm": 0.46645960211753845, "learning_rate": 4.990360695441676e-05, "loss": 0.242, "num_input_tokens_seen": 14669808, "step": 15345 }, { "epoch": 1.2521412839546455, "grad_norm": 0.23586684465408325, "learning_rate": 4.990329444577339e-05, "loss": 0.1398, "num_input_tokens_seen": 14673744, "step": 15350 }, { "epoch": 1.2525491475650543, "grad_norm": 10.784097671508789, "learning_rate": 4.990298143235084e-05, "loss": 0.0898, "num_input_tokens_seen": 14678928, "step": 15355 }, { "epoch": 1.2529570111754629, "grad_norm": 0.872157871723175, "learning_rate": 4.9902667914155425e-05, "loss": 0.1904, "num_input_tokens_seen": 14683584, "step": 15360 }, { "epoch": 1.2533648747858717, "grad_norm": 0.31459498405456543, "learning_rate": 4.990235389119352e-05, "loss": 0.0867, "num_input_tokens_seen": 14687760, "step": 15365 }, { "epoch": 1.2537727383962802, "grad_norm": 0.20178015530109406, "learning_rate": 4.990203936347148e-05, "loss": 0.0398, "num_input_tokens_seen": 14693056, "step": 15370 }, { "epoch": 1.254180602006689, "grad_norm": 1.698847770690918, "learning_rate": 4.9901724330995684e-05, "loss": 0.2904, "num_input_tokens_seen": 14698496, "step": 15375 }, { "epoch": 1.2545884656170976, "grad_norm": 0.20906345546245575, "learning_rate": 4.9901408793772517e-05, "loss": 0.2894, "num_input_tokens_seen": 14703632, "step": 15380 }, { "epoch": 1.2549963292275064, "grad_norm": 2.1284892559051514, "learning_rate": 4.990109275180838e-05, "loss": 0.1952, "num_input_tokens_seen": 14708896, "step": 15385 }, { "epoch": 1.255404192837915, "grad_norm": 0.17349405586719513, "learning_rate": 4.990077620510967e-05, "loss": 0.0176, "num_input_tokens_seen": 14713040, "step": 15390 }, { "epoch": 1.2558120564483237, "grad_norm": 5.8276543617248535, "learning_rate": 4.990045915368281e-05, "loss": 0.046, "num_input_tokens_seen": 14718640, "step": 15395 }, { "epoch": 1.2562199200587323, "grad_norm": 0.05859418958425522, "learning_rate": 4.9900141597534215e-05, "loss": 0.2789, "num_input_tokens_seen": 14723200, "step": 15400 }, { "epoch": 1.256627783669141, "grad_norm": 0.026155835017561913, "learning_rate": 4.989982353667034e-05, "loss": 0.1874, "num_input_tokens_seen": 14728032, "step": 15405 }, { "epoch": 1.2570356472795496, "grad_norm": 0.1493755578994751, "learning_rate": 4.989950497109762e-05, "loss": 0.1778, "num_input_tokens_seen": 14733008, "step": 15410 }, { "epoch": 1.2574435108899584, "grad_norm": 0.10154082626104355, "learning_rate": 4.989918590082252e-05, "loss": 0.1787, "num_input_tokens_seen": 14737184, "step": 15415 }, { "epoch": 1.2578513745003672, "grad_norm": 0.08978988230228424, "learning_rate": 4.989886632585149e-05, "loss": 0.0103, "num_input_tokens_seen": 14741856, "step": 15420 }, { "epoch": 1.2582592381107758, "grad_norm": 0.14249880611896515, "learning_rate": 4.989854624619102e-05, "loss": 0.1617, "num_input_tokens_seen": 14745632, "step": 15425 }, { "epoch": 1.2586671017211843, "grad_norm": 0.08767508715391159, "learning_rate": 4.989822566184761e-05, "loss": 0.0268, "num_input_tokens_seen": 14750192, "step": 15430 }, { "epoch": 1.2590749653315931, "grad_norm": 0.052610959857702255, "learning_rate": 4.9897904572827735e-05, "loss": 0.1214, "num_input_tokens_seen": 14755264, "step": 15435 }, { "epoch": 1.259482828942002, "grad_norm": 0.037066277116537094, "learning_rate": 4.9897582979137926e-05, "loss": 0.2438, "num_input_tokens_seen": 14760912, "step": 15440 }, { "epoch": 1.2598906925524105, "grad_norm": 0.05914777144789696, "learning_rate": 4.989726088078468e-05, "loss": 0.1652, "num_input_tokens_seen": 14765136, "step": 15445 }, { "epoch": 1.260298556162819, "grad_norm": 0.11021581292152405, "learning_rate": 4.989693827777454e-05, "loss": 0.1515, "num_input_tokens_seen": 14769664, "step": 15450 }, { "epoch": 1.2607064197732278, "grad_norm": 34.17871856689453, "learning_rate": 4.989661517011404e-05, "loss": 0.2473, "num_input_tokens_seen": 14774720, "step": 15455 }, { "epoch": 1.2611142833836366, "grad_norm": 2.939382314682007, "learning_rate": 4.989629155780972e-05, "loss": 0.149, "num_input_tokens_seen": 14778960, "step": 15460 }, { "epoch": 1.2615221469940452, "grad_norm": 0.05187418684363365, "learning_rate": 4.9895967440868155e-05, "loss": 0.0288, "num_input_tokens_seen": 14784064, "step": 15465 }, { "epoch": 1.2619300106044538, "grad_norm": 0.0634278878569603, "learning_rate": 4.989564281929592e-05, "loss": 0.2358, "num_input_tokens_seen": 14788976, "step": 15470 }, { "epoch": 1.2623378742148625, "grad_norm": 0.25181493163108826, "learning_rate": 4.989531769309957e-05, "loss": 0.0155, "num_input_tokens_seen": 14793520, "step": 15475 }, { "epoch": 1.2627457378252713, "grad_norm": 0.0391397699713707, "learning_rate": 4.9894992062285715e-05, "loss": 0.0101, "num_input_tokens_seen": 14798544, "step": 15480 }, { "epoch": 1.26315360143568, "grad_norm": 0.10399241000413895, "learning_rate": 4.9894665926860943e-05, "loss": 0.0949, "num_input_tokens_seen": 14803728, "step": 15485 }, { "epoch": 1.2635614650460885, "grad_norm": 15.394986152648926, "learning_rate": 4.989433928683188e-05, "loss": 0.073, "num_input_tokens_seen": 14808960, "step": 15490 }, { "epoch": 1.2639693286564972, "grad_norm": 1.8909049034118652, "learning_rate": 4.989401214220513e-05, "loss": 0.3978, "num_input_tokens_seen": 14813232, "step": 15495 }, { "epoch": 1.264377192266906, "grad_norm": 5.633296012878418, "learning_rate": 4.9893684492987334e-05, "loss": 0.402, "num_input_tokens_seen": 14817136, "step": 15500 }, { "epoch": 1.2647850558773146, "grad_norm": 2.9116666316986084, "learning_rate": 4.989335633918514e-05, "loss": 0.5976, "num_input_tokens_seen": 14822192, "step": 15505 }, { "epoch": 1.2651929194877232, "grad_norm": 0.1375514715909958, "learning_rate": 4.989302768080518e-05, "loss": 0.0896, "num_input_tokens_seen": 14827024, "step": 15510 }, { "epoch": 1.265600783098132, "grad_norm": 0.14360375702381134, "learning_rate": 4.989269851785413e-05, "loss": 0.0367, "num_input_tokens_seen": 14831392, "step": 15515 }, { "epoch": 1.2660086467085407, "grad_norm": 0.3594802916049957, "learning_rate": 4.989236885033866e-05, "loss": 0.0335, "num_input_tokens_seen": 14836128, "step": 15520 }, { "epoch": 1.2664165103189493, "grad_norm": 0.0705619677901268, "learning_rate": 4.9892038678265454e-05, "loss": 0.1454, "num_input_tokens_seen": 14841184, "step": 15525 }, { "epoch": 1.266824373929358, "grad_norm": 3.9363365173339844, "learning_rate": 4.989170800164119e-05, "loss": 0.2239, "num_input_tokens_seen": 14846160, "step": 15530 }, { "epoch": 1.2672322375397667, "grad_norm": 0.09241580963134766, "learning_rate": 4.989137682047259e-05, "loss": 0.0386, "num_input_tokens_seen": 14850480, "step": 15535 }, { "epoch": 1.2676401011501754, "grad_norm": 0.1895240992307663, "learning_rate": 4.989104513476636e-05, "loss": 0.0696, "num_input_tokens_seen": 14855984, "step": 15540 }, { "epoch": 1.268047964760584, "grad_norm": 0.8709878325462341, "learning_rate": 4.9890712944529224e-05, "loss": 0.0494, "num_input_tokens_seen": 14861664, "step": 15545 }, { "epoch": 1.2684558283709928, "grad_norm": 6.707047939300537, "learning_rate": 4.989038024976791e-05, "loss": 0.2481, "num_input_tokens_seen": 14866928, "step": 15550 }, { "epoch": 1.2688636919814014, "grad_norm": 5.574248313903809, "learning_rate": 4.989004705048916e-05, "loss": 0.2055, "num_input_tokens_seen": 14870768, "step": 15555 }, { "epoch": 1.2692715555918102, "grad_norm": 0.030755089595913887, "learning_rate": 4.988971334669974e-05, "loss": 0.0931, "num_input_tokens_seen": 14876400, "step": 15560 }, { "epoch": 1.2696794192022187, "grad_norm": 25.834386825561523, "learning_rate": 4.98893791384064e-05, "loss": 0.4391, "num_input_tokens_seen": 14881392, "step": 15565 }, { "epoch": 1.2700872828126275, "grad_norm": 0.06653454899787903, "learning_rate": 4.9889044425615926e-05, "loss": 0.2193, "num_input_tokens_seen": 14885984, "step": 15570 }, { "epoch": 1.270495146423036, "grad_norm": 0.17536647617816925, "learning_rate": 4.98887092083351e-05, "loss": 0.3228, "num_input_tokens_seen": 14891360, "step": 15575 }, { "epoch": 1.2709030100334449, "grad_norm": 0.046662405133247375, "learning_rate": 4.9888373486570705e-05, "loss": 0.0128, "num_input_tokens_seen": 14895904, "step": 15580 }, { "epoch": 1.2713108736438534, "grad_norm": 0.0338958315551281, "learning_rate": 4.988803726032957e-05, "loss": 0.0307, "num_input_tokens_seen": 14900160, "step": 15585 }, { "epoch": 1.2717187372542622, "grad_norm": 0.03290216252207756, "learning_rate": 4.9887700529618484e-05, "loss": 0.1765, "num_input_tokens_seen": 14904896, "step": 15590 }, { "epoch": 1.2721266008646708, "grad_norm": 0.03560012951493263, "learning_rate": 4.988736329444428e-05, "loss": 0.3589, "num_input_tokens_seen": 14909424, "step": 15595 }, { "epoch": 1.2725344644750796, "grad_norm": 0.779495358467102, "learning_rate": 4.9887025554813804e-05, "loss": 0.0143, "num_input_tokens_seen": 14914128, "step": 15600 }, { "epoch": 1.2729423280854881, "grad_norm": 0.10787390917539597, "learning_rate": 4.988668731073389e-05, "loss": 0.3604, "num_input_tokens_seen": 14918864, "step": 15605 }, { "epoch": 1.273350191695897, "grad_norm": 0.10500174760818481, "learning_rate": 4.9886348562211405e-05, "loss": 0.2323, "num_input_tokens_seen": 14923696, "step": 15610 }, { "epoch": 1.2737580553063057, "grad_norm": 0.060605525970458984, "learning_rate": 4.9886009309253214e-05, "loss": 0.0109, "num_input_tokens_seen": 14928128, "step": 15615 }, { "epoch": 1.2741659189167143, "grad_norm": 0.0389571487903595, "learning_rate": 4.9885669551866174e-05, "loss": 0.3256, "num_input_tokens_seen": 14932880, "step": 15620 }, { "epoch": 1.2745737825271228, "grad_norm": 4.624256610870361, "learning_rate": 4.9885329290057204e-05, "loss": 0.1448, "num_input_tokens_seen": 14936992, "step": 15625 }, { "epoch": 1.2749816461375316, "grad_norm": 0.15114043653011322, "learning_rate": 4.988498852383317e-05, "loss": 0.121, "num_input_tokens_seen": 14942128, "step": 15630 }, { "epoch": 1.2753895097479404, "grad_norm": 2.8449289798736572, "learning_rate": 4.9884647253201e-05, "loss": 0.2402, "num_input_tokens_seen": 14947088, "step": 15635 }, { "epoch": 1.275797373358349, "grad_norm": 0.1299755871295929, "learning_rate": 4.98843054781676e-05, "loss": 0.0171, "num_input_tokens_seen": 14952272, "step": 15640 }, { "epoch": 1.2762052369687575, "grad_norm": 3.1591086387634277, "learning_rate": 4.98839631987399e-05, "loss": 0.386, "num_input_tokens_seen": 14956368, "step": 15645 }, { "epoch": 1.2766131005791663, "grad_norm": 0.12893232703208923, "learning_rate": 4.988362041492485e-05, "loss": 0.2287, "num_input_tokens_seen": 14961296, "step": 15650 }, { "epoch": 1.2770209641895751, "grad_norm": 2.5727741718292236, "learning_rate": 4.9883277126729384e-05, "loss": 0.1784, "num_input_tokens_seen": 14966560, "step": 15655 }, { "epoch": 1.2774288277999837, "grad_norm": 0.24093826115131378, "learning_rate": 4.988293333416047e-05, "loss": 0.0766, "num_input_tokens_seen": 14971632, "step": 15660 }, { "epoch": 1.2778366914103922, "grad_norm": 0.11833029985427856, "learning_rate": 4.988258903722505e-05, "loss": 0.3172, "num_input_tokens_seen": 14976720, "step": 15665 }, { "epoch": 1.278244555020801, "grad_norm": 0.21061007678508759, "learning_rate": 4.988224423593014e-05, "loss": 0.1767, "num_input_tokens_seen": 14982496, "step": 15670 }, { "epoch": 1.2786524186312098, "grad_norm": 2.8532660007476807, "learning_rate": 4.988189893028271e-05, "loss": 0.2349, "num_input_tokens_seen": 14987584, "step": 15675 }, { "epoch": 1.2790602822416184, "grad_norm": 1.1220898628234863, "learning_rate": 4.9881553120289756e-05, "loss": 0.3191, "num_input_tokens_seen": 14992368, "step": 15680 }, { "epoch": 1.279468145852027, "grad_norm": 0.10769116133451462, "learning_rate": 4.98812068059583e-05, "loss": 0.1962, "num_input_tokens_seen": 14997008, "step": 15685 }, { "epoch": 1.2798760094624357, "grad_norm": 29.42388153076172, "learning_rate": 4.9880859987295356e-05, "loss": 0.6613, "num_input_tokens_seen": 15001856, "step": 15690 }, { "epoch": 1.2802838730728445, "grad_norm": 0.48728975653648376, "learning_rate": 4.988051266430794e-05, "loss": 0.2066, "num_input_tokens_seen": 15006064, "step": 15695 }, { "epoch": 1.280691736683253, "grad_norm": 0.08466193079948425, "learning_rate": 4.988016483700312e-05, "loss": 0.0314, "num_input_tokens_seen": 15011360, "step": 15700 }, { "epoch": 1.2810996002936619, "grad_norm": 0.630311131477356, "learning_rate": 4.987981650538792e-05, "loss": 0.1811, "num_input_tokens_seen": 15015776, "step": 15705 }, { "epoch": 1.2815074639040704, "grad_norm": 0.11985081434249878, "learning_rate": 4.987946766946942e-05, "loss": 0.0186, "num_input_tokens_seen": 15019904, "step": 15710 }, { "epoch": 1.2819153275144792, "grad_norm": 0.1076786145567894, "learning_rate": 4.987911832925468e-05, "loss": 0.1023, "num_input_tokens_seen": 15024576, "step": 15715 }, { "epoch": 1.2823231911248878, "grad_norm": 0.21773605048656464, "learning_rate": 4.987876848475078e-05, "loss": 0.0334, "num_input_tokens_seen": 15029456, "step": 15720 }, { "epoch": 1.2827310547352966, "grad_norm": 0.10755647718906403, "learning_rate": 4.9878418135964817e-05, "loss": 0.1687, "num_input_tokens_seen": 15033952, "step": 15725 }, { "epoch": 1.2831389183457051, "grad_norm": 4.397225379943848, "learning_rate": 4.9878067282903884e-05, "loss": 0.1156, "num_input_tokens_seen": 15039056, "step": 15730 }, { "epoch": 1.283546781956114, "grad_norm": 0.029559900984168053, "learning_rate": 4.987771592557511e-05, "loss": 0.0566, "num_input_tokens_seen": 15043296, "step": 15735 }, { "epoch": 1.2839546455665225, "grad_norm": 0.025991618633270264, "learning_rate": 4.98773640639856e-05, "loss": 0.0104, "num_input_tokens_seen": 15047568, "step": 15740 }, { "epoch": 1.2843625091769313, "grad_norm": 7.032015800476074, "learning_rate": 4.9877011698142486e-05, "loss": 0.7607, "num_input_tokens_seen": 15052048, "step": 15745 }, { "epoch": 1.2847703727873399, "grad_norm": 2.025076150894165, "learning_rate": 4.987665882805292e-05, "loss": 0.2979, "num_input_tokens_seen": 15056672, "step": 15750 }, { "epoch": 1.2851782363977486, "grad_norm": 0.1686541736125946, "learning_rate": 4.987630545372405e-05, "loss": 0.0407, "num_input_tokens_seen": 15061584, "step": 15755 }, { "epoch": 1.2855861000081572, "grad_norm": 0.1895906925201416, "learning_rate": 4.987595157516304e-05, "loss": 0.0361, "num_input_tokens_seen": 15066592, "step": 15760 }, { "epoch": 1.285993963618566, "grad_norm": 0.12705913186073303, "learning_rate": 4.987559719237706e-05, "loss": 0.0154, "num_input_tokens_seen": 15071456, "step": 15765 }, { "epoch": 1.2864018272289746, "grad_norm": 0.029008671641349792, "learning_rate": 4.9875242305373295e-05, "loss": 0.0128, "num_input_tokens_seen": 15076640, "step": 15770 }, { "epoch": 1.2868096908393833, "grad_norm": 0.01584462821483612, "learning_rate": 4.987488691415894e-05, "loss": 0.0127, "num_input_tokens_seen": 15082016, "step": 15775 }, { "epoch": 1.287217554449792, "grad_norm": 14.541749000549316, "learning_rate": 4.98745310187412e-05, "loss": 0.0909, "num_input_tokens_seen": 15086288, "step": 15780 }, { "epoch": 1.2876254180602007, "grad_norm": 0.03463202342391014, "learning_rate": 4.987417461912728e-05, "loss": 0.2457, "num_input_tokens_seen": 15091424, "step": 15785 }, { "epoch": 1.2880332816706095, "grad_norm": 4.349978923797607, "learning_rate": 4.987381771532441e-05, "loss": 0.3895, "num_input_tokens_seen": 15096144, "step": 15790 }, { "epoch": 1.288441145281018, "grad_norm": 16.353126525878906, "learning_rate": 4.9873460307339833e-05, "loss": 0.2397, "num_input_tokens_seen": 15101248, "step": 15795 }, { "epoch": 1.2888490088914266, "grad_norm": 0.08046596497297287, "learning_rate": 4.987310239518078e-05, "loss": 0.179, "num_input_tokens_seen": 15105968, "step": 15800 }, { "epoch": 1.2892568725018354, "grad_norm": 0.08796229213476181, "learning_rate": 4.987274397885451e-05, "loss": 0.5655, "num_input_tokens_seen": 15110400, "step": 15805 }, { "epoch": 1.2896647361122442, "grad_norm": 2.212165594100952, "learning_rate": 4.987238505836829e-05, "loss": 0.1849, "num_input_tokens_seen": 15115424, "step": 15810 }, { "epoch": 1.2900725997226528, "grad_norm": 4.999525547027588, "learning_rate": 4.987202563372939e-05, "loss": 0.2429, "num_input_tokens_seen": 15120512, "step": 15815 }, { "epoch": 1.2904804633330613, "grad_norm": 22.130496978759766, "learning_rate": 4.9871665704945105e-05, "loss": 0.4649, "num_input_tokens_seen": 15125616, "step": 15820 }, { "epoch": 1.29088832694347, "grad_norm": 1.7563791275024414, "learning_rate": 4.987130527202272e-05, "loss": 0.1096, "num_input_tokens_seen": 15130544, "step": 15825 }, { "epoch": 1.291296190553879, "grad_norm": 7.024492263793945, "learning_rate": 4.987094433496955e-05, "loss": 0.0637, "num_input_tokens_seen": 15135440, "step": 15830 }, { "epoch": 1.2917040541642875, "grad_norm": 5.789759635925293, "learning_rate": 4.987058289379291e-05, "loss": 0.0919, "num_input_tokens_seen": 15139968, "step": 15835 }, { "epoch": 1.292111917774696, "grad_norm": 0.07486942410469055, "learning_rate": 4.987022094850011e-05, "loss": 0.1777, "num_input_tokens_seen": 15145120, "step": 15840 }, { "epoch": 1.2925197813851048, "grad_norm": 0.042406514286994934, "learning_rate": 4.986985849909851e-05, "loss": 0.2458, "num_input_tokens_seen": 15150272, "step": 15845 }, { "epoch": 1.2929276449955136, "grad_norm": 0.21721701323986053, "learning_rate": 4.9869495545595436e-05, "loss": 0.349, "num_input_tokens_seen": 15155024, "step": 15850 }, { "epoch": 1.2933355086059222, "grad_norm": 1.1393389701843262, "learning_rate": 4.986913208799826e-05, "loss": 0.068, "num_input_tokens_seen": 15159872, "step": 15855 }, { "epoch": 1.2937433722163307, "grad_norm": 0.07471833378076553, "learning_rate": 4.986876812631435e-05, "loss": 0.0302, "num_input_tokens_seen": 15164208, "step": 15860 }, { "epoch": 1.2941512358267395, "grad_norm": 1.2380220890045166, "learning_rate": 4.9868403660551075e-05, "loss": 0.0153, "num_input_tokens_seen": 15169440, "step": 15865 }, { "epoch": 1.2945590994371483, "grad_norm": 16.24224090576172, "learning_rate": 4.9868038690715824e-05, "loss": 0.2415, "num_input_tokens_seen": 15174448, "step": 15870 }, { "epoch": 1.2949669630475569, "grad_norm": 0.2358371764421463, "learning_rate": 4.986767321681599e-05, "loss": 0.2232, "num_input_tokens_seen": 15178224, "step": 15875 }, { "epoch": 1.2953748266579654, "grad_norm": 0.34909117221832275, "learning_rate": 4.9867307238859e-05, "loss": 0.0971, "num_input_tokens_seen": 15182976, "step": 15880 }, { "epoch": 1.2957826902683742, "grad_norm": 0.06663485616445541, "learning_rate": 4.986694075685224e-05, "loss": 0.0866, "num_input_tokens_seen": 15187456, "step": 15885 }, { "epoch": 1.296190553878783, "grad_norm": 0.06995070725679398, "learning_rate": 4.9866573770803174e-05, "loss": 0.2627, "num_input_tokens_seen": 15192224, "step": 15890 }, { "epoch": 1.2965984174891916, "grad_norm": 4.59690523147583, "learning_rate": 4.986620628071922e-05, "loss": 0.2165, "num_input_tokens_seen": 15197536, "step": 15895 }, { "epoch": 1.2970062810996004, "grad_norm": 4.031141757965088, "learning_rate": 4.9865838286607825e-05, "loss": 0.55, "num_input_tokens_seen": 15202272, "step": 15900 }, { "epoch": 1.297414144710009, "grad_norm": 1.0749582052230835, "learning_rate": 4.986546978847646e-05, "loss": 0.1853, "num_input_tokens_seen": 15207824, "step": 15905 }, { "epoch": 1.2978220083204177, "grad_norm": 0.22201356291770935, "learning_rate": 4.9865100786332586e-05, "loss": 0.2941, "num_input_tokens_seen": 15212560, "step": 15910 }, { "epoch": 1.2982298719308263, "grad_norm": 185.31617736816406, "learning_rate": 4.986473128018369e-05, "loss": 0.4062, "num_input_tokens_seen": 15217968, "step": 15915 }, { "epoch": 1.298637735541235, "grad_norm": 0.8254187703132629, "learning_rate": 4.9864361270037244e-05, "loss": 0.1373, "num_input_tokens_seen": 15222480, "step": 15920 }, { "epoch": 1.2990455991516436, "grad_norm": 8.546483039855957, "learning_rate": 4.986399075590077e-05, "loss": 0.2465, "num_input_tokens_seen": 15227664, "step": 15925 }, { "epoch": 1.2994534627620524, "grad_norm": 1.2776236534118652, "learning_rate": 4.9863619737781764e-05, "loss": 0.2636, "num_input_tokens_seen": 15232928, "step": 15930 }, { "epoch": 1.299861326372461, "grad_norm": 5.944625377655029, "learning_rate": 4.986324821568776e-05, "loss": 0.264, "num_input_tokens_seen": 15237440, "step": 15935 }, { "epoch": 1.3002691899828698, "grad_norm": 0.12520724534988403, "learning_rate": 4.986287618962627e-05, "loss": 0.3593, "num_input_tokens_seen": 15242544, "step": 15940 }, { "epoch": 1.3006770535932783, "grad_norm": 4.745885372161865, "learning_rate": 4.986250365960485e-05, "loss": 0.0524, "num_input_tokens_seen": 15246928, "step": 15945 }, { "epoch": 1.3010849172036871, "grad_norm": 54.49592971801758, "learning_rate": 4.9862130625631044e-05, "loss": 0.8039, "num_input_tokens_seen": 15251584, "step": 15950 }, { "epoch": 1.3014927808140957, "grad_norm": 0.3556700050830841, "learning_rate": 4.986175708771241e-05, "loss": 2.589, "num_input_tokens_seen": 15257152, "step": 15955 }, { "epoch": 1.3019006444245045, "grad_norm": 0.7142993211746216, "learning_rate": 4.986138304585654e-05, "loss": 0.2945, "num_input_tokens_seen": 15261200, "step": 15960 }, { "epoch": 1.302308508034913, "grad_norm": 0.12399541586637497, "learning_rate": 4.9861008500070984e-05, "loss": 0.0546, "num_input_tokens_seen": 15266000, "step": 15965 }, { "epoch": 1.3027163716453218, "grad_norm": 0.10365559160709381, "learning_rate": 4.986063345036336e-05, "loss": 0.1599, "num_input_tokens_seen": 15270400, "step": 15970 }, { "epoch": 1.3031242352557304, "grad_norm": 0.05923720821738243, "learning_rate": 4.986025789674125e-05, "loss": 0.015, "num_input_tokens_seen": 15275680, "step": 15975 }, { "epoch": 1.3035320988661392, "grad_norm": 0.0692192018032074, "learning_rate": 4.985988183921228e-05, "loss": 0.1415, "num_input_tokens_seen": 15280688, "step": 15980 }, { "epoch": 1.303939962476548, "grad_norm": 0.04332439973950386, "learning_rate": 4.985950527778407e-05, "loss": 0.0125, "num_input_tokens_seen": 15284992, "step": 15985 }, { "epoch": 1.3043478260869565, "grad_norm": 0.1952522248029709, "learning_rate": 4.985912821246425e-05, "loss": 0.0116, "num_input_tokens_seen": 15289440, "step": 15990 }, { "epoch": 1.304755689697365, "grad_norm": 13.166116714477539, "learning_rate": 4.9858750643260464e-05, "loss": 0.0596, "num_input_tokens_seen": 15293712, "step": 15995 }, { "epoch": 1.305163553307774, "grad_norm": 0.03445987030863762, "learning_rate": 4.9858372570180364e-05, "loss": 0.1276, "num_input_tokens_seen": 15298032, "step": 16000 }, { "epoch": 1.3055714169181827, "grad_norm": 0.018583569675683975, "learning_rate": 4.9857993993231624e-05, "loss": 0.3085, "num_input_tokens_seen": 15303088, "step": 16005 }, { "epoch": 1.3059792805285912, "grad_norm": 3.2352182865142822, "learning_rate": 4.98576149124219e-05, "loss": 0.4841, "num_input_tokens_seen": 15307376, "step": 16010 }, { "epoch": 1.3063871441389998, "grad_norm": 8.419805526733398, "learning_rate": 4.985723532775889e-05, "loss": 0.0336, "num_input_tokens_seen": 15311584, "step": 16015 }, { "epoch": 1.3067950077494086, "grad_norm": 0.08733632415533066, "learning_rate": 4.985685523925028e-05, "loss": 0.2565, "num_input_tokens_seen": 15315984, "step": 16020 }, { "epoch": 1.3072028713598174, "grad_norm": 5.091055393218994, "learning_rate": 4.9856474646903777e-05, "loss": 0.4276, "num_input_tokens_seen": 15321216, "step": 16025 }, { "epoch": 1.307610734970226, "grad_norm": 0.0654141828417778, "learning_rate": 4.985609355072709e-05, "loss": 0.2558, "num_input_tokens_seen": 15325888, "step": 16030 }, { "epoch": 1.3080185985806345, "grad_norm": 0.8858017921447754, "learning_rate": 4.9855711950727955e-05, "loss": 0.2447, "num_input_tokens_seen": 15331008, "step": 16035 }, { "epoch": 1.3084264621910433, "grad_norm": 0.05735119432210922, "learning_rate": 4.98553298469141e-05, "loss": 0.0468, "num_input_tokens_seen": 15335632, "step": 16040 }, { "epoch": 1.308834325801452, "grad_norm": 0.20769548416137695, "learning_rate": 4.985494723929327e-05, "loss": 0.1492, "num_input_tokens_seen": 15341232, "step": 16045 }, { "epoch": 1.3092421894118607, "grad_norm": 5.1519694328308105, "learning_rate": 4.9854564127873213e-05, "loss": 0.498, "num_input_tokens_seen": 15345136, "step": 16050 }, { "epoch": 1.3096500530222692, "grad_norm": 348.2441711425781, "learning_rate": 4.985418051266172e-05, "loss": 0.8123, "num_input_tokens_seen": 15349840, "step": 16055 }, { "epoch": 1.310057916632678, "grad_norm": 0.6151241064071655, "learning_rate": 4.9853796393666534e-05, "loss": 0.0318, "num_input_tokens_seen": 15354912, "step": 16060 }, { "epoch": 1.3104657802430868, "grad_norm": 0.08537105470895767, "learning_rate": 4.985341177089546e-05, "loss": 0.2814, "num_input_tokens_seen": 15359360, "step": 16065 }, { "epoch": 1.3108736438534954, "grad_norm": 0.120635986328125, "learning_rate": 4.9853026644356296e-05, "loss": 0.064, "num_input_tokens_seen": 15363744, "step": 16070 }, { "epoch": 1.3112815074639042, "grad_norm": 2.4520468711853027, "learning_rate": 4.9852641014056833e-05, "loss": 0.1485, "num_input_tokens_seen": 15368304, "step": 16075 }, { "epoch": 1.3116893710743127, "grad_norm": 0.17881488800048828, "learning_rate": 4.98522548800049e-05, "loss": 0.2448, "num_input_tokens_seen": 15371008, "step": 16080 }, { "epoch": 1.3120972346847215, "grad_norm": 0.11888821423053741, "learning_rate": 4.985186824220833e-05, "loss": 0.1306, "num_input_tokens_seen": 15376224, "step": 16085 }, { "epoch": 1.31250509829513, "grad_norm": 0.207301527261734, "learning_rate": 4.985148110067494e-05, "loss": 0.1978, "num_input_tokens_seen": 15380768, "step": 16090 }, { "epoch": 1.3129129619055389, "grad_norm": 0.22315263748168945, "learning_rate": 4.985109345541259e-05, "loss": 0.016, "num_input_tokens_seen": 15384976, "step": 16095 }, { "epoch": 1.3133208255159474, "grad_norm": 0.08158819377422333, "learning_rate": 4.985070530642914e-05, "loss": 0.1126, "num_input_tokens_seen": 15388704, "step": 16100 }, { "epoch": 1.3137286891263562, "grad_norm": 0.2873685956001282, "learning_rate": 4.9850316653732456e-05, "loss": 0.3199, "num_input_tokens_seen": 15393216, "step": 16105 }, { "epoch": 1.3141365527367648, "grad_norm": 0.11345576494932175, "learning_rate": 4.98499274973304e-05, "loss": 0.03, "num_input_tokens_seen": 15397552, "step": 16110 }, { "epoch": 1.3145444163471736, "grad_norm": 0.021577857434749603, "learning_rate": 4.9849537837230884e-05, "loss": 0.3486, "num_input_tokens_seen": 15401856, "step": 16115 }, { "epoch": 1.3149522799575821, "grad_norm": 0.01181072648614645, "learning_rate": 4.984914767344179e-05, "loss": 0.2639, "num_input_tokens_seen": 15406688, "step": 16120 }, { "epoch": 1.315360143567991, "grad_norm": 0.02713293768465519, "learning_rate": 4.984875700597104e-05, "loss": 0.0054, "num_input_tokens_seen": 15412032, "step": 16125 }, { "epoch": 1.3157680071783995, "grad_norm": 46.568965911865234, "learning_rate": 4.984836583482653e-05, "loss": 0.2271, "num_input_tokens_seen": 15417008, "step": 16130 }, { "epoch": 1.3161758707888083, "grad_norm": 0.027849135920405388, "learning_rate": 4.984797416001621e-05, "loss": 0.2219, "num_input_tokens_seen": 15421504, "step": 16135 }, { "epoch": 1.3165837343992168, "grad_norm": 0.8765387535095215, "learning_rate": 4.984758198154802e-05, "loss": 0.2021, "num_input_tokens_seen": 15426128, "step": 16140 }, { "epoch": 1.3169915980096256, "grad_norm": 0.034462783485651016, "learning_rate": 4.984718929942989e-05, "loss": 0.013, "num_input_tokens_seen": 15431488, "step": 16145 }, { "epoch": 1.3173994616200342, "grad_norm": 0.17821143567562103, "learning_rate": 4.9846796113669796e-05, "loss": 0.4043, "num_input_tokens_seen": 15435376, "step": 16150 }, { "epoch": 1.317807325230443, "grad_norm": 2.8723974227905273, "learning_rate": 4.984640242427571e-05, "loss": 0.3633, "num_input_tokens_seen": 15440176, "step": 16155 }, { "epoch": 1.3182151888408515, "grad_norm": 0.1649199277162552, "learning_rate": 4.984600823125559e-05, "loss": 0.0934, "num_input_tokens_seen": 15444992, "step": 16160 }, { "epoch": 1.3186230524512603, "grad_norm": 0.09395736455917358, "learning_rate": 4.9845613534617453e-05, "loss": 0.0138, "num_input_tokens_seen": 15449312, "step": 16165 }, { "epoch": 1.319030916061669, "grad_norm": 0.22203677892684937, "learning_rate": 4.984521833436928e-05, "loss": 0.022, "num_input_tokens_seen": 15454048, "step": 16170 }, { "epoch": 1.3194387796720777, "grad_norm": 0.05173235386610031, "learning_rate": 4.9844822630519096e-05, "loss": 0.0528, "num_input_tokens_seen": 15458944, "step": 16175 }, { "epoch": 1.3198466432824865, "grad_norm": 0.04888910427689552, "learning_rate": 4.984442642307491e-05, "loss": 0.3494, "num_input_tokens_seen": 15464192, "step": 16180 }, { "epoch": 1.320254506892895, "grad_norm": 11.175435066223145, "learning_rate": 4.9844029712044754e-05, "loss": 0.0245, "num_input_tokens_seen": 15468000, "step": 16185 }, { "epoch": 1.3206623705033036, "grad_norm": 0.03952006623148918, "learning_rate": 4.984363249743668e-05, "loss": 0.0081, "num_input_tokens_seen": 15472768, "step": 16190 }, { "epoch": 1.3210702341137124, "grad_norm": 0.04928985610604286, "learning_rate": 4.984323477925872e-05, "loss": 0.1446, "num_input_tokens_seen": 15477680, "step": 16195 }, { "epoch": 1.3214780977241212, "grad_norm": 0.019813988357782364, "learning_rate": 4.984283655751896e-05, "loss": 0.0086, "num_input_tokens_seen": 15482224, "step": 16200 }, { "epoch": 1.3218859613345297, "grad_norm": 17.905662536621094, "learning_rate": 4.984243783222545e-05, "loss": 0.1659, "num_input_tokens_seen": 15486512, "step": 16205 }, { "epoch": 1.3222938249449383, "grad_norm": 0.1692231446504593, "learning_rate": 4.984203860338629e-05, "loss": 0.2073, "num_input_tokens_seen": 15492432, "step": 16210 }, { "epoch": 1.322701688555347, "grad_norm": 0.025546347722411156, "learning_rate": 4.984163887100956e-05, "loss": 0.1814, "num_input_tokens_seen": 15496080, "step": 16215 }, { "epoch": 1.3231095521657559, "grad_norm": 0.042023882269859314, "learning_rate": 4.984123863510336e-05, "loss": 0.0075, "num_input_tokens_seen": 15501280, "step": 16220 }, { "epoch": 1.3235174157761644, "grad_norm": 0.28961673378944397, "learning_rate": 4.984083789567582e-05, "loss": 0.1764, "num_input_tokens_seen": 15505856, "step": 16225 }, { "epoch": 1.323925279386573, "grad_norm": 2.802152633666992, "learning_rate": 4.9840436652735044e-05, "loss": 0.8641, "num_input_tokens_seen": 15510272, "step": 16230 }, { "epoch": 1.3243331429969818, "grad_norm": 2.9894192218780518, "learning_rate": 4.984003490628918e-05, "loss": 0.2707, "num_input_tokens_seen": 15514512, "step": 16235 }, { "epoch": 1.3247410066073906, "grad_norm": 3.227180242538452, "learning_rate": 4.983963265634635e-05, "loss": 0.3498, "num_input_tokens_seen": 15518816, "step": 16240 }, { "epoch": 1.3251488702177991, "grad_norm": 0.07653550803661346, "learning_rate": 4.983922990291473e-05, "loss": 0.0201, "num_input_tokens_seen": 15523984, "step": 16245 }, { "epoch": 1.3255567338282077, "grad_norm": 0.18018053472042084, "learning_rate": 4.983882664600248e-05, "loss": 0.119, "num_input_tokens_seen": 15528896, "step": 16250 }, { "epoch": 1.3259645974386165, "grad_norm": 12.390074729919434, "learning_rate": 4.983842288561776e-05, "loss": 0.2557, "num_input_tokens_seen": 15533696, "step": 16255 }, { "epoch": 1.3263724610490253, "grad_norm": 6.288187503814697, "learning_rate": 4.983801862176876e-05, "loss": 0.3857, "num_input_tokens_seen": 15538080, "step": 16260 }, { "epoch": 1.3267803246594339, "grad_norm": 0.04956216365098953, "learning_rate": 4.9837613854463684e-05, "loss": 0.0422, "num_input_tokens_seen": 15543216, "step": 16265 }, { "epoch": 1.3271881882698426, "grad_norm": 5.063998222351074, "learning_rate": 4.9837208583710726e-05, "loss": 0.1911, "num_input_tokens_seen": 15548512, "step": 16270 }, { "epoch": 1.3275960518802512, "grad_norm": 0.20320643484592438, "learning_rate": 4.9836802809518104e-05, "loss": 0.1881, "num_input_tokens_seen": 15553824, "step": 16275 }, { "epoch": 1.32800391549066, "grad_norm": 0.042305003851652145, "learning_rate": 4.9836396531894046e-05, "loss": 0.0271, "num_input_tokens_seen": 15558720, "step": 16280 }, { "epoch": 1.3284117791010686, "grad_norm": 0.10745332390069962, "learning_rate": 4.983598975084678e-05, "loss": 0.0189, "num_input_tokens_seen": 15563504, "step": 16285 }, { "epoch": 1.3288196427114773, "grad_norm": 0.11585472524166107, "learning_rate": 4.983558246638456e-05, "loss": 0.018, "num_input_tokens_seen": 15567808, "step": 16290 }, { "epoch": 1.329227506321886, "grad_norm": 1.4493950605392456, "learning_rate": 4.983517467851563e-05, "loss": 0.0155, "num_input_tokens_seen": 15572640, "step": 16295 }, { "epoch": 1.3296353699322947, "grad_norm": 0.3461235761642456, "learning_rate": 4.9834766387248264e-05, "loss": 0.1172, "num_input_tokens_seen": 15577264, "step": 16300 }, { "epoch": 1.3300432335427033, "grad_norm": 0.015053018927574158, "learning_rate": 4.983435759259074e-05, "loss": 0.1875, "num_input_tokens_seen": 15582688, "step": 16305 }, { "epoch": 1.330451097153112, "grad_norm": 6.739504814147949, "learning_rate": 4.983394829455134e-05, "loss": 0.8031, "num_input_tokens_seen": 15587552, "step": 16310 }, { "epoch": 1.3308589607635206, "grad_norm": 0.7777006030082703, "learning_rate": 4.9833538493138354e-05, "loss": 0.0888, "num_input_tokens_seen": 15592080, "step": 16315 }, { "epoch": 1.3312668243739294, "grad_norm": 0.04759807884693146, "learning_rate": 4.98331281883601e-05, "loss": 0.0099, "num_input_tokens_seen": 15596976, "step": 16320 }, { "epoch": 1.331674687984338, "grad_norm": 0.09160571545362473, "learning_rate": 4.983271738022489e-05, "loss": 0.0123, "num_input_tokens_seen": 15601712, "step": 16325 }, { "epoch": 1.3320825515947468, "grad_norm": 2.311877965927124, "learning_rate": 4.983230606874105e-05, "loss": 0.4947, "num_input_tokens_seen": 15606048, "step": 16330 }, { "epoch": 1.3324904152051553, "grad_norm": 0.09371908009052277, "learning_rate": 4.983189425391692e-05, "loss": 0.3151, "num_input_tokens_seen": 15611440, "step": 16335 }, { "epoch": 1.332898278815564, "grad_norm": 0.08423082530498505, "learning_rate": 4.983148193576085e-05, "loss": 0.0141, "num_input_tokens_seen": 15615328, "step": 16340 }, { "epoch": 1.3333061424259727, "grad_norm": 0.07626733183860779, "learning_rate": 4.9831069114281184e-05, "loss": 0.2233, "num_input_tokens_seen": 15620320, "step": 16345 }, { "epoch": 1.3337140060363815, "grad_norm": 0.1298886090517044, "learning_rate": 4.9830655789486295e-05, "loss": 0.1327, "num_input_tokens_seen": 15624592, "step": 16350 }, { "epoch": 1.3341218696467902, "grad_norm": 3.843391180038452, "learning_rate": 4.983024196138457e-05, "loss": 0.0255, "num_input_tokens_seen": 15628976, "step": 16355 }, { "epoch": 1.3345297332571988, "grad_norm": 19.32269287109375, "learning_rate": 4.9829827629984396e-05, "loss": 0.3588, "num_input_tokens_seen": 15634528, "step": 16360 }, { "epoch": 1.3349375968676074, "grad_norm": 0.06983978301286697, "learning_rate": 4.982941279529416e-05, "loss": 0.2341, "num_input_tokens_seen": 15639248, "step": 16365 }, { "epoch": 1.3353454604780162, "grad_norm": 0.09243965893983841, "learning_rate": 4.982899745732228e-05, "loss": 0.1569, "num_input_tokens_seen": 15644816, "step": 16370 }, { "epoch": 1.335753324088425, "grad_norm": 0.1405673772096634, "learning_rate": 4.982858161607718e-05, "loss": 0.2829, "num_input_tokens_seen": 15649744, "step": 16375 }, { "epoch": 1.3361611876988335, "grad_norm": 0.12005685269832611, "learning_rate": 4.982816527156726e-05, "loss": 0.0134, "num_input_tokens_seen": 15654528, "step": 16380 }, { "epoch": 1.336569051309242, "grad_norm": 0.1709682047367096, "learning_rate": 4.9827748423801004e-05, "loss": 0.1149, "num_input_tokens_seen": 15659296, "step": 16385 }, { "epoch": 1.3369769149196509, "grad_norm": 0.06478729099035263, "learning_rate": 4.982733107278682e-05, "loss": 0.0118, "num_input_tokens_seen": 15663824, "step": 16390 }, { "epoch": 1.3373847785300597, "grad_norm": 0.05218895524740219, "learning_rate": 4.982691321853319e-05, "loss": 0.0951, "num_input_tokens_seen": 15668544, "step": 16395 }, { "epoch": 1.3377926421404682, "grad_norm": 0.21035388112068176, "learning_rate": 4.9826494861048576e-05, "loss": 0.1603, "num_input_tokens_seen": 15673216, "step": 16400 }, { "epoch": 1.3382005057508768, "grad_norm": 0.13621853291988373, "learning_rate": 4.982607600034147e-05, "loss": 0.2335, "num_input_tokens_seen": 15677984, "step": 16405 }, { "epoch": 1.3386083693612856, "grad_norm": 4.622291088104248, "learning_rate": 4.9825656636420346e-05, "loss": 0.1992, "num_input_tokens_seen": 15682464, "step": 16410 }, { "epoch": 1.3390162329716944, "grad_norm": 0.1836620718240738, "learning_rate": 4.982523676929372e-05, "loss": 0.1906, "num_input_tokens_seen": 15687120, "step": 16415 }, { "epoch": 1.339424096582103, "grad_norm": 0.04515290632843971, "learning_rate": 4.982481639897008e-05, "loss": 0.0155, "num_input_tokens_seen": 15691136, "step": 16420 }, { "epoch": 1.3398319601925115, "grad_norm": 0.07271143794059753, "learning_rate": 4.982439552545797e-05, "loss": 0.0262, "num_input_tokens_seen": 15695424, "step": 16425 }, { "epoch": 1.3402398238029203, "grad_norm": 0.057308610528707504, "learning_rate": 4.98239741487659e-05, "loss": 0.4831, "num_input_tokens_seen": 15700784, "step": 16430 }, { "epoch": 1.340647687413329, "grad_norm": 0.07934673875570297, "learning_rate": 4.982355226890243e-05, "loss": 0.1624, "num_input_tokens_seen": 15706320, "step": 16435 }, { "epoch": 1.3410555510237376, "grad_norm": 19.750577926635742, "learning_rate": 4.982312988587611e-05, "loss": 0.6403, "num_input_tokens_seen": 15712080, "step": 16440 }, { "epoch": 1.3414634146341464, "grad_norm": 13.41639518737793, "learning_rate": 4.982270699969548e-05, "loss": 0.1486, "num_input_tokens_seen": 15716336, "step": 16445 }, { "epoch": 1.341871278244555, "grad_norm": 3.524421453475952, "learning_rate": 4.9822283610369136e-05, "loss": 0.1408, "num_input_tokens_seen": 15720736, "step": 16450 }, { "epoch": 1.3422791418549638, "grad_norm": 0.12320239841938019, "learning_rate": 4.982185971790565e-05, "loss": 0.0197, "num_input_tokens_seen": 15725824, "step": 16455 }, { "epoch": 1.3426870054653723, "grad_norm": 0.04238595813512802, "learning_rate": 4.9821435322313614e-05, "loss": 0.0053, "num_input_tokens_seen": 15729952, "step": 16460 }, { "epoch": 1.3430948690757811, "grad_norm": 0.14803530275821686, "learning_rate": 4.9821010423601635e-05, "loss": 0.0191, "num_input_tokens_seen": 15734592, "step": 16465 }, { "epoch": 1.3435027326861897, "grad_norm": 0.048194076865911484, "learning_rate": 4.982058502177832e-05, "loss": 0.053, "num_input_tokens_seen": 15739968, "step": 16470 }, { "epoch": 1.3439105962965985, "grad_norm": 1.0445526838302612, "learning_rate": 4.98201591168523e-05, "loss": 0.0969, "num_input_tokens_seen": 15744208, "step": 16475 }, { "epoch": 1.344318459907007, "grad_norm": 0.03963864967226982, "learning_rate": 4.981973270883219e-05, "loss": 0.0158, "num_input_tokens_seen": 15749488, "step": 16480 }, { "epoch": 1.3447263235174158, "grad_norm": 0.043877337127923965, "learning_rate": 4.9819305797726644e-05, "loss": 0.2619, "num_input_tokens_seen": 15754528, "step": 16485 }, { "epoch": 1.3451341871278244, "grad_norm": 0.04728380963206291, "learning_rate": 4.981887838354432e-05, "loss": 0.113, "num_input_tokens_seen": 15759264, "step": 16490 }, { "epoch": 1.3455420507382332, "grad_norm": 0.03500904142856598, "learning_rate": 4.981845046629388e-05, "loss": 0.0057, "num_input_tokens_seen": 15763488, "step": 16495 }, { "epoch": 1.3459499143486418, "grad_norm": 0.027272628620266914, "learning_rate": 4.981802204598399e-05, "loss": 0.1196, "num_input_tokens_seen": 15768096, "step": 16500 }, { "epoch": 1.3463577779590505, "grad_norm": 0.023866774514317513, "learning_rate": 4.981759312262334e-05, "loss": 0.1521, "num_input_tokens_seen": 15772944, "step": 16505 }, { "epoch": 1.346765641569459, "grad_norm": 6.057704448699951, "learning_rate": 4.981716369622063e-05, "loss": 0.3674, "num_input_tokens_seen": 15777296, "step": 16510 }, { "epoch": 1.347173505179868, "grad_norm": 5.076775550842285, "learning_rate": 4.981673376678455e-05, "loss": 0.2089, "num_input_tokens_seen": 15782080, "step": 16515 }, { "epoch": 1.3475813687902765, "grad_norm": 0.010007278062403202, "learning_rate": 4.9816303334323824e-05, "loss": 0.1913, "num_input_tokens_seen": 15786720, "step": 16520 }, { "epoch": 1.3479892324006852, "grad_norm": 0.015128466300666332, "learning_rate": 4.9815872398847176e-05, "loss": 0.0023, "num_input_tokens_seen": 15790896, "step": 16525 }, { "epoch": 1.3483970960110938, "grad_norm": 0.05484367534518242, "learning_rate": 4.9815440960363333e-05, "loss": 0.0055, "num_input_tokens_seen": 15795504, "step": 16530 }, { "epoch": 1.3488049596215026, "grad_norm": 4.016417026519775, "learning_rate": 4.981500901888106e-05, "loss": 0.3368, "num_input_tokens_seen": 15800688, "step": 16535 }, { "epoch": 1.3492128232319112, "grad_norm": 0.25430476665496826, "learning_rate": 4.981457657440909e-05, "loss": 0.0085, "num_input_tokens_seen": 15806000, "step": 16540 }, { "epoch": 1.34962068684232, "grad_norm": 0.021261341869831085, "learning_rate": 4.9814143626956195e-05, "loss": 0.0075, "num_input_tokens_seen": 15811216, "step": 16545 }, { "epoch": 1.3500285504527287, "grad_norm": 0.025039514526724815, "learning_rate": 4.981371017653116e-05, "loss": 0.0491, "num_input_tokens_seen": 15816800, "step": 16550 }, { "epoch": 1.3504364140631373, "grad_norm": 0.05960063263773918, "learning_rate": 4.981327622314276e-05, "loss": 0.1932, "num_input_tokens_seen": 15821200, "step": 16555 }, { "epoch": 1.3508442776735459, "grad_norm": 44.46498489379883, "learning_rate": 4.981284176679979e-05, "loss": 0.2095, "num_input_tokens_seen": 15825952, "step": 16560 }, { "epoch": 1.3512521412839547, "grad_norm": 0.10614106059074402, "learning_rate": 4.981240680751106e-05, "loss": 0.0069, "num_input_tokens_seen": 15830848, "step": 16565 }, { "epoch": 1.3516600048943634, "grad_norm": 0.09849001467227936, "learning_rate": 4.981197134528539e-05, "loss": 0.4298, "num_input_tokens_seen": 15835376, "step": 16570 }, { "epoch": 1.352067868504772, "grad_norm": 0.13044188916683197, "learning_rate": 4.9811535380131614e-05, "loss": 0.2296, "num_input_tokens_seen": 15840128, "step": 16575 }, { "epoch": 1.3524757321151806, "grad_norm": 0.253856360912323, "learning_rate": 4.9811098912058545e-05, "loss": 0.0075, "num_input_tokens_seen": 15844624, "step": 16580 }, { "epoch": 1.3528835957255894, "grad_norm": 0.034572623670101166, "learning_rate": 4.981066194107505e-05, "loss": 0.2786, "num_input_tokens_seen": 15849696, "step": 16585 }, { "epoch": 1.3532914593359981, "grad_norm": 0.023658689111471176, "learning_rate": 4.981022446718998e-05, "loss": 0.1261, "num_input_tokens_seen": 15853824, "step": 16590 }, { "epoch": 1.3536993229464067, "grad_norm": 0.09156006574630737, "learning_rate": 4.98097864904122e-05, "loss": 0.0073, "num_input_tokens_seen": 15858688, "step": 16595 }, { "epoch": 1.3541071865568153, "grad_norm": 0.30277469754219055, "learning_rate": 4.98093480107506e-05, "loss": 0.0059, "num_input_tokens_seen": 15863104, "step": 16600 }, { "epoch": 1.354515050167224, "grad_norm": 0.19497720897197723, "learning_rate": 4.9808909028214045e-05, "loss": 0.0619, "num_input_tokens_seen": 15867616, "step": 16605 }, { "epoch": 1.3549229137776329, "grad_norm": 4.147480487823486, "learning_rate": 4.980846954281145e-05, "loss": 0.2551, "num_input_tokens_seen": 15872352, "step": 16610 }, { "epoch": 1.3553307773880414, "grad_norm": 3.8757457733154297, "learning_rate": 4.9808029554551726e-05, "loss": 0.2079, "num_input_tokens_seen": 15876992, "step": 16615 }, { "epoch": 1.35573864099845, "grad_norm": 0.04489341750741005, "learning_rate": 4.980758906344378e-05, "loss": 0.1087, "num_input_tokens_seen": 15881424, "step": 16620 }, { "epoch": 1.3561465046088588, "grad_norm": 0.05901694297790527, "learning_rate": 4.9807148069496544e-05, "loss": 0.0113, "num_input_tokens_seen": 15886544, "step": 16625 }, { "epoch": 1.3565543682192676, "grad_norm": 0.3323043882846832, "learning_rate": 4.980670657271896e-05, "loss": 0.1604, "num_input_tokens_seen": 15891472, "step": 16630 }, { "epoch": 1.3569622318296761, "grad_norm": 9.606125831604004, "learning_rate": 4.980626457311997e-05, "loss": 0.0976, "num_input_tokens_seen": 15896480, "step": 16635 }, { "epoch": 1.357370095440085, "grad_norm": 15.464812278747559, "learning_rate": 4.980582207070854e-05, "loss": 0.158, "num_input_tokens_seen": 15900768, "step": 16640 }, { "epoch": 1.3577779590504935, "grad_norm": 49.897682189941406, "learning_rate": 4.980537906549364e-05, "loss": 0.0636, "num_input_tokens_seen": 15905552, "step": 16645 }, { "epoch": 1.3581858226609023, "grad_norm": 5.745718479156494, "learning_rate": 4.9804935557484245e-05, "loss": 0.2722, "num_input_tokens_seen": 15911024, "step": 16650 }, { "epoch": 1.3585936862713108, "grad_norm": 5.2516655921936035, "learning_rate": 4.9804491546689344e-05, "loss": 0.3651, "num_input_tokens_seen": 15916128, "step": 16655 }, { "epoch": 1.3590015498817196, "grad_norm": 14.965215682983398, "learning_rate": 4.9804047033117946e-05, "loss": 0.3128, "num_input_tokens_seen": 15921472, "step": 16660 }, { "epoch": 1.3594094134921282, "grad_norm": 0.06986527889966965, "learning_rate": 4.980360201677905e-05, "loss": 0.0209, "num_input_tokens_seen": 15926848, "step": 16665 }, { "epoch": 1.359817277102537, "grad_norm": 13.467705726623535, "learning_rate": 4.980315649768168e-05, "loss": 0.4307, "num_input_tokens_seen": 15931440, "step": 16670 }, { "epoch": 1.3602251407129455, "grad_norm": 6.825736045837402, "learning_rate": 4.980271047583487e-05, "loss": 0.2576, "num_input_tokens_seen": 15937072, "step": 16675 }, { "epoch": 1.3606330043233543, "grad_norm": 0.050811391323804855, "learning_rate": 4.9802263951247654e-05, "loss": 0.0078, "num_input_tokens_seen": 15942032, "step": 16680 }, { "epoch": 1.3610408679337629, "grad_norm": 0.21656909584999084, "learning_rate": 4.980181692392909e-05, "loss": 0.573, "num_input_tokens_seen": 15947344, "step": 16685 }, { "epoch": 1.3614487315441717, "grad_norm": 0.2925996482372284, "learning_rate": 4.980136939388824e-05, "loss": 0.7735, "num_input_tokens_seen": 15952208, "step": 16690 }, { "epoch": 1.3618565951545802, "grad_norm": 9.51955795288086, "learning_rate": 4.980092136113416e-05, "loss": 0.2593, "num_input_tokens_seen": 15956512, "step": 16695 }, { "epoch": 1.362264458764989, "grad_norm": 2.0613579750061035, "learning_rate": 4.980047282567595e-05, "loss": 0.0254, "num_input_tokens_seen": 15960944, "step": 16700 }, { "epoch": 1.3626723223753976, "grad_norm": 0.2513541579246521, "learning_rate": 4.980002378752269e-05, "loss": 0.1663, "num_input_tokens_seen": 15965328, "step": 16705 }, { "epoch": 1.3630801859858064, "grad_norm": 0.5594724416732788, "learning_rate": 4.979957424668349e-05, "loss": 0.1515, "num_input_tokens_seen": 15969504, "step": 16710 }, { "epoch": 1.363488049596215, "grad_norm": 5.390789031982422, "learning_rate": 4.979912420316746e-05, "loss": 0.0705, "num_input_tokens_seen": 15974288, "step": 16715 }, { "epoch": 1.3638959132066237, "grad_norm": 11.56428337097168, "learning_rate": 4.979867365698372e-05, "loss": 0.3429, "num_input_tokens_seen": 15979296, "step": 16720 }, { "epoch": 1.3643037768170325, "grad_norm": 1.8545905351638794, "learning_rate": 4.97982226081414e-05, "loss": 0.2019, "num_input_tokens_seen": 15984064, "step": 16725 }, { "epoch": 1.364711640427441, "grad_norm": 1.9344419240951538, "learning_rate": 4.9797771056649645e-05, "loss": 0.0191, "num_input_tokens_seen": 15988752, "step": 16730 }, { "epoch": 1.3651195040378497, "grad_norm": 0.13231414556503296, "learning_rate": 4.979731900251761e-05, "loss": 0.1767, "num_input_tokens_seen": 15993584, "step": 16735 }, { "epoch": 1.3655273676482584, "grad_norm": 0.7210158705711365, "learning_rate": 4.979686644575445e-05, "loss": 0.0396, "num_input_tokens_seen": 15998208, "step": 16740 }, { "epoch": 1.3659352312586672, "grad_norm": 0.028003429993987083, "learning_rate": 4.979641338636935e-05, "loss": 0.1934, "num_input_tokens_seen": 16002176, "step": 16745 }, { "epoch": 1.3663430948690758, "grad_norm": 1.2249112129211426, "learning_rate": 4.979595982437148e-05, "loss": 0.0125, "num_input_tokens_seen": 16007200, "step": 16750 }, { "epoch": 1.3667509584794844, "grad_norm": 6.19495964050293, "learning_rate": 4.979550575977004e-05, "loss": 0.3734, "num_input_tokens_seen": 16012080, "step": 16755 }, { "epoch": 1.3671588220898931, "grad_norm": 2.2445926666259766, "learning_rate": 4.979505119257425e-05, "loss": 0.532, "num_input_tokens_seen": 16016576, "step": 16760 }, { "epoch": 1.367566685700302, "grad_norm": 0.07782842963933945, "learning_rate": 4.979459612279329e-05, "loss": 0.3511, "num_input_tokens_seen": 16020992, "step": 16765 }, { "epoch": 1.3679745493107105, "grad_norm": 0.5782884955406189, "learning_rate": 4.979414055043642e-05, "loss": 0.029, "num_input_tokens_seen": 16025504, "step": 16770 }, { "epoch": 1.368382412921119, "grad_norm": 0.09326799213886261, "learning_rate": 4.9793684475512845e-05, "loss": 0.1729, "num_input_tokens_seen": 16030288, "step": 16775 }, { "epoch": 1.3687902765315278, "grad_norm": 1.381770372390747, "learning_rate": 4.979322789803182e-05, "loss": 0.0389, "num_input_tokens_seen": 16034896, "step": 16780 }, { "epoch": 1.3691981401419366, "grad_norm": 0.2916417717933655, "learning_rate": 4.979277081800261e-05, "loss": 0.1337, "num_input_tokens_seen": 16039632, "step": 16785 }, { "epoch": 1.3696060037523452, "grad_norm": 0.023386748507618904, "learning_rate": 4.9792313235434464e-05, "loss": 0.0199, "num_input_tokens_seen": 16044720, "step": 16790 }, { "epoch": 1.3700138673627538, "grad_norm": 0.16928879916667938, "learning_rate": 4.979185515033666e-05, "loss": 0.0155, "num_input_tokens_seen": 16050096, "step": 16795 }, { "epoch": 1.3704217309731626, "grad_norm": 0.025739796459674835, "learning_rate": 4.979139656271849e-05, "loss": 0.1005, "num_input_tokens_seen": 16055072, "step": 16800 }, { "epoch": 1.3708295945835713, "grad_norm": 3.0594429969787598, "learning_rate": 4.979093747258925e-05, "loss": 0.1811, "num_input_tokens_seen": 16060224, "step": 16805 }, { "epoch": 1.37123745819398, "grad_norm": 4.844731330871582, "learning_rate": 4.9790477879958244e-05, "loss": 0.1673, "num_input_tokens_seen": 16064464, "step": 16810 }, { "epoch": 1.3716453218043887, "grad_norm": 3.0302734375, "learning_rate": 4.979001778483478e-05, "loss": 0.8184, "num_input_tokens_seen": 16069200, "step": 16815 }, { "epoch": 1.3720531854147973, "grad_norm": 0.09407499432563782, "learning_rate": 4.9789557187228194e-05, "loss": 0.6761, "num_input_tokens_seen": 16073536, "step": 16820 }, { "epoch": 1.372461049025206, "grad_norm": 36.45442199707031, "learning_rate": 4.9789096087147815e-05, "loss": 0.4301, "num_input_tokens_seen": 16077808, "step": 16825 }, { "epoch": 1.3728689126356146, "grad_norm": 5.733433723449707, "learning_rate": 4.9788634484602985e-05, "loss": 0.35, "num_input_tokens_seen": 16083200, "step": 16830 }, { "epoch": 1.3732767762460234, "grad_norm": 18.65199089050293, "learning_rate": 4.9788172379603076e-05, "loss": 0.4057, "num_input_tokens_seen": 16088384, "step": 16835 }, { "epoch": 1.373684639856432, "grad_norm": 5.87918758392334, "learning_rate": 4.978770977215744e-05, "loss": 0.4961, "num_input_tokens_seen": 16092704, "step": 16840 }, { "epoch": 1.3740925034668408, "grad_norm": 8.547685623168945, "learning_rate": 4.978724666227547e-05, "loss": 0.3656, "num_input_tokens_seen": 16097808, "step": 16845 }, { "epoch": 1.3745003670772493, "grad_norm": 3.7988953590393066, "learning_rate": 4.978678304996654e-05, "loss": 0.4349, "num_input_tokens_seen": 16102896, "step": 16850 }, { "epoch": 1.374908230687658, "grad_norm": 4.153595924377441, "learning_rate": 4.978631893524004e-05, "loss": 0.3487, "num_input_tokens_seen": 16107936, "step": 16855 }, { "epoch": 1.3753160942980667, "grad_norm": 4.905442714691162, "learning_rate": 4.978585431810539e-05, "loss": 0.2645, "num_input_tokens_seen": 16112080, "step": 16860 }, { "epoch": 1.3757239579084755, "grad_norm": 14.857789993286133, "learning_rate": 4.978538919857201e-05, "loss": 0.5819, "num_input_tokens_seen": 16116256, "step": 16865 }, { "epoch": 1.376131821518884, "grad_norm": 30.923276901245117, "learning_rate": 4.9784923576649325e-05, "loss": 0.2627, "num_input_tokens_seen": 16121616, "step": 16870 }, { "epoch": 1.3765396851292928, "grad_norm": 13.041625022888184, "learning_rate": 4.9784457452346764e-05, "loss": 0.2309, "num_input_tokens_seen": 16127168, "step": 16875 }, { "epoch": 1.3769475487397014, "grad_norm": 13.891817092895508, "learning_rate": 4.978399082567379e-05, "loss": 0.5193, "num_input_tokens_seen": 16131632, "step": 16880 }, { "epoch": 1.3773554123501102, "grad_norm": 9.806975364685059, "learning_rate": 4.978352369663985e-05, "loss": 0.5919, "num_input_tokens_seen": 16136224, "step": 16885 }, { "epoch": 1.3777632759605187, "grad_norm": 5.723084926605225, "learning_rate": 4.978305606525441e-05, "loss": 0.335, "num_input_tokens_seen": 16140384, "step": 16890 }, { "epoch": 1.3781711395709275, "grad_norm": 1.0283797979354858, "learning_rate": 4.978258793152696e-05, "loss": 0.2446, "num_input_tokens_seen": 16145088, "step": 16895 }, { "epoch": 1.378579003181336, "grad_norm": 86.44513702392578, "learning_rate": 4.978211929546698e-05, "loss": 0.5679, "num_input_tokens_seen": 16150496, "step": 16900 }, { "epoch": 1.3789868667917449, "grad_norm": 2.842395067214966, "learning_rate": 4.9781650157083976e-05, "loss": 0.3672, "num_input_tokens_seen": 16155024, "step": 16905 }, { "epoch": 1.3793947304021534, "grad_norm": 38.39208221435547, "learning_rate": 4.978118051638745e-05, "loss": 0.535, "num_input_tokens_seen": 16159664, "step": 16910 }, { "epoch": 1.3798025940125622, "grad_norm": 33.5463981628418, "learning_rate": 4.978071037338693e-05, "loss": 1.299, "num_input_tokens_seen": 16164480, "step": 16915 }, { "epoch": 1.380210457622971, "grad_norm": 10.570138931274414, "learning_rate": 4.978023972809194e-05, "loss": 0.7992, "num_input_tokens_seen": 16169360, "step": 16920 }, { "epoch": 1.3806183212333796, "grad_norm": 1.7446919679641724, "learning_rate": 4.977976858051202e-05, "loss": 0.2191, "num_input_tokens_seen": 16175456, "step": 16925 }, { "epoch": 1.3810261848437881, "grad_norm": 16.63631820678711, "learning_rate": 4.977929693065672e-05, "loss": 0.1173, "num_input_tokens_seen": 16180016, "step": 16930 }, { "epoch": 1.381434048454197, "grad_norm": 0.1537170112133026, "learning_rate": 4.977882477853559e-05, "loss": 0.0354, "num_input_tokens_seen": 16184576, "step": 16935 }, { "epoch": 1.3818419120646057, "grad_norm": 0.22656790912151337, "learning_rate": 4.977835212415823e-05, "loss": 0.179, "num_input_tokens_seen": 16189456, "step": 16940 }, { "epoch": 1.3822497756750143, "grad_norm": 0.07958007603883743, "learning_rate": 4.977787896753419e-05, "loss": 0.0172, "num_input_tokens_seen": 16193408, "step": 16945 }, { "epoch": 1.3826576392854228, "grad_norm": 0.6217358708381653, "learning_rate": 4.977740530867307e-05, "loss": 0.151, "num_input_tokens_seen": 16197520, "step": 16950 }, { "epoch": 1.3830655028958316, "grad_norm": 0.04318803548812866, "learning_rate": 4.9776931147584486e-05, "loss": 0.2119, "num_input_tokens_seen": 16202864, "step": 16955 }, { "epoch": 1.3834733665062404, "grad_norm": 0.06048263609409332, "learning_rate": 4.977645648427802e-05, "loss": 0.1703, "num_input_tokens_seen": 16207712, "step": 16960 }, { "epoch": 1.383881230116649, "grad_norm": 10.054442405700684, "learning_rate": 4.977598131876332e-05, "loss": 0.185, "num_input_tokens_seen": 16213264, "step": 16965 }, { "epoch": 1.3842890937270576, "grad_norm": 7.95301628112793, "learning_rate": 4.977550565105e-05, "loss": 0.3711, "num_input_tokens_seen": 16217984, "step": 16970 }, { "epoch": 1.3846969573374663, "grad_norm": 0.042788486927747726, "learning_rate": 4.977502948114772e-05, "loss": 0.0175, "num_input_tokens_seen": 16223120, "step": 16975 }, { "epoch": 1.3851048209478751, "grad_norm": 1.5345263481140137, "learning_rate": 4.97745528090661e-05, "loss": 0.2216, "num_input_tokens_seen": 16227632, "step": 16980 }, { "epoch": 1.3855126845582837, "grad_norm": 12.183478355407715, "learning_rate": 4.977407563481484e-05, "loss": 0.1533, "num_input_tokens_seen": 16232912, "step": 16985 }, { "epoch": 1.3859205481686923, "grad_norm": 3.1879124641418457, "learning_rate": 4.977359795840358e-05, "loss": 0.1681, "num_input_tokens_seen": 16238080, "step": 16990 }, { "epoch": 1.386328411779101, "grad_norm": 0.17280472815036774, "learning_rate": 4.977311977984202e-05, "loss": 0.0126, "num_input_tokens_seen": 16242336, "step": 16995 }, { "epoch": 1.3867362753895098, "grad_norm": 9.932579040527344, "learning_rate": 4.977264109913986e-05, "loss": 0.3664, "num_input_tokens_seen": 16247504, "step": 17000 }, { "epoch": 1.3871441389999184, "grad_norm": 0.40174615383148193, "learning_rate": 4.977216191630679e-05, "loss": 0.0989, "num_input_tokens_seen": 16252624, "step": 17005 }, { "epoch": 1.3875520026103272, "grad_norm": 1.7463507652282715, "learning_rate": 4.9771682231352515e-05, "loss": 0.2283, "num_input_tokens_seen": 16256608, "step": 17010 }, { "epoch": 1.3879598662207357, "grad_norm": 0.24406929314136505, "learning_rate": 4.9771202044286766e-05, "loss": 0.2517, "num_input_tokens_seen": 16260896, "step": 17015 }, { "epoch": 1.3883677298311445, "grad_norm": 0.04602167755365372, "learning_rate": 4.977072135511929e-05, "loss": 0.2582, "num_input_tokens_seen": 16265536, "step": 17020 }, { "epoch": 1.388775593441553, "grad_norm": 0.03295042738318443, "learning_rate": 4.977024016385981e-05, "loss": 0.3492, "num_input_tokens_seen": 16271168, "step": 17025 }, { "epoch": 1.389183457051962, "grad_norm": 0.07714510709047318, "learning_rate": 4.976975847051808e-05, "loss": 0.0126, "num_input_tokens_seen": 16276272, "step": 17030 }, { "epoch": 1.3895913206623705, "grad_norm": 0.04528915137052536, "learning_rate": 4.976927627510388e-05, "loss": 0.0193, "num_input_tokens_seen": 16280528, "step": 17035 }, { "epoch": 1.3899991842727792, "grad_norm": 0.7432146072387695, "learning_rate": 4.9768793577626974e-05, "loss": 0.0146, "num_input_tokens_seen": 16285344, "step": 17040 }, { "epoch": 1.3904070478831878, "grad_norm": 0.14200803637504578, "learning_rate": 4.976831037809714e-05, "loss": 0.3484, "num_input_tokens_seen": 16289840, "step": 17045 }, { "epoch": 1.3908149114935966, "grad_norm": 0.04204944521188736, "learning_rate": 4.9767826676524184e-05, "loss": 0.0105, "num_input_tokens_seen": 16294256, "step": 17050 }, { "epoch": 1.3912227751040052, "grad_norm": 0.041638996452093124, "learning_rate": 4.976734247291791e-05, "loss": 0.0096, "num_input_tokens_seen": 16299024, "step": 17055 }, { "epoch": 1.391630638714414, "grad_norm": 0.1998320072889328, "learning_rate": 4.976685776728813e-05, "loss": 0.293, "num_input_tokens_seen": 16304544, "step": 17060 }, { "epoch": 1.3920385023248225, "grad_norm": 0.0433361791074276, "learning_rate": 4.976637255964466e-05, "loss": 0.0041, "num_input_tokens_seen": 16308000, "step": 17065 }, { "epoch": 1.3924463659352313, "grad_norm": 0.0684359148144722, "learning_rate": 4.9765886849997345e-05, "loss": 0.1662, "num_input_tokens_seen": 16312176, "step": 17070 }, { "epoch": 1.3928542295456399, "grad_norm": 2.3578572273254395, "learning_rate": 4.976540063835603e-05, "loss": 0.285, "num_input_tokens_seen": 16316240, "step": 17075 }, { "epoch": 1.3932620931560487, "grad_norm": 22.773723602294922, "learning_rate": 4.976491392473056e-05, "loss": 0.1442, "num_input_tokens_seen": 16321376, "step": 17080 }, { "epoch": 1.3936699567664572, "grad_norm": 5.098193168640137, "learning_rate": 4.976442670913082e-05, "loss": 0.3577, "num_input_tokens_seen": 16325872, "step": 17085 }, { "epoch": 1.394077820376866, "grad_norm": 2.4085352420806885, "learning_rate": 4.976393899156666e-05, "loss": 0.0223, "num_input_tokens_seen": 16331200, "step": 17090 }, { "epoch": 1.3944856839872748, "grad_norm": 0.08838895708322525, "learning_rate": 4.9763450772047996e-05, "loss": 0.0098, "num_input_tokens_seen": 16335968, "step": 17095 }, { "epoch": 1.3948935475976834, "grad_norm": 0.10608473420143127, "learning_rate": 4.976296205058469e-05, "loss": 0.1641, "num_input_tokens_seen": 16340672, "step": 17100 }, { "epoch": 1.395301411208092, "grad_norm": 2.704179286956787, "learning_rate": 4.9762472827186674e-05, "loss": 0.5135, "num_input_tokens_seen": 16345984, "step": 17105 }, { "epoch": 1.3957092748185007, "grad_norm": 0.0791001170873642, "learning_rate": 4.976198310186385e-05, "loss": 0.0074, "num_input_tokens_seen": 16351280, "step": 17110 }, { "epoch": 1.3961171384289095, "grad_norm": 0.03218761086463928, "learning_rate": 4.9761492874626156e-05, "loss": 0.4229, "num_input_tokens_seen": 16356112, "step": 17115 }, { "epoch": 1.396525002039318, "grad_norm": 30.729520797729492, "learning_rate": 4.9761002145483525e-05, "loss": 0.3984, "num_input_tokens_seen": 16360832, "step": 17120 }, { "epoch": 1.3969328656497266, "grad_norm": 8.331297874450684, "learning_rate": 4.97605109144459e-05, "loss": 0.2177, "num_input_tokens_seen": 16365984, "step": 17125 }, { "epoch": 1.3973407292601354, "grad_norm": 5.4779133796691895, "learning_rate": 4.976001918152324e-05, "loss": 0.3052, "num_input_tokens_seen": 16370896, "step": 17130 }, { "epoch": 1.3977485928705442, "grad_norm": 14.611783027648926, "learning_rate": 4.975952694672551e-05, "loss": 0.2169, "num_input_tokens_seen": 16375568, "step": 17135 }, { "epoch": 1.3981564564809528, "grad_norm": 9.824945449829102, "learning_rate": 4.9759034210062696e-05, "loss": 0.0901, "num_input_tokens_seen": 16380576, "step": 17140 }, { "epoch": 1.3985643200913613, "grad_norm": 0.1883082091808319, "learning_rate": 4.9758540971544775e-05, "loss": 0.0116, "num_input_tokens_seen": 16384880, "step": 17145 }, { "epoch": 1.3989721837017701, "grad_norm": 0.8855413198471069, "learning_rate": 4.975804723118175e-05, "loss": 0.0173, "num_input_tokens_seen": 16389216, "step": 17150 }, { "epoch": 1.399380047312179, "grad_norm": 0.43005529046058655, "learning_rate": 4.975755298898362e-05, "loss": 0.1045, "num_input_tokens_seen": 16393632, "step": 17155 }, { "epoch": 1.3997879109225875, "grad_norm": 0.1122979149222374, "learning_rate": 4.975705824496043e-05, "loss": 0.179, "num_input_tokens_seen": 16399072, "step": 17160 }, { "epoch": 1.400195774532996, "grad_norm": 0.22857841849327087, "learning_rate": 4.975656299912217e-05, "loss": 0.2597, "num_input_tokens_seen": 16403120, "step": 17165 }, { "epoch": 1.4006036381434048, "grad_norm": 10.820076942443848, "learning_rate": 4.975606725147891e-05, "loss": 0.491, "num_input_tokens_seen": 16408080, "step": 17170 }, { "epoch": 1.4010115017538136, "grad_norm": 0.03296218067407608, "learning_rate": 4.975557100204068e-05, "loss": 0.0286, "num_input_tokens_seen": 16413200, "step": 17175 }, { "epoch": 1.4014193653642222, "grad_norm": 0.25990530848503113, "learning_rate": 4.9755074250817545e-05, "loss": 0.041, "num_input_tokens_seen": 16417744, "step": 17180 }, { "epoch": 1.4018272289746307, "grad_norm": 9.176572799682617, "learning_rate": 4.975457699781958e-05, "loss": 0.9374, "num_input_tokens_seen": 16422656, "step": 17185 }, { "epoch": 1.4022350925850395, "grad_norm": 0.8865094780921936, "learning_rate": 4.975407924305685e-05, "loss": 0.4738, "num_input_tokens_seen": 16427248, "step": 17190 }, { "epoch": 1.4026429561954483, "grad_norm": 0.48500704765319824, "learning_rate": 4.975358098653946e-05, "loss": 0.0987, "num_input_tokens_seen": 16431808, "step": 17195 }, { "epoch": 1.4030508198058569, "grad_norm": 3.6019492149353027, "learning_rate": 4.9753082228277494e-05, "loss": 0.3381, "num_input_tokens_seen": 16436288, "step": 17200 }, { "epoch": 1.4034586834162657, "grad_norm": 0.14017382264137268, "learning_rate": 4.975258296828108e-05, "loss": 0.0501, "num_input_tokens_seen": 16440976, "step": 17205 }, { "epoch": 1.4038665470266742, "grad_norm": 0.09520106762647629, "learning_rate": 4.9752083206560315e-05, "loss": 0.487, "num_input_tokens_seen": 16445024, "step": 17210 }, { "epoch": 1.404274410637083, "grad_norm": 0.39277952909469604, "learning_rate": 4.975158294312535e-05, "loss": 0.01, "num_input_tokens_seen": 16450272, "step": 17215 }, { "epoch": 1.4046822742474916, "grad_norm": 5.697965621948242, "learning_rate": 4.975108217798631e-05, "loss": 0.2598, "num_input_tokens_seen": 16454464, "step": 17220 }, { "epoch": 1.4050901378579004, "grad_norm": 4.109641075134277, "learning_rate": 4.975058091115336e-05, "loss": 0.0629, "num_input_tokens_seen": 16459776, "step": 17225 }, { "epoch": 1.405498001468309, "grad_norm": 0.1693934053182602, "learning_rate": 4.975007914263664e-05, "loss": 0.0188, "num_input_tokens_seen": 16464448, "step": 17230 }, { "epoch": 1.4059058650787177, "grad_norm": 0.11288553476333618, "learning_rate": 4.9749576872446344e-05, "loss": 0.1039, "num_input_tokens_seen": 16469552, "step": 17235 }, { "epoch": 1.4063137286891263, "grad_norm": 0.26929008960723877, "learning_rate": 4.974907410059263e-05, "loss": 0.2449, "num_input_tokens_seen": 16473344, "step": 17240 }, { "epoch": 1.406721592299535, "grad_norm": 0.15374216437339783, "learning_rate": 4.974857082708572e-05, "loss": 0.4717, "num_input_tokens_seen": 16478224, "step": 17245 }, { "epoch": 1.4071294559099436, "grad_norm": 18.542041778564453, "learning_rate": 4.974806705193577e-05, "loss": 0.0416, "num_input_tokens_seen": 16482944, "step": 17250 }, { "epoch": 1.4075373195203524, "grad_norm": 0.05047396570444107, "learning_rate": 4.9747562775153034e-05, "loss": 0.0066, "num_input_tokens_seen": 16487872, "step": 17255 }, { "epoch": 1.407945183130761, "grad_norm": 0.14456826448440552, "learning_rate": 4.9747057996747714e-05, "loss": 0.2062, "num_input_tokens_seen": 16492208, "step": 17260 }, { "epoch": 1.4083530467411698, "grad_norm": 0.024206317961215973, "learning_rate": 4.9746552716730045e-05, "loss": 0.0047, "num_input_tokens_seen": 16496768, "step": 17265 }, { "epoch": 1.4087609103515784, "grad_norm": 4.824012279510498, "learning_rate": 4.9746046935110266e-05, "loss": 0.4003, "num_input_tokens_seen": 16501312, "step": 17270 }, { "epoch": 1.4091687739619871, "grad_norm": 0.04455108940601349, "learning_rate": 4.974554065189863e-05, "loss": 0.1915, "num_input_tokens_seen": 16505504, "step": 17275 }, { "epoch": 1.4095766375723957, "grad_norm": 3.0919740200042725, "learning_rate": 4.97450338671054e-05, "loss": 0.4741, "num_input_tokens_seen": 16510784, "step": 17280 }, { "epoch": 1.4099845011828045, "grad_norm": 7.175971031188965, "learning_rate": 4.974452658074085e-05, "loss": 0.1428, "num_input_tokens_seen": 16514848, "step": 17285 }, { "epoch": 1.4103923647932133, "grad_norm": 0.07819433510303497, "learning_rate": 4.974401879281526e-05, "loss": 0.3381, "num_input_tokens_seen": 16520400, "step": 17290 }, { "epoch": 1.4108002284036218, "grad_norm": 0.05548805370926857, "learning_rate": 4.974351050333892e-05, "loss": 0.3031, "num_input_tokens_seen": 16525248, "step": 17295 }, { "epoch": 1.4112080920140304, "grad_norm": 0.14894799888134003, "learning_rate": 4.9743001712322146e-05, "loss": 0.0059, "num_input_tokens_seen": 16529712, "step": 17300 }, { "epoch": 1.4116159556244392, "grad_norm": 0.19058319926261902, "learning_rate": 4.9742492419775236e-05, "loss": 0.2204, "num_input_tokens_seen": 16533552, "step": 17305 }, { "epoch": 1.412023819234848, "grad_norm": 2.805053234100342, "learning_rate": 4.974198262570851e-05, "loss": 0.2384, "num_input_tokens_seen": 16538256, "step": 17310 }, { "epoch": 1.4124316828452566, "grad_norm": 0.10350334644317627, "learning_rate": 4.974147233013232e-05, "loss": 0.008, "num_input_tokens_seen": 16543520, "step": 17315 }, { "epoch": 1.4128395464556651, "grad_norm": 0.2911554276943207, "learning_rate": 4.9740961533056994e-05, "loss": 0.1043, "num_input_tokens_seen": 16548080, "step": 17320 }, { "epoch": 1.413247410066074, "grad_norm": 0.042026083916425705, "learning_rate": 4.97404502344929e-05, "loss": 0.2965, "num_input_tokens_seen": 16553456, "step": 17325 }, { "epoch": 1.4136552736764827, "grad_norm": 1.8418214321136475, "learning_rate": 4.973993843445038e-05, "loss": 0.0205, "num_input_tokens_seen": 16558352, "step": 17330 }, { "epoch": 1.4140631372868913, "grad_norm": 9.872722625732422, "learning_rate": 4.973942613293983e-05, "loss": 0.5426, "num_input_tokens_seen": 16563232, "step": 17335 }, { "epoch": 1.4144710008972998, "grad_norm": 14.545231819152832, "learning_rate": 4.973891332997161e-05, "loss": 0.4822, "num_input_tokens_seen": 16568016, "step": 17340 }, { "epoch": 1.4148788645077086, "grad_norm": 0.07931163161993027, "learning_rate": 4.973840002555614e-05, "loss": 0.326, "num_input_tokens_seen": 16572960, "step": 17345 }, { "epoch": 1.4152867281181174, "grad_norm": 1.6884690523147583, "learning_rate": 4.973788621970382e-05, "loss": 0.0175, "num_input_tokens_seen": 16577904, "step": 17350 }, { "epoch": 1.415694591728526, "grad_norm": 0.3212142884731293, "learning_rate": 4.9737371912425046e-05, "loss": 0.0108, "num_input_tokens_seen": 16582880, "step": 17355 }, { "epoch": 1.4161024553389345, "grad_norm": 8.129899978637695, "learning_rate": 4.973685710373026e-05, "loss": 0.3275, "num_input_tokens_seen": 16588256, "step": 17360 }, { "epoch": 1.4165103189493433, "grad_norm": 6.238020896911621, "learning_rate": 4.973634179362989e-05, "loss": 0.1795, "num_input_tokens_seen": 16593392, "step": 17365 }, { "epoch": 1.416918182559752, "grad_norm": 0.037590473890304565, "learning_rate": 4.9735825982134385e-05, "loss": 0.0907, "num_input_tokens_seen": 16599024, "step": 17370 }, { "epoch": 1.4173260461701607, "grad_norm": 0.03930465131998062, "learning_rate": 4.9735309669254206e-05, "loss": 0.0052, "num_input_tokens_seen": 16603088, "step": 17375 }, { "epoch": 1.4177339097805695, "grad_norm": 1.21486234664917, "learning_rate": 4.97347928549998e-05, "loss": 0.0078, "num_input_tokens_seen": 16608096, "step": 17380 }, { "epoch": 1.418141773390978, "grad_norm": 0.012737388722598553, "learning_rate": 4.973427553938165e-05, "loss": 0.0035, "num_input_tokens_seen": 16613216, "step": 17385 }, { "epoch": 1.4185496370013868, "grad_norm": 1.0370124578475952, "learning_rate": 4.9733757722410254e-05, "loss": 0.4092, "num_input_tokens_seen": 16618064, "step": 17390 }, { "epoch": 1.4189575006117954, "grad_norm": 0.2722232937812805, "learning_rate": 4.97332394040961e-05, "loss": 0.1258, "num_input_tokens_seen": 16621920, "step": 17395 }, { "epoch": 1.4193653642222042, "grad_norm": 6.606302738189697, "learning_rate": 4.9732720584449685e-05, "loss": 0.0176, "num_input_tokens_seen": 16626848, "step": 17400 }, { "epoch": 1.4197732278326127, "grad_norm": 0.044211309403181076, "learning_rate": 4.973220126348154e-05, "loss": 0.0051, "num_input_tokens_seen": 16630800, "step": 17405 }, { "epoch": 1.4201810914430215, "grad_norm": 0.017718251794576645, "learning_rate": 4.973168144120218e-05, "loss": 0.3305, "num_input_tokens_seen": 16635104, "step": 17410 }, { "epoch": 1.42058895505343, "grad_norm": 8.42772102355957, "learning_rate": 4.973116111762215e-05, "loss": 0.1518, "num_input_tokens_seen": 16640336, "step": 17415 }, { "epoch": 1.4209968186638389, "grad_norm": 3.177159309387207, "learning_rate": 4.973064029275199e-05, "loss": 0.6556, "num_input_tokens_seen": 16644528, "step": 17420 }, { "epoch": 1.4214046822742474, "grad_norm": 0.03017403744161129, "learning_rate": 4.973011896660226e-05, "loss": 0.0064, "num_input_tokens_seen": 16649568, "step": 17425 }, { "epoch": 1.4218125458846562, "grad_norm": 0.021438106894493103, "learning_rate": 4.972959713918352e-05, "loss": 0.249, "num_input_tokens_seen": 16655072, "step": 17430 }, { "epoch": 1.4222204094950648, "grad_norm": 0.1476106196641922, "learning_rate": 4.972907481050637e-05, "loss": 0.1204, "num_input_tokens_seen": 16659904, "step": 17435 }, { "epoch": 1.4226282731054736, "grad_norm": 0.09505510330200195, "learning_rate": 4.9728551980581376e-05, "loss": 0.4627, "num_input_tokens_seen": 16664928, "step": 17440 }, { "epoch": 1.4230361367158821, "grad_norm": 0.12922999262809753, "learning_rate": 4.9728028649419136e-05, "loss": 0.4295, "num_input_tokens_seen": 16670320, "step": 17445 }, { "epoch": 1.423444000326291, "grad_norm": 3.9949591159820557, "learning_rate": 4.9727504817030267e-05, "loss": 0.109, "num_input_tokens_seen": 16674864, "step": 17450 }, { "epoch": 1.4238518639366995, "grad_norm": 11.363914489746094, "learning_rate": 4.972698048342539e-05, "loss": 0.3199, "num_input_tokens_seen": 16678944, "step": 17455 }, { "epoch": 1.4242597275471083, "grad_norm": 8.329444885253906, "learning_rate": 4.972645564861511e-05, "loss": 0.1818, "num_input_tokens_seen": 16683456, "step": 17460 }, { "epoch": 1.4246675911575168, "grad_norm": 1.5792765617370605, "learning_rate": 4.972593031261009e-05, "loss": 0.2035, "num_input_tokens_seen": 16689104, "step": 17465 }, { "epoch": 1.4250754547679256, "grad_norm": 3.2282328605651855, "learning_rate": 4.9725404475420966e-05, "loss": 0.2704, "num_input_tokens_seen": 16694608, "step": 17470 }, { "epoch": 1.4254833183783342, "grad_norm": 14.843925476074219, "learning_rate": 4.97248781370584e-05, "loss": 0.3048, "num_input_tokens_seen": 16700224, "step": 17475 }, { "epoch": 1.425891181988743, "grad_norm": 0.05204574391245842, "learning_rate": 4.972435129753307e-05, "loss": 0.2833, "num_input_tokens_seen": 16705600, "step": 17480 }, { "epoch": 1.4262990455991518, "grad_norm": 0.2023276686668396, "learning_rate": 4.972382395685563e-05, "loss": 0.27, "num_input_tokens_seen": 16710672, "step": 17485 }, { "epoch": 1.4267069092095603, "grad_norm": 0.060193099081516266, "learning_rate": 4.97232961150368e-05, "loss": 0.0124, "num_input_tokens_seen": 16716032, "step": 17490 }, { "epoch": 1.427114772819969, "grad_norm": 11.127422332763672, "learning_rate": 4.9722767772087256e-05, "loss": 0.2161, "num_input_tokens_seen": 16720576, "step": 17495 }, { "epoch": 1.4275226364303777, "grad_norm": 8.087296485900879, "learning_rate": 4.972223892801771e-05, "loss": 0.1917, "num_input_tokens_seen": 16724784, "step": 17500 }, { "epoch": 1.4279305000407865, "grad_norm": 0.23976387083530426, "learning_rate": 4.9721709582838885e-05, "loss": 0.0195, "num_input_tokens_seen": 16730400, "step": 17505 }, { "epoch": 1.428338363651195, "grad_norm": 0.6220074892044067, "learning_rate": 4.972117973656152e-05, "loss": 0.2013, "num_input_tokens_seen": 16735424, "step": 17510 }, { "epoch": 1.4287462272616036, "grad_norm": 0.029624471440911293, "learning_rate": 4.972064938919634e-05, "loss": 0.0085, "num_input_tokens_seen": 16740256, "step": 17515 }, { "epoch": 1.4291540908720124, "grad_norm": 5.517033100128174, "learning_rate": 4.97201185407541e-05, "loss": 0.3169, "num_input_tokens_seen": 16744928, "step": 17520 }, { "epoch": 1.4295619544824212, "grad_norm": 0.03207021206617355, "learning_rate": 4.971958719124557e-05, "loss": 0.0607, "num_input_tokens_seen": 16749232, "step": 17525 }, { "epoch": 1.4299698180928297, "grad_norm": 0.10326128453016281, "learning_rate": 4.971905534068151e-05, "loss": 0.187, "num_input_tokens_seen": 16754352, "step": 17530 }, { "epoch": 1.4303776817032383, "grad_norm": 0.4809122085571289, "learning_rate": 4.971852298907269e-05, "loss": 0.1668, "num_input_tokens_seen": 16759184, "step": 17535 }, { "epoch": 1.430785545313647, "grad_norm": 2.229212760925293, "learning_rate": 4.971799013642992e-05, "loss": 0.3803, "num_input_tokens_seen": 16764704, "step": 17540 }, { "epoch": 1.4311934089240559, "grad_norm": 0.03329102322459221, "learning_rate": 4.9717456782763996e-05, "loss": 0.0066, "num_input_tokens_seen": 16768064, "step": 17545 }, { "epoch": 1.4316012725344645, "grad_norm": 0.1357954889535904, "learning_rate": 4.971692292808573e-05, "loss": 0.1654, "num_input_tokens_seen": 16773072, "step": 17550 }, { "epoch": 1.432009136144873, "grad_norm": 0.05457673594355583, "learning_rate": 4.971638857240594e-05, "loss": 0.0053, "num_input_tokens_seen": 16778160, "step": 17555 }, { "epoch": 1.4324169997552818, "grad_norm": 0.30797097086906433, "learning_rate": 4.971585371573544e-05, "loss": 0.2727, "num_input_tokens_seen": 16783344, "step": 17560 }, { "epoch": 1.4328248633656906, "grad_norm": 0.09947561472654343, "learning_rate": 4.97153183580851e-05, "loss": 0.0276, "num_input_tokens_seen": 16787872, "step": 17565 }, { "epoch": 1.4332327269760992, "grad_norm": 0.07118216156959534, "learning_rate": 4.9714782499465755e-05, "loss": 0.029, "num_input_tokens_seen": 16792384, "step": 17570 }, { "epoch": 1.433640590586508, "grad_norm": 0.03160784766077995, "learning_rate": 4.9714246139888276e-05, "loss": 0.0212, "num_input_tokens_seen": 16796784, "step": 17575 }, { "epoch": 1.4340484541969165, "grad_norm": 0.09967941045761108, "learning_rate": 4.9713709279363524e-05, "loss": 0.0205, "num_input_tokens_seen": 16801952, "step": 17580 }, { "epoch": 1.4344563178073253, "grad_norm": 2.8981990814208984, "learning_rate": 4.971317191790239e-05, "loss": 0.1374, "num_input_tokens_seen": 16806864, "step": 17585 }, { "epoch": 1.4348641814177339, "grad_norm": 0.051585711538791656, "learning_rate": 4.971263405551576e-05, "loss": 0.0682, "num_input_tokens_seen": 16811744, "step": 17590 }, { "epoch": 1.4352720450281427, "grad_norm": 0.01109510287642479, "learning_rate": 4.9712095692214536e-05, "loss": 0.3189, "num_input_tokens_seen": 16816016, "step": 17595 }, { "epoch": 1.4356799086385512, "grad_norm": 0.09288172423839569, "learning_rate": 4.9711556828009644e-05, "loss": 0.0019, "num_input_tokens_seen": 16820560, "step": 17600 }, { "epoch": 1.43608777224896, "grad_norm": 0.01192405167967081, "learning_rate": 4.971101746291198e-05, "loss": 0.2953, "num_input_tokens_seen": 16825136, "step": 17605 }, { "epoch": 1.4364956358593686, "grad_norm": 0.01964900642633438, "learning_rate": 4.9710477596932505e-05, "loss": 0.2154, "num_input_tokens_seen": 16830704, "step": 17610 }, { "epoch": 1.4369034994697774, "grad_norm": 0.1828071027994156, "learning_rate": 4.9709937230082146e-05, "loss": 0.1584, "num_input_tokens_seen": 16835280, "step": 17615 }, { "epoch": 1.437311363080186, "grad_norm": 0.21322086453437805, "learning_rate": 4.970939636237185e-05, "loss": 0.0739, "num_input_tokens_seen": 16840608, "step": 17620 }, { "epoch": 1.4377192266905947, "grad_norm": 0.053286392241716385, "learning_rate": 4.9708854993812605e-05, "loss": 0.2546, "num_input_tokens_seen": 16845008, "step": 17625 }, { "epoch": 1.4381270903010033, "grad_norm": 0.090025395154953, "learning_rate": 4.970831312441536e-05, "loss": 0.009, "num_input_tokens_seen": 16850896, "step": 17630 }, { "epoch": 1.438534953911412, "grad_norm": 0.0461403988301754, "learning_rate": 4.9707770754191106e-05, "loss": 0.0048, "num_input_tokens_seen": 16855424, "step": 17635 }, { "epoch": 1.4389428175218206, "grad_norm": 14.148781776428223, "learning_rate": 4.970722788315084e-05, "loss": 0.0586, "num_input_tokens_seen": 16860448, "step": 17640 }, { "epoch": 1.4393506811322294, "grad_norm": 7.699225902557373, "learning_rate": 4.970668451130557e-05, "loss": 0.0369, "num_input_tokens_seen": 16865040, "step": 17645 }, { "epoch": 1.439758544742638, "grad_norm": 0.26933571696281433, "learning_rate": 4.9706140638666296e-05, "loss": 0.0073, "num_input_tokens_seen": 16869344, "step": 17650 }, { "epoch": 1.4401664083530468, "grad_norm": 0.06036584824323654, "learning_rate": 4.970559626524405e-05, "loss": 0.2344, "num_input_tokens_seen": 16874592, "step": 17655 }, { "epoch": 1.4405742719634556, "grad_norm": 6.6863274574279785, "learning_rate": 4.970505139104986e-05, "loss": 0.1458, "num_input_tokens_seen": 16880224, "step": 17660 }, { "epoch": 1.4409821355738641, "grad_norm": 11.935131072998047, "learning_rate": 4.9704506016094786e-05, "loss": 0.1413, "num_input_tokens_seen": 16885088, "step": 17665 }, { "epoch": 1.4413899991842727, "grad_norm": 0.1341041475534439, "learning_rate": 4.970396014038987e-05, "loss": 0.3567, "num_input_tokens_seen": 16889744, "step": 17670 }, { "epoch": 1.4417978627946815, "grad_norm": 8.221360206604004, "learning_rate": 4.9703413763946176e-05, "loss": 0.8281, "num_input_tokens_seen": 16894400, "step": 17675 }, { "epoch": 1.4422057264050903, "grad_norm": 2.869941473007202, "learning_rate": 4.9702866886774786e-05, "loss": 0.5357, "num_input_tokens_seen": 16899408, "step": 17680 }, { "epoch": 1.4426135900154988, "grad_norm": 0.015992656350135803, "learning_rate": 4.970231950888678e-05, "loss": 0.0264, "num_input_tokens_seen": 16903536, "step": 17685 }, { "epoch": 1.4430214536259074, "grad_norm": 0.06987165659666061, "learning_rate": 4.9701771630293255e-05, "loss": 0.0112, "num_input_tokens_seen": 16907744, "step": 17690 }, { "epoch": 1.4434293172363162, "grad_norm": 10.32866382598877, "learning_rate": 4.970122325100531e-05, "loss": 0.2097, "num_input_tokens_seen": 16912464, "step": 17695 }, { "epoch": 1.443837180846725, "grad_norm": 0.09442561119794846, "learning_rate": 4.9700674371034074e-05, "loss": 0.2252, "num_input_tokens_seen": 16917072, "step": 17700 }, { "epoch": 1.4442450444571335, "grad_norm": 0.07328007370233536, "learning_rate": 4.970012499039066e-05, "loss": 0.379, "num_input_tokens_seen": 16921984, "step": 17705 }, { "epoch": 1.444652908067542, "grad_norm": 2.483884572982788, "learning_rate": 4.969957510908621e-05, "loss": 0.0863, "num_input_tokens_seen": 16926864, "step": 17710 }, { "epoch": 1.4450607716779509, "grad_norm": 0.06334403157234192, "learning_rate": 4.969902472713187e-05, "loss": 0.3103, "num_input_tokens_seen": 16930176, "step": 17715 }, { "epoch": 1.4454686352883597, "grad_norm": 0.0356796570122242, "learning_rate": 4.9698473844538786e-05, "loss": 0.0113, "num_input_tokens_seen": 16933904, "step": 17720 }, { "epoch": 1.4458764988987682, "grad_norm": 6.317897319793701, "learning_rate": 4.9697922461318135e-05, "loss": 0.2675, "num_input_tokens_seen": 16938992, "step": 17725 }, { "epoch": 1.4462843625091768, "grad_norm": 0.043059270828962326, "learning_rate": 4.96973705774811e-05, "loss": 0.1128, "num_input_tokens_seen": 16944224, "step": 17730 }, { "epoch": 1.4466922261195856, "grad_norm": 3.5135512351989746, "learning_rate": 4.969681819303884e-05, "loss": 0.146, "num_input_tokens_seen": 16948720, "step": 17735 }, { "epoch": 1.4471000897299944, "grad_norm": 1.8807436227798462, "learning_rate": 4.969626530800259e-05, "loss": 0.0115, "num_input_tokens_seen": 16954448, "step": 17740 }, { "epoch": 1.447507953340403, "grad_norm": 0.39042019844055176, "learning_rate": 4.969571192238352e-05, "loss": 0.3882, "num_input_tokens_seen": 16959424, "step": 17745 }, { "epoch": 1.4479158169508117, "grad_norm": 0.0479096919298172, "learning_rate": 4.969515803619287e-05, "loss": 0.0085, "num_input_tokens_seen": 16964528, "step": 17750 }, { "epoch": 1.4483236805612203, "grad_norm": 0.13833099603652954, "learning_rate": 4.9694603649441863e-05, "loss": 0.4474, "num_input_tokens_seen": 16968736, "step": 17755 }, { "epoch": 1.448731544171629, "grad_norm": 0.10291606187820435, "learning_rate": 4.969404876214173e-05, "loss": 0.0377, "num_input_tokens_seen": 16973616, "step": 17760 }, { "epoch": 1.4491394077820376, "grad_norm": 0.5104349255561829, "learning_rate": 4.969349337430372e-05, "loss": 0.2435, "num_input_tokens_seen": 16978912, "step": 17765 }, { "epoch": 1.4495472713924464, "grad_norm": 0.2858007252216339, "learning_rate": 4.9692937485939096e-05, "loss": 0.4316, "num_input_tokens_seen": 16983424, "step": 17770 }, { "epoch": 1.449955135002855, "grad_norm": 14.112544059753418, "learning_rate": 4.9692381097059114e-05, "loss": 0.1599, "num_input_tokens_seen": 16988800, "step": 17775 }, { "epoch": 1.4503629986132638, "grad_norm": 24.41695785522461, "learning_rate": 4.969182420767507e-05, "loss": 0.1674, "num_input_tokens_seen": 16993008, "step": 17780 }, { "epoch": 1.4507708622236724, "grad_norm": 2.0114476680755615, "learning_rate": 4.969126681779823e-05, "loss": 0.011, "num_input_tokens_seen": 16997936, "step": 17785 }, { "epoch": 1.4511787258340811, "grad_norm": 0.12631112337112427, "learning_rate": 4.969070892743992e-05, "loss": 0.2018, "num_input_tokens_seen": 17002832, "step": 17790 }, { "epoch": 1.4515865894444897, "grad_norm": 0.564950704574585, "learning_rate": 4.9690150536611416e-05, "loss": 0.1628, "num_input_tokens_seen": 17007600, "step": 17795 }, { "epoch": 1.4519944530548985, "grad_norm": 9.978111267089844, "learning_rate": 4.968959164532406e-05, "loss": 0.2898, "num_input_tokens_seen": 17012544, "step": 17800 }, { "epoch": 1.452402316665307, "grad_norm": 0.05986715480685234, "learning_rate": 4.9689032253589175e-05, "loss": 0.0278, "num_input_tokens_seen": 17017440, "step": 17805 }, { "epoch": 1.4528101802757158, "grad_norm": 8.136107444763184, "learning_rate": 4.968847236141809e-05, "loss": 0.5494, "num_input_tokens_seen": 17022576, "step": 17810 }, { "epoch": 1.4532180438861244, "grad_norm": 0.05365322157740593, "learning_rate": 4.968791196882216e-05, "loss": 0.2687, "num_input_tokens_seen": 17027712, "step": 17815 }, { "epoch": 1.4536259074965332, "grad_norm": 2.888864755630493, "learning_rate": 4.968735107581275e-05, "loss": 0.0857, "num_input_tokens_seen": 17032896, "step": 17820 }, { "epoch": 1.4540337711069418, "grad_norm": 0.02901296690106392, "learning_rate": 4.968678968240122e-05, "loss": 0.1541, "num_input_tokens_seen": 17037440, "step": 17825 }, { "epoch": 1.4544416347173506, "grad_norm": 0.04384608566761017, "learning_rate": 4.968622778859896e-05, "loss": 0.1777, "num_input_tokens_seen": 17042432, "step": 17830 }, { "epoch": 1.4548494983277591, "grad_norm": 0.748457133769989, "learning_rate": 4.968566539441735e-05, "loss": 0.2239, "num_input_tokens_seen": 17046720, "step": 17835 }, { "epoch": 1.455257361938168, "grad_norm": 0.016915014013648033, "learning_rate": 4.9685102499867795e-05, "loss": 0.0136, "num_input_tokens_seen": 17052000, "step": 17840 }, { "epoch": 1.4556652255485765, "grad_norm": 2.207092046737671, "learning_rate": 4.968453910496169e-05, "loss": 0.0569, "num_input_tokens_seen": 17056592, "step": 17845 }, { "epoch": 1.4560730891589853, "grad_norm": 0.19179323315620422, "learning_rate": 4.9683975209710476e-05, "loss": 0.1761, "num_input_tokens_seen": 17062176, "step": 17850 }, { "epoch": 1.456480952769394, "grad_norm": 3.7278764247894287, "learning_rate": 4.9683410814125575e-05, "loss": 0.3299, "num_input_tokens_seen": 17067344, "step": 17855 }, { "epoch": 1.4568888163798026, "grad_norm": 0.06013714149594307, "learning_rate": 4.9682845918218424e-05, "loss": 0.1654, "num_input_tokens_seen": 17072480, "step": 17860 }, { "epoch": 1.4572966799902112, "grad_norm": 0.0631699487566948, "learning_rate": 4.968228052200047e-05, "loss": 0.0182, "num_input_tokens_seen": 17076816, "step": 17865 }, { "epoch": 1.45770454360062, "grad_norm": 2.4598336219787598, "learning_rate": 4.968171462548318e-05, "loss": 0.4244, "num_input_tokens_seen": 17082576, "step": 17870 }, { "epoch": 1.4581124072110287, "grad_norm": 0.14398294687271118, "learning_rate": 4.9681148228678024e-05, "loss": 0.0116, "num_input_tokens_seen": 17087888, "step": 17875 }, { "epoch": 1.4585202708214373, "grad_norm": 1.5822595357894897, "learning_rate": 4.968058133159647e-05, "loss": 0.1416, "num_input_tokens_seen": 17093072, "step": 17880 }, { "epoch": 1.4589281344318459, "grad_norm": 0.062067750841379166, "learning_rate": 4.9680013934250033e-05, "loss": 0.0864, "num_input_tokens_seen": 17097872, "step": 17885 }, { "epoch": 1.4593359980422547, "grad_norm": 0.26874232292175293, "learning_rate": 4.9679446036650204e-05, "loss": 0.2705, "num_input_tokens_seen": 17102592, "step": 17890 }, { "epoch": 1.4597438616526635, "grad_norm": 0.20434311032295227, "learning_rate": 4.9678877638808485e-05, "loss": 0.277, "num_input_tokens_seen": 17107888, "step": 17895 }, { "epoch": 1.460151725263072, "grad_norm": 0.09898677468299866, "learning_rate": 4.96783087407364e-05, "loss": 0.4955, "num_input_tokens_seen": 17112896, "step": 17900 }, { "epoch": 1.4605595888734806, "grad_norm": 0.08857458084821701, "learning_rate": 4.967773934244549e-05, "loss": 0.2319, "num_input_tokens_seen": 17118080, "step": 17905 }, { "epoch": 1.4609674524838894, "grad_norm": 4.394218444824219, "learning_rate": 4.967716944394728e-05, "loss": 0.4091, "num_input_tokens_seen": 17122288, "step": 17910 }, { "epoch": 1.4613753160942982, "grad_norm": 4.2425103187561035, "learning_rate": 4.9676599045253334e-05, "loss": 0.1779, "num_input_tokens_seen": 17127248, "step": 17915 }, { "epoch": 1.4617831797047067, "grad_norm": 0.42606180906295776, "learning_rate": 4.9676028146375215e-05, "loss": 0.0831, "num_input_tokens_seen": 17132144, "step": 17920 }, { "epoch": 1.4621910433151153, "grad_norm": 0.07580097764730453, "learning_rate": 4.9675456747324486e-05, "loss": 0.0661, "num_input_tokens_seen": 17137088, "step": 17925 }, { "epoch": 1.462598906925524, "grad_norm": 0.08745155483484268, "learning_rate": 4.967488484811274e-05, "loss": 0.1773, "num_input_tokens_seen": 17142736, "step": 17930 }, { "epoch": 1.4630067705359329, "grad_norm": 0.1747184544801712, "learning_rate": 4.9674312448751556e-05, "loss": 0.1224, "num_input_tokens_seen": 17147472, "step": 17935 }, { "epoch": 1.4634146341463414, "grad_norm": 0.4198273718357086, "learning_rate": 4.967373954925255e-05, "loss": 0.3223, "num_input_tokens_seen": 17151504, "step": 17940 }, { "epoch": 1.4638224977567502, "grad_norm": 0.2405976951122284, "learning_rate": 4.9673166149627324e-05, "loss": 0.0174, "num_input_tokens_seen": 17156000, "step": 17945 }, { "epoch": 1.4642303613671588, "grad_norm": 0.05600392073392868, "learning_rate": 4.9672592249887514e-05, "loss": 0.0073, "num_input_tokens_seen": 17160496, "step": 17950 }, { "epoch": 1.4646382249775676, "grad_norm": 0.09851295500993729, "learning_rate": 4.9672017850044733e-05, "loss": 0.0153, "num_input_tokens_seen": 17165696, "step": 17955 }, { "epoch": 1.4650460885879761, "grad_norm": 0.05386212095618248, "learning_rate": 4.967144295011064e-05, "loss": 0.0092, "num_input_tokens_seen": 17170560, "step": 17960 }, { "epoch": 1.465453952198385, "grad_norm": 0.21306298673152924, "learning_rate": 4.9670867550096876e-05, "loss": 0.187, "num_input_tokens_seen": 17175952, "step": 17965 }, { "epoch": 1.4658618158087935, "grad_norm": 0.10529454052448273, "learning_rate": 4.967029165001511e-05, "loss": 0.0267, "num_input_tokens_seen": 17180336, "step": 17970 }, { "epoch": 1.4662696794192023, "grad_norm": 0.2987322509288788, "learning_rate": 4.9669715249877014e-05, "loss": 0.4622, "num_input_tokens_seen": 17185504, "step": 17975 }, { "epoch": 1.4666775430296108, "grad_norm": 0.037681944668293, "learning_rate": 4.966913834969428e-05, "loss": 0.2595, "num_input_tokens_seen": 17190224, "step": 17980 }, { "epoch": 1.4670854066400196, "grad_norm": 0.07688453048467636, "learning_rate": 4.966856094947859e-05, "loss": 0.1398, "num_input_tokens_seen": 17195712, "step": 17985 }, { "epoch": 1.4674932702504282, "grad_norm": 4.747501373291016, "learning_rate": 4.966798304924165e-05, "loss": 0.233, "num_input_tokens_seen": 17200624, "step": 17990 }, { "epoch": 1.467901133860837, "grad_norm": 0.5791275501251221, "learning_rate": 4.966740464899518e-05, "loss": 0.2065, "num_input_tokens_seen": 17206128, "step": 17995 }, { "epoch": 1.4683089974712455, "grad_norm": 0.02531600557267666, "learning_rate": 4.96668257487509e-05, "loss": 0.2888, "num_input_tokens_seen": 17211120, "step": 18000 }, { "epoch": 1.4687168610816543, "grad_norm": 0.02494114451110363, "learning_rate": 4.966624634852054e-05, "loss": 0.0107, "num_input_tokens_seen": 17215472, "step": 18005 }, { "epoch": 1.469124724692063, "grad_norm": 0.16528812050819397, "learning_rate": 4.966566644831585e-05, "loss": 0.6827, "num_input_tokens_seen": 17220336, "step": 18010 }, { "epoch": 1.4695325883024717, "grad_norm": 0.05847376212477684, "learning_rate": 4.966508604814858e-05, "loss": 0.1311, "num_input_tokens_seen": 17223952, "step": 18015 }, { "epoch": 1.4699404519128803, "grad_norm": 1.412003517150879, "learning_rate": 4.966450514803049e-05, "loss": 0.1466, "num_input_tokens_seen": 17228512, "step": 18020 }, { "epoch": 1.470348315523289, "grad_norm": 0.11837160587310791, "learning_rate": 4.966392374797337e-05, "loss": 0.1956, "num_input_tokens_seen": 17232528, "step": 18025 }, { "epoch": 1.4707561791336978, "grad_norm": 0.027115046977996826, "learning_rate": 4.9663341847989e-05, "loss": 0.116, "num_input_tokens_seen": 17236384, "step": 18030 }, { "epoch": 1.4711640427441064, "grad_norm": 2.332404851913452, "learning_rate": 4.9662759448089167e-05, "loss": 0.3209, "num_input_tokens_seen": 17241888, "step": 18035 }, { "epoch": 1.471571906354515, "grad_norm": 0.34052199125289917, "learning_rate": 4.9662176548285675e-05, "loss": 0.2725, "num_input_tokens_seen": 17246384, "step": 18040 }, { "epoch": 1.4719797699649237, "grad_norm": 0.10029929131269455, "learning_rate": 4.9661593148590355e-05, "loss": 0.2722, "num_input_tokens_seen": 17251088, "step": 18045 }, { "epoch": 1.4723876335753325, "grad_norm": 0.07759091258049011, "learning_rate": 4.966100924901501e-05, "loss": 0.0997, "num_input_tokens_seen": 17255328, "step": 18050 }, { "epoch": 1.472795497185741, "grad_norm": 2.3000025749206543, "learning_rate": 4.9660424849571494e-05, "loss": 0.4117, "num_input_tokens_seen": 17259984, "step": 18055 }, { "epoch": 1.4732033607961497, "grad_norm": 1.7485295534133911, "learning_rate": 4.9659839950271635e-05, "loss": 0.2527, "num_input_tokens_seen": 17263344, "step": 18060 }, { "epoch": 1.4736112244065585, "grad_norm": 0.2214120775461197, "learning_rate": 4.965925455112731e-05, "loss": 0.0356, "num_input_tokens_seen": 17267888, "step": 18065 }, { "epoch": 1.4740190880169672, "grad_norm": 0.09369491040706635, "learning_rate": 4.965866865215036e-05, "loss": 0.0653, "num_input_tokens_seen": 17272928, "step": 18070 }, { "epoch": 1.4744269516273758, "grad_norm": 0.3226485252380371, "learning_rate": 4.9658082253352686e-05, "loss": 0.115, "num_input_tokens_seen": 17277648, "step": 18075 }, { "epoch": 1.4748348152377844, "grad_norm": 0.1380353420972824, "learning_rate": 4.965749535474616e-05, "loss": 0.0994, "num_input_tokens_seen": 17281552, "step": 18080 }, { "epoch": 1.4752426788481932, "grad_norm": 0.9987866282463074, "learning_rate": 4.9656907956342686e-05, "loss": 0.1386, "num_input_tokens_seen": 17286576, "step": 18085 }, { "epoch": 1.475650542458602, "grad_norm": 0.07089639455080032, "learning_rate": 4.965632005815416e-05, "loss": 0.2934, "num_input_tokens_seen": 17291776, "step": 18090 }, { "epoch": 1.4760584060690105, "grad_norm": 0.12054537981748581, "learning_rate": 4.9655731660192507e-05, "loss": 0.0994, "num_input_tokens_seen": 17297184, "step": 18095 }, { "epoch": 1.476466269679419, "grad_norm": 1.332135796546936, "learning_rate": 4.965514276246964e-05, "loss": 0.3602, "num_input_tokens_seen": 17302032, "step": 18100 }, { "epoch": 1.4768741332898279, "grad_norm": 12.194742202758789, "learning_rate": 4.965455336499751e-05, "loss": 0.0502, "num_input_tokens_seen": 17307632, "step": 18105 }, { "epoch": 1.4772819969002366, "grad_norm": 0.09089597314596176, "learning_rate": 4.965396346778807e-05, "loss": 0.0264, "num_input_tokens_seen": 17313056, "step": 18110 }, { "epoch": 1.4776898605106452, "grad_norm": 34.90291213989258, "learning_rate": 4.9653373070853257e-05, "loss": 0.3215, "num_input_tokens_seen": 17317472, "step": 18115 }, { "epoch": 1.478097724121054, "grad_norm": 3.2678425312042236, "learning_rate": 4.965278217420505e-05, "loss": 0.2049, "num_input_tokens_seen": 17321696, "step": 18120 }, { "epoch": 1.4785055877314626, "grad_norm": 4.571150302886963, "learning_rate": 4.9652190777855426e-05, "loss": 0.2931, "num_input_tokens_seen": 17326000, "step": 18125 }, { "epoch": 1.4789134513418714, "grad_norm": 0.6350138783454895, "learning_rate": 4.965159888181636e-05, "loss": 0.227, "num_input_tokens_seen": 17330864, "step": 18130 }, { "epoch": 1.47932131495228, "grad_norm": 0.14136207103729248, "learning_rate": 4.965100648609987e-05, "loss": 0.2223, "num_input_tokens_seen": 17336000, "step": 18135 }, { "epoch": 1.4797291785626887, "grad_norm": 0.4002867043018341, "learning_rate": 4.965041359071795e-05, "loss": 0.1763, "num_input_tokens_seen": 17340736, "step": 18140 }, { "epoch": 1.4801370421730973, "grad_norm": 0.01829250529408455, "learning_rate": 4.9649820195682624e-05, "loss": 0.0252, "num_input_tokens_seen": 17345824, "step": 18145 }, { "epoch": 1.480544905783506, "grad_norm": 0.015567840076982975, "learning_rate": 4.9649226301005914e-05, "loss": 0.0238, "num_input_tokens_seen": 17350848, "step": 18150 }, { "epoch": 1.4809527693939146, "grad_norm": 0.005163283087313175, "learning_rate": 4.964863190669986e-05, "loss": 0.1809, "num_input_tokens_seen": 17356224, "step": 18155 }, { "epoch": 1.4813606330043234, "grad_norm": 0.04885099455714226, "learning_rate": 4.9648037012776514e-05, "loss": 0.077, "num_input_tokens_seen": 17361872, "step": 18160 }, { "epoch": 1.481768496614732, "grad_norm": 0.015393584035336971, "learning_rate": 4.964744161924793e-05, "loss": 0.0202, "num_input_tokens_seen": 17367776, "step": 18165 }, { "epoch": 1.4821763602251408, "grad_norm": 0.0577550083398819, "learning_rate": 4.964684572612618e-05, "loss": 0.018, "num_input_tokens_seen": 17372608, "step": 18170 }, { "epoch": 1.4825842238355493, "grad_norm": 0.02707006223499775, "learning_rate": 4.964624933342333e-05, "loss": 0.0023, "num_input_tokens_seen": 17376112, "step": 18175 }, { "epoch": 1.4829920874459581, "grad_norm": 0.019607199355959892, "learning_rate": 4.964565244115149e-05, "loss": 0.635, "num_input_tokens_seen": 17381456, "step": 18180 }, { "epoch": 1.4833999510563667, "grad_norm": 53.8525505065918, "learning_rate": 4.964505504932275e-05, "loss": 0.0991, "num_input_tokens_seen": 17386624, "step": 18185 }, { "epoch": 1.4838078146667755, "grad_norm": 0.13275951147079468, "learning_rate": 4.9644457157949206e-05, "loss": 0.3294, "num_input_tokens_seen": 17391488, "step": 18190 }, { "epoch": 1.484215678277184, "grad_norm": 4.714134216308594, "learning_rate": 4.9643858767043e-05, "loss": 0.3533, "num_input_tokens_seen": 17396832, "step": 18195 }, { "epoch": 1.4846235418875928, "grad_norm": 1.1941847801208496, "learning_rate": 4.964325987661623e-05, "loss": 0.0211, "num_input_tokens_seen": 17401536, "step": 18200 }, { "epoch": 1.4850314054980014, "grad_norm": 0.39278602600097656, "learning_rate": 4.964266048668107e-05, "loss": 0.1536, "num_input_tokens_seen": 17407120, "step": 18205 }, { "epoch": 1.4854392691084102, "grad_norm": 0.2811613976955414, "learning_rate": 4.9642060597249654e-05, "loss": 0.0782, "num_input_tokens_seen": 17411472, "step": 18210 }, { "epoch": 1.4858471327188187, "grad_norm": 0.08050656318664551, "learning_rate": 4.9641460208334136e-05, "loss": 0.2158, "num_input_tokens_seen": 17415744, "step": 18215 }, { "epoch": 1.4862549963292275, "grad_norm": 0.08451345562934875, "learning_rate": 4.964085931994669e-05, "loss": 0.0133, "num_input_tokens_seen": 17420592, "step": 18220 }, { "epoch": 1.4866628599396363, "grad_norm": 2.8564112186431885, "learning_rate": 4.9640257932099486e-05, "loss": 0.32, "num_input_tokens_seen": 17424928, "step": 18225 }, { "epoch": 1.4870707235500449, "grad_norm": 3.5973458290100098, "learning_rate": 4.963965604480474e-05, "loss": 0.4488, "num_input_tokens_seen": 17430256, "step": 18230 }, { "epoch": 1.4874785871604534, "grad_norm": 0.078397236764431, "learning_rate": 4.9639053658074623e-05, "loss": 0.1424, "num_input_tokens_seen": 17435344, "step": 18235 }, { "epoch": 1.4878864507708622, "grad_norm": 0.2744252383708954, "learning_rate": 4.9638450771921365e-05, "loss": 0.2487, "num_input_tokens_seen": 17440432, "step": 18240 }, { "epoch": 1.488294314381271, "grad_norm": 1.8909881114959717, "learning_rate": 4.963784738635718e-05, "loss": 0.5039, "num_input_tokens_seen": 17445904, "step": 18245 }, { "epoch": 1.4887021779916796, "grad_norm": 0.28800711035728455, "learning_rate": 4.96372435013943e-05, "loss": 0.104, "num_input_tokens_seen": 17450992, "step": 18250 }, { "epoch": 1.4891100416020882, "grad_norm": 2.3364241123199463, "learning_rate": 4.9636639117044956e-05, "loss": 0.119, "num_input_tokens_seen": 17455968, "step": 18255 }, { "epoch": 1.489517905212497, "grad_norm": 0.23439545929431915, "learning_rate": 4.963603423332141e-05, "loss": 0.0781, "num_input_tokens_seen": 17460928, "step": 18260 }, { "epoch": 1.4899257688229057, "grad_norm": 0.45465782284736633, "learning_rate": 4.9635428850235923e-05, "loss": 0.0313, "num_input_tokens_seen": 17466416, "step": 18265 }, { "epoch": 1.4903336324333143, "grad_norm": 0.3772275745868683, "learning_rate": 4.963482296780075e-05, "loss": 0.0779, "num_input_tokens_seen": 17471344, "step": 18270 }, { "epoch": 1.4907414960437229, "grad_norm": 0.3788255751132965, "learning_rate": 4.963421658602819e-05, "loss": 0.1223, "num_input_tokens_seen": 17476960, "step": 18275 }, { "epoch": 1.4911493596541316, "grad_norm": 0.052689384669065475, "learning_rate": 4.963360970493053e-05, "loss": 0.2091, "num_input_tokens_seen": 17482304, "step": 18280 }, { "epoch": 1.4915572232645404, "grad_norm": 0.1545860320329666, "learning_rate": 4.963300232452007e-05, "loss": 0.033, "num_input_tokens_seen": 17487568, "step": 18285 }, { "epoch": 1.491965086874949, "grad_norm": 0.05061991140246391, "learning_rate": 4.963239444480911e-05, "loss": 0.0072, "num_input_tokens_seen": 17492880, "step": 18290 }, { "epoch": 1.4923729504853576, "grad_norm": 0.7521027326583862, "learning_rate": 4.963178606580999e-05, "loss": 0.248, "num_input_tokens_seen": 17498496, "step": 18295 }, { "epoch": 1.4927808140957664, "grad_norm": 0.04556389898061752, "learning_rate": 4.963117718753503e-05, "loss": 0.0141, "num_input_tokens_seen": 17504400, "step": 18300 }, { "epoch": 1.4931886777061751, "grad_norm": 4.588309288024902, "learning_rate": 4.9630567809996576e-05, "loss": 0.1959, "num_input_tokens_seen": 17509328, "step": 18305 }, { "epoch": 1.4935965413165837, "grad_norm": 5.213405132293701, "learning_rate": 4.962995793320698e-05, "loss": 0.5033, "num_input_tokens_seen": 17513184, "step": 18310 }, { "epoch": 1.4940044049269925, "grad_norm": 0.042291633784770966, "learning_rate": 4.96293475571786e-05, "loss": 0.146, "num_input_tokens_seen": 17518032, "step": 18315 }, { "epoch": 1.494412268537401, "grad_norm": 0.19438301026821136, "learning_rate": 4.9628736681923817e-05, "loss": 0.0277, "num_input_tokens_seen": 17522928, "step": 18320 }, { "epoch": 1.4948201321478098, "grad_norm": 4.519681930541992, "learning_rate": 4.9628125307455e-05, "loss": 0.1963, "num_input_tokens_seen": 17527856, "step": 18325 }, { "epoch": 1.4952279957582184, "grad_norm": 51.43040084838867, "learning_rate": 4.962751343378455e-05, "loss": 0.1064, "num_input_tokens_seen": 17531712, "step": 18330 }, { "epoch": 1.4956358593686272, "grad_norm": 2.6373438835144043, "learning_rate": 4.962690106092487e-05, "loss": 0.6283, "num_input_tokens_seen": 17535712, "step": 18335 }, { "epoch": 1.4960437229790358, "grad_norm": 0.11918114870786667, "learning_rate": 4.9626288188888364e-05, "loss": 0.2791, "num_input_tokens_seen": 17540608, "step": 18340 }, { "epoch": 1.4964515865894445, "grad_norm": 6.210867881774902, "learning_rate": 4.962567481768746e-05, "loss": 0.2807, "num_input_tokens_seen": 17545280, "step": 18345 }, { "epoch": 1.4968594501998531, "grad_norm": 0.08558879792690277, "learning_rate": 4.96250609473346e-05, "loss": 0.0477, "num_input_tokens_seen": 17549936, "step": 18350 }, { "epoch": 1.497267313810262, "grad_norm": 0.260850727558136, "learning_rate": 4.962444657784221e-05, "loss": 0.0445, "num_input_tokens_seen": 17555520, "step": 18355 }, { "epoch": 1.4976751774206705, "grad_norm": 0.08780068159103394, "learning_rate": 4.9623831709222756e-05, "loss": 0.0294, "num_input_tokens_seen": 17561168, "step": 18360 }, { "epoch": 1.4980830410310793, "grad_norm": 0.17676055431365967, "learning_rate": 4.96232163414887e-05, "loss": 0.1571, "num_input_tokens_seen": 17566336, "step": 18365 }, { "epoch": 1.4984909046414878, "grad_norm": 0.15269224345684052, "learning_rate": 4.962260047465251e-05, "loss": 0.1147, "num_input_tokens_seen": 17570656, "step": 18370 }, { "epoch": 1.4988987682518966, "grad_norm": 0.06806357949972153, "learning_rate": 4.9621984108726663e-05, "loss": 0.1826, "num_input_tokens_seen": 17574608, "step": 18375 }, { "epoch": 1.4993066318623052, "grad_norm": 0.09053242206573486, "learning_rate": 4.962136724372367e-05, "loss": 0.1558, "num_input_tokens_seen": 17579120, "step": 18380 }, { "epoch": 1.499714495472714, "grad_norm": 1.5385690927505493, "learning_rate": 4.9620749879656026e-05, "loss": 0.0929, "num_input_tokens_seen": 17583664, "step": 18385 }, { "epoch": 1.5001223590831225, "grad_norm": 0.06451454013586044, "learning_rate": 4.9620132016536235e-05, "loss": 0.0109, "num_input_tokens_seen": 17588368, "step": 18390 }, { "epoch": 1.5001223590831225, "eval_loss": 0.1670130044221878, "eval_runtime": 570.8428, "eval_samples_per_second": 4.774, "eval_steps_per_second": 2.388, "num_input_tokens_seen": 17588368, "step": 18390 }, { "epoch": 1.5005302226935313, "grad_norm": 0.15398061275482178, "learning_rate": 4.961951365437684e-05, "loss": 0.0104, "num_input_tokens_seen": 17593264, "step": 18395 }, { "epoch": 1.50093808630394, "grad_norm": 0.016430437564849854, "learning_rate": 4.961889479319035e-05, "loss": 0.0072, "num_input_tokens_seen": 17598160, "step": 18400 }, { "epoch": 1.5013459499143487, "grad_norm": 2.2981338500976562, "learning_rate": 4.9618275432989335e-05, "loss": 0.3248, "num_input_tokens_seen": 17603264, "step": 18405 }, { "epoch": 1.5017538135247572, "grad_norm": 0.041569631546735764, "learning_rate": 4.9617655573786334e-05, "loss": 0.1387, "num_input_tokens_seen": 17608112, "step": 18410 }, { "epoch": 1.502161677135166, "grad_norm": 0.04212503135204315, "learning_rate": 4.961703521559392e-05, "loss": 0.1235, "num_input_tokens_seen": 17613184, "step": 18415 }, { "epoch": 1.5025695407455748, "grad_norm": 0.029504923149943352, "learning_rate": 4.961641435842466e-05, "loss": 0.013, "num_input_tokens_seen": 17617968, "step": 18420 }, { "epoch": 1.5029774043559834, "grad_norm": 0.024514425545930862, "learning_rate": 4.9615793002291144e-05, "loss": 0.2045, "num_input_tokens_seen": 17622736, "step": 18425 }, { "epoch": 1.503385267966392, "grad_norm": 0.03310847654938698, "learning_rate": 4.9615171147205964e-05, "loss": 0.1214, "num_input_tokens_seen": 17627248, "step": 18430 }, { "epoch": 1.5037931315768007, "grad_norm": 2.6148581504821777, "learning_rate": 4.961454879318171e-05, "loss": 0.039, "num_input_tokens_seen": 17632720, "step": 18435 }, { "epoch": 1.5042009951872095, "grad_norm": 0.021267086267471313, "learning_rate": 4.961392594023103e-05, "loss": 0.2104, "num_input_tokens_seen": 17637664, "step": 18440 }, { "epoch": 1.504608858797618, "grad_norm": 0.04377388581633568, "learning_rate": 4.961330258836652e-05, "loss": 0.1107, "num_input_tokens_seen": 17641696, "step": 18445 }, { "epoch": 1.5050167224080266, "grad_norm": 0.01942310482263565, "learning_rate": 4.9612678737600824e-05, "loss": 0.0042, "num_input_tokens_seen": 17646192, "step": 18450 }, { "epoch": 1.5054245860184354, "grad_norm": 12.930603981018066, "learning_rate": 4.96120543879466e-05, "loss": 0.1085, "num_input_tokens_seen": 17651536, "step": 18455 }, { "epoch": 1.5058324496288442, "grad_norm": 2.838764190673828, "learning_rate": 4.961142953941648e-05, "loss": 0.3651, "num_input_tokens_seen": 17655984, "step": 18460 }, { "epoch": 1.5062403132392528, "grad_norm": 0.06518755108118057, "learning_rate": 4.961080419202315e-05, "loss": 0.1777, "num_input_tokens_seen": 17661296, "step": 18465 }, { "epoch": 1.5066481768496613, "grad_norm": 5.737739562988281, "learning_rate": 4.961017834577927e-05, "loss": 0.3085, "num_input_tokens_seen": 17666208, "step": 18470 }, { "epoch": 1.5070560404600701, "grad_norm": 0.09406457096338272, "learning_rate": 4.960955200069754e-05, "loss": 0.1034, "num_input_tokens_seen": 17670272, "step": 18475 }, { "epoch": 1.507463904070479, "grad_norm": 2.350229263305664, "learning_rate": 4.9608925156790644e-05, "loss": 0.3418, "num_input_tokens_seen": 17675584, "step": 18480 }, { "epoch": 1.5078717676808875, "grad_norm": 0.33885636925697327, "learning_rate": 4.9608297814071286e-05, "loss": 0.0292, "num_input_tokens_seen": 17680784, "step": 18485 }, { "epoch": 1.508279631291296, "grad_norm": 0.01644902490079403, "learning_rate": 4.9607669972552196e-05, "loss": 0.0093, "num_input_tokens_seen": 17685712, "step": 18490 }, { "epoch": 1.5086874949017048, "grad_norm": 0.037614788860082626, "learning_rate": 4.9607041632246087e-05, "loss": 0.1906, "num_input_tokens_seen": 17691008, "step": 18495 }, { "epoch": 1.5090953585121136, "grad_norm": 0.1704341024160385, "learning_rate": 4.9606412793165703e-05, "loss": 0.4475, "num_input_tokens_seen": 17695264, "step": 18500 }, { "epoch": 1.5095032221225222, "grad_norm": 0.23365768790245056, "learning_rate": 4.9605783455323786e-05, "loss": 0.0239, "num_input_tokens_seen": 17699920, "step": 18505 }, { "epoch": 1.5099110857329308, "grad_norm": 9.826419830322266, "learning_rate": 4.9605153618733094e-05, "loss": 0.3102, "num_input_tokens_seen": 17705136, "step": 18510 }, { "epoch": 1.5103189493433395, "grad_norm": 0.07667089253664017, "learning_rate": 4.9604523283406394e-05, "loss": 0.0139, "num_input_tokens_seen": 17709552, "step": 18515 }, { "epoch": 1.5107268129537483, "grad_norm": 0.027997907251119614, "learning_rate": 4.9603892449356467e-05, "loss": 0.1103, "num_input_tokens_seen": 17714176, "step": 18520 }, { "epoch": 1.511134676564157, "grad_norm": 0.041330184787511826, "learning_rate": 4.960326111659609e-05, "loss": 0.3441, "num_input_tokens_seen": 17718048, "step": 18525 }, { "epoch": 1.5115425401745655, "grad_norm": 0.13504759967327118, "learning_rate": 4.960262928513806e-05, "loss": 0.0126, "num_input_tokens_seen": 17722032, "step": 18530 }, { "epoch": 1.5119504037849743, "grad_norm": 0.14562052488327026, "learning_rate": 4.96019969549952e-05, "loss": 0.1394, "num_input_tokens_seen": 17726496, "step": 18535 }, { "epoch": 1.512358267395383, "grad_norm": 0.035135604441165924, "learning_rate": 4.960136412618031e-05, "loss": 0.0214, "num_input_tokens_seen": 17731072, "step": 18540 }, { "epoch": 1.5127661310057916, "grad_norm": 0.12403476983308792, "learning_rate": 4.960073079870622e-05, "loss": 0.2694, "num_input_tokens_seen": 17736160, "step": 18545 }, { "epoch": 1.5131739946162004, "grad_norm": 0.043977729976177216, "learning_rate": 4.960009697258578e-05, "loss": 0.3222, "num_input_tokens_seen": 17740832, "step": 18550 }, { "epoch": 1.5135818582266092, "grad_norm": 0.1034039556980133, "learning_rate": 4.959946264783181e-05, "loss": 0.0362, "num_input_tokens_seen": 17745696, "step": 18555 }, { "epoch": 1.5139897218370177, "grad_norm": 9.9581880569458, "learning_rate": 4.959882782445719e-05, "loss": 0.1621, "num_input_tokens_seen": 17750736, "step": 18560 }, { "epoch": 1.5143975854474263, "grad_norm": 0.07717373967170715, "learning_rate": 4.959819250247479e-05, "loss": 0.2705, "num_input_tokens_seen": 17754672, "step": 18565 }, { "epoch": 1.514805449057835, "grad_norm": 1.3751072883605957, "learning_rate": 4.9597556681897485e-05, "loss": 0.1426, "num_input_tokens_seen": 17759440, "step": 18570 }, { "epoch": 1.5152133126682439, "grad_norm": 0.010119447484612465, "learning_rate": 4.9596920362738154e-05, "loss": 0.0167, "num_input_tokens_seen": 17762736, "step": 18575 }, { "epoch": 1.5156211762786524, "grad_norm": 2.3853468894958496, "learning_rate": 4.9596283545009694e-05, "loss": 0.2895, "num_input_tokens_seen": 17767408, "step": 18580 }, { "epoch": 1.516029039889061, "grad_norm": 9.082366943359375, "learning_rate": 4.959564622872502e-05, "loss": 0.47, "num_input_tokens_seen": 17772304, "step": 18585 }, { "epoch": 1.5164369034994698, "grad_norm": 13.855201721191406, "learning_rate": 4.959500841389705e-05, "loss": 0.1159, "num_input_tokens_seen": 17776752, "step": 18590 }, { "epoch": 1.5168447671098786, "grad_norm": 0.8758640289306641, "learning_rate": 4.959437010053871e-05, "loss": 0.0274, "num_input_tokens_seen": 17782288, "step": 18595 }, { "epoch": 1.5172526307202872, "grad_norm": 5.884263038635254, "learning_rate": 4.959373128866294e-05, "loss": 0.1345, "num_input_tokens_seen": 17786112, "step": 18600 }, { "epoch": 1.5176604943306957, "grad_norm": 7.883790016174316, "learning_rate": 4.959309197828269e-05, "loss": 0.3247, "num_input_tokens_seen": 17791008, "step": 18605 }, { "epoch": 1.5180683579411045, "grad_norm": 0.1491614729166031, "learning_rate": 4.9592452169410905e-05, "loss": 0.1485, "num_input_tokens_seen": 17796016, "step": 18610 }, { "epoch": 1.5184762215515133, "grad_norm": 4.310735702514648, "learning_rate": 4.959181186206057e-05, "loss": 0.2878, "num_input_tokens_seen": 17800592, "step": 18615 }, { "epoch": 1.5188840851619219, "grad_norm": 0.06199514493346214, "learning_rate": 4.959117105624466e-05, "loss": 0.0169, "num_input_tokens_seen": 17805552, "step": 18620 }, { "epoch": 1.5192919487723304, "grad_norm": 11.280778884887695, "learning_rate": 4.9590529751976164e-05, "loss": 0.2261, "num_input_tokens_seen": 17810384, "step": 18625 }, { "epoch": 1.5196998123827392, "grad_norm": 0.7684001922607422, "learning_rate": 4.958988794926808e-05, "loss": 0.0942, "num_input_tokens_seen": 17815520, "step": 18630 }, { "epoch": 1.520107675993148, "grad_norm": 8.158584594726562, "learning_rate": 4.958924564813341e-05, "loss": 0.0487, "num_input_tokens_seen": 17820784, "step": 18635 }, { "epoch": 1.5205155396035566, "grad_norm": 1.9640682935714722, "learning_rate": 4.958860284858518e-05, "loss": 0.2216, "num_input_tokens_seen": 17825200, "step": 18640 }, { "epoch": 1.5209234032139651, "grad_norm": 0.03529301658272743, "learning_rate": 4.958795955063642e-05, "loss": 0.0156, "num_input_tokens_seen": 17829776, "step": 18645 }, { "epoch": 1.521331266824374, "grad_norm": 0.0293881855905056, "learning_rate": 4.9587315754300166e-05, "loss": 0.0086, "num_input_tokens_seen": 17834576, "step": 18650 }, { "epoch": 1.5217391304347827, "grad_norm": 109.56026458740234, "learning_rate": 4.958667145958948e-05, "loss": 0.172, "num_input_tokens_seen": 17839184, "step": 18655 }, { "epoch": 1.5221469940451913, "grad_norm": 15.477461814880371, "learning_rate": 4.9586026666517406e-05, "loss": 0.0385, "num_input_tokens_seen": 17843968, "step": 18660 }, { "epoch": 1.5225548576555998, "grad_norm": 0.0207776241004467, "learning_rate": 4.9585381375097003e-05, "loss": 0.1811, "num_input_tokens_seen": 17848768, "step": 18665 }, { "epoch": 1.5229627212660086, "grad_norm": 0.04269253462553024, "learning_rate": 4.958473558534139e-05, "loss": 0.1188, "num_input_tokens_seen": 17852960, "step": 18670 }, { "epoch": 1.5233705848764174, "grad_norm": 3.8866708278656006, "learning_rate": 4.958408929726362e-05, "loss": 1.0961, "num_input_tokens_seen": 17857552, "step": 18675 }, { "epoch": 1.523778448486826, "grad_norm": 0.11840447783470154, "learning_rate": 4.958344251087681e-05, "loss": 0.1581, "num_input_tokens_seen": 17861952, "step": 18680 }, { "epoch": 1.5241863120972345, "grad_norm": 0.15749415755271912, "learning_rate": 4.9582795226194066e-05, "loss": 0.0085, "num_input_tokens_seen": 17866112, "step": 18685 }, { "epoch": 1.5245941757076433, "grad_norm": 0.007888766005635262, "learning_rate": 4.958214744322851e-05, "loss": 0.0055, "num_input_tokens_seen": 17870240, "step": 18690 }, { "epoch": 1.5250020393180521, "grad_norm": 0.008853422477841377, "learning_rate": 4.9581499161993266e-05, "loss": 0.2798, "num_input_tokens_seen": 17874880, "step": 18695 }, { "epoch": 1.5254099029284607, "grad_norm": 0.0975106805562973, "learning_rate": 4.958085038250148e-05, "loss": 0.4083, "num_input_tokens_seen": 17878528, "step": 18700 }, { "epoch": 1.5258177665388692, "grad_norm": 0.01695535145699978, "learning_rate": 4.9580201104766306e-05, "loss": 0.2112, "num_input_tokens_seen": 17883008, "step": 18705 }, { "epoch": 1.526225630149278, "grad_norm": 0.022073226049542427, "learning_rate": 4.95795513288009e-05, "loss": 0.0152, "num_input_tokens_seen": 17888400, "step": 18710 }, { "epoch": 1.5266334937596868, "grad_norm": 0.05483236908912659, "learning_rate": 4.957890105461843e-05, "loss": 0.2246, "num_input_tokens_seen": 17893616, "step": 18715 }, { "epoch": 1.5270413573700954, "grad_norm": 1.3580986261367798, "learning_rate": 4.957825028223208e-05, "loss": 0.584, "num_input_tokens_seen": 17898576, "step": 18720 }, { "epoch": 1.5274492209805042, "grad_norm": 1.3942911624908447, "learning_rate": 4.957759901165504e-05, "loss": 0.0355, "num_input_tokens_seen": 17903520, "step": 18725 }, { "epoch": 1.527857084590913, "grad_norm": 3.029956340789795, "learning_rate": 4.957694724290052e-05, "loss": 0.2517, "num_input_tokens_seen": 17908080, "step": 18730 }, { "epoch": 1.5282649482013215, "grad_norm": 0.07258487492799759, "learning_rate": 4.957629497598171e-05, "loss": 0.1008, "num_input_tokens_seen": 17912560, "step": 18735 }, { "epoch": 1.52867281181173, "grad_norm": 0.10273641347885132, "learning_rate": 4.9575642210911856e-05, "loss": 0.2115, "num_input_tokens_seen": 17917472, "step": 18740 }, { "epoch": 1.5290806754221389, "grad_norm": 2.0721073150634766, "learning_rate": 4.957498894770417e-05, "loss": 0.0394, "num_input_tokens_seen": 17922416, "step": 18745 }, { "epoch": 1.5294885390325477, "grad_norm": 0.1466996967792511, "learning_rate": 4.95743351863719e-05, "loss": 0.103, "num_input_tokens_seen": 17926704, "step": 18750 }, { "epoch": 1.5298964026429562, "grad_norm": 0.11630839109420776, "learning_rate": 4.95736809269283e-05, "loss": 0.0288, "num_input_tokens_seen": 17931248, "step": 18755 }, { "epoch": 1.5303042662533648, "grad_norm": 0.052670132368803024, "learning_rate": 4.957302616938663e-05, "loss": 0.1459, "num_input_tokens_seen": 17935984, "step": 18760 }, { "epoch": 1.5307121298637736, "grad_norm": 0.14195656776428223, "learning_rate": 4.957237091376016e-05, "loss": 0.3401, "num_input_tokens_seen": 17941072, "step": 18765 }, { "epoch": 1.5311199934741824, "grad_norm": 0.29706767201423645, "learning_rate": 4.9571715160062176e-05, "loss": 0.1208, "num_input_tokens_seen": 17945440, "step": 18770 }, { "epoch": 1.531527857084591, "grad_norm": 0.05691885948181152, "learning_rate": 4.9571058908305964e-05, "loss": 0.2335, "num_input_tokens_seen": 17950208, "step": 18775 }, { "epoch": 1.5319357206949995, "grad_norm": 0.08652297407388687, "learning_rate": 4.9570402158504836e-05, "loss": 0.0992, "num_input_tokens_seen": 17955376, "step": 18780 }, { "epoch": 1.5323435843054083, "grad_norm": 0.31924450397491455, "learning_rate": 4.956974491067209e-05, "loss": 0.3062, "num_input_tokens_seen": 17960576, "step": 18785 }, { "epoch": 1.532751447915817, "grad_norm": 10.310843467712402, "learning_rate": 4.9569087164821055e-05, "loss": 0.141, "num_input_tokens_seen": 17964704, "step": 18790 }, { "epoch": 1.5331593115262256, "grad_norm": 0.07892680913209915, "learning_rate": 4.956842892096506e-05, "loss": 0.3592, "num_input_tokens_seen": 17969024, "step": 18795 }, { "epoch": 1.5335671751366342, "grad_norm": 28.375661849975586, "learning_rate": 4.956777017911746e-05, "loss": 0.4661, "num_input_tokens_seen": 17974768, "step": 18800 }, { "epoch": 1.533975038747043, "grad_norm": 0.4350358545780182, "learning_rate": 4.956711093929159e-05, "loss": 0.2248, "num_input_tokens_seen": 17980112, "step": 18805 }, { "epoch": 1.5343829023574518, "grad_norm": 0.09513199329376221, "learning_rate": 4.956645120150082e-05, "loss": 0.4222, "num_input_tokens_seen": 17984576, "step": 18810 }, { "epoch": 1.5347907659678603, "grad_norm": 0.5564838647842407, "learning_rate": 4.956579096575853e-05, "loss": 0.3041, "num_input_tokens_seen": 17988832, "step": 18815 }, { "epoch": 1.535198629578269, "grad_norm": 0.08847320824861526, "learning_rate": 4.956513023207809e-05, "loss": 0.0182, "num_input_tokens_seen": 17993616, "step": 18820 }, { "epoch": 1.5356064931886777, "grad_norm": 0.042204562574625015, "learning_rate": 4.95644690004729e-05, "loss": 0.0164, "num_input_tokens_seen": 17998560, "step": 18825 }, { "epoch": 1.5360143567990865, "grad_norm": 0.23500625789165497, "learning_rate": 4.956380727095637e-05, "loss": 0.2684, "num_input_tokens_seen": 18003696, "step": 18830 }, { "epoch": 1.536422220409495, "grad_norm": 0.36600014567375183, "learning_rate": 4.956314504354189e-05, "loss": 0.2233, "num_input_tokens_seen": 18008448, "step": 18835 }, { "epoch": 1.5368300840199036, "grad_norm": 0.14319729804992676, "learning_rate": 4.95624823182429e-05, "loss": 0.0223, "num_input_tokens_seen": 18014352, "step": 18840 }, { "epoch": 1.5372379476303124, "grad_norm": 0.15956515073776245, "learning_rate": 4.956181909507284e-05, "loss": 0.0984, "num_input_tokens_seen": 18018560, "step": 18845 }, { "epoch": 1.5376458112407212, "grad_norm": 4.530796051025391, "learning_rate": 4.9561155374045134e-05, "loss": 0.0311, "num_input_tokens_seen": 18023712, "step": 18850 }, { "epoch": 1.5380536748511298, "grad_norm": 0.5949535965919495, "learning_rate": 4.956049115517325e-05, "loss": 0.1833, "num_input_tokens_seen": 18028240, "step": 18855 }, { "epoch": 1.5384615384615383, "grad_norm": 1.946697473526001, "learning_rate": 4.955982643847065e-05, "loss": 0.0241, "num_input_tokens_seen": 18032832, "step": 18860 }, { "epoch": 1.5388694020719471, "grad_norm": 19.62043571472168, "learning_rate": 4.955916122395079e-05, "loss": 0.0312, "num_input_tokens_seen": 18038336, "step": 18865 }, { "epoch": 1.539277265682356, "grad_norm": 0.04578131437301636, "learning_rate": 4.955849551162718e-05, "loss": 0.0364, "num_input_tokens_seen": 18043088, "step": 18870 }, { "epoch": 1.5396851292927645, "grad_norm": 0.009916329756379128, "learning_rate": 4.9557829301513305e-05, "loss": 0.0169, "num_input_tokens_seen": 18048656, "step": 18875 }, { "epoch": 1.540092992903173, "grad_norm": 16.881027221679688, "learning_rate": 4.9557162593622655e-05, "loss": 0.1029, "num_input_tokens_seen": 18052960, "step": 18880 }, { "epoch": 1.5405008565135818, "grad_norm": 0.007881207391619682, "learning_rate": 4.955649538796876e-05, "loss": 0.0108, "num_input_tokens_seen": 18058048, "step": 18885 }, { "epoch": 1.5409087201239906, "grad_norm": 5.232394695281982, "learning_rate": 4.9555827684565134e-05, "loss": 0.1096, "num_input_tokens_seen": 18062800, "step": 18890 }, { "epoch": 1.5413165837343992, "grad_norm": 6.2426958084106445, "learning_rate": 4.9555159483425316e-05, "loss": 0.3817, "num_input_tokens_seen": 18068672, "step": 18895 }, { "epoch": 1.5417244473448077, "grad_norm": 9.179146766662598, "learning_rate": 4.955449078456286e-05, "loss": 0.2177, "num_input_tokens_seen": 18073376, "step": 18900 }, { "epoch": 1.5421323109552165, "grad_norm": 2.5500123500823975, "learning_rate": 4.95538215879913e-05, "loss": 0.377, "num_input_tokens_seen": 18077808, "step": 18905 }, { "epoch": 1.5425401745656253, "grad_norm": 0.013923327438533306, "learning_rate": 4.9553151893724206e-05, "loss": 0.0271, "num_input_tokens_seen": 18082112, "step": 18910 }, { "epoch": 1.5429480381760339, "grad_norm": 0.10661807656288147, "learning_rate": 4.9552481701775164e-05, "loss": 0.1241, "num_input_tokens_seen": 18087456, "step": 18915 }, { "epoch": 1.5433559017864427, "grad_norm": 0.10934842377901077, "learning_rate": 4.955181101215775e-05, "loss": 0.0088, "num_input_tokens_seen": 18092704, "step": 18920 }, { "epoch": 1.5437637653968515, "grad_norm": 15.839800834655762, "learning_rate": 4.9551139824885564e-05, "loss": 0.057, "num_input_tokens_seen": 18097984, "step": 18925 }, { "epoch": 1.54417162900726, "grad_norm": 3.2041587829589844, "learning_rate": 4.955046813997219e-05, "loss": 0.1839, "num_input_tokens_seen": 18102928, "step": 18930 }, { "epoch": 1.5445794926176686, "grad_norm": 35.29936981201172, "learning_rate": 4.9549795957431275e-05, "loss": 0.0855, "num_input_tokens_seen": 18108096, "step": 18935 }, { "epoch": 1.5449873562280774, "grad_norm": 5.88492488861084, "learning_rate": 4.9549123277276424e-05, "loss": 0.1796, "num_input_tokens_seen": 18113056, "step": 18940 }, { "epoch": 1.5453952198384862, "grad_norm": 56.28171920776367, "learning_rate": 4.9548450099521274e-05, "loss": 0.353, "num_input_tokens_seen": 18117872, "step": 18945 }, { "epoch": 1.5458030834488947, "grad_norm": 8.77684497833252, "learning_rate": 4.954777642417947e-05, "loss": 0.4541, "num_input_tokens_seen": 18122880, "step": 18950 }, { "epoch": 1.5462109470593033, "grad_norm": 3.189321756362915, "learning_rate": 4.9547102251264664e-05, "loss": 0.2777, "num_input_tokens_seen": 18127040, "step": 18955 }, { "epoch": 1.546618810669712, "grad_norm": 0.15714941918849945, "learning_rate": 4.9546427580790533e-05, "loss": 0.0061, "num_input_tokens_seen": 18132096, "step": 18960 }, { "epoch": 1.5470266742801209, "grad_norm": 0.023259101435542107, "learning_rate": 4.9545752412770754e-05, "loss": 0.5673, "num_input_tokens_seen": 18136416, "step": 18965 }, { "epoch": 1.5474345378905294, "grad_norm": 3.0665700435638428, "learning_rate": 4.954507674721899e-05, "loss": 0.1562, "num_input_tokens_seen": 18140672, "step": 18970 }, { "epoch": 1.547842401500938, "grad_norm": 3.890698194503784, "learning_rate": 4.954440058414895e-05, "loss": 0.2314, "num_input_tokens_seen": 18146176, "step": 18975 }, { "epoch": 1.5482502651113468, "grad_norm": 12.017335891723633, "learning_rate": 4.954372392357435e-05, "loss": 0.3737, "num_input_tokens_seen": 18151360, "step": 18980 }, { "epoch": 1.5486581287217556, "grad_norm": 80.13636016845703, "learning_rate": 4.954304676550889e-05, "loss": 0.7375, "num_input_tokens_seen": 18156608, "step": 18985 }, { "epoch": 1.5490659923321641, "grad_norm": 28.452560424804688, "learning_rate": 4.95423691099663e-05, "loss": 0.5347, "num_input_tokens_seen": 18161520, "step": 18990 }, { "epoch": 1.5494738559425727, "grad_norm": 13.66498851776123, "learning_rate": 4.954169095696032e-05, "loss": 0.5385, "num_input_tokens_seen": 18166768, "step": 18995 }, { "epoch": 1.5498817195529815, "grad_norm": 2.4920125007629395, "learning_rate": 4.9541012306504684e-05, "loss": 0.2806, "num_input_tokens_seen": 18172320, "step": 19000 }, { "epoch": 1.5502895831633903, "grad_norm": 0.09537677466869354, "learning_rate": 4.954033315861316e-05, "loss": 0.2325, "num_input_tokens_seen": 18177488, "step": 19005 }, { "epoch": 1.5506974467737988, "grad_norm": 0.42038968205451965, "learning_rate": 4.953965351329952e-05, "loss": 0.2394, "num_input_tokens_seen": 18182768, "step": 19010 }, { "epoch": 1.5511053103842074, "grad_norm": 0.20301908254623413, "learning_rate": 4.9538973370577525e-05, "loss": 0.0243, "num_input_tokens_seen": 18188048, "step": 19015 }, { "epoch": 1.5515131739946162, "grad_norm": 0.1817374974489212, "learning_rate": 4.9538292730460964e-05, "loss": 0.0686, "num_input_tokens_seen": 18192384, "step": 19020 }, { "epoch": 1.551921037605025, "grad_norm": 3.253493070602417, "learning_rate": 4.953761159296364e-05, "loss": 0.1058, "num_input_tokens_seen": 18197488, "step": 19025 }, { "epoch": 1.5523289012154335, "grad_norm": 2.307297945022583, "learning_rate": 4.953692995809935e-05, "loss": 0.1441, "num_input_tokens_seen": 18201792, "step": 19030 }, { "epoch": 1.552736764825842, "grad_norm": 7.606870174407959, "learning_rate": 4.953624782588192e-05, "loss": 0.1681, "num_input_tokens_seen": 18206368, "step": 19035 }, { "epoch": 1.553144628436251, "grad_norm": 0.12794533371925354, "learning_rate": 4.953556519632517e-05, "loss": 0.2218, "num_input_tokens_seen": 18211760, "step": 19040 }, { "epoch": 1.5535524920466597, "grad_norm": 0.04811399057507515, "learning_rate": 4.953488206944295e-05, "loss": 0.0129, "num_input_tokens_seen": 18216160, "step": 19045 }, { "epoch": 1.5539603556570682, "grad_norm": 0.015666093677282333, "learning_rate": 4.9534198445249095e-05, "loss": 0.2236, "num_input_tokens_seen": 18220272, "step": 19050 }, { "epoch": 1.5543682192674768, "grad_norm": 0.019969122484326363, "learning_rate": 4.953351432375745e-05, "loss": 0.1985, "num_input_tokens_seen": 18224464, "step": 19055 }, { "epoch": 1.5547760828778856, "grad_norm": 0.1374175101518631, "learning_rate": 4.95328297049819e-05, "loss": 0.0153, "num_input_tokens_seen": 18229648, "step": 19060 }, { "epoch": 1.5551839464882944, "grad_norm": 0.02428651787340641, "learning_rate": 4.9532144588936316e-05, "loss": 0.2657, "num_input_tokens_seen": 18234736, "step": 19065 }, { "epoch": 1.555591810098703, "grad_norm": 20.627695083618164, "learning_rate": 4.953145897563459e-05, "loss": 0.0967, "num_input_tokens_seen": 18239936, "step": 19070 }, { "epoch": 1.5559996737091115, "grad_norm": 0.020875418558716774, "learning_rate": 4.9530772865090606e-05, "loss": 0.0328, "num_input_tokens_seen": 18245280, "step": 19075 }, { "epoch": 1.5564075373195203, "grad_norm": 31.26117706298828, "learning_rate": 4.9530086257318286e-05, "loss": 0.068, "num_input_tokens_seen": 18250064, "step": 19080 }, { "epoch": 1.556815400929929, "grad_norm": 5.996613025665283, "learning_rate": 4.952939915233155e-05, "loss": 0.41, "num_input_tokens_seen": 18254352, "step": 19085 }, { "epoch": 1.5572232645403377, "grad_norm": 0.02135862037539482, "learning_rate": 4.9528711550144303e-05, "loss": 0.2333, "num_input_tokens_seen": 18259296, "step": 19090 }, { "epoch": 1.5576311281507462, "grad_norm": 0.020883407443761826, "learning_rate": 4.95280234507705e-05, "loss": 0.0631, "num_input_tokens_seen": 18263376, "step": 19095 }, { "epoch": 1.5580389917611552, "grad_norm": 5.074875831604004, "learning_rate": 4.9527334854224076e-05, "loss": 0.1457, "num_input_tokens_seen": 18266832, "step": 19100 }, { "epoch": 1.5584468553715638, "grad_norm": 0.27746230363845825, "learning_rate": 4.9526645760519005e-05, "loss": 0.0894, "num_input_tokens_seen": 18271792, "step": 19105 }, { "epoch": 1.5588547189819724, "grad_norm": 0.9654653668403625, "learning_rate": 4.9525956169669244e-05, "loss": 0.3521, "num_input_tokens_seen": 18276784, "step": 19110 }, { "epoch": 1.5592625825923812, "grad_norm": 0.18087568879127502, "learning_rate": 4.9525266081688784e-05, "loss": 0.2649, "num_input_tokens_seen": 18281488, "step": 19115 }, { "epoch": 1.55967044620279, "grad_norm": 0.2312154620885849, "learning_rate": 4.952457549659159e-05, "loss": 0.0129, "num_input_tokens_seen": 18286464, "step": 19120 }, { "epoch": 1.5600783098131985, "grad_norm": 17.166168212890625, "learning_rate": 4.952388441439168e-05, "loss": 0.2441, "num_input_tokens_seen": 18291152, "step": 19125 }, { "epoch": 1.560486173423607, "grad_norm": 0.1633739173412323, "learning_rate": 4.9523192835103036e-05, "loss": 0.006, "num_input_tokens_seen": 18296480, "step": 19130 }, { "epoch": 1.5608940370340159, "grad_norm": 0.10792500525712967, "learning_rate": 4.952250075873971e-05, "loss": 0.1762, "num_input_tokens_seen": 18300624, "step": 19135 }, { "epoch": 1.5613019006444246, "grad_norm": 0.02958725206553936, "learning_rate": 4.952180818531571e-05, "loss": 0.0052, "num_input_tokens_seen": 18306176, "step": 19140 }, { "epoch": 1.5617097642548332, "grad_norm": 0.03942986950278282, "learning_rate": 4.952111511484508e-05, "loss": 0.0974, "num_input_tokens_seen": 18311200, "step": 19145 }, { "epoch": 1.5621176278652418, "grad_norm": 0.013013306073844433, "learning_rate": 4.952042154734187e-05, "loss": 0.2508, "num_input_tokens_seen": 18316608, "step": 19150 }, { "epoch": 1.5625254914756506, "grad_norm": 4.087923526763916, "learning_rate": 4.9519727482820123e-05, "loss": 0.3454, "num_input_tokens_seen": 18321712, "step": 19155 }, { "epoch": 1.5629333550860594, "grad_norm": 5.5040202140808105, "learning_rate": 4.9519032921293924e-05, "loss": 0.2524, "num_input_tokens_seen": 18326064, "step": 19160 }, { "epoch": 1.563341218696468, "grad_norm": 0.8204525113105774, "learning_rate": 4.951833786277735e-05, "loss": 0.3663, "num_input_tokens_seen": 18331104, "step": 19165 }, { "epoch": 1.5637490823068765, "grad_norm": 0.3113190829753876, "learning_rate": 4.951764230728448e-05, "loss": 0.033, "num_input_tokens_seen": 18335888, "step": 19170 }, { "epoch": 1.5641569459172853, "grad_norm": 0.09412718564271927, "learning_rate": 4.951694625482942e-05, "loss": 0.0184, "num_input_tokens_seen": 18339904, "step": 19175 }, { "epoch": 1.564564809527694, "grad_norm": 0.0820666179060936, "learning_rate": 4.9516249705426283e-05, "loss": 0.3534, "num_input_tokens_seen": 18343552, "step": 19180 }, { "epoch": 1.5649726731381026, "grad_norm": 0.055644240230321884, "learning_rate": 4.951555265908918e-05, "loss": 0.1547, "num_input_tokens_seen": 18348720, "step": 19185 }, { "epoch": 1.5653805367485112, "grad_norm": 0.21556197106838226, "learning_rate": 4.9514855115832236e-05, "loss": 0.1492, "num_input_tokens_seen": 18353056, "step": 19190 }, { "epoch": 1.56578840035892, "grad_norm": 4.261964321136475, "learning_rate": 4.95141570756696e-05, "loss": 0.31, "num_input_tokens_seen": 18357456, "step": 19195 }, { "epoch": 1.5661962639693288, "grad_norm": 0.05629557743668556, "learning_rate": 4.951345853861542e-05, "loss": 0.0306, "num_input_tokens_seen": 18361984, "step": 19200 }, { "epoch": 1.5666041275797373, "grad_norm": 14.289680480957031, "learning_rate": 4.9512759504683845e-05, "loss": 0.0375, "num_input_tokens_seen": 18366896, "step": 19205 }, { "epoch": 1.567011991190146, "grad_norm": 0.10736986994743347, "learning_rate": 4.951205997388906e-05, "loss": 0.0087, "num_input_tokens_seen": 18370544, "step": 19210 }, { "epoch": 1.5674198548005547, "grad_norm": 11.420852661132812, "learning_rate": 4.951135994624523e-05, "loss": 0.2134, "num_input_tokens_seen": 18375472, "step": 19215 }, { "epoch": 1.5678277184109635, "grad_norm": 0.24742959439754486, "learning_rate": 4.9510659421766546e-05, "loss": 0.0078, "num_input_tokens_seen": 18380608, "step": 19220 }, { "epoch": 1.568235582021372, "grad_norm": 0.3748226463794708, "learning_rate": 4.9509958400467213e-05, "loss": 0.2463, "num_input_tokens_seen": 18384640, "step": 19225 }, { "epoch": 1.5686434456317806, "grad_norm": 0.015772778540849686, "learning_rate": 4.9509256882361435e-05, "loss": 0.1817, "num_input_tokens_seen": 18389728, "step": 19230 }, { "epoch": 1.5690513092421894, "grad_norm": 0.1445237547159195, "learning_rate": 4.950855486746344e-05, "loss": 0.009, "num_input_tokens_seen": 18394400, "step": 19235 }, { "epoch": 1.5694591728525982, "grad_norm": 0.026413314044475555, "learning_rate": 4.950785235578745e-05, "loss": 0.0079, "num_input_tokens_seen": 18398896, "step": 19240 }, { "epoch": 1.5698670364630067, "grad_norm": 0.031762879341840744, "learning_rate": 4.9507149347347706e-05, "loss": 0.1109, "num_input_tokens_seen": 18404160, "step": 19245 }, { "epoch": 1.5702749000734153, "grad_norm": 0.07417972385883331, "learning_rate": 4.950644584215847e-05, "loss": 0.2107, "num_input_tokens_seen": 18409488, "step": 19250 }, { "epoch": 1.570682763683824, "grad_norm": 3.962584972381592, "learning_rate": 4.950574184023398e-05, "loss": 0.236, "num_input_tokens_seen": 18414320, "step": 19255 }, { "epoch": 1.5710906272942329, "grad_norm": 0.020241305232048035, "learning_rate": 4.950503734158851e-05, "loss": 0.2836, "num_input_tokens_seen": 18418752, "step": 19260 }, { "epoch": 1.5714984909046414, "grad_norm": 3.250755548477173, "learning_rate": 4.9504332346236346e-05, "loss": 0.4812, "num_input_tokens_seen": 18424240, "step": 19265 }, { "epoch": 1.57190635451505, "grad_norm": 0.028180936351418495, "learning_rate": 4.950362685419179e-05, "loss": 0.0096, "num_input_tokens_seen": 18428464, "step": 19270 }, { "epoch": 1.5723142181254588, "grad_norm": 0.02080872468650341, "learning_rate": 4.9502920865469126e-05, "loss": 0.0954, "num_input_tokens_seen": 18432944, "step": 19275 }, { "epoch": 1.5727220817358676, "grad_norm": 0.369676798582077, "learning_rate": 4.9502214380082664e-05, "loss": 0.0166, "num_input_tokens_seen": 18438032, "step": 19280 }, { "epoch": 1.5731299453462761, "grad_norm": 0.08004046976566315, "learning_rate": 4.9501507398046734e-05, "loss": 0.1124, "num_input_tokens_seen": 18443248, "step": 19285 }, { "epoch": 1.573537808956685, "grad_norm": 0.022165995091199875, "learning_rate": 4.950079991937565e-05, "loss": 0.2587, "num_input_tokens_seen": 18447568, "step": 19290 }, { "epoch": 1.5739456725670937, "grad_norm": 2.03117036819458, "learning_rate": 4.950009194408377e-05, "loss": 0.1779, "num_input_tokens_seen": 18452592, "step": 19295 }, { "epoch": 1.5743535361775023, "grad_norm": 0.07767529040575027, "learning_rate": 4.949938347218544e-05, "loss": 0.1053, "num_input_tokens_seen": 18458192, "step": 19300 }, { "epoch": 1.5747613997879109, "grad_norm": 17.450897216796875, "learning_rate": 4.949867450369501e-05, "loss": 0.1112, "num_input_tokens_seen": 18462192, "step": 19305 }, { "epoch": 1.5751692633983196, "grad_norm": 1.5949705839157104, "learning_rate": 4.949796503862686e-05, "loss": 0.0336, "num_input_tokens_seen": 18466144, "step": 19310 }, { "epoch": 1.5755771270087284, "grad_norm": 3.4118525981903076, "learning_rate": 4.949725507699537e-05, "loss": 0.0805, "num_input_tokens_seen": 18470752, "step": 19315 }, { "epoch": 1.575984990619137, "grad_norm": 0.03634452819824219, "learning_rate": 4.949654461881493e-05, "loss": 0.3514, "num_input_tokens_seen": 18474848, "step": 19320 }, { "epoch": 1.5763928542295456, "grad_norm": 0.12801946699619293, "learning_rate": 4.949583366409993e-05, "loss": 0.4363, "num_input_tokens_seen": 18479616, "step": 19325 }, { "epoch": 1.5768007178399543, "grad_norm": 0.028408989310264587, "learning_rate": 4.949512221286481e-05, "loss": 0.226, "num_input_tokens_seen": 18485024, "step": 19330 }, { "epoch": 1.5772085814503631, "grad_norm": 3.7211010456085205, "learning_rate": 4.949441026512396e-05, "loss": 0.3483, "num_input_tokens_seen": 18489248, "step": 19335 }, { "epoch": 1.5776164450607717, "grad_norm": 3.999729871749878, "learning_rate": 4.949369782089182e-05, "loss": 0.9189, "num_input_tokens_seen": 18493040, "step": 19340 }, { "epoch": 1.5780243086711803, "grad_norm": 0.1597064882516861, "learning_rate": 4.9492984880182833e-05, "loss": 0.2782, "num_input_tokens_seen": 18497776, "step": 19345 }, { "epoch": 1.578432172281589, "grad_norm": 0.17178961634635925, "learning_rate": 4.949227144301146e-05, "loss": 0.1446, "num_input_tokens_seen": 18502544, "step": 19350 }, { "epoch": 1.5788400358919978, "grad_norm": 1.5996142625808716, "learning_rate": 4.949155750939214e-05, "loss": 0.0722, "num_input_tokens_seen": 18506992, "step": 19355 }, { "epoch": 1.5792478995024064, "grad_norm": 0.13007473945617676, "learning_rate": 4.949084307933937e-05, "loss": 0.0669, "num_input_tokens_seen": 18512352, "step": 19360 }, { "epoch": 1.579655763112815, "grad_norm": 0.5968101024627686, "learning_rate": 4.94901281528676e-05, "loss": 0.2223, "num_input_tokens_seen": 18517568, "step": 19365 }, { "epoch": 1.5800636267232238, "grad_norm": 8.389349937438965, "learning_rate": 4.9489412729991344e-05, "loss": 0.25, "num_input_tokens_seen": 18522608, "step": 19370 }, { "epoch": 1.5804714903336325, "grad_norm": 0.17505532503128052, "learning_rate": 4.948869681072511e-05, "loss": 0.1241, "num_input_tokens_seen": 18527584, "step": 19375 }, { "epoch": 1.580879353944041, "grad_norm": 0.101381316781044, "learning_rate": 4.948798039508339e-05, "loss": 0.0215, "num_input_tokens_seen": 18532672, "step": 19380 }, { "epoch": 1.5812872175544497, "grad_norm": 0.017488673329353333, "learning_rate": 4.948726348308071e-05, "loss": 0.2536, "num_input_tokens_seen": 18537376, "step": 19385 }, { "epoch": 1.5816950811648585, "grad_norm": 5.535292148590088, "learning_rate": 4.948654607473161e-05, "loss": 0.1514, "num_input_tokens_seen": 18541584, "step": 19390 }, { "epoch": 1.5821029447752673, "grad_norm": 0.022950707003474236, "learning_rate": 4.9485828170050615e-05, "loss": 0.032, "num_input_tokens_seen": 18546928, "step": 19395 }, { "epoch": 1.5825108083856758, "grad_norm": 1.346414566040039, "learning_rate": 4.94851097690523e-05, "loss": 0.0352, "num_input_tokens_seen": 18552528, "step": 19400 }, { "epoch": 1.5829186719960844, "grad_norm": 0.6027689576148987, "learning_rate": 4.9484390871751215e-05, "loss": 0.2531, "num_input_tokens_seen": 18557424, "step": 19405 }, { "epoch": 1.5833265356064932, "grad_norm": 0.0272117517888546, "learning_rate": 4.9483671478161926e-05, "loss": 0.0733, "num_input_tokens_seen": 18562624, "step": 19410 }, { "epoch": 1.583734399216902, "grad_norm": 0.03608863428235054, "learning_rate": 4.948295158829902e-05, "loss": 0.0044, "num_input_tokens_seen": 18567600, "step": 19415 }, { "epoch": 1.5841422628273105, "grad_norm": 0.08901354670524597, "learning_rate": 4.9482231202177085e-05, "loss": 0.5101, "num_input_tokens_seen": 18571952, "step": 19420 }, { "epoch": 1.584550126437719, "grad_norm": 0.1845189779996872, "learning_rate": 4.948151031981073e-05, "loss": 0.1921, "num_input_tokens_seen": 18576272, "step": 19425 }, { "epoch": 1.5849579900481279, "grad_norm": 0.033770669251680374, "learning_rate": 4.948078894121457e-05, "loss": 0.5371, "num_input_tokens_seen": 18580896, "step": 19430 }, { "epoch": 1.5853658536585367, "grad_norm": 0.14328427612781525, "learning_rate": 4.948006706640321e-05, "loss": 0.0355, "num_input_tokens_seen": 18586448, "step": 19435 }, { "epoch": 1.5857737172689452, "grad_norm": 5.628300666809082, "learning_rate": 4.94793446953913e-05, "loss": 0.3023, "num_input_tokens_seen": 18590752, "step": 19440 }, { "epoch": 1.5861815808793538, "grad_norm": 0.14865145087242126, "learning_rate": 4.947862182819347e-05, "loss": 0.1137, "num_input_tokens_seen": 18595056, "step": 19445 }, { "epoch": 1.5865894444897626, "grad_norm": 3.8594648838043213, "learning_rate": 4.947789846482438e-05, "loss": 0.2452, "num_input_tokens_seen": 18599520, "step": 19450 }, { "epoch": 1.5869973081001714, "grad_norm": 0.24979965388774872, "learning_rate": 4.947717460529869e-05, "loss": 0.244, "num_input_tokens_seen": 18603296, "step": 19455 }, { "epoch": 1.58740517171058, "grad_norm": 0.057148560881614685, "learning_rate": 4.9476450249631075e-05, "loss": 0.2369, "num_input_tokens_seen": 18608304, "step": 19460 }, { "epoch": 1.5878130353209885, "grad_norm": 5.691132068634033, "learning_rate": 4.94757253978362e-05, "loss": 0.2514, "num_input_tokens_seen": 18612624, "step": 19465 }, { "epoch": 1.5882208989313975, "grad_norm": 0.049914706498384476, "learning_rate": 4.947500004992879e-05, "loss": 0.0231, "num_input_tokens_seen": 18617552, "step": 19470 }, { "epoch": 1.588628762541806, "grad_norm": 0.24187220633029938, "learning_rate": 4.947427420592352e-05, "loss": 0.0962, "num_input_tokens_seen": 18622736, "step": 19475 }, { "epoch": 1.5890366261522146, "grad_norm": 5.600076675415039, "learning_rate": 4.9473547865835115e-05, "loss": 0.2649, "num_input_tokens_seen": 18628400, "step": 19480 }, { "epoch": 1.5894444897626234, "grad_norm": 7.232548713684082, "learning_rate": 4.947282102967828e-05, "loss": 0.1151, "num_input_tokens_seen": 18633680, "step": 19485 }, { "epoch": 1.5898523533730322, "grad_norm": 0.415971577167511, "learning_rate": 4.9472093697467784e-05, "loss": 0.135, "num_input_tokens_seen": 18638912, "step": 19490 }, { "epoch": 1.5902602169834408, "grad_norm": 7.859434127807617, "learning_rate": 4.947136586921833e-05, "loss": 0.2019, "num_input_tokens_seen": 18643248, "step": 19495 }, { "epoch": 1.5906680805938493, "grad_norm": 8.687005996704102, "learning_rate": 4.94706375449447e-05, "loss": 0.3437, "num_input_tokens_seen": 18647408, "step": 19500 }, { "epoch": 1.5910759442042581, "grad_norm": 9.644918441772461, "learning_rate": 4.946990872466164e-05, "loss": 0.1022, "num_input_tokens_seen": 18651360, "step": 19505 }, { "epoch": 1.591483807814667, "grad_norm": 4.951041221618652, "learning_rate": 4.9469179408383925e-05, "loss": 0.464, "num_input_tokens_seen": 18657008, "step": 19510 }, { "epoch": 1.5918916714250755, "grad_norm": 5.581594467163086, "learning_rate": 4.946844959612635e-05, "loss": 0.2983, "num_input_tokens_seen": 18662224, "step": 19515 }, { "epoch": 1.592299535035484, "grad_norm": 7.850131988525391, "learning_rate": 4.946771928790369e-05, "loss": 0.6046, "num_input_tokens_seen": 18667296, "step": 19520 }, { "epoch": 1.5927073986458928, "grad_norm": 29.554861068725586, "learning_rate": 4.946698848373076e-05, "loss": 0.4911, "num_input_tokens_seen": 18672336, "step": 19525 }, { "epoch": 1.5931152622563016, "grad_norm": 31.031330108642578, "learning_rate": 4.946625718362237e-05, "loss": 0.9337, "num_input_tokens_seen": 18676304, "step": 19530 }, { "epoch": 1.5935231258667102, "grad_norm": 16.102495193481445, "learning_rate": 4.946552538759335e-05, "loss": 0.4273, "num_input_tokens_seen": 18681216, "step": 19535 }, { "epoch": 1.5939309894771188, "grad_norm": 5.992430686950684, "learning_rate": 4.9464793095658516e-05, "loss": 0.4433, "num_input_tokens_seen": 18685440, "step": 19540 }, { "epoch": 1.5943388530875275, "grad_norm": 1.9187651872634888, "learning_rate": 4.946406030783273e-05, "loss": 0.3931, "num_input_tokens_seen": 18690272, "step": 19545 }, { "epoch": 1.5947467166979363, "grad_norm": 11.27187442779541, "learning_rate": 4.946332702413083e-05, "loss": 0.6154, "num_input_tokens_seen": 18695264, "step": 19550 }, { "epoch": 1.595154580308345, "grad_norm": 4.39964485168457, "learning_rate": 4.946259324456769e-05, "loss": 0.2145, "num_input_tokens_seen": 18699856, "step": 19555 }, { "epoch": 1.5955624439187535, "grad_norm": 9.297755241394043, "learning_rate": 4.946185896915818e-05, "loss": 0.3101, "num_input_tokens_seen": 18705072, "step": 19560 }, { "epoch": 1.5959703075291622, "grad_norm": 10.041035652160645, "learning_rate": 4.9461124197917175e-05, "loss": 0.3251, "num_input_tokens_seen": 18709664, "step": 19565 }, { "epoch": 1.596378171139571, "grad_norm": 0.053290773183107376, "learning_rate": 4.946038893085959e-05, "loss": 0.1329, "num_input_tokens_seen": 18714288, "step": 19570 }, { "epoch": 1.5967860347499796, "grad_norm": 10.834162712097168, "learning_rate": 4.9459653168000306e-05, "loss": 0.0622, "num_input_tokens_seen": 18719952, "step": 19575 }, { "epoch": 1.5971938983603882, "grad_norm": 0.1593756079673767, "learning_rate": 4.945891690935425e-05, "loss": 0.2934, "num_input_tokens_seen": 18724736, "step": 19580 }, { "epoch": 1.597601761970797, "grad_norm": 0.5282103419303894, "learning_rate": 4.945818015493634e-05, "loss": 0.0154, "num_input_tokens_seen": 18728944, "step": 19585 }, { "epoch": 1.5980096255812057, "grad_norm": 1.2783328294754028, "learning_rate": 4.945744290476151e-05, "loss": 0.216, "num_input_tokens_seen": 18733584, "step": 19590 }, { "epoch": 1.5984174891916143, "grad_norm": 0.02146613411605358, "learning_rate": 4.945670515884471e-05, "loss": 0.0941, "num_input_tokens_seen": 18738288, "step": 19595 }, { "epoch": 1.5988253528020229, "grad_norm": 0.0651465430855751, "learning_rate": 4.945596691720088e-05, "loss": 0.2183, "num_input_tokens_seen": 18743632, "step": 19600 }, { "epoch": 1.5992332164124317, "grad_norm": 0.11292393505573273, "learning_rate": 4.9455228179845e-05, "loss": 0.1317, "num_input_tokens_seen": 18748896, "step": 19605 }, { "epoch": 1.5996410800228404, "grad_norm": 0.053552716970443726, "learning_rate": 4.9454488946792035e-05, "loss": 0.1855, "num_input_tokens_seen": 18754096, "step": 19610 }, { "epoch": 1.600048943633249, "grad_norm": 0.0284368135035038, "learning_rate": 4.945374921805697e-05, "loss": 0.1168, "num_input_tokens_seen": 18758448, "step": 19615 }, { "epoch": 1.6004568072436576, "grad_norm": 0.055606428533792496, "learning_rate": 4.9453008993654795e-05, "loss": 0.1948, "num_input_tokens_seen": 18762992, "step": 19620 }, { "epoch": 1.6008646708540664, "grad_norm": 0.7053053975105286, "learning_rate": 4.945226827360052e-05, "loss": 0.0275, "num_input_tokens_seen": 18768144, "step": 19625 }, { "epoch": 1.6012725344644752, "grad_norm": 0.08042552322149277, "learning_rate": 4.945152705790916e-05, "loss": 0.0077, "num_input_tokens_seen": 18773536, "step": 19630 }, { "epoch": 1.6016803980748837, "grad_norm": 4.843117713928223, "learning_rate": 4.945078534659574e-05, "loss": 0.0689, "num_input_tokens_seen": 18778128, "step": 19635 }, { "epoch": 1.6020882616852923, "grad_norm": 0.052354514598846436, "learning_rate": 4.9450043139675286e-05, "loss": 0.128, "num_input_tokens_seen": 18781968, "step": 19640 }, { "epoch": 1.602496125295701, "grad_norm": 3.261622190475464, "learning_rate": 4.9449300437162846e-05, "loss": 0.448, "num_input_tokens_seen": 18787008, "step": 19645 }, { "epoch": 1.6029039889061099, "grad_norm": 0.19990545511245728, "learning_rate": 4.944855723907348e-05, "loss": 0.2634, "num_input_tokens_seen": 18792064, "step": 19650 }, { "epoch": 1.6033118525165184, "grad_norm": 3.68473219871521, "learning_rate": 4.944781354542224e-05, "loss": 0.1843, "num_input_tokens_seen": 18796784, "step": 19655 }, { "epoch": 1.6037197161269272, "grad_norm": 0.1776457279920578, "learning_rate": 4.944706935622422e-05, "loss": 0.1716, "num_input_tokens_seen": 18802144, "step": 19660 }, { "epoch": 1.604127579737336, "grad_norm": 0.4346478283405304, "learning_rate": 4.944632467149448e-05, "loss": 0.0203, "num_input_tokens_seen": 18806624, "step": 19665 }, { "epoch": 1.6045354433477446, "grad_norm": 0.25370362401008606, "learning_rate": 4.944557949124814e-05, "loss": 0.1059, "num_input_tokens_seen": 18811520, "step": 19670 }, { "epoch": 1.6049433069581531, "grad_norm": 0.2554357647895813, "learning_rate": 4.944483381550028e-05, "loss": 0.2313, "num_input_tokens_seen": 18815952, "step": 19675 }, { "epoch": 1.605351170568562, "grad_norm": 6.804846286773682, "learning_rate": 4.944408764426603e-05, "loss": 0.3092, "num_input_tokens_seen": 18820528, "step": 19680 }, { "epoch": 1.6057590341789707, "grad_norm": 3.499617338180542, "learning_rate": 4.9443340977560505e-05, "loss": 0.1467, "num_input_tokens_seen": 18825760, "step": 19685 }, { "epoch": 1.6061668977893793, "grad_norm": 0.04105742275714874, "learning_rate": 4.944259381539885e-05, "loss": 0.028, "num_input_tokens_seen": 18830928, "step": 19690 }, { "epoch": 1.6065747613997878, "grad_norm": 0.1570267230272293, "learning_rate": 4.94418461577962e-05, "loss": 0.3982, "num_input_tokens_seen": 18836192, "step": 19695 }, { "epoch": 1.6069826250101966, "grad_norm": 0.10729013383388519, "learning_rate": 4.944109800476772e-05, "loss": 0.2405, "num_input_tokens_seen": 18841200, "step": 19700 }, { "epoch": 1.6073904886206054, "grad_norm": 0.048199936747550964, "learning_rate": 4.944034935632857e-05, "loss": 0.0471, "num_input_tokens_seen": 18845360, "step": 19705 }, { "epoch": 1.607798352231014, "grad_norm": 0.0752587616443634, "learning_rate": 4.943960021249392e-05, "loss": 0.1576, "num_input_tokens_seen": 18850208, "step": 19710 }, { "epoch": 1.6082062158414225, "grad_norm": 0.8480772972106934, "learning_rate": 4.943885057327896e-05, "loss": 0.1283, "num_input_tokens_seen": 18855136, "step": 19715 }, { "epoch": 1.6086140794518313, "grad_norm": 0.09038218855857849, "learning_rate": 4.9438100438698886e-05, "loss": 0.1561, "num_input_tokens_seen": 18859776, "step": 19720 }, { "epoch": 1.6090219430622401, "grad_norm": 0.20161478221416473, "learning_rate": 4.9437349808768894e-05, "loss": 0.0254, "num_input_tokens_seen": 18864384, "step": 19725 }, { "epoch": 1.6094298066726487, "grad_norm": 0.06852541863918304, "learning_rate": 4.943659868350421e-05, "loss": 0.0132, "num_input_tokens_seen": 18868800, "step": 19730 }, { "epoch": 1.6098376702830572, "grad_norm": 3.1084792613983154, "learning_rate": 4.9435847062920054e-05, "loss": 0.0392, "num_input_tokens_seen": 18872928, "step": 19735 }, { "epoch": 1.610245533893466, "grad_norm": 4.197193622589111, "learning_rate": 4.9435094947031665e-05, "loss": 0.0413, "num_input_tokens_seen": 18878640, "step": 19740 }, { "epoch": 1.6106533975038748, "grad_norm": 0.03817307949066162, "learning_rate": 4.943434233585428e-05, "loss": 0.1385, "num_input_tokens_seen": 18883328, "step": 19745 }, { "epoch": 1.6110612611142834, "grad_norm": 0.024318328127264977, "learning_rate": 4.943358922940316e-05, "loss": 0.0389, "num_input_tokens_seen": 18888608, "step": 19750 }, { "epoch": 1.611469124724692, "grad_norm": 0.09126190096139908, "learning_rate": 4.9432835627693563e-05, "loss": 0.1108, "num_input_tokens_seen": 18892352, "step": 19755 }, { "epoch": 1.6118769883351007, "grad_norm": 0.5965274572372437, "learning_rate": 4.943208153074078e-05, "loss": 0.0221, "num_input_tokens_seen": 18896560, "step": 19760 }, { "epoch": 1.6122848519455095, "grad_norm": 4.917475700378418, "learning_rate": 4.943132693856008e-05, "loss": 0.6436, "num_input_tokens_seen": 18901184, "step": 19765 }, { "epoch": 1.612692715555918, "grad_norm": 0.021260712295770645, "learning_rate": 4.943057185116676e-05, "loss": 0.4262, "num_input_tokens_seen": 18906272, "step": 19770 }, { "epoch": 1.6131005791663267, "grad_norm": 0.11782843619585037, "learning_rate": 4.9429816268576146e-05, "loss": 0.0059, "num_input_tokens_seen": 18911952, "step": 19775 }, { "epoch": 1.6135084427767354, "grad_norm": 0.019650598987936974, "learning_rate": 4.9429060190803524e-05, "loss": 0.4509, "num_input_tokens_seen": 18917264, "step": 19780 }, { "epoch": 1.6139163063871442, "grad_norm": 15.80191707611084, "learning_rate": 4.942830361786424e-05, "loss": 0.4247, "num_input_tokens_seen": 18922240, "step": 19785 }, { "epoch": 1.6143241699975528, "grad_norm": 0.07718734443187714, "learning_rate": 4.942754654977361e-05, "loss": 0.4969, "num_input_tokens_seen": 18927296, "step": 19790 }, { "epoch": 1.6147320336079614, "grad_norm": 0.2829466760158539, "learning_rate": 4.9426788986547e-05, "loss": 0.1177, "num_input_tokens_seen": 18931808, "step": 19795 }, { "epoch": 1.6151398972183701, "grad_norm": 0.3075456917285919, "learning_rate": 4.942603092819975e-05, "loss": 0.1184, "num_input_tokens_seen": 18936224, "step": 19800 }, { "epoch": 1.615547760828779, "grad_norm": 4.922769546508789, "learning_rate": 4.942527237474724e-05, "loss": 0.0585, "num_input_tokens_seen": 18940720, "step": 19805 }, { "epoch": 1.6159556244391875, "grad_norm": 0.33138731122016907, "learning_rate": 4.942451332620483e-05, "loss": 0.0334, "num_input_tokens_seen": 18945488, "step": 19810 }, { "epoch": 1.616363488049596, "grad_norm": 0.08991144597530365, "learning_rate": 4.942375378258793e-05, "loss": 0.0449, "num_input_tokens_seen": 18949984, "step": 19815 }, { "epoch": 1.6167713516600049, "grad_norm": 0.18477369844913483, "learning_rate": 4.94229937439119e-05, "loss": 0.0196, "num_input_tokens_seen": 18953984, "step": 19820 }, { "epoch": 1.6171792152704136, "grad_norm": 0.7614602446556091, "learning_rate": 4.942223321019218e-05, "loss": 0.3864, "num_input_tokens_seen": 18959664, "step": 19825 }, { "epoch": 1.6175870788808222, "grad_norm": 3.0564544200897217, "learning_rate": 4.942147218144416e-05, "loss": 0.3738, "num_input_tokens_seen": 18964304, "step": 19830 }, { "epoch": 1.6179949424912308, "grad_norm": 0.023938896134495735, "learning_rate": 4.942071065768328e-05, "loss": 0.0953, "num_input_tokens_seen": 18968288, "step": 19835 }, { "epoch": 1.6184028061016396, "grad_norm": 2.3704206943511963, "learning_rate": 4.941994863892497e-05, "loss": 0.1424, "num_input_tokens_seen": 18972800, "step": 19840 }, { "epoch": 1.6188106697120483, "grad_norm": 1.6470234394073486, "learning_rate": 4.941918612518468e-05, "loss": 0.0949, "num_input_tokens_seen": 18977072, "step": 19845 }, { "epoch": 1.619218533322457, "grad_norm": 0.045289263129234314, "learning_rate": 4.941842311647786e-05, "loss": 0.0293, "num_input_tokens_seen": 18982432, "step": 19850 }, { "epoch": 1.6196263969328657, "grad_norm": 0.8221303224563599, "learning_rate": 4.941765961281999e-05, "loss": 0.1291, "num_input_tokens_seen": 18986912, "step": 19855 }, { "epoch": 1.6200342605432745, "grad_norm": 0.05784674733877182, "learning_rate": 4.941689561422652e-05, "loss": 0.1968, "num_input_tokens_seen": 18992048, "step": 19860 }, { "epoch": 1.620442124153683, "grad_norm": 4.831726551055908, "learning_rate": 4.941613112071296e-05, "loss": 0.2478, "num_input_tokens_seen": 18997408, "step": 19865 }, { "epoch": 1.6208499877640916, "grad_norm": 0.09518469125032425, "learning_rate": 4.94153661322948e-05, "loss": 0.2769, "num_input_tokens_seen": 19002368, "step": 19870 }, { "epoch": 1.6212578513745004, "grad_norm": 0.06662797927856445, "learning_rate": 4.941460064898754e-05, "loss": 0.3852, "num_input_tokens_seen": 19006912, "step": 19875 }, { "epoch": 1.6216657149849092, "grad_norm": 1.1845769882202148, "learning_rate": 4.9413834670806696e-05, "loss": 0.0343, "num_input_tokens_seen": 19011776, "step": 19880 }, { "epoch": 1.6220735785953178, "grad_norm": 0.09983492642641068, "learning_rate": 4.94130681977678e-05, "loss": 0.1402, "num_input_tokens_seen": 19016880, "step": 19885 }, { "epoch": 1.6224814422057263, "grad_norm": 0.11992083489894867, "learning_rate": 4.9412301229886386e-05, "loss": 0.2342, "num_input_tokens_seen": 19021168, "step": 19890 }, { "epoch": 1.622889305816135, "grad_norm": 0.09650745242834091, "learning_rate": 4.9411533767178e-05, "loss": 0.0295, "num_input_tokens_seen": 19026144, "step": 19895 }, { "epoch": 1.623297169426544, "grad_norm": 1.698657751083374, "learning_rate": 4.94107658096582e-05, "loss": 0.1338, "num_input_tokens_seen": 19031072, "step": 19900 }, { "epoch": 1.6237050330369525, "grad_norm": 3.182133674621582, "learning_rate": 4.940999735734254e-05, "loss": 0.4614, "num_input_tokens_seen": 19035200, "step": 19905 }, { "epoch": 1.624112896647361, "grad_norm": 0.05414064601063728, "learning_rate": 4.940922841024661e-05, "loss": 0.1015, "num_input_tokens_seen": 19040000, "step": 19910 }, { "epoch": 1.6245207602577698, "grad_norm": 1.4555065631866455, "learning_rate": 4.9408458968385994e-05, "loss": 0.0206, "num_input_tokens_seen": 19044464, "step": 19915 }, { "epoch": 1.6249286238681786, "grad_norm": 0.04854395613074303, "learning_rate": 4.940768903177629e-05, "loss": 0.0252, "num_input_tokens_seen": 19048304, "step": 19920 }, { "epoch": 1.6253364874785872, "grad_norm": 7.592998504638672, "learning_rate": 4.94069186004331e-05, "loss": 0.3394, "num_input_tokens_seen": 19052192, "step": 19925 }, { "epoch": 1.6257443510889957, "grad_norm": 3.4867115020751953, "learning_rate": 4.940614767437204e-05, "loss": 0.5438, "num_input_tokens_seen": 19057504, "step": 19930 }, { "epoch": 1.6261522146994045, "grad_norm": 0.19530747830867767, "learning_rate": 4.940537625360874e-05, "loss": 0.0393, "num_input_tokens_seen": 19062512, "step": 19935 }, { "epoch": 1.6265600783098133, "grad_norm": 1.8921141624450684, "learning_rate": 4.9404604338158824e-05, "loss": 0.0883, "num_input_tokens_seen": 19067536, "step": 19940 }, { "epoch": 1.6269679419202219, "grad_norm": 6.16801118850708, "learning_rate": 4.9403831928037955e-05, "loss": 0.1132, "num_input_tokens_seen": 19072704, "step": 19945 }, { "epoch": 1.6273758055306304, "grad_norm": 0.04936852678656578, "learning_rate": 4.9403059023261784e-05, "loss": 0.1102, "num_input_tokens_seen": 19077456, "step": 19950 }, { "epoch": 1.6277836691410392, "grad_norm": 0.18135899305343628, "learning_rate": 4.9402285623845965e-05, "loss": 0.2586, "num_input_tokens_seen": 19082480, "step": 19955 }, { "epoch": 1.628191532751448, "grad_norm": 0.17300544679164886, "learning_rate": 4.9401511729806204e-05, "loss": 0.5264, "num_input_tokens_seen": 19087056, "step": 19960 }, { "epoch": 1.6285993963618566, "grad_norm": 0.4331591725349426, "learning_rate": 4.9400737341158156e-05, "loss": 0.1832, "num_input_tokens_seen": 19091936, "step": 19965 }, { "epoch": 1.6290072599722651, "grad_norm": 0.3834858238697052, "learning_rate": 4.9399962457917535e-05, "loss": 0.1557, "num_input_tokens_seen": 19096416, "step": 19970 }, { "epoch": 1.629415123582674, "grad_norm": 0.20149758458137512, "learning_rate": 4.939918708010004e-05, "loss": 0.0358, "num_input_tokens_seen": 19101024, "step": 19975 }, { "epoch": 1.6298229871930827, "grad_norm": 0.16355468332767487, "learning_rate": 4.9398411207721394e-05, "loss": 0.2809, "num_input_tokens_seen": 19105520, "step": 19980 }, { "epoch": 1.6302308508034913, "grad_norm": 0.3371947109699249, "learning_rate": 4.9397634840797324e-05, "loss": 0.0488, "num_input_tokens_seen": 19110848, "step": 19985 }, { "epoch": 1.6306387144138998, "grad_norm": 0.3426332473754883, "learning_rate": 4.9396857979343556e-05, "loss": 0.1852, "num_input_tokens_seen": 19116592, "step": 19990 }, { "epoch": 1.6310465780243086, "grad_norm": 0.23148827254772186, "learning_rate": 4.939608062337585e-05, "loss": 0.2307, "num_input_tokens_seen": 19120832, "step": 19995 }, { "epoch": 1.6314544416347174, "grad_norm": 0.06395597755908966, "learning_rate": 4.9395302772909944e-05, "loss": 0.1145, "num_input_tokens_seen": 19125984, "step": 20000 }, { "epoch": 1.631862305245126, "grad_norm": 0.035602495074272156, "learning_rate": 4.939452442796163e-05, "loss": 0.0177, "num_input_tokens_seen": 19130752, "step": 20005 }, { "epoch": 1.6322701688555346, "grad_norm": 0.2787283957004547, "learning_rate": 4.939374558854667e-05, "loss": 0.0537, "num_input_tokens_seen": 19135264, "step": 20010 }, { "epoch": 1.6326780324659433, "grad_norm": 0.10359416902065277, "learning_rate": 4.9392966254680845e-05, "loss": 0.1625, "num_input_tokens_seen": 19140560, "step": 20015 }, { "epoch": 1.6330858960763521, "grad_norm": 0.05603097006678581, "learning_rate": 4.939218642637996e-05, "loss": 0.0079, "num_input_tokens_seen": 19145632, "step": 20020 }, { "epoch": 1.6334937596867607, "grad_norm": 0.36059510707855225, "learning_rate": 4.939140610365983e-05, "loss": 0.1129, "num_input_tokens_seen": 19150320, "step": 20025 }, { "epoch": 1.6339016232971695, "grad_norm": 0.09119151532649994, "learning_rate": 4.939062528653625e-05, "loss": 0.1895, "num_input_tokens_seen": 19155760, "step": 20030 }, { "epoch": 1.6343094869075783, "grad_norm": 1.0683683156967163, "learning_rate": 4.938984397502508e-05, "loss": 0.0138, "num_input_tokens_seen": 19161056, "step": 20035 }, { "epoch": 1.6347173505179868, "grad_norm": 0.029997892677783966, "learning_rate": 4.938906216914212e-05, "loss": 0.1335, "num_input_tokens_seen": 19165584, "step": 20040 }, { "epoch": 1.6351252141283954, "grad_norm": 0.010715960524976254, "learning_rate": 4.938827986890323e-05, "loss": 0.4471, "num_input_tokens_seen": 19170704, "step": 20045 }, { "epoch": 1.6355330777388042, "grad_norm": 0.027092020958662033, "learning_rate": 4.938749707432428e-05, "loss": 0.1966, "num_input_tokens_seen": 19175008, "step": 20050 }, { "epoch": 1.635940941349213, "grad_norm": 0.08631006628274918, "learning_rate": 4.938671378542112e-05, "loss": 0.0105, "num_input_tokens_seen": 19179760, "step": 20055 }, { "epoch": 1.6363488049596215, "grad_norm": 6.257284164428711, "learning_rate": 4.9385930002209644e-05, "loss": 0.1362, "num_input_tokens_seen": 19184672, "step": 20060 }, { "epoch": 1.63675666857003, "grad_norm": 0.02127194032073021, "learning_rate": 4.938514572470572e-05, "loss": 0.0956, "num_input_tokens_seen": 19189680, "step": 20065 }, { "epoch": 1.637164532180439, "grad_norm": 1.0615944862365723, "learning_rate": 4.938436095292526e-05, "loss": 0.0122, "num_input_tokens_seen": 19194752, "step": 20070 }, { "epoch": 1.6375723957908477, "grad_norm": 0.09817341715097427, "learning_rate": 4.938357568688417e-05, "loss": 0.0066, "num_input_tokens_seen": 19199344, "step": 20075 }, { "epoch": 1.6379802594012562, "grad_norm": 0.05557721108198166, "learning_rate": 4.938278992659835e-05, "loss": 0.0041, "num_input_tokens_seen": 19203792, "step": 20080 }, { "epoch": 1.6383881230116648, "grad_norm": 0.17551392316818237, "learning_rate": 4.9382003672083744e-05, "loss": 0.0166, "num_input_tokens_seen": 19208976, "step": 20085 }, { "epoch": 1.6387959866220736, "grad_norm": 0.21542112529277802, "learning_rate": 4.938121692335628e-05, "loss": 0.1413, "num_input_tokens_seen": 19213120, "step": 20090 }, { "epoch": 1.6392038502324824, "grad_norm": 0.007951753214001656, "learning_rate": 4.938042968043192e-05, "loss": 0.0029, "num_input_tokens_seen": 19217632, "step": 20095 }, { "epoch": 1.639611713842891, "grad_norm": 0.07902917265892029, "learning_rate": 4.9379641943326596e-05, "loss": 0.074, "num_input_tokens_seen": 19222800, "step": 20100 }, { "epoch": 1.6400195774532995, "grad_norm": 2.583880662918091, "learning_rate": 4.9378853712056296e-05, "loss": 0.2136, "num_input_tokens_seen": 19227776, "step": 20105 }, { "epoch": 1.6404274410637083, "grad_norm": 0.006244329269975424, "learning_rate": 4.9378064986636996e-05, "loss": 0.002, "num_input_tokens_seen": 19232512, "step": 20110 }, { "epoch": 1.640835304674117, "grad_norm": 0.02915559709072113, "learning_rate": 4.937727576708467e-05, "loss": 0.3579, "num_input_tokens_seen": 19237728, "step": 20115 }, { "epoch": 1.6412431682845257, "grad_norm": 0.4059053063392639, "learning_rate": 4.9376486053415326e-05, "loss": 0.0117, "num_input_tokens_seen": 19242320, "step": 20120 }, { "epoch": 1.6416510318949342, "grad_norm": 0.06302224844694138, "learning_rate": 4.937569584564497e-05, "loss": 0.0046, "num_input_tokens_seen": 19247136, "step": 20125 }, { "epoch": 1.642058895505343, "grad_norm": 0.025432124733924866, "learning_rate": 4.937490514378961e-05, "loss": 0.1685, "num_input_tokens_seen": 19251280, "step": 20130 }, { "epoch": 1.6424667591157518, "grad_norm": 0.01041568536311388, "learning_rate": 4.937411394786529e-05, "loss": 0.0182, "num_input_tokens_seen": 19255792, "step": 20135 }, { "epoch": 1.6428746227261604, "grad_norm": 0.19715192914009094, "learning_rate": 4.937332225788803e-05, "loss": 0.1633, "num_input_tokens_seen": 19260208, "step": 20140 }, { "epoch": 1.643282486336569, "grad_norm": 0.17849965393543243, "learning_rate": 4.937253007387389e-05, "loss": 0.2764, "num_input_tokens_seen": 19264096, "step": 20145 }, { "epoch": 1.6436903499469777, "grad_norm": 0.01128388848155737, "learning_rate": 4.9371737395838915e-05, "loss": 0.2152, "num_input_tokens_seen": 19268704, "step": 20150 }, { "epoch": 1.6440982135573865, "grad_norm": 0.05463002622127533, "learning_rate": 4.937094422379918e-05, "loss": 0.0147, "num_input_tokens_seen": 19273840, "step": 20155 }, { "epoch": 1.644506077167795, "grad_norm": 0.15528561174869537, "learning_rate": 4.937015055777077e-05, "loss": 0.1305, "num_input_tokens_seen": 19278528, "step": 20160 }, { "epoch": 1.6449139407782036, "grad_norm": 0.12192332744598389, "learning_rate": 4.936935639776975e-05, "loss": 0.5043, "num_input_tokens_seen": 19283440, "step": 20165 }, { "epoch": 1.6453218043886124, "grad_norm": 4.393192291259766, "learning_rate": 4.936856174381224e-05, "loss": 0.3772, "num_input_tokens_seen": 19288208, "step": 20170 }, { "epoch": 1.6457296679990212, "grad_norm": 0.08558815717697144, "learning_rate": 4.936776659591434e-05, "loss": 0.571, "num_input_tokens_seen": 19293808, "step": 20175 }, { "epoch": 1.6461375316094298, "grad_norm": 4.562851428985596, "learning_rate": 4.936697095409216e-05, "loss": 0.1217, "num_input_tokens_seen": 19298304, "step": 20180 }, { "epoch": 1.6465453952198383, "grad_norm": 5.070316791534424, "learning_rate": 4.9366174818361835e-05, "loss": 0.1623, "num_input_tokens_seen": 19303424, "step": 20185 }, { "epoch": 1.6469532588302471, "grad_norm": 7.227706432342529, "learning_rate": 4.93653781887395e-05, "loss": 0.0912, "num_input_tokens_seen": 19308288, "step": 20190 }, { "epoch": 1.647361122440656, "grad_norm": 0.06908437609672546, "learning_rate": 4.936458106524131e-05, "loss": 0.1243, "num_input_tokens_seen": 19313536, "step": 20195 }, { "epoch": 1.6477689860510645, "grad_norm": 0.09489067643880844, "learning_rate": 4.9363783447883406e-05, "loss": 0.011, "num_input_tokens_seen": 19318496, "step": 20200 }, { "epoch": 1.648176849661473, "grad_norm": 4.493704319000244, "learning_rate": 4.936298533668196e-05, "loss": 0.2641, "num_input_tokens_seen": 19323472, "step": 20205 }, { "epoch": 1.6485847132718818, "grad_norm": 1.6708273887634277, "learning_rate": 4.936218673165317e-05, "loss": 0.5189, "num_input_tokens_seen": 19328848, "step": 20210 }, { "epoch": 1.6489925768822906, "grad_norm": 2.730168104171753, "learning_rate": 4.9361387632813195e-05, "loss": 0.2356, "num_input_tokens_seen": 19333488, "step": 20215 }, { "epoch": 1.6494004404926992, "grad_norm": 0.42742034792900085, "learning_rate": 4.9360588040178246e-05, "loss": 0.24, "num_input_tokens_seen": 19338336, "step": 20220 }, { "epoch": 1.649808304103108, "grad_norm": 227.64125061035156, "learning_rate": 4.935978795376453e-05, "loss": 0.6115, "num_input_tokens_seen": 19343872, "step": 20225 }, { "epoch": 1.6502161677135168, "grad_norm": 0.566416323184967, "learning_rate": 4.935898737358826e-05, "loss": 0.467, "num_input_tokens_seen": 19348912, "step": 20230 }, { "epoch": 1.6506240313239253, "grad_norm": 0.10231582820415497, "learning_rate": 4.9358186299665674e-05, "loss": 0.2631, "num_input_tokens_seen": 19353408, "step": 20235 }, { "epoch": 1.6510318949343339, "grad_norm": 0.14264728128910065, "learning_rate": 4.9357384732013e-05, "loss": 0.0373, "num_input_tokens_seen": 19358176, "step": 20240 }, { "epoch": 1.6514397585447427, "grad_norm": 0.08758833259344101, "learning_rate": 4.9356582670646486e-05, "loss": 0.1513, "num_input_tokens_seen": 19363520, "step": 20245 }, { "epoch": 1.6518476221551515, "grad_norm": 0.20719261467456818, "learning_rate": 4.935578011558239e-05, "loss": 0.2181, "num_input_tokens_seen": 19368096, "step": 20250 }, { "epoch": 1.65225548576556, "grad_norm": 5.829585075378418, "learning_rate": 4.9354977066836986e-05, "loss": 0.5242, "num_input_tokens_seen": 19373072, "step": 20255 }, { "epoch": 1.6526633493759686, "grad_norm": 0.134490504860878, "learning_rate": 4.935417352442653e-05, "loss": 0.1033, "num_input_tokens_seen": 19377680, "step": 20260 }, { "epoch": 1.6530712129863774, "grad_norm": 9.982416152954102, "learning_rate": 4.935336948836734e-05, "loss": 0.4222, "num_input_tokens_seen": 19382928, "step": 20265 }, { "epoch": 1.6534790765967862, "grad_norm": 0.5736533403396606, "learning_rate": 4.93525649586757e-05, "loss": 0.3447, "num_input_tokens_seen": 19387584, "step": 20270 }, { "epoch": 1.6538869402071947, "grad_norm": 0.11580225825309753, "learning_rate": 4.9351759935367904e-05, "loss": 0.266, "num_input_tokens_seen": 19392624, "step": 20275 }, { "epoch": 1.6542948038176033, "grad_norm": 0.08627956360578537, "learning_rate": 4.935095441846029e-05, "loss": 0.0736, "num_input_tokens_seen": 19397008, "step": 20280 }, { "epoch": 1.654702667428012, "grad_norm": 0.07779008150100708, "learning_rate": 4.9350148407969175e-05, "loss": 0.0891, "num_input_tokens_seen": 19401792, "step": 20285 }, { "epoch": 1.6551105310384209, "grad_norm": 0.10620062053203583, "learning_rate": 4.934934190391091e-05, "loss": 0.0187, "num_input_tokens_seen": 19407392, "step": 20290 }, { "epoch": 1.6555183946488294, "grad_norm": 0.20063261687755585, "learning_rate": 4.934853490630181e-05, "loss": 0.2475, "num_input_tokens_seen": 19411680, "step": 20295 }, { "epoch": 1.655926258259238, "grad_norm": 0.8069648742675781, "learning_rate": 4.9347727415158273e-05, "loss": 0.0871, "num_input_tokens_seen": 19416368, "step": 20300 }, { "epoch": 1.6563341218696468, "grad_norm": 0.12417145073413849, "learning_rate": 4.934691943049664e-05, "loss": 0.0151, "num_input_tokens_seen": 19421152, "step": 20305 }, { "epoch": 1.6567419854800556, "grad_norm": 0.27730312943458557, "learning_rate": 4.934611095233329e-05, "loss": 0.0167, "num_input_tokens_seen": 19425632, "step": 20310 }, { "epoch": 1.6571498490904641, "grad_norm": 0.03689054399728775, "learning_rate": 4.934530198068462e-05, "loss": 0.3272, "num_input_tokens_seen": 19430048, "step": 20315 }, { "epoch": 1.6575577127008727, "grad_norm": 1.7139681577682495, "learning_rate": 4.9344492515567034e-05, "loss": 0.2832, "num_input_tokens_seen": 19435424, "step": 20320 }, { "epoch": 1.6579655763112815, "grad_norm": 0.8509698510169983, "learning_rate": 4.934368255699692e-05, "loss": 0.0612, "num_input_tokens_seen": 19440224, "step": 20325 }, { "epoch": 1.6583734399216903, "grad_norm": 3.2002947330474854, "learning_rate": 4.93428721049907e-05, "loss": 0.2252, "num_input_tokens_seen": 19443968, "step": 20330 }, { "epoch": 1.6587813035320988, "grad_norm": 8.382608413696289, "learning_rate": 4.9342061159564815e-05, "loss": 0.1911, "num_input_tokens_seen": 19448640, "step": 20335 }, { "epoch": 1.6591891671425074, "grad_norm": 0.030044345185160637, "learning_rate": 4.9341249720735695e-05, "loss": 0.0159, "num_input_tokens_seen": 19452672, "step": 20340 }, { "epoch": 1.6595970307529162, "grad_norm": 0.018121909350156784, "learning_rate": 4.9340437788519776e-05, "loss": 0.1858, "num_input_tokens_seen": 19457568, "step": 20345 }, { "epoch": 1.660004894363325, "grad_norm": 0.06972166895866394, "learning_rate": 4.933962536293353e-05, "loss": 0.0886, "num_input_tokens_seen": 19462304, "step": 20350 }, { "epoch": 1.6604127579737336, "grad_norm": 1.9100757837295532, "learning_rate": 4.933881244399343e-05, "loss": 0.3733, "num_input_tokens_seen": 19467072, "step": 20355 }, { "epoch": 1.6608206215841421, "grad_norm": 0.07074480503797531, "learning_rate": 4.9337999031715934e-05, "loss": 0.0528, "num_input_tokens_seen": 19470992, "step": 20360 }, { "epoch": 1.661228485194551, "grad_norm": 0.04650735482573509, "learning_rate": 4.933718512611754e-05, "loss": 0.1429, "num_input_tokens_seen": 19475424, "step": 20365 }, { "epoch": 1.6616363488049597, "grad_norm": 2.4339427947998047, "learning_rate": 4.9336370727214745e-05, "loss": 0.2715, "num_input_tokens_seen": 19479200, "step": 20370 }, { "epoch": 1.6620442124153683, "grad_norm": 1.510124921798706, "learning_rate": 4.933555583502406e-05, "loss": 0.177, "num_input_tokens_seen": 19483488, "step": 20375 }, { "epoch": 1.6624520760257768, "grad_norm": 0.8759607076644897, "learning_rate": 4.9334740449562e-05, "loss": 0.0281, "num_input_tokens_seen": 19487504, "step": 20380 }, { "epoch": 1.6628599396361856, "grad_norm": 0.4330200254917145, "learning_rate": 4.933392457084509e-05, "loss": 0.3545, "num_input_tokens_seen": 19492288, "step": 20385 }, { "epoch": 1.6632678032465944, "grad_norm": 0.7630143761634827, "learning_rate": 4.933310819888987e-05, "loss": 0.035, "num_input_tokens_seen": 19497440, "step": 20390 }, { "epoch": 1.663675666857003, "grad_norm": 1.754393219947815, "learning_rate": 4.933229133371288e-05, "loss": 0.1598, "num_input_tokens_seen": 19501440, "step": 20395 }, { "epoch": 1.6640835304674115, "grad_norm": 0.4147678017616272, "learning_rate": 4.933147397533069e-05, "loss": 0.0513, "num_input_tokens_seen": 19506256, "step": 20400 }, { "epoch": 1.6644913940778205, "grad_norm": 0.056333594024181366, "learning_rate": 4.933065612375985e-05, "loss": 0.007, "num_input_tokens_seen": 19511728, "step": 20405 }, { "epoch": 1.664899257688229, "grad_norm": 12.69257926940918, "learning_rate": 4.932983777901697e-05, "loss": 0.1429, "num_input_tokens_seen": 19516336, "step": 20410 }, { "epoch": 1.6653071212986377, "grad_norm": 0.26176777482032776, "learning_rate": 4.9329018941118606e-05, "loss": 0.1671, "num_input_tokens_seen": 19520976, "step": 20415 }, { "epoch": 1.6657149849090465, "grad_norm": 0.10013357549905777, "learning_rate": 4.9328199610081366e-05, "loss": 0.0178, "num_input_tokens_seen": 19525984, "step": 20420 }, { "epoch": 1.6661228485194552, "grad_norm": 3.289250135421753, "learning_rate": 4.932737978592186e-05, "loss": 0.1339, "num_input_tokens_seen": 19530720, "step": 20425 }, { "epoch": 1.6665307121298638, "grad_norm": 5.860978603363037, "learning_rate": 4.93265594686567e-05, "loss": 0.0798, "num_input_tokens_seen": 19535664, "step": 20430 }, { "epoch": 1.6669385757402724, "grad_norm": 0.09452613443136215, "learning_rate": 4.932573865830251e-05, "loss": 0.4898, "num_input_tokens_seen": 19540528, "step": 20435 }, { "epoch": 1.6673464393506812, "grad_norm": 0.03646945208311081, "learning_rate": 4.9324917354875945e-05, "loss": 0.301, "num_input_tokens_seen": 19545840, "step": 20440 }, { "epoch": 1.66775430296109, "grad_norm": 0.04482194781303406, "learning_rate": 4.932409555839364e-05, "loss": 0.2063, "num_input_tokens_seen": 19550320, "step": 20445 }, { "epoch": 1.6681621665714985, "grad_norm": 0.15210513770580292, "learning_rate": 4.932327326887225e-05, "loss": 0.0107, "num_input_tokens_seen": 19555616, "step": 20450 }, { "epoch": 1.668570030181907, "grad_norm": 0.40861767530441284, "learning_rate": 4.932245048632845e-05, "loss": 0.295, "num_input_tokens_seen": 19560768, "step": 20455 }, { "epoch": 1.6689778937923159, "grad_norm": 0.7271597981452942, "learning_rate": 4.932162721077891e-05, "loss": 0.246, "num_input_tokens_seen": 19565008, "step": 20460 }, { "epoch": 1.6693857574027247, "grad_norm": 0.20637789368629456, "learning_rate": 4.9320803442240325e-05, "loss": 0.0409, "num_input_tokens_seen": 19569280, "step": 20465 }, { "epoch": 1.6697936210131332, "grad_norm": 3.5426688194274902, "learning_rate": 4.9319979180729385e-05, "loss": 0.2708, "num_input_tokens_seen": 19574656, "step": 20470 }, { "epoch": 1.6702014846235418, "grad_norm": 1.724758267402649, "learning_rate": 4.9319154426262806e-05, "loss": 0.0838, "num_input_tokens_seen": 19579616, "step": 20475 }, { "epoch": 1.6706093482339506, "grad_norm": 7.355896949768066, "learning_rate": 4.93183291788573e-05, "loss": 0.0656, "num_input_tokens_seen": 19584624, "step": 20480 }, { "epoch": 1.6710172118443594, "grad_norm": 0.21579012274742126, "learning_rate": 4.931750343852959e-05, "loss": 0.0944, "num_input_tokens_seen": 19589952, "step": 20485 }, { "epoch": 1.671425075454768, "grad_norm": 0.12333057075738907, "learning_rate": 4.9316677205296424e-05, "loss": 0.0876, "num_input_tokens_seen": 19594800, "step": 20490 }, { "epoch": 1.6718329390651765, "grad_norm": 5.779553413391113, "learning_rate": 4.9315850479174544e-05, "loss": 0.6298, "num_input_tokens_seen": 19598448, "step": 20495 }, { "epoch": 1.6722408026755853, "grad_norm": 0.10246195644140244, "learning_rate": 4.9315023260180706e-05, "loss": 0.2745, "num_input_tokens_seen": 19603568, "step": 20500 }, { "epoch": 1.672648666285994, "grad_norm": 0.1403300017118454, "learning_rate": 4.9314195548331673e-05, "loss": 0.151, "num_input_tokens_seen": 19608512, "step": 20505 }, { "epoch": 1.6730565298964026, "grad_norm": 0.02709594927728176, "learning_rate": 4.931336734364423e-05, "loss": 0.0946, "num_input_tokens_seen": 19613920, "step": 20510 }, { "epoch": 1.6734643935068112, "grad_norm": 0.03449578210711479, "learning_rate": 4.931253864613517e-05, "loss": 0.1296, "num_input_tokens_seen": 19619648, "step": 20515 }, { "epoch": 1.67387225711722, "grad_norm": 1.8087211847305298, "learning_rate": 4.9311709455821276e-05, "loss": 0.2239, "num_input_tokens_seen": 19623888, "step": 20520 }, { "epoch": 1.6742801207276288, "grad_norm": 0.12170131504535675, "learning_rate": 4.931087977271936e-05, "loss": 0.1258, "num_input_tokens_seen": 19629360, "step": 20525 }, { "epoch": 1.6746879843380373, "grad_norm": 0.09692240506410599, "learning_rate": 4.931004959684624e-05, "loss": 0.1913, "num_input_tokens_seen": 19633904, "step": 20530 }, { "epoch": 1.675095847948446, "grad_norm": 0.13915111124515533, "learning_rate": 4.9309218928218746e-05, "loss": 0.1929, "num_input_tokens_seen": 19638720, "step": 20535 }, { "epoch": 1.6755037115588547, "grad_norm": 10.988381385803223, "learning_rate": 4.9308387766853725e-05, "loss": 0.1513, "num_input_tokens_seen": 19643712, "step": 20540 }, { "epoch": 1.6759115751692635, "grad_norm": 6.476643085479736, "learning_rate": 4.9307556112768005e-05, "loss": 0.1327, "num_input_tokens_seen": 19648704, "step": 20545 }, { "epoch": 1.676319438779672, "grad_norm": 0.07919485867023468, "learning_rate": 4.930672396597845e-05, "loss": 0.1202, "num_input_tokens_seen": 19653664, "step": 20550 }, { "epoch": 1.6767273023900806, "grad_norm": 5.580765247344971, "learning_rate": 4.930589132650193e-05, "loss": 0.1919, "num_input_tokens_seen": 19658896, "step": 20555 }, { "epoch": 1.6771351660004894, "grad_norm": 0.4206722676753998, "learning_rate": 4.930505819435532e-05, "loss": 0.1542, "num_input_tokens_seen": 19663984, "step": 20560 }, { "epoch": 1.6775430296108982, "grad_norm": 0.04121459275484085, "learning_rate": 4.930422456955551e-05, "loss": 0.2294, "num_input_tokens_seen": 19668128, "step": 20565 }, { "epoch": 1.6779508932213067, "grad_norm": 0.03906034678220749, "learning_rate": 4.93033904521194e-05, "loss": 0.1593, "num_input_tokens_seen": 19673248, "step": 20570 }, { "epoch": 1.6783587568317153, "grad_norm": 0.036833662539720535, "learning_rate": 4.930255584206389e-05, "loss": 0.2068, "num_input_tokens_seen": 19676912, "step": 20575 }, { "epoch": 1.678766620442124, "grad_norm": 1.6740328073501587, "learning_rate": 4.9301720739405896e-05, "loss": 0.3838, "num_input_tokens_seen": 19681920, "step": 20580 }, { "epoch": 1.679174484052533, "grad_norm": 0.13878914713859558, "learning_rate": 4.9300885144162344e-05, "loss": 0.1458, "num_input_tokens_seen": 19687184, "step": 20585 }, { "epoch": 1.6795823476629415, "grad_norm": 0.2388063222169876, "learning_rate": 4.930004905635018e-05, "loss": 0.0304, "num_input_tokens_seen": 19692112, "step": 20590 }, { "epoch": 1.6799902112733502, "grad_norm": 0.1781129240989685, "learning_rate": 4.929921247598635e-05, "loss": 0.0688, "num_input_tokens_seen": 19696656, "step": 20595 }, { "epoch": 1.680398074883759, "grad_norm": 0.07078303396701813, "learning_rate": 4.9298375403087815e-05, "loss": 0.1635, "num_input_tokens_seen": 19701264, "step": 20600 }, { "epoch": 1.6808059384941676, "grad_norm": 0.7438822388648987, "learning_rate": 4.929753783767153e-05, "loss": 0.2343, "num_input_tokens_seen": 19705872, "step": 20605 }, { "epoch": 1.6812138021045762, "grad_norm": 0.425910085439682, "learning_rate": 4.929669977975447e-05, "loss": 0.132, "num_input_tokens_seen": 19711008, "step": 20610 }, { "epoch": 1.681621665714985, "grad_norm": 1.4047657251358032, "learning_rate": 4.929586122935365e-05, "loss": 0.2204, "num_input_tokens_seen": 19716352, "step": 20615 }, { "epoch": 1.6820295293253937, "grad_norm": 0.10091666132211685, "learning_rate": 4.929502218648603e-05, "loss": 0.0434, "num_input_tokens_seen": 19719872, "step": 20620 }, { "epoch": 1.6824373929358023, "grad_norm": 0.09022921323776245, "learning_rate": 4.929418265116864e-05, "loss": 0.1816, "num_input_tokens_seen": 19725360, "step": 20625 }, { "epoch": 1.6828452565462109, "grad_norm": 2.0698623657226562, "learning_rate": 4.9293342623418485e-05, "loss": 0.3658, "num_input_tokens_seen": 19729808, "step": 20630 }, { "epoch": 1.6832531201566197, "grad_norm": 4.972890377044678, "learning_rate": 4.92925021032526e-05, "loss": 0.2213, "num_input_tokens_seen": 19734848, "step": 20635 }, { "epoch": 1.6836609837670284, "grad_norm": 0.20788560807704926, "learning_rate": 4.9291661090688024e-05, "loss": 0.0286, "num_input_tokens_seen": 19739616, "step": 20640 }, { "epoch": 1.684068847377437, "grad_norm": 0.2110227793455124, "learning_rate": 4.9290819585741793e-05, "loss": 0.0658, "num_input_tokens_seen": 19744272, "step": 20645 }, { "epoch": 1.6844767109878456, "grad_norm": 0.026791265234351158, "learning_rate": 4.928997758843098e-05, "loss": 0.1465, "num_input_tokens_seen": 19749344, "step": 20650 }, { "epoch": 1.6848845745982544, "grad_norm": 0.10109859704971313, "learning_rate": 4.928913509877263e-05, "loss": 0.1472, "num_input_tokens_seen": 19753648, "step": 20655 }, { "epoch": 1.6852924382086631, "grad_norm": 0.04182365536689758, "learning_rate": 4.928829211678384e-05, "loss": 0.1743, "num_input_tokens_seen": 19757216, "step": 20660 }, { "epoch": 1.6857003018190717, "grad_norm": 7.188681125640869, "learning_rate": 4.928744864248169e-05, "loss": 0.133, "num_input_tokens_seen": 19762720, "step": 20665 }, { "epoch": 1.6861081654294803, "grad_norm": 0.05620019882917404, "learning_rate": 4.9286604675883273e-05, "loss": 0.1288, "num_input_tokens_seen": 19767440, "step": 20670 }, { "epoch": 1.686516029039889, "grad_norm": 0.08626919239759445, "learning_rate": 4.9285760217005706e-05, "loss": 0.4347, "num_input_tokens_seen": 19771600, "step": 20675 }, { "epoch": 1.6869238926502979, "grad_norm": 0.0720604658126831, "learning_rate": 4.928491526586609e-05, "loss": 0.0212, "num_input_tokens_seen": 19775760, "step": 20680 }, { "epoch": 1.6873317562607064, "grad_norm": 0.07115118205547333, "learning_rate": 4.928406982248157e-05, "loss": 0.033, "num_input_tokens_seen": 19780896, "step": 20685 }, { "epoch": 1.687739619871115, "grad_norm": 0.20881903171539307, "learning_rate": 4.928322388686927e-05, "loss": 0.1952, "num_input_tokens_seen": 19786096, "step": 20690 }, { "epoch": 1.6881474834815238, "grad_norm": 1.7258867025375366, "learning_rate": 4.928237745904634e-05, "loss": 0.0789, "num_input_tokens_seen": 19790704, "step": 20695 }, { "epoch": 1.6885553470919326, "grad_norm": 0.06540057808160782, "learning_rate": 4.928153053902994e-05, "loss": 0.1095, "num_input_tokens_seen": 19795936, "step": 20700 }, { "epoch": 1.6889632107023411, "grad_norm": 9.72614860534668, "learning_rate": 4.928068312683723e-05, "loss": 0.5009, "num_input_tokens_seen": 19801232, "step": 20705 }, { "epoch": 1.6893710743127497, "grad_norm": 1.9170643091201782, "learning_rate": 4.927983522248539e-05, "loss": 0.3374, "num_input_tokens_seen": 19806048, "step": 20710 }, { "epoch": 1.6897789379231585, "grad_norm": 0.04651537537574768, "learning_rate": 4.9278986825991614e-05, "loss": 0.0178, "num_input_tokens_seen": 19811040, "step": 20715 }, { "epoch": 1.6901868015335673, "grad_norm": 0.07252512127161026, "learning_rate": 4.927813793737308e-05, "loss": 0.2054, "num_input_tokens_seen": 19816032, "step": 20720 }, { "epoch": 1.6905946651439758, "grad_norm": 3.089005947113037, "learning_rate": 4.927728855664702e-05, "loss": 0.1782, "num_input_tokens_seen": 19820544, "step": 20725 }, { "epoch": 1.6910025287543844, "grad_norm": 0.17677949368953705, "learning_rate": 4.927643868383063e-05, "loss": 0.1646, "num_input_tokens_seen": 19825200, "step": 20730 }, { "epoch": 1.6914103923647932, "grad_norm": 0.02869187481701374, "learning_rate": 4.927558831894114e-05, "loss": 0.0406, "num_input_tokens_seen": 19829440, "step": 20735 }, { "epoch": 1.691818255975202, "grad_norm": 0.26981645822525024, "learning_rate": 4.927473746199579e-05, "loss": 0.2119, "num_input_tokens_seen": 19834352, "step": 20740 }, { "epoch": 1.6922261195856105, "grad_norm": 0.1289217174053192, "learning_rate": 4.927388611301184e-05, "loss": 0.1068, "num_input_tokens_seen": 19840000, "step": 20745 }, { "epoch": 1.692633983196019, "grad_norm": 0.25531354546546936, "learning_rate": 4.927303427200651e-05, "loss": 0.2073, "num_input_tokens_seen": 19844048, "step": 20750 }, { "epoch": 1.6930418468064279, "grad_norm": 0.07512696832418442, "learning_rate": 4.927218193899711e-05, "loss": 0.1415, "num_input_tokens_seen": 19848672, "step": 20755 }, { "epoch": 1.6934497104168367, "grad_norm": 0.09281603246927261, "learning_rate": 4.927132911400089e-05, "loss": 0.0163, "num_input_tokens_seen": 19853664, "step": 20760 }, { "epoch": 1.6938575740272452, "grad_norm": 0.2295180708169937, "learning_rate": 4.927047579703514e-05, "loss": 0.2227, "num_input_tokens_seen": 19858576, "step": 20765 }, { "epoch": 1.6942654376376538, "grad_norm": 0.1411314755678177, "learning_rate": 4.926962198811716e-05, "loss": 0.0859, "num_input_tokens_seen": 19863744, "step": 20770 }, { "epoch": 1.6946733012480628, "grad_norm": 0.061490438878536224, "learning_rate": 4.926876768726425e-05, "loss": 0.0213, "num_input_tokens_seen": 19869280, "step": 20775 }, { "epoch": 1.6950811648584714, "grad_norm": 15.506086349487305, "learning_rate": 4.926791289449374e-05, "loss": 0.1967, "num_input_tokens_seen": 19874480, "step": 20780 }, { "epoch": 1.69548902846888, "grad_norm": 7.7045111656188965, "learning_rate": 4.926705760982294e-05, "loss": 0.0723, "num_input_tokens_seen": 19879072, "step": 20785 }, { "epoch": 1.6958968920792887, "grad_norm": 3.0156102180480957, "learning_rate": 4.9266201833269206e-05, "loss": 0.2876, "num_input_tokens_seen": 19883552, "step": 20790 }, { "epoch": 1.6963047556896975, "grad_norm": 0.12342444062232971, "learning_rate": 4.9265345564849864e-05, "loss": 0.1899, "num_input_tokens_seen": 19888704, "step": 20795 }, { "epoch": 1.696712619300106, "grad_norm": 0.13840343058109283, "learning_rate": 4.9264488804582274e-05, "loss": 0.1643, "num_input_tokens_seen": 19893968, "step": 20800 }, { "epoch": 1.6971204829105146, "grad_norm": 4.281818866729736, "learning_rate": 4.926363155248381e-05, "loss": 0.096, "num_input_tokens_seen": 19898928, "step": 20805 }, { "epoch": 1.6975283465209234, "grad_norm": 1.8525522947311401, "learning_rate": 4.926277380857185e-05, "loss": 0.2238, "num_input_tokens_seen": 19902576, "step": 20810 }, { "epoch": 1.6979362101313322, "grad_norm": 0.01984655112028122, "learning_rate": 4.926191557286377e-05, "loss": 0.5336, "num_input_tokens_seen": 19906368, "step": 20815 }, { "epoch": 1.6983440737417408, "grad_norm": 0.05712069571018219, "learning_rate": 4.926105684537697e-05, "loss": 0.0163, "num_input_tokens_seen": 19911264, "step": 20820 }, { "epoch": 1.6987519373521494, "grad_norm": 0.16099244356155396, "learning_rate": 4.9260197626128853e-05, "loss": 0.0935, "num_input_tokens_seen": 19915232, "step": 20825 }, { "epoch": 1.6991598009625581, "grad_norm": 0.10449623316526413, "learning_rate": 4.925933791513685e-05, "loss": 0.023, "num_input_tokens_seen": 19919760, "step": 20830 }, { "epoch": 1.699567664572967, "grad_norm": 0.03586520627140999, "learning_rate": 4.925847771241837e-05, "loss": 0.226, "num_input_tokens_seen": 19923968, "step": 20835 }, { "epoch": 1.6999755281833755, "grad_norm": 0.035806652158498764, "learning_rate": 4.925761701799085e-05, "loss": 0.0159, "num_input_tokens_seen": 19929440, "step": 20840 }, { "epoch": 1.700383391793784, "grad_norm": 0.08183456212282181, "learning_rate": 4.925675583187175e-05, "loss": 0.1218, "num_input_tokens_seen": 19934672, "step": 20845 }, { "epoch": 1.7007912554041928, "grad_norm": 0.03440104052424431, "learning_rate": 4.925589415407851e-05, "loss": 0.129, "num_input_tokens_seen": 19938864, "step": 20850 }, { "epoch": 1.7011991190146016, "grad_norm": 0.07552356272935867, "learning_rate": 4.925503198462861e-05, "loss": 0.1834, "num_input_tokens_seen": 19944688, "step": 20855 }, { "epoch": 1.7016069826250102, "grad_norm": 0.1073141098022461, "learning_rate": 4.925416932353951e-05, "loss": 0.1663, "num_input_tokens_seen": 19949584, "step": 20860 }, { "epoch": 1.7020148462354188, "grad_norm": 0.08040902763605118, "learning_rate": 4.9253306170828705e-05, "loss": 0.3577, "num_input_tokens_seen": 19955488, "step": 20865 }, { "epoch": 1.7024227098458276, "grad_norm": 0.08797387033700943, "learning_rate": 4.92524425265137e-05, "loss": 0.0115, "num_input_tokens_seen": 19959968, "step": 20870 }, { "epoch": 1.7028305734562363, "grad_norm": 0.2154473513364792, "learning_rate": 4.9251578390611976e-05, "loss": 0.1224, "num_input_tokens_seen": 19964640, "step": 20875 }, { "epoch": 1.703238437066645, "grad_norm": 0.33921748399734497, "learning_rate": 4.925071376314107e-05, "loss": 0.0795, "num_input_tokens_seen": 19969808, "step": 20880 }, { "epoch": 1.7036463006770535, "grad_norm": 0.8365267515182495, "learning_rate": 4.92498486441185e-05, "loss": 0.0712, "num_input_tokens_seen": 19974368, "step": 20885 }, { "epoch": 1.7040541642874623, "grad_norm": 0.05443559214472771, "learning_rate": 4.92489830335618e-05, "loss": 0.0163, "num_input_tokens_seen": 19979344, "step": 20890 }, { "epoch": 1.704462027897871, "grad_norm": 0.028860444203019142, "learning_rate": 4.9248116931488526e-05, "loss": 0.1239, "num_input_tokens_seen": 19985264, "step": 20895 }, { "epoch": 1.7048698915082796, "grad_norm": 0.028032522648572922, "learning_rate": 4.924725033791622e-05, "loss": 0.1449, "num_input_tokens_seen": 19989488, "step": 20900 }, { "epoch": 1.7052777551186882, "grad_norm": 0.21901778876781464, "learning_rate": 4.924638325286245e-05, "loss": 0.3122, "num_input_tokens_seen": 19994976, "step": 20905 }, { "epoch": 1.705685618729097, "grad_norm": 11.178082466125488, "learning_rate": 4.92455156763448e-05, "loss": 0.076, "num_input_tokens_seen": 19999488, "step": 20910 }, { "epoch": 1.7060934823395058, "grad_norm": 0.13149426877498627, "learning_rate": 4.9244647608380847e-05, "loss": 0.4343, "num_input_tokens_seen": 20003776, "step": 20915 }, { "epoch": 1.7065013459499143, "grad_norm": 2.4473135471343994, "learning_rate": 4.9243779048988185e-05, "loss": 0.4078, "num_input_tokens_seen": 20008368, "step": 20920 }, { "epoch": 1.7069092095603229, "grad_norm": 0.06517224758863449, "learning_rate": 4.9242909998184425e-05, "loss": 0.0123, "num_input_tokens_seen": 20012912, "step": 20925 }, { "epoch": 1.7073170731707317, "grad_norm": 0.06357535719871521, "learning_rate": 4.924204045598719e-05, "loss": 0.0172, "num_input_tokens_seen": 20016496, "step": 20930 }, { "epoch": 1.7077249367811405, "grad_norm": 0.06935293227434158, "learning_rate": 4.924117042241409e-05, "loss": 0.2688, "num_input_tokens_seen": 20021280, "step": 20935 }, { "epoch": 1.708132800391549, "grad_norm": 0.08888450264930725, "learning_rate": 4.9240299897482766e-05, "loss": 0.1512, "num_input_tokens_seen": 20026096, "step": 20940 }, { "epoch": 1.7085406640019576, "grad_norm": 3.249274492263794, "learning_rate": 4.923942888121086e-05, "loss": 0.2187, "num_input_tokens_seen": 20030224, "step": 20945 }, { "epoch": 1.7089485276123664, "grad_norm": 5.960554599761963, "learning_rate": 4.923855737361604e-05, "loss": 0.2415, "num_input_tokens_seen": 20034528, "step": 20950 }, { "epoch": 1.7093563912227752, "grad_norm": 0.057152170687913895, "learning_rate": 4.923768537471595e-05, "loss": 0.178, "num_input_tokens_seen": 20038768, "step": 20955 }, { "epoch": 1.7097642548331837, "grad_norm": 6.38995361328125, "learning_rate": 4.9236812884528286e-05, "loss": 0.1903, "num_input_tokens_seen": 20043504, "step": 20960 }, { "epoch": 1.7101721184435925, "grad_norm": 0.29050713777542114, "learning_rate": 4.9235939903070717e-05, "loss": 0.3945, "num_input_tokens_seen": 20047760, "step": 20965 }, { "epoch": 1.7105799820540013, "grad_norm": 0.26446259021759033, "learning_rate": 4.9235066430360946e-05, "loss": 0.0502, "num_input_tokens_seen": 20052528, "step": 20970 }, { "epoch": 1.7109878456644099, "grad_norm": 0.03397630900144577, "learning_rate": 4.9234192466416675e-05, "loss": 0.4041, "num_input_tokens_seen": 20057824, "step": 20975 }, { "epoch": 1.7113957092748184, "grad_norm": 3.020568609237671, "learning_rate": 4.923331801125562e-05, "loss": 0.0332, "num_input_tokens_seen": 20062576, "step": 20980 }, { "epoch": 1.7118035728852272, "grad_norm": 0.1774807870388031, "learning_rate": 4.9232443064895515e-05, "loss": 0.168, "num_input_tokens_seen": 20067376, "step": 20985 }, { "epoch": 1.712211436495636, "grad_norm": 0.07012232393026352, "learning_rate": 4.923156762735408e-05, "loss": 0.0317, "num_input_tokens_seen": 20072304, "step": 20990 }, { "epoch": 1.7126193001060446, "grad_norm": 0.7830320000648499, "learning_rate": 4.923069169864907e-05, "loss": 0.0144, "num_input_tokens_seen": 20077056, "step": 20995 }, { "epoch": 1.7130271637164531, "grad_norm": 0.27565568685531616, "learning_rate": 4.9229815278798233e-05, "loss": 0.1533, "num_input_tokens_seen": 20081792, "step": 21000 }, { "epoch": 1.713435027326862, "grad_norm": 8.718328475952148, "learning_rate": 4.922893836781933e-05, "loss": 0.4117, "num_input_tokens_seen": 20085824, "step": 21005 }, { "epoch": 1.7138428909372707, "grad_norm": 0.2539675533771515, "learning_rate": 4.922806096573015e-05, "loss": 0.1124, "num_input_tokens_seen": 20090960, "step": 21010 }, { "epoch": 1.7142507545476793, "grad_norm": 0.05022556334733963, "learning_rate": 4.922718307254847e-05, "loss": 0.156, "num_input_tokens_seen": 20095040, "step": 21015 }, { "epoch": 1.7146586181580878, "grad_norm": 1.5333908796310425, "learning_rate": 4.922630468829207e-05, "loss": 0.1833, "num_input_tokens_seen": 20099680, "step": 21020 }, { "epoch": 1.7150664817684966, "grad_norm": 0.24393615126609802, "learning_rate": 4.922542581297878e-05, "loss": 0.1268, "num_input_tokens_seen": 20105216, "step": 21025 }, { "epoch": 1.7154743453789054, "grad_norm": 45.778621673583984, "learning_rate": 4.92245464466264e-05, "loss": 0.2202, "num_input_tokens_seen": 20109344, "step": 21030 }, { "epoch": 1.715882208989314, "grad_norm": 4.207413196563721, "learning_rate": 4.922366658925276e-05, "loss": 0.402, "num_input_tokens_seen": 20113632, "step": 21035 }, { "epoch": 1.7162900725997225, "grad_norm": 5.454944610595703, "learning_rate": 4.922278624087569e-05, "loss": 0.2529, "num_input_tokens_seen": 20118560, "step": 21040 }, { "epoch": 1.7166979362101313, "grad_norm": 3.0987300872802734, "learning_rate": 4.9221905401513035e-05, "loss": 0.3336, "num_input_tokens_seen": 20123552, "step": 21045 }, { "epoch": 1.7171057998205401, "grad_norm": 0.3987905979156494, "learning_rate": 4.922102407118265e-05, "loss": 0.1423, "num_input_tokens_seen": 20128064, "step": 21050 }, { "epoch": 1.7175136634309487, "grad_norm": 0.45769500732421875, "learning_rate": 4.92201422499024e-05, "loss": 0.1289, "num_input_tokens_seen": 20133664, "step": 21055 }, { "epoch": 1.7179215270413573, "grad_norm": 1.431674599647522, "learning_rate": 4.921925993769016e-05, "loss": 0.1352, "num_input_tokens_seen": 20138944, "step": 21060 }, { "epoch": 1.718329390651766, "grad_norm": 0.28111329674720764, "learning_rate": 4.921837713456381e-05, "loss": 0.0616, "num_input_tokens_seen": 20144640, "step": 21065 }, { "epoch": 1.7187372542621748, "grad_norm": 0.030001724138855934, "learning_rate": 4.921749384054125e-05, "loss": 0.1251, "num_input_tokens_seen": 20149024, "step": 21070 }, { "epoch": 1.7191451178725834, "grad_norm": 15.959760665893555, "learning_rate": 4.9216610055640374e-05, "loss": 0.2155, "num_input_tokens_seen": 20153856, "step": 21075 }, { "epoch": 1.719552981482992, "grad_norm": 0.08057913929224014, "learning_rate": 4.921572577987911e-05, "loss": 0.3626, "num_input_tokens_seen": 20159536, "step": 21080 }, { "epoch": 1.7199608450934007, "grad_norm": 1.6587259769439697, "learning_rate": 4.921484101327537e-05, "loss": 0.3556, "num_input_tokens_seen": 20164528, "step": 21085 }, { "epoch": 1.7203687087038095, "grad_norm": 11.0960054397583, "learning_rate": 4.921395575584709e-05, "loss": 0.2166, "num_input_tokens_seen": 20169120, "step": 21090 }, { "epoch": 1.720776572314218, "grad_norm": 13.242502212524414, "learning_rate": 4.921307000761221e-05, "loss": 0.2827, "num_input_tokens_seen": 20173920, "step": 21095 }, { "epoch": 1.7211844359246267, "grad_norm": 0.12045981734991074, "learning_rate": 4.921218376858871e-05, "loss": 0.1985, "num_input_tokens_seen": 20178688, "step": 21100 }, { "epoch": 1.7215922995350355, "grad_norm": 8.095290184020996, "learning_rate": 4.9211297038794515e-05, "loss": 0.3111, "num_input_tokens_seen": 20183312, "step": 21105 }, { "epoch": 1.7220001631454442, "grad_norm": 5.835057735443115, "learning_rate": 4.921040981824763e-05, "loss": 0.3316, "num_input_tokens_seen": 20187760, "step": 21110 }, { "epoch": 1.7224080267558528, "grad_norm": 0.42615965008735657, "learning_rate": 4.9209522106966015e-05, "loss": 0.0193, "num_input_tokens_seen": 20192400, "step": 21115 }, { "epoch": 1.7228158903662614, "grad_norm": 4.042187690734863, "learning_rate": 4.920863390496768e-05, "loss": 0.1506, "num_input_tokens_seen": 20197536, "step": 21120 }, { "epoch": 1.7232237539766702, "grad_norm": 6.261936664581299, "learning_rate": 4.920774521227062e-05, "loss": 0.0883, "num_input_tokens_seen": 20202496, "step": 21125 }, { "epoch": 1.723631617587079, "grad_norm": 0.07166095823049545, "learning_rate": 4.9206856028892854e-05, "loss": 0.0121, "num_input_tokens_seen": 20206224, "step": 21130 }, { "epoch": 1.7240394811974875, "grad_norm": 2.7034265995025635, "learning_rate": 4.92059663548524e-05, "loss": 0.177, "num_input_tokens_seen": 20209952, "step": 21135 }, { "epoch": 1.724447344807896, "grad_norm": 0.11425624787807465, "learning_rate": 4.9205076190167294e-05, "loss": 0.0133, "num_input_tokens_seen": 20214144, "step": 21140 }, { "epoch": 1.7248552084183049, "grad_norm": 2.6699306964874268, "learning_rate": 4.920418553485558e-05, "loss": 0.3891, "num_input_tokens_seen": 20219408, "step": 21145 }, { "epoch": 1.7252630720287137, "grad_norm": 0.049146369099617004, "learning_rate": 4.9203294388935304e-05, "loss": 0.1582, "num_input_tokens_seen": 20223744, "step": 21150 }, { "epoch": 1.7256709356391222, "grad_norm": 2.5029799938201904, "learning_rate": 4.9202402752424545e-05, "loss": 0.1622, "num_input_tokens_seen": 20229376, "step": 21155 }, { "epoch": 1.726078799249531, "grad_norm": 0.09656193852424622, "learning_rate": 4.9201510625341366e-05, "loss": 0.3758, "num_input_tokens_seen": 20233712, "step": 21160 }, { "epoch": 1.7264866628599398, "grad_norm": 0.10518470406532288, "learning_rate": 4.920061800770384e-05, "loss": 0.2184, "num_input_tokens_seen": 20238736, "step": 21165 }, { "epoch": 1.7268945264703484, "grad_norm": 0.24510687589645386, "learning_rate": 4.919972489953008e-05, "loss": 0.1561, "num_input_tokens_seen": 20243376, "step": 21170 }, { "epoch": 1.727302390080757, "grad_norm": 0.0698879063129425, "learning_rate": 4.919883130083818e-05, "loss": 0.0406, "num_input_tokens_seen": 20248208, "step": 21175 }, { "epoch": 1.7277102536911657, "grad_norm": 4.671753883361816, "learning_rate": 4.9197937211646245e-05, "loss": 0.0929, "num_input_tokens_seen": 20252512, "step": 21180 }, { "epoch": 1.7281181173015745, "grad_norm": 0.016248878091573715, "learning_rate": 4.919704263197241e-05, "loss": 0.0253, "num_input_tokens_seen": 20257248, "step": 21185 }, { "epoch": 1.728525980911983, "grad_norm": 0.04141195863485336, "learning_rate": 4.9196147561834796e-05, "loss": 0.0795, "num_input_tokens_seen": 20261376, "step": 21190 }, { "epoch": 1.7289338445223916, "grad_norm": 1.3561017513275146, "learning_rate": 4.9195252001251564e-05, "loss": 0.4541, "num_input_tokens_seen": 20266224, "step": 21195 }, { "epoch": 1.7293417081328004, "grad_norm": 0.09110162407159805, "learning_rate": 4.9194355950240844e-05, "loss": 0.0314, "num_input_tokens_seen": 20271136, "step": 21200 }, { "epoch": 1.7297495717432092, "grad_norm": 0.07428484410047531, "learning_rate": 4.919345940882082e-05, "loss": 0.5604, "num_input_tokens_seen": 20275280, "step": 21205 }, { "epoch": 1.7301574353536178, "grad_norm": 0.04745294898748398, "learning_rate": 4.9192562377009645e-05, "loss": 0.0262, "num_input_tokens_seen": 20280160, "step": 21210 }, { "epoch": 1.7305652989640263, "grad_norm": 0.05937677249312401, "learning_rate": 4.9191664854825515e-05, "loss": 0.0911, "num_input_tokens_seen": 20285360, "step": 21215 }, { "epoch": 1.7309731625744351, "grad_norm": 0.05388987436890602, "learning_rate": 4.919076684228662e-05, "loss": 0.354, "num_input_tokens_seen": 20289968, "step": 21220 }, { "epoch": 1.731381026184844, "grad_norm": 7.444369792938232, "learning_rate": 4.918986833941116e-05, "loss": 0.4093, "num_input_tokens_seen": 20294080, "step": 21225 }, { "epoch": 1.7317888897952525, "grad_norm": 2.6869266033172607, "learning_rate": 4.918896934621734e-05, "loss": 0.0358, "num_input_tokens_seen": 20297760, "step": 21230 }, { "epoch": 1.732196753405661, "grad_norm": 0.08790378272533417, "learning_rate": 4.91880698627234e-05, "loss": 0.0138, "num_input_tokens_seen": 20301936, "step": 21235 }, { "epoch": 1.7326046170160698, "grad_norm": 2.4325995445251465, "learning_rate": 4.918716988894756e-05, "loss": 0.4231, "num_input_tokens_seen": 20306896, "step": 21240 }, { "epoch": 1.7330124806264786, "grad_norm": 0.07468923926353455, "learning_rate": 4.918626942490806e-05, "loss": 0.1304, "num_input_tokens_seen": 20311952, "step": 21245 }, { "epoch": 1.7334203442368872, "grad_norm": 2.17675518989563, "learning_rate": 4.918536847062316e-05, "loss": 0.279, "num_input_tokens_seen": 20316736, "step": 21250 }, { "epoch": 1.7338282078472957, "grad_norm": 0.2809740900993347, "learning_rate": 4.918446702611111e-05, "loss": 0.1663, "num_input_tokens_seen": 20322624, "step": 21255 }, { "epoch": 1.7342360714577045, "grad_norm": 7.508816242218018, "learning_rate": 4.91835650913902e-05, "loss": 0.2796, "num_input_tokens_seen": 20328432, "step": 21260 }, { "epoch": 1.7346439350681133, "grad_norm": 0.23023369908332825, "learning_rate": 4.9182662666478695e-05, "loss": 0.0209, "num_input_tokens_seen": 20333680, "step": 21265 }, { "epoch": 1.7350517986785219, "grad_norm": 0.32237446308135986, "learning_rate": 4.9181759751394896e-05, "loss": 0.0918, "num_input_tokens_seen": 20338960, "step": 21270 }, { "epoch": 1.7354596622889304, "grad_norm": 0.0718996524810791, "learning_rate": 4.91808563461571e-05, "loss": 0.3316, "num_input_tokens_seen": 20343408, "step": 21275 }, { "epoch": 1.7358675258993392, "grad_norm": 0.06054864451289177, "learning_rate": 4.9179952450783625e-05, "loss": 0.0168, "num_input_tokens_seen": 20348720, "step": 21280 }, { "epoch": 1.736275389509748, "grad_norm": 0.08905985951423645, "learning_rate": 4.9179048065292785e-05, "loss": 0.0226, "num_input_tokens_seen": 20353344, "step": 21285 }, { "epoch": 1.7366832531201566, "grad_norm": 0.037317924201488495, "learning_rate": 4.917814318970292e-05, "loss": 0.1094, "num_input_tokens_seen": 20358368, "step": 21290 }, { "epoch": 1.7370911167305652, "grad_norm": 0.08953918516635895, "learning_rate": 4.9177237824032366e-05, "loss": 0.4124, "num_input_tokens_seen": 20362912, "step": 21295 }, { "epoch": 1.737498980340974, "grad_norm": 10.030956268310547, "learning_rate": 4.917633196829947e-05, "loss": 0.4738, "num_input_tokens_seen": 20366768, "step": 21300 }, { "epoch": 1.7379068439513827, "grad_norm": 0.18466949462890625, "learning_rate": 4.91754256225226e-05, "loss": 0.1481, "num_input_tokens_seen": 20371792, "step": 21305 }, { "epoch": 1.7383147075617913, "grad_norm": 0.05834832414984703, "learning_rate": 4.9174518786720126e-05, "loss": 0.1762, "num_input_tokens_seen": 20376848, "step": 21310 }, { "epoch": 1.7387225711721999, "grad_norm": 0.3461543321609497, "learning_rate": 4.917361146091043e-05, "loss": 0.0723, "num_input_tokens_seen": 20381472, "step": 21315 }, { "epoch": 1.7391304347826086, "grad_norm": 14.311586380004883, "learning_rate": 4.91727036451119e-05, "loss": 0.0453, "num_input_tokens_seen": 20386160, "step": 21320 }, { "epoch": 1.7395382983930174, "grad_norm": 0.8370636701583862, "learning_rate": 4.917179533934294e-05, "loss": 0.3177, "num_input_tokens_seen": 20390992, "step": 21325 }, { "epoch": 1.739946162003426, "grad_norm": 0.7365707159042358, "learning_rate": 4.917088654362196e-05, "loss": 0.2222, "num_input_tokens_seen": 20395232, "step": 21330 }, { "epoch": 1.7403540256138348, "grad_norm": 0.053405147045850754, "learning_rate": 4.916997725796738e-05, "loss": 0.1622, "num_input_tokens_seen": 20399056, "step": 21335 }, { "epoch": 1.7407618892242436, "grad_norm": 99.42863464355469, "learning_rate": 4.916906748239763e-05, "loss": 0.2957, "num_input_tokens_seen": 20404256, "step": 21340 }, { "epoch": 1.7411697528346521, "grad_norm": 0.1877981573343277, "learning_rate": 4.916815721693115e-05, "loss": 0.076, "num_input_tokens_seen": 20408384, "step": 21345 }, { "epoch": 1.7415776164450607, "grad_norm": 2.705216646194458, "learning_rate": 4.9167246461586395e-05, "loss": 0.194, "num_input_tokens_seen": 20413392, "step": 21350 }, { "epoch": 1.7419854800554695, "grad_norm": 4.555440902709961, "learning_rate": 4.916633521638182e-05, "loss": 0.2741, "num_input_tokens_seen": 20418304, "step": 21355 }, { "epoch": 1.7423933436658783, "grad_norm": 3.296741247177124, "learning_rate": 4.916542348133591e-05, "loss": 0.1588, "num_input_tokens_seen": 20422832, "step": 21360 }, { "epoch": 1.7428012072762868, "grad_norm": 18.488157272338867, "learning_rate": 4.9164511256467126e-05, "loss": 0.0597, "num_input_tokens_seen": 20427520, "step": 21365 }, { "epoch": 1.7432090708866954, "grad_norm": 0.14906878769397736, "learning_rate": 4.916359854179397e-05, "loss": 0.3008, "num_input_tokens_seen": 20431648, "step": 21370 }, { "epoch": 1.7436169344971042, "grad_norm": 0.5030289888381958, "learning_rate": 4.916268533733493e-05, "loss": 0.1053, "num_input_tokens_seen": 20436096, "step": 21375 }, { "epoch": 1.744024798107513, "grad_norm": 0.27094194293022156, "learning_rate": 4.916177164310855e-05, "loss": 0.5018, "num_input_tokens_seen": 20440672, "step": 21380 }, { "epoch": 1.7444326617179216, "grad_norm": 0.09403382986783981, "learning_rate": 4.91608574591333e-05, "loss": 0.0099, "num_input_tokens_seen": 20445376, "step": 21385 }, { "epoch": 1.7448405253283301, "grad_norm": 0.2069229930639267, "learning_rate": 4.915994278542775e-05, "loss": 0.1392, "num_input_tokens_seen": 20450256, "step": 21390 }, { "epoch": 1.745248388938739, "grad_norm": 5.986053943634033, "learning_rate": 4.915902762201042e-05, "loss": 0.2064, "num_input_tokens_seen": 20455600, "step": 21395 }, { "epoch": 1.7456562525491477, "grad_norm": 0.4610782861709595, "learning_rate": 4.915811196889987e-05, "loss": 0.017, "num_input_tokens_seen": 20460560, "step": 21400 }, { "epoch": 1.7460641161595563, "grad_norm": 0.04901069402694702, "learning_rate": 4.9157195826114646e-05, "loss": 0.1403, "num_input_tokens_seen": 20465904, "step": 21405 }, { "epoch": 1.7464719797699648, "grad_norm": 4.230154514312744, "learning_rate": 4.9156279193673336e-05, "loss": 0.1908, "num_input_tokens_seen": 20470720, "step": 21410 }, { "epoch": 1.7468798433803736, "grad_norm": 3.6905739307403564, "learning_rate": 4.9155362071594514e-05, "loss": 0.3158, "num_input_tokens_seen": 20475168, "step": 21415 }, { "epoch": 1.7472877069907824, "grad_norm": 0.031839292496442795, "learning_rate": 4.915444445989676e-05, "loss": 0.007, "num_input_tokens_seen": 20480192, "step": 21420 }, { "epoch": 1.747695570601191, "grad_norm": 0.19318842887878418, "learning_rate": 4.915352635859868e-05, "loss": 0.0136, "num_input_tokens_seen": 20484768, "step": 21425 }, { "epoch": 1.7481034342115995, "grad_norm": 8.392305374145508, "learning_rate": 4.915260776771889e-05, "loss": 0.0743, "num_input_tokens_seen": 20488928, "step": 21430 }, { "epoch": 1.7485112978220083, "grad_norm": 0.9069898724555969, "learning_rate": 4.9151688687276e-05, "loss": 0.2358, "num_input_tokens_seen": 20493744, "step": 21435 }, { "epoch": 1.748919161432417, "grad_norm": 0.04264659807085991, "learning_rate": 4.9150769117288644e-05, "loss": 0.014, "num_input_tokens_seen": 20499584, "step": 21440 }, { "epoch": 1.7493270250428257, "grad_norm": 0.010482482612133026, "learning_rate": 4.914984905777546e-05, "loss": 0.1378, "num_input_tokens_seen": 20503824, "step": 21445 }, { "epoch": 1.7497348886532342, "grad_norm": 0.2021671086549759, "learning_rate": 4.9148928508755096e-05, "loss": 0.0185, "num_input_tokens_seen": 20507776, "step": 21450 }, { "epoch": 1.750142752263643, "grad_norm": 0.07911483943462372, "learning_rate": 4.9148007470246214e-05, "loss": 0.1958, "num_input_tokens_seen": 20512368, "step": 21455 }, { "epoch": 1.7505506158740518, "grad_norm": 0.02021023817360401, "learning_rate": 4.914708594226749e-05, "loss": 0.1555, "num_input_tokens_seen": 20517008, "step": 21460 }, { "epoch": 1.7509584794844604, "grad_norm": 16.377334594726562, "learning_rate": 4.9146163924837585e-05, "loss": 0.3392, "num_input_tokens_seen": 20522032, "step": 21465 }, { "epoch": 1.751366343094869, "grad_norm": 0.034436892718076706, "learning_rate": 4.9145241417975195e-05, "loss": 0.0285, "num_input_tokens_seen": 20526032, "step": 21470 }, { "epoch": 1.7517742067052777, "grad_norm": 0.09055189043283463, "learning_rate": 4.914431842169903e-05, "loss": 0.1884, "num_input_tokens_seen": 20530448, "step": 21475 }, { "epoch": 1.7521820703156865, "grad_norm": 0.2334367334842682, "learning_rate": 4.914339493602779e-05, "loss": 0.0068, "num_input_tokens_seen": 20535872, "step": 21480 }, { "epoch": 1.752589933926095, "grad_norm": 0.05359645187854767, "learning_rate": 4.914247096098019e-05, "loss": 0.2733, "num_input_tokens_seen": 20540912, "step": 21485 }, { "epoch": 1.7529977975365036, "grad_norm": 0.021542472764849663, "learning_rate": 4.914154649657496e-05, "loss": 0.0594, "num_input_tokens_seen": 20546080, "step": 21490 }, { "epoch": 1.7534056611469124, "grad_norm": 0.029143700376152992, "learning_rate": 4.914062154283084e-05, "loss": 0.1645, "num_input_tokens_seen": 20550896, "step": 21495 }, { "epoch": 1.7538135247573212, "grad_norm": 2.75169038772583, "learning_rate": 4.9139696099766585e-05, "loss": 0.379, "num_input_tokens_seen": 20555184, "step": 21500 }, { "epoch": 1.7542213883677298, "grad_norm": 12.003904342651367, "learning_rate": 4.913877016740095e-05, "loss": 0.3114, "num_input_tokens_seen": 20559840, "step": 21505 }, { "epoch": 1.7546292519781383, "grad_norm": 6.431666851043701, "learning_rate": 4.913784374575269e-05, "loss": 0.0741, "num_input_tokens_seen": 20564496, "step": 21510 }, { "epoch": 1.7550371155885471, "grad_norm": 0.04040156677365303, "learning_rate": 4.91369168348406e-05, "loss": 0.1377, "num_input_tokens_seen": 20569776, "step": 21515 }, { "epoch": 1.755444979198956, "grad_norm": 4.935336589813232, "learning_rate": 4.913598943468346e-05, "loss": 0.1556, "num_input_tokens_seen": 20574912, "step": 21520 }, { "epoch": 1.7558528428093645, "grad_norm": 0.07760544121265411, "learning_rate": 4.9135061545300075e-05, "loss": 0.1021, "num_input_tokens_seen": 20580144, "step": 21525 }, { "epoch": 1.7562607064197733, "grad_norm": 0.0482720285654068, "learning_rate": 4.913413316670925e-05, "loss": 0.0089, "num_input_tokens_seen": 20585392, "step": 21530 }, { "epoch": 1.756668570030182, "grad_norm": 0.1907062530517578, "learning_rate": 4.913320429892979e-05, "loss": 0.014, "num_input_tokens_seen": 20591136, "step": 21535 }, { "epoch": 1.7570764336405906, "grad_norm": 15.36485767364502, "learning_rate": 4.9132274941980536e-05, "loss": 0.3223, "num_input_tokens_seen": 20596688, "step": 21540 }, { "epoch": 1.7574842972509992, "grad_norm": 0.005194033030420542, "learning_rate": 4.913134509588033e-05, "loss": 0.0134, "num_input_tokens_seen": 20601472, "step": 21545 }, { "epoch": 1.757892160861408, "grad_norm": 0.01713058352470398, "learning_rate": 4.913041476064801e-05, "loss": 0.0049, "num_input_tokens_seen": 20606000, "step": 21550 }, { "epoch": 1.7583000244718168, "grad_norm": 0.010617896914482117, "learning_rate": 4.912948393630244e-05, "loss": 0.0253, "num_input_tokens_seen": 20610832, "step": 21555 }, { "epoch": 1.7587078880822253, "grad_norm": 6.837419509887695, "learning_rate": 4.912855262286248e-05, "loss": 0.3108, "num_input_tokens_seen": 20615488, "step": 21560 }, { "epoch": 1.759115751692634, "grad_norm": 0.027904074639081955, "learning_rate": 4.912762082034701e-05, "loss": 0.2403, "num_input_tokens_seen": 20620096, "step": 21565 }, { "epoch": 1.7595236153030427, "grad_norm": 9.573481559753418, "learning_rate": 4.912668852877492e-05, "loss": 0.3198, "num_input_tokens_seen": 20625600, "step": 21570 }, { "epoch": 1.7599314789134515, "grad_norm": 0.09282638877630234, "learning_rate": 4.912575574816511e-05, "loss": 0.1296, "num_input_tokens_seen": 20630288, "step": 21575 }, { "epoch": 1.76033934252386, "grad_norm": 0.03217676654458046, "learning_rate": 4.9124822478536476e-05, "loss": 0.2069, "num_input_tokens_seen": 20635360, "step": 21580 }, { "epoch": 1.7607472061342686, "grad_norm": 19.606393814086914, "learning_rate": 4.9123888719907945e-05, "loss": 0.2391, "num_input_tokens_seen": 20640528, "step": 21585 }, { "epoch": 1.7611550697446774, "grad_norm": 5.090377330780029, "learning_rate": 4.912295447229844e-05, "loss": 0.3986, "num_input_tokens_seen": 20645232, "step": 21590 }, { "epoch": 1.7615629333550862, "grad_norm": 0.4652276933193207, "learning_rate": 4.9122019735726896e-05, "loss": 0.2202, "num_input_tokens_seen": 20650080, "step": 21595 }, { "epoch": 1.7619707969654947, "grad_norm": 0.026859845966100693, "learning_rate": 4.912108451021227e-05, "loss": 0.0092, "num_input_tokens_seen": 20654800, "step": 21600 }, { "epoch": 1.7623786605759033, "grad_norm": 0.8782646059989929, "learning_rate": 4.9120148795773504e-05, "loss": 0.2494, "num_input_tokens_seen": 20659344, "step": 21605 }, { "epoch": 1.762786524186312, "grad_norm": 2.8380491733551025, "learning_rate": 4.911921259242956e-05, "loss": 0.6931, "num_input_tokens_seen": 20663952, "step": 21610 }, { "epoch": 1.7631943877967209, "grad_norm": 0.18632760643959045, "learning_rate": 4.911827590019944e-05, "loss": 0.0179, "num_input_tokens_seen": 20668880, "step": 21615 }, { "epoch": 1.7636022514071295, "grad_norm": 0.03387105092406273, "learning_rate": 4.911733871910212e-05, "loss": 0.1429, "num_input_tokens_seen": 20672864, "step": 21620 }, { "epoch": 1.764010115017538, "grad_norm": 0.15933877229690552, "learning_rate": 4.911640104915658e-05, "loss": 0.0971, "num_input_tokens_seen": 20677680, "step": 21625 }, { "epoch": 1.7644179786279468, "grad_norm": 0.16883979737758636, "learning_rate": 4.911546289038185e-05, "loss": 0.0136, "num_input_tokens_seen": 20682112, "step": 21630 }, { "epoch": 1.7648258422383556, "grad_norm": 0.09718678891658783, "learning_rate": 4.9114524242796923e-05, "loss": 0.2875, "num_input_tokens_seen": 20687456, "step": 21635 }, { "epoch": 1.7652337058487642, "grad_norm": 0.07777886837720871, "learning_rate": 4.911358510642084e-05, "loss": 0.2866, "num_input_tokens_seen": 20693008, "step": 21640 }, { "epoch": 1.7656415694591727, "grad_norm": 0.042555175721645355, "learning_rate": 4.911264548127264e-05, "loss": 0.0762, "num_input_tokens_seen": 20697824, "step": 21645 }, { "epoch": 1.7660494330695815, "grad_norm": 0.23530787229537964, "learning_rate": 4.9111705367371355e-05, "loss": 0.0236, "num_input_tokens_seen": 20702464, "step": 21650 }, { "epoch": 1.7664572966799903, "grad_norm": 0.12664657831192017, "learning_rate": 4.911076476473605e-05, "loss": 0.0789, "num_input_tokens_seen": 20708384, "step": 21655 }, { "epoch": 1.7668651602903989, "grad_norm": 0.4612950384616852, "learning_rate": 4.910982367338579e-05, "loss": 0.0831, "num_input_tokens_seen": 20713056, "step": 21660 }, { "epoch": 1.7672730239008074, "grad_norm": 0.4857665002346039, "learning_rate": 4.910888209333965e-05, "loss": 0.5435, "num_input_tokens_seen": 20717664, "step": 21665 }, { "epoch": 1.7676808875112162, "grad_norm": 0.0682704746723175, "learning_rate": 4.910794002461671e-05, "loss": 0.0225, "num_input_tokens_seen": 20721104, "step": 21670 }, { "epoch": 1.768088751121625, "grad_norm": 0.06787004321813583, "learning_rate": 4.9106997467236074e-05, "loss": 0.139, "num_input_tokens_seen": 20725808, "step": 21675 }, { "epoch": 1.7684966147320336, "grad_norm": 4.3719868659973145, "learning_rate": 4.910605442121684e-05, "loss": 0.1522, "num_input_tokens_seen": 20729632, "step": 21680 }, { "epoch": 1.7689044783424421, "grad_norm": 2.089925765991211, "learning_rate": 4.910511088657813e-05, "loss": 0.2264, "num_input_tokens_seen": 20734704, "step": 21685 }, { "epoch": 1.769312341952851, "grad_norm": 0.9859006404876709, "learning_rate": 4.910416686333906e-05, "loss": 0.0268, "num_input_tokens_seen": 20739504, "step": 21690 }, { "epoch": 1.7697202055632597, "grad_norm": 0.052272263914346695, "learning_rate": 4.9103222351518776e-05, "loss": 0.0943, "num_input_tokens_seen": 20744352, "step": 21695 }, { "epoch": 1.7701280691736683, "grad_norm": 0.09420914947986603, "learning_rate": 4.9102277351136413e-05, "loss": 0.0906, "num_input_tokens_seen": 20748624, "step": 21700 }, { "epoch": 1.770535932784077, "grad_norm": 0.29212144017219543, "learning_rate": 4.910133186221113e-05, "loss": 0.1098, "num_input_tokens_seen": 20753696, "step": 21705 }, { "epoch": 1.7709437963944858, "grad_norm": 0.7609570026397705, "learning_rate": 4.910038588476209e-05, "loss": 0.2265, "num_input_tokens_seen": 20758464, "step": 21710 }, { "epoch": 1.7713516600048944, "grad_norm": 0.07384327799081802, "learning_rate": 4.9099439418808474e-05, "loss": 0.1033, "num_input_tokens_seen": 20763856, "step": 21715 }, { "epoch": 1.771759523615303, "grad_norm": 0.1174730658531189, "learning_rate": 4.909849246436946e-05, "loss": 0.0192, "num_input_tokens_seen": 20768608, "step": 21720 }, { "epoch": 1.7721673872257118, "grad_norm": 0.05064541846513748, "learning_rate": 4.909754502146424e-05, "loss": 0.0941, "num_input_tokens_seen": 20771824, "step": 21725 }, { "epoch": 1.7725752508361206, "grad_norm": 0.03636607527732849, "learning_rate": 4.9096597090112026e-05, "loss": 0.0119, "num_input_tokens_seen": 20777360, "step": 21730 }, { "epoch": 1.7729831144465291, "grad_norm": 0.5043187737464905, "learning_rate": 4.909564867033203e-05, "loss": 0.2087, "num_input_tokens_seen": 20781328, "step": 21735 }, { "epoch": 1.7733909780569377, "grad_norm": 0.022825859487056732, "learning_rate": 4.909469976214347e-05, "loss": 0.2107, "num_input_tokens_seen": 20786880, "step": 21740 }, { "epoch": 1.7737988416673465, "grad_norm": 4.133779048919678, "learning_rate": 4.909375036556558e-05, "loss": 0.4985, "num_input_tokens_seen": 20791824, "step": 21745 }, { "epoch": 1.7742067052777553, "grad_norm": 0.209125816822052, "learning_rate": 4.909280048061762e-05, "loss": 0.0113, "num_input_tokens_seen": 20796208, "step": 21750 }, { "epoch": 1.7746145688881638, "grad_norm": 0.008149726316332817, "learning_rate": 4.909185010731882e-05, "loss": 0.0245, "num_input_tokens_seen": 20801360, "step": 21755 }, { "epoch": 1.7750224324985724, "grad_norm": 2.707498073577881, "learning_rate": 4.909089924568846e-05, "loss": 0.1071, "num_input_tokens_seen": 20807520, "step": 21760 }, { "epoch": 1.7754302961089812, "grad_norm": 1.5889300107955933, "learning_rate": 4.9089947895745806e-05, "loss": 0.2348, "num_input_tokens_seen": 20812384, "step": 21765 }, { "epoch": 1.77583815971939, "grad_norm": 7.431365013122559, "learning_rate": 4.908899605751015e-05, "loss": 0.0477, "num_input_tokens_seen": 20817568, "step": 21770 }, { "epoch": 1.7762460233297985, "grad_norm": 0.023355752229690552, "learning_rate": 4.9088043731000776e-05, "loss": 0.0123, "num_input_tokens_seen": 20822096, "step": 21775 }, { "epoch": 1.776653886940207, "grad_norm": 14.993450164794922, "learning_rate": 4.9087090916237e-05, "loss": 0.0797, "num_input_tokens_seen": 20826352, "step": 21780 }, { "epoch": 1.7770617505506159, "grad_norm": 4.143021106719971, "learning_rate": 4.908613761323812e-05, "loss": 0.1806, "num_input_tokens_seen": 20831184, "step": 21785 }, { "epoch": 1.7774696141610247, "grad_norm": 0.06762450188398361, "learning_rate": 4.908518382202346e-05, "loss": 0.1878, "num_input_tokens_seen": 20836672, "step": 21790 }, { "epoch": 1.7778774777714332, "grad_norm": 0.034695401787757874, "learning_rate": 4.908422954261237e-05, "loss": 0.0162, "num_input_tokens_seen": 20841248, "step": 21795 }, { "epoch": 1.7782853413818418, "grad_norm": 4.466291904449463, "learning_rate": 4.908327477502417e-05, "loss": 0.2482, "num_input_tokens_seen": 20846640, "step": 21800 }, { "epoch": 1.7786932049922506, "grad_norm": 0.017845742404460907, "learning_rate": 4.908231951927823e-05, "loss": 0.2284, "num_input_tokens_seen": 20851536, "step": 21805 }, { "epoch": 1.7791010686026594, "grad_norm": 0.4362903833389282, "learning_rate": 4.908136377539391e-05, "loss": 0.4257, "num_input_tokens_seen": 20855904, "step": 21810 }, { "epoch": 1.779508932213068, "grad_norm": 0.09344053268432617, "learning_rate": 4.908040754339057e-05, "loss": 0.1767, "num_input_tokens_seen": 20860672, "step": 21815 }, { "epoch": 1.7799167958234765, "grad_norm": 0.23854313790798187, "learning_rate": 4.907945082328761e-05, "loss": 0.1826, "num_input_tokens_seen": 20865168, "step": 21820 }, { "epoch": 1.7803246594338853, "grad_norm": 0.06183473393321037, "learning_rate": 4.9078493615104405e-05, "loss": 0.0699, "num_input_tokens_seen": 20870384, "step": 21825 }, { "epoch": 1.780732523044294, "grad_norm": 0.35088297724723816, "learning_rate": 4.9077535918860374e-05, "loss": 0.1572, "num_input_tokens_seen": 20874656, "step": 21830 }, { "epoch": 1.7811403866547026, "grad_norm": 0.01649315096437931, "learning_rate": 4.907657773457491e-05, "loss": 0.0656, "num_input_tokens_seen": 20879072, "step": 21835 }, { "epoch": 1.7815482502651112, "grad_norm": 0.25444474816322327, "learning_rate": 4.907561906226746e-05, "loss": 0.0714, "num_input_tokens_seen": 20884288, "step": 21840 }, { "epoch": 1.78195611387552, "grad_norm": 2.7360036373138428, "learning_rate": 4.907465990195743e-05, "loss": 0.3325, "num_input_tokens_seen": 20889072, "step": 21845 }, { "epoch": 1.7823639774859288, "grad_norm": 0.5376061797142029, "learning_rate": 4.907370025366428e-05, "loss": 0.3637, "num_input_tokens_seen": 20893744, "step": 21850 }, { "epoch": 1.7827718410963374, "grad_norm": 0.049782995134592056, "learning_rate": 4.907274011740746e-05, "loss": 0.0069, "num_input_tokens_seen": 20897792, "step": 21855 }, { "epoch": 1.783179704706746, "grad_norm": 0.03997502848505974, "learning_rate": 4.9071779493206424e-05, "loss": 0.0049, "num_input_tokens_seen": 20902240, "step": 21860 }, { "epoch": 1.7835875683171547, "grad_norm": 0.4475243091583252, "learning_rate": 4.907081838108064e-05, "loss": 0.0142, "num_input_tokens_seen": 20907120, "step": 21865 }, { "epoch": 1.7839954319275635, "grad_norm": 0.05277125537395477, "learning_rate": 4.9069856781049605e-05, "loss": 0.1316, "num_input_tokens_seen": 20911648, "step": 21870 }, { "epoch": 1.784403295537972, "grad_norm": 0.02150009758770466, "learning_rate": 4.90688946931328e-05, "loss": 0.1553, "num_input_tokens_seen": 20916224, "step": 21875 }, { "epoch": 1.7848111591483806, "grad_norm": 2.533717632293701, "learning_rate": 4.906793211734971e-05, "loss": 0.2482, "num_input_tokens_seen": 20920736, "step": 21880 }, { "epoch": 1.7852190227587894, "grad_norm": 0.058545537292957306, "learning_rate": 4.906696905371988e-05, "loss": 0.1247, "num_input_tokens_seen": 20925760, "step": 21885 }, { "epoch": 1.7856268863691982, "grad_norm": 8.790512084960938, "learning_rate": 4.906600550226281e-05, "loss": 0.0411, "num_input_tokens_seen": 20930368, "step": 21890 }, { "epoch": 1.7860347499796068, "grad_norm": 0.055185962468385696, "learning_rate": 4.906504146299804e-05, "loss": 0.2449, "num_input_tokens_seen": 20935680, "step": 21895 }, { "epoch": 1.7864426135900155, "grad_norm": 1.1502611637115479, "learning_rate": 4.90640769359451e-05, "loss": 0.027, "num_input_tokens_seen": 20940768, "step": 21900 }, { "epoch": 1.7868504772004243, "grad_norm": 2.5684895515441895, "learning_rate": 4.906311192112355e-05, "loss": 0.1583, "num_input_tokens_seen": 20944768, "step": 21905 }, { "epoch": 1.787258340810833, "grad_norm": 13.543770790100098, "learning_rate": 4.906214641855293e-05, "loss": 0.0798, "num_input_tokens_seen": 20949536, "step": 21910 }, { "epoch": 1.7876662044212415, "grad_norm": 0.21502608060836792, "learning_rate": 4.906118042825285e-05, "loss": 0.0077, "num_input_tokens_seen": 20953984, "step": 21915 }, { "epoch": 1.7880740680316503, "grad_norm": 0.43549829721450806, "learning_rate": 4.906021395024285e-05, "loss": 0.1199, "num_input_tokens_seen": 20958528, "step": 21920 }, { "epoch": 1.788481931642059, "grad_norm": 3.748812198638916, "learning_rate": 4.9059246984542545e-05, "loss": 0.1568, "num_input_tokens_seen": 20963824, "step": 21925 }, { "epoch": 1.7888897952524676, "grad_norm": 0.3527303636074066, "learning_rate": 4.905827953117152e-05, "loss": 0.012, "num_input_tokens_seen": 20968432, "step": 21930 }, { "epoch": 1.7892976588628762, "grad_norm": 0.045651186257600784, "learning_rate": 4.90573115901494e-05, "loss": 0.0067, "num_input_tokens_seen": 20974176, "step": 21935 }, { "epoch": 1.789705522473285, "grad_norm": 0.05420264974236488, "learning_rate": 4.90563431614958e-05, "loss": 0.2932, "num_input_tokens_seen": 20979136, "step": 21940 }, { "epoch": 1.7901133860836937, "grad_norm": 0.07728120684623718, "learning_rate": 4.9055374245230336e-05, "loss": 0.2114, "num_input_tokens_seen": 20983648, "step": 21945 }, { "epoch": 1.7905212496941023, "grad_norm": 0.07163811475038528, "learning_rate": 4.905440484137266e-05, "loss": 0.0139, "num_input_tokens_seen": 20989264, "step": 21950 }, { "epoch": 1.7909291133045109, "grad_norm": 10.630094528198242, "learning_rate": 4.905343494994242e-05, "loss": 0.2491, "num_input_tokens_seen": 20994336, "step": 21955 }, { "epoch": 1.7913369769149197, "grad_norm": 0.31721940636634827, "learning_rate": 4.9052464570959275e-05, "loss": 0.3951, "num_input_tokens_seen": 20999760, "step": 21960 }, { "epoch": 1.7917448405253285, "grad_norm": 12.603825569152832, "learning_rate": 4.9051493704442896e-05, "loss": 0.1867, "num_input_tokens_seen": 21005264, "step": 21965 }, { "epoch": 1.792152704135737, "grad_norm": 0.053486648947000504, "learning_rate": 4.905052235041296e-05, "loss": 0.016, "num_input_tokens_seen": 21010288, "step": 21970 }, { "epoch": 1.7925605677461456, "grad_norm": 0.09898269176483154, "learning_rate": 4.904955050888915e-05, "loss": 0.019, "num_input_tokens_seen": 21014736, "step": 21975 }, { "epoch": 1.7929684313565544, "grad_norm": 0.037956662476062775, "learning_rate": 4.904857817989118e-05, "loss": 0.0726, "num_input_tokens_seen": 21019984, "step": 21980 }, { "epoch": 1.7933762949669632, "grad_norm": 7.126067161560059, "learning_rate": 4.904760536343874e-05, "loss": 0.3889, "num_input_tokens_seen": 21024176, "step": 21985 }, { "epoch": 1.7937841585773717, "grad_norm": 0.2726461887359619, "learning_rate": 4.904663205955156e-05, "loss": 0.5363, "num_input_tokens_seen": 21029584, "step": 21990 }, { "epoch": 1.7941920221877803, "grad_norm": 4.882093906402588, "learning_rate": 4.904565826824937e-05, "loss": 0.0848, "num_input_tokens_seen": 21033872, "step": 21995 }, { "epoch": 1.794599885798189, "grad_norm": 0.02772533893585205, "learning_rate": 4.90446839895519e-05, "loss": 0.0077, "num_input_tokens_seen": 21038672, "step": 22000 }, { "epoch": 1.7950077494085979, "grad_norm": 0.07057242095470428, "learning_rate": 4.904370922347891e-05, "loss": 0.154, "num_input_tokens_seen": 21042832, "step": 22005 }, { "epoch": 1.7954156130190064, "grad_norm": 0.012833266519010067, "learning_rate": 4.904273397005014e-05, "loss": 0.0973, "num_input_tokens_seen": 21047088, "step": 22010 }, { "epoch": 1.795823476629415, "grad_norm": 0.08815718442201614, "learning_rate": 4.9041758229285375e-05, "loss": 0.3243, "num_input_tokens_seen": 21052128, "step": 22015 }, { "epoch": 1.7962313402398238, "grad_norm": 0.5297846794128418, "learning_rate": 4.904078200120438e-05, "loss": 0.1444, "num_input_tokens_seen": 21056704, "step": 22020 }, { "epoch": 1.7966392038502326, "grad_norm": 0.06252708286046982, "learning_rate": 4.9039805285826956e-05, "loss": 0.3272, "num_input_tokens_seen": 21061328, "step": 22025 }, { "epoch": 1.7970470674606411, "grad_norm": 0.0743609219789505, "learning_rate": 4.9038828083172895e-05, "loss": 0.0164, "num_input_tokens_seen": 21066048, "step": 22030 }, { "epoch": 1.7974549310710497, "grad_norm": 0.12343791127204895, "learning_rate": 4.9037850393262005e-05, "loss": 0.2208, "num_input_tokens_seen": 21071152, "step": 22035 }, { "epoch": 1.7978627946814585, "grad_norm": 0.012872928753495216, "learning_rate": 4.9036872216114095e-05, "loss": 0.307, "num_input_tokens_seen": 21075200, "step": 22040 }, { "epoch": 1.7982706582918673, "grad_norm": 6.339492321014404, "learning_rate": 4.9035893551749e-05, "loss": 0.0981, "num_input_tokens_seen": 21080192, "step": 22045 }, { "epoch": 1.7986785219022758, "grad_norm": 8.916116714477539, "learning_rate": 4.903491440018656e-05, "loss": 0.2199, "num_input_tokens_seen": 21084704, "step": 22050 }, { "epoch": 1.7990863855126844, "grad_norm": 1.7473857402801514, "learning_rate": 4.903393476144662e-05, "loss": 0.018, "num_input_tokens_seen": 21089552, "step": 22055 }, { "epoch": 1.7994942491230932, "grad_norm": 0.03052746132016182, "learning_rate": 4.903295463554903e-05, "loss": 0.0059, "num_input_tokens_seen": 21094144, "step": 22060 }, { "epoch": 1.799902112733502, "grad_norm": 0.011239446699619293, "learning_rate": 4.903197402251366e-05, "loss": 0.0163, "num_input_tokens_seen": 21099936, "step": 22065 }, { "epoch": 1.8003099763439105, "grad_norm": 0.2050100415945053, "learning_rate": 4.90309929223604e-05, "loss": 0.1705, "num_input_tokens_seen": 21105328, "step": 22070 }, { "epoch": 1.800717839954319, "grad_norm": 0.025196989998221397, "learning_rate": 4.903001133510911e-05, "loss": 0.1036, "num_input_tokens_seen": 21110432, "step": 22075 }, { "epoch": 1.8011257035647281, "grad_norm": 0.17217424511909485, "learning_rate": 4.9029029260779707e-05, "loss": 0.0056, "num_input_tokens_seen": 21115472, "step": 22080 }, { "epoch": 1.8015335671751367, "grad_norm": 0.06596150249242783, "learning_rate": 4.9028046699392094e-05, "loss": 0.2378, "num_input_tokens_seen": 21120176, "step": 22085 }, { "epoch": 1.8019414307855453, "grad_norm": 0.029197396710515022, "learning_rate": 4.902706365096618e-05, "loss": 0.3794, "num_input_tokens_seen": 21124512, "step": 22090 }, { "epoch": 1.802349294395954, "grad_norm": 9.084197998046875, "learning_rate": 4.90260801155219e-05, "loss": 0.1482, "num_input_tokens_seen": 21129616, "step": 22095 }, { "epoch": 1.8027571580063628, "grad_norm": 8.123757362365723, "learning_rate": 4.902509609307918e-05, "loss": 0.161, "num_input_tokens_seen": 21134096, "step": 22100 }, { "epoch": 1.8031650216167714, "grad_norm": 0.04798701032996178, "learning_rate": 4.902411158365798e-05, "loss": 0.2381, "num_input_tokens_seen": 21138576, "step": 22105 }, { "epoch": 1.80357288522718, "grad_norm": 5.555981159210205, "learning_rate": 4.9023126587278245e-05, "loss": 0.2029, "num_input_tokens_seen": 21143536, "step": 22110 }, { "epoch": 1.8039807488375887, "grad_norm": 3.055915117263794, "learning_rate": 4.9022141103959937e-05, "loss": 0.1566, "num_input_tokens_seen": 21148688, "step": 22115 }, { "epoch": 1.8043886124479975, "grad_norm": 6.228394031524658, "learning_rate": 4.902115513372304e-05, "loss": 0.2219, "num_input_tokens_seen": 21153072, "step": 22120 }, { "epoch": 1.804796476058406, "grad_norm": 0.05410335212945938, "learning_rate": 4.902016867658753e-05, "loss": 0.0391, "num_input_tokens_seen": 21157376, "step": 22125 }, { "epoch": 1.8052043396688147, "grad_norm": 0.02806658111512661, "learning_rate": 4.9019181732573413e-05, "loss": 0.0246, "num_input_tokens_seen": 21161504, "step": 22130 }, { "epoch": 1.8056122032792234, "grad_norm": 0.03787832707166672, "learning_rate": 4.901819430170068e-05, "loss": 0.0173, "num_input_tokens_seen": 21166816, "step": 22135 }, { "epoch": 1.8060200668896322, "grad_norm": 0.03143259882926941, "learning_rate": 4.901720638398937e-05, "loss": 0.0054, "num_input_tokens_seen": 21171184, "step": 22140 }, { "epoch": 1.8064279305000408, "grad_norm": 0.05336444079875946, "learning_rate": 4.901621797945948e-05, "loss": 0.6474, "num_input_tokens_seen": 21175696, "step": 22145 }, { "epoch": 1.8068357941104494, "grad_norm": 0.03545519709587097, "learning_rate": 4.901522908813106e-05, "loss": 0.093, "num_input_tokens_seen": 21180384, "step": 22150 }, { "epoch": 1.8072436577208582, "grad_norm": 0.013702197931706905, "learning_rate": 4.901423971002415e-05, "loss": 0.0039, "num_input_tokens_seen": 21185120, "step": 22155 }, { "epoch": 1.807651521331267, "grad_norm": 0.054348815232515335, "learning_rate": 4.901324984515881e-05, "loss": 0.0101, "num_input_tokens_seen": 21190032, "step": 22160 }, { "epoch": 1.8080593849416755, "grad_norm": 0.059559281915426254, "learning_rate": 4.901225949355509e-05, "loss": 0.0978, "num_input_tokens_seen": 21194624, "step": 22165 }, { "epoch": 1.808467248552084, "grad_norm": 0.040272895246744156, "learning_rate": 4.9011268655233085e-05, "loss": 0.1104, "num_input_tokens_seen": 21199056, "step": 22170 }, { "epoch": 1.8088751121624929, "grad_norm": 9.648359298706055, "learning_rate": 4.9010277330212865e-05, "loss": 0.1131, "num_input_tokens_seen": 21203664, "step": 22175 }, { "epoch": 1.8092829757729016, "grad_norm": 0.053537625819444656, "learning_rate": 4.9009285518514516e-05, "loss": 0.0319, "num_input_tokens_seen": 21208576, "step": 22180 }, { "epoch": 1.8096908393833102, "grad_norm": 6.120938301086426, "learning_rate": 4.9008293220158165e-05, "loss": 0.5274, "num_input_tokens_seen": 21212880, "step": 22185 }, { "epoch": 1.8100987029937188, "grad_norm": 0.9056649208068848, "learning_rate": 4.90073004351639e-05, "loss": 0.1043, "num_input_tokens_seen": 21217552, "step": 22190 }, { "epoch": 1.8105065666041276, "grad_norm": 1.2431130409240723, "learning_rate": 4.900630716355187e-05, "loss": 0.0139, "num_input_tokens_seen": 21222112, "step": 22195 }, { "epoch": 1.8109144302145364, "grad_norm": 35.89370346069336, "learning_rate": 4.900531340534218e-05, "loss": 0.3494, "num_input_tokens_seen": 21227472, "step": 22200 }, { "epoch": 1.811322293824945, "grad_norm": 0.42048123478889465, "learning_rate": 4.9004319160555e-05, "loss": 0.2611, "num_input_tokens_seen": 21230848, "step": 22205 }, { "epoch": 1.8117301574353535, "grad_norm": 8.956869125366211, "learning_rate": 4.9003324429210464e-05, "loss": 0.6316, "num_input_tokens_seen": 21235328, "step": 22210 }, { "epoch": 1.8121380210457623, "grad_norm": 0.12476051598787308, "learning_rate": 4.9002329211328746e-05, "loss": 0.2594, "num_input_tokens_seen": 21238960, "step": 22215 }, { "epoch": 1.812545884656171, "grad_norm": 0.19686247408390045, "learning_rate": 4.9001333506930006e-05, "loss": 0.0732, "num_input_tokens_seen": 21243984, "step": 22220 }, { "epoch": 1.8129537482665796, "grad_norm": 0.15775100886821747, "learning_rate": 4.900033731603444e-05, "loss": 0.1544, "num_input_tokens_seen": 21248160, "step": 22225 }, { "epoch": 1.8133616118769882, "grad_norm": 0.6077678799629211, "learning_rate": 4.899934063866224e-05, "loss": 0.032, "num_input_tokens_seen": 21253120, "step": 22230 }, { "epoch": 1.813769475487397, "grad_norm": 0.12220829725265503, "learning_rate": 4.89983434748336e-05, "loss": 0.0388, "num_input_tokens_seen": 21256832, "step": 22235 }, { "epoch": 1.8141773390978058, "grad_norm": 0.06320682913064957, "learning_rate": 4.899734582456873e-05, "loss": 0.1381, "num_input_tokens_seen": 21260832, "step": 22240 }, { "epoch": 1.8145852027082143, "grad_norm": 0.022174540907144547, "learning_rate": 4.8996347687887856e-05, "loss": 0.0043, "num_input_tokens_seen": 21265088, "step": 22245 }, { "epoch": 1.814993066318623, "grad_norm": 3.031717538833618, "learning_rate": 4.8995349064811216e-05, "loss": 0.2435, "num_input_tokens_seen": 21270240, "step": 22250 }, { "epoch": 1.8154009299290317, "grad_norm": 0.030160019174218178, "learning_rate": 4.899434995535905e-05, "loss": 0.1689, "num_input_tokens_seen": 21274304, "step": 22255 }, { "epoch": 1.8158087935394405, "grad_norm": 0.021054839715361595, "learning_rate": 4.8993350359551596e-05, "loss": 0.0232, "num_input_tokens_seen": 21278560, "step": 22260 }, { "epoch": 1.816216657149849, "grad_norm": 0.07748076319694519, "learning_rate": 4.899235027740914e-05, "loss": 0.0611, "num_input_tokens_seen": 21283056, "step": 22265 }, { "epoch": 1.8166245207602578, "grad_norm": 0.07735127210617065, "learning_rate": 4.899134970895193e-05, "loss": 0.2227, "num_input_tokens_seen": 21287296, "step": 22270 }, { "epoch": 1.8170323843706666, "grad_norm": 3.210153818130493, "learning_rate": 4.8990348654200254e-05, "loss": 0.4539, "num_input_tokens_seen": 21291712, "step": 22275 }, { "epoch": 1.8174402479810752, "grad_norm": 0.04869736731052399, "learning_rate": 4.8989347113174414e-05, "loss": 0.2693, "num_input_tokens_seen": 21296288, "step": 22280 }, { "epoch": 1.8178481115914837, "grad_norm": 0.09206201136112213, "learning_rate": 4.898834508589469e-05, "loss": 0.0227, "num_input_tokens_seen": 21300960, "step": 22285 }, { "epoch": 1.8182559752018925, "grad_norm": 0.026450064033269882, "learning_rate": 4.898734257238141e-05, "loss": 0.0084, "num_input_tokens_seen": 21305264, "step": 22290 }, { "epoch": 1.8186638388123013, "grad_norm": 17.387331008911133, "learning_rate": 4.898633957265489e-05, "loss": 0.3072, "num_input_tokens_seen": 21309216, "step": 22295 }, { "epoch": 1.8190717024227099, "grad_norm": 0.1272953450679779, "learning_rate": 4.8985336086735454e-05, "loss": 0.03, "num_input_tokens_seen": 21313584, "step": 22300 }, { "epoch": 1.8194795660331184, "grad_norm": 0.008762015961110592, "learning_rate": 4.898433211464345e-05, "loss": 0.0109, "num_input_tokens_seen": 21317952, "step": 22305 }, { "epoch": 1.8198874296435272, "grad_norm": 0.09275336563587189, "learning_rate": 4.898332765639923e-05, "loss": 0.3288, "num_input_tokens_seen": 21322800, "step": 22310 }, { "epoch": 1.820295293253936, "grad_norm": 0.46086743474006653, "learning_rate": 4.8982322712023147e-05, "loss": 0.0133, "num_input_tokens_seen": 21328192, "step": 22315 }, { "epoch": 1.8207031568643446, "grad_norm": 0.35450276732444763, "learning_rate": 4.898131728153557e-05, "loss": 0.1903, "num_input_tokens_seen": 21332816, "step": 22320 }, { "epoch": 1.8211110204747532, "grad_norm": 0.11913282424211502, "learning_rate": 4.898031136495689e-05, "loss": 0.1512, "num_input_tokens_seen": 21338400, "step": 22325 }, { "epoch": 1.821518884085162, "grad_norm": 0.03577061742544174, "learning_rate": 4.897930496230748e-05, "loss": 0.0099, "num_input_tokens_seen": 21342896, "step": 22330 }, { "epoch": 1.8219267476955707, "grad_norm": 0.014382896944880486, "learning_rate": 4.897829807360775e-05, "loss": 0.5177, "num_input_tokens_seen": 21348272, "step": 22335 }, { "epoch": 1.8223346113059793, "grad_norm": 8.426630973815918, "learning_rate": 4.897729069887811e-05, "loss": 0.0601, "num_input_tokens_seen": 21352656, "step": 22340 }, { "epoch": 1.8227424749163879, "grad_norm": 0.06943771243095398, "learning_rate": 4.897628283813898e-05, "loss": 0.4105, "num_input_tokens_seen": 21357744, "step": 22345 }, { "epoch": 1.8231503385267966, "grad_norm": 16.166200637817383, "learning_rate": 4.897527449141077e-05, "loss": 0.2995, "num_input_tokens_seen": 21363008, "step": 22350 }, { "epoch": 1.8235582021372054, "grad_norm": 18.699607849121094, "learning_rate": 4.897426565871395e-05, "loss": 0.2486, "num_input_tokens_seen": 21367248, "step": 22355 }, { "epoch": 1.823966065747614, "grad_norm": 0.07545937597751617, "learning_rate": 4.897325634006894e-05, "loss": 0.034, "num_input_tokens_seen": 21372096, "step": 22360 }, { "epoch": 1.8243739293580226, "grad_norm": 3.60117244720459, "learning_rate": 4.897224653549622e-05, "loss": 0.3303, "num_input_tokens_seen": 21376736, "step": 22365 }, { "epoch": 1.8247817929684313, "grad_norm": 0.23281525075435638, "learning_rate": 4.897123624501624e-05, "loss": 0.1108, "num_input_tokens_seen": 21381888, "step": 22370 }, { "epoch": 1.8251896565788401, "grad_norm": 0.0945059210062027, "learning_rate": 4.89702254686495e-05, "loss": 0.0324, "num_input_tokens_seen": 21387152, "step": 22375 }, { "epoch": 1.8255975201892487, "grad_norm": 1.663104772567749, "learning_rate": 4.8969214206416464e-05, "loss": 0.2, "num_input_tokens_seen": 21392288, "step": 22380 }, { "epoch": 1.8260053837996573, "grad_norm": 0.05208500847220421, "learning_rate": 4.896820245833764e-05, "loss": 0.2063, "num_input_tokens_seen": 21397392, "step": 22385 }, { "epoch": 1.826413247410066, "grad_norm": 5.463951587677002, "learning_rate": 4.896719022443354e-05, "loss": 0.2012, "num_input_tokens_seen": 21402192, "step": 22390 }, { "epoch": 1.8268211110204748, "grad_norm": 0.15729467570781708, "learning_rate": 4.896617750472469e-05, "loss": 0.2597, "num_input_tokens_seen": 21406688, "step": 22395 }, { "epoch": 1.8272289746308834, "grad_norm": 0.10450185090303421, "learning_rate": 4.896516429923159e-05, "loss": 0.0118, "num_input_tokens_seen": 21412000, "step": 22400 }, { "epoch": 1.827636838241292, "grad_norm": 2.6816301345825195, "learning_rate": 4.89641506079748e-05, "loss": 0.1462, "num_input_tokens_seen": 21416656, "step": 22405 }, { "epoch": 1.8280447018517008, "grad_norm": 0.18204909563064575, "learning_rate": 4.896313643097487e-05, "loss": 0.2677, "num_input_tokens_seen": 21422192, "step": 22410 }, { "epoch": 1.8284525654621095, "grad_norm": 0.21937969326972961, "learning_rate": 4.896212176825233e-05, "loss": 0.2426, "num_input_tokens_seen": 21427152, "step": 22415 }, { "epoch": 1.8288604290725181, "grad_norm": 4.666619777679443, "learning_rate": 4.896110661982777e-05, "loss": 0.0485, "num_input_tokens_seen": 21431392, "step": 22420 }, { "epoch": 1.8292682926829267, "grad_norm": 6.442577362060547, "learning_rate": 4.896009098572176e-05, "loss": 0.0302, "num_input_tokens_seen": 21435760, "step": 22425 }, { "epoch": 1.8296761562933355, "grad_norm": 0.8139817714691162, "learning_rate": 4.895907486595489e-05, "loss": 0.1217, "num_input_tokens_seen": 21439840, "step": 22430 }, { "epoch": 1.8300840199037443, "grad_norm": 0.031716473400592804, "learning_rate": 4.895805826054775e-05, "loss": 0.0207, "num_input_tokens_seen": 21444480, "step": 22435 }, { "epoch": 1.8304918835141528, "grad_norm": 0.08041730523109436, "learning_rate": 4.895704116952096e-05, "loss": 0.0085, "num_input_tokens_seen": 21449360, "step": 22440 }, { "epoch": 1.8308997471245614, "grad_norm": 62.121456146240234, "learning_rate": 4.8956023592895104e-05, "loss": 0.2377, "num_input_tokens_seen": 21454208, "step": 22445 }, { "epoch": 1.8313076107349704, "grad_norm": 3.667550563812256, "learning_rate": 4.895500553069084e-05, "loss": 0.2411, "num_input_tokens_seen": 21458352, "step": 22450 }, { "epoch": 1.831715474345379, "grad_norm": 0.0654812604188919, "learning_rate": 4.89539869829288e-05, "loss": 0.0077, "num_input_tokens_seen": 21462592, "step": 22455 }, { "epoch": 1.8321233379557875, "grad_norm": 0.15171054005622864, "learning_rate": 4.8952967949629617e-05, "loss": 0.2067, "num_input_tokens_seen": 21467552, "step": 22460 }, { "epoch": 1.8325312015661963, "grad_norm": 0.021353069692850113, "learning_rate": 4.895194843081395e-05, "loss": 0.1053, "num_input_tokens_seen": 21472848, "step": 22465 }, { "epoch": 1.832939065176605, "grad_norm": 0.059301674365997314, "learning_rate": 4.895092842650247e-05, "loss": 0.0241, "num_input_tokens_seen": 21477680, "step": 22470 }, { "epoch": 1.8333469287870137, "grad_norm": 0.011452886275947094, "learning_rate": 4.894990793671584e-05, "loss": 0.0048, "num_input_tokens_seen": 21482928, "step": 22475 }, { "epoch": 1.8337547923974222, "grad_norm": 1.9374350309371948, "learning_rate": 4.8948886961474766e-05, "loss": 0.2536, "num_input_tokens_seen": 21487856, "step": 22480 }, { "epoch": 1.834162656007831, "grad_norm": 0.03187637776136398, "learning_rate": 4.8947865500799926e-05, "loss": 0.0032, "num_input_tokens_seen": 21492528, "step": 22485 }, { "epoch": 1.8345705196182398, "grad_norm": 0.11880690604448318, "learning_rate": 4.8946843554712026e-05, "loss": 0.2755, "num_input_tokens_seen": 21497104, "step": 22490 }, { "epoch": 1.8349783832286484, "grad_norm": 0.16093042492866516, "learning_rate": 4.894582112323178e-05, "loss": 0.0241, "num_input_tokens_seen": 21501840, "step": 22495 }, { "epoch": 1.835386246839057, "grad_norm": 0.37376293540000916, "learning_rate": 4.894479820637992e-05, "loss": 0.1062, "num_input_tokens_seen": 21507536, "step": 22500 }, { "epoch": 1.8357941104494657, "grad_norm": 0.11655182391405106, "learning_rate": 4.8943774804177166e-05, "loss": 0.1465, "num_input_tokens_seen": 21512032, "step": 22505 }, { "epoch": 1.8362019740598745, "grad_norm": 0.03802575170993805, "learning_rate": 4.894275091664428e-05, "loss": 0.0378, "num_input_tokens_seen": 21516416, "step": 22510 }, { "epoch": 1.836609837670283, "grad_norm": 0.03260600194334984, "learning_rate": 4.8941726543802e-05, "loss": 0.0065, "num_input_tokens_seen": 21521648, "step": 22515 }, { "epoch": 1.8370177012806916, "grad_norm": 4.455177307128906, "learning_rate": 4.894070168567111e-05, "loss": 0.1479, "num_input_tokens_seen": 21526464, "step": 22520 }, { "epoch": 1.8374255648911004, "grad_norm": 0.0321883000433445, "learning_rate": 4.8939676342272356e-05, "loss": 0.0058, "num_input_tokens_seen": 21530784, "step": 22525 }, { "epoch": 1.8378334285015092, "grad_norm": 0.06529013812541962, "learning_rate": 4.893865051362654e-05, "loss": 0.009, "num_input_tokens_seen": 21535504, "step": 22530 }, { "epoch": 1.8382412921119178, "grad_norm": 4.314476490020752, "learning_rate": 4.893762419975445e-05, "loss": 0.6095, "num_input_tokens_seen": 21540592, "step": 22535 }, { "epoch": 1.8386491557223263, "grad_norm": 0.07072960585355759, "learning_rate": 4.893659740067689e-05, "loss": 0.0045, "num_input_tokens_seen": 21544736, "step": 22540 }, { "epoch": 1.8390570193327351, "grad_norm": 0.09145643562078476, "learning_rate": 4.893557011641467e-05, "loss": 0.0146, "num_input_tokens_seen": 21548960, "step": 22545 }, { "epoch": 1.839464882943144, "grad_norm": 0.37047070264816284, "learning_rate": 4.893454234698862e-05, "loss": 0.21, "num_input_tokens_seen": 21554384, "step": 22550 }, { "epoch": 1.8398727465535525, "grad_norm": 0.47858795523643494, "learning_rate": 4.8933514092419555e-05, "loss": 0.1378, "num_input_tokens_seen": 21560256, "step": 22555 }, { "epoch": 1.840280610163961, "grad_norm": 0.4795656204223633, "learning_rate": 4.893248535272834e-05, "loss": 0.38, "num_input_tokens_seen": 21564592, "step": 22560 }, { "epoch": 1.8406884737743698, "grad_norm": 0.09401823580265045, "learning_rate": 4.893145612793581e-05, "loss": 0.2158, "num_input_tokens_seen": 21568032, "step": 22565 }, { "epoch": 1.8410963373847786, "grad_norm": 0.043001458048820496, "learning_rate": 4.893042641806284e-05, "loss": 0.2017, "num_input_tokens_seen": 21573072, "step": 22570 }, { "epoch": 1.8415042009951872, "grad_norm": 0.06705691665410995, "learning_rate": 4.89293962231303e-05, "loss": 0.0425, "num_input_tokens_seen": 21577936, "step": 22575 }, { "epoch": 1.8419120646055958, "grad_norm": 0.3348334729671478, "learning_rate": 4.892836554315905e-05, "loss": 0.069, "num_input_tokens_seen": 21582304, "step": 22580 }, { "epoch": 1.8423199282160045, "grad_norm": 0.15786603093147278, "learning_rate": 4.892733437817001e-05, "loss": 0.1755, "num_input_tokens_seen": 21586688, "step": 22585 }, { "epoch": 1.8427277918264133, "grad_norm": 0.02640790306031704, "learning_rate": 4.8926302728184065e-05, "loss": 0.3221, "num_input_tokens_seen": 21591248, "step": 22590 }, { "epoch": 1.843135655436822, "grad_norm": 0.16049200296401978, "learning_rate": 4.8925270593222135e-05, "loss": 0.0222, "num_input_tokens_seen": 21596432, "step": 22595 }, { "epoch": 1.8435435190472305, "grad_norm": 0.1826780140399933, "learning_rate": 4.892423797330513e-05, "loss": 0.1728, "num_input_tokens_seen": 21600640, "step": 22600 }, { "epoch": 1.8439513826576392, "grad_norm": 2.0754973888397217, "learning_rate": 4.8923204868453994e-05, "loss": 0.1408, "num_input_tokens_seen": 21604880, "step": 22605 }, { "epoch": 1.844359246268048, "grad_norm": 0.04644068330526352, "learning_rate": 4.892217127868965e-05, "loss": 0.049, "num_input_tokens_seen": 21609296, "step": 22610 }, { "epoch": 1.8447671098784566, "grad_norm": 2.391120195388794, "learning_rate": 4.8921137204033076e-05, "loss": 0.4182, "num_input_tokens_seen": 21614976, "step": 22615 }, { "epoch": 1.8451749734888652, "grad_norm": 0.06476172059774399, "learning_rate": 4.8920102644505206e-05, "loss": 0.062, "num_input_tokens_seen": 21619520, "step": 22620 }, { "epoch": 1.845582837099274, "grad_norm": 1.9711780548095703, "learning_rate": 4.891906760012702e-05, "loss": 0.1679, "num_input_tokens_seen": 21624000, "step": 22625 }, { "epoch": 1.8459907007096827, "grad_norm": 0.049848321825265884, "learning_rate": 4.8918032070919496e-05, "loss": 0.2884, "num_input_tokens_seen": 21628480, "step": 22630 }, { "epoch": 1.8463985643200913, "grad_norm": 0.15409114956855774, "learning_rate": 4.891699605690363e-05, "loss": 0.1228, "num_input_tokens_seen": 21633712, "step": 22635 }, { "epoch": 1.8468064279305, "grad_norm": 0.029777076095342636, "learning_rate": 4.891595955810041e-05, "loss": 0.0088, "num_input_tokens_seen": 21639056, "step": 22640 }, { "epoch": 1.8472142915409089, "grad_norm": 0.08215029537677765, "learning_rate": 4.8914922574530856e-05, "loss": 0.0292, "num_input_tokens_seen": 21644224, "step": 22645 }, { "epoch": 1.8476221551513174, "grad_norm": 0.03512867912650108, "learning_rate": 4.8913885106215985e-05, "loss": 0.1223, "num_input_tokens_seen": 21648640, "step": 22650 }, { "epoch": 1.848030018761726, "grad_norm": 0.08426698297262192, "learning_rate": 4.891284715317682e-05, "loss": 0.232, "num_input_tokens_seen": 21653648, "step": 22655 }, { "epoch": 1.8484378823721348, "grad_norm": 0.04182305559515953, "learning_rate": 4.8911808715434415e-05, "loss": 0.0251, "num_input_tokens_seen": 21658288, "step": 22660 }, { "epoch": 1.8488457459825436, "grad_norm": 0.1385246217250824, "learning_rate": 4.89107697930098e-05, "loss": 0.2048, "num_input_tokens_seen": 21662240, "step": 22665 }, { "epoch": 1.8492536095929522, "grad_norm": 0.3210793733596802, "learning_rate": 4.890973038592404e-05, "loss": 0.0319, "num_input_tokens_seen": 21666944, "step": 22670 }, { "epoch": 1.8496614732033607, "grad_norm": 0.03068569488823414, "learning_rate": 4.890869049419821e-05, "loss": 0.1416, "num_input_tokens_seen": 21672112, "step": 22675 }, { "epoch": 1.8500693368137695, "grad_norm": 300.228271484375, "learning_rate": 4.8907650117853375e-05, "loss": 1.5292, "num_input_tokens_seen": 21676768, "step": 22680 }, { "epoch": 1.8504772004241783, "grad_norm": 1.285477638244629, "learning_rate": 4.890660925691064e-05, "loss": 0.2989, "num_input_tokens_seen": 21680848, "step": 22685 }, { "epoch": 1.8508850640345869, "grad_norm": 5.591148376464844, "learning_rate": 4.890556791139109e-05, "loss": 0.2104, "num_input_tokens_seen": 21685664, "step": 22690 }, { "epoch": 1.8512929276449954, "grad_norm": 4.471333980560303, "learning_rate": 4.890452608131584e-05, "loss": 0.0414, "num_input_tokens_seen": 21690528, "step": 22695 }, { "epoch": 1.8517007912554042, "grad_norm": 0.2525976002216339, "learning_rate": 4.890348376670599e-05, "loss": 0.4435, "num_input_tokens_seen": 21695520, "step": 22700 }, { "epoch": 1.852108654865813, "grad_norm": 0.03429064899682999, "learning_rate": 4.8902440967582694e-05, "loss": 0.1384, "num_input_tokens_seen": 21700096, "step": 22705 }, { "epoch": 1.8525165184762216, "grad_norm": 6.6283111572265625, "learning_rate": 4.890139768396708e-05, "loss": 0.2877, "num_input_tokens_seen": 21705568, "step": 22710 }, { "epoch": 1.8529243820866301, "grad_norm": 0.016209358349442482, "learning_rate": 4.890035391588029e-05, "loss": 0.2148, "num_input_tokens_seen": 21710640, "step": 22715 }, { "epoch": 1.853332245697039, "grad_norm": 0.05440826714038849, "learning_rate": 4.889930966334347e-05, "loss": 0.1994, "num_input_tokens_seen": 21715152, "step": 22720 }, { "epoch": 1.8537401093074477, "grad_norm": 1.541355013847351, "learning_rate": 4.889826492637781e-05, "loss": 0.0266, "num_input_tokens_seen": 21720816, "step": 22725 }, { "epoch": 1.8541479729178563, "grad_norm": 0.02119668573141098, "learning_rate": 4.889721970500447e-05, "loss": 0.0169, "num_input_tokens_seen": 21725168, "step": 22730 }, { "epoch": 1.8545558365282648, "grad_norm": 0.03209182620048523, "learning_rate": 4.8896173999244647e-05, "loss": 0.166, "num_input_tokens_seen": 21729984, "step": 22735 }, { "epoch": 1.8549637001386736, "grad_norm": 0.5905929803848267, "learning_rate": 4.889512780911952e-05, "loss": 0.0116, "num_input_tokens_seen": 21734192, "step": 22740 }, { "epoch": 1.8553715637490824, "grad_norm": 3.0550217628479004, "learning_rate": 4.889408113465032e-05, "loss": 0.1675, "num_input_tokens_seen": 21738864, "step": 22745 }, { "epoch": 1.855779427359491, "grad_norm": 0.091737762093544, "learning_rate": 4.889303397585824e-05, "loss": 0.009, "num_input_tokens_seen": 21743568, "step": 22750 }, { "epoch": 1.8561872909698995, "grad_norm": 0.08981779962778091, "learning_rate": 4.889198633276451e-05, "loss": 0.0195, "num_input_tokens_seen": 21748320, "step": 22755 }, { "epoch": 1.8565951545803083, "grad_norm": 0.030155129730701447, "learning_rate": 4.889093820539038e-05, "loss": 0.3239, "num_input_tokens_seen": 21752912, "step": 22760 }, { "epoch": 1.8570030181907171, "grad_norm": 0.1265561282634735, "learning_rate": 4.8889889593757076e-05, "loss": 0.0494, "num_input_tokens_seen": 21756816, "step": 22765 }, { "epoch": 1.8574108818011257, "grad_norm": 0.3726114332675934, "learning_rate": 4.8888840497885866e-05, "loss": 0.3222, "num_input_tokens_seen": 21761328, "step": 22770 }, { "epoch": 1.8578187454115342, "grad_norm": 0.20984825491905212, "learning_rate": 4.8887790917798006e-05, "loss": 0.4769, "num_input_tokens_seen": 21766032, "step": 22775 }, { "epoch": 1.858226609021943, "grad_norm": 0.06517384946346283, "learning_rate": 4.888674085351478e-05, "loss": 0.2475, "num_input_tokens_seen": 21771392, "step": 22780 }, { "epoch": 1.8586344726323518, "grad_norm": 0.15099473297595978, "learning_rate": 4.8885690305057464e-05, "loss": 0.1078, "num_input_tokens_seen": 21776128, "step": 22785 }, { "epoch": 1.8590423362427604, "grad_norm": 0.4962542951107025, "learning_rate": 4.888463927244735e-05, "loss": 0.1032, "num_input_tokens_seen": 21780192, "step": 22790 }, { "epoch": 1.859450199853169, "grad_norm": 0.6260547637939453, "learning_rate": 4.888358775570576e-05, "loss": 0.027, "num_input_tokens_seen": 21785200, "step": 22795 }, { "epoch": 1.8598580634635777, "grad_norm": 16.636327743530273, "learning_rate": 4.8882535754853984e-05, "loss": 0.0824, "num_input_tokens_seen": 21789920, "step": 22800 }, { "epoch": 1.8602659270739865, "grad_norm": 7.708309173583984, "learning_rate": 4.888148326991336e-05, "loss": 0.1748, "num_input_tokens_seen": 21794304, "step": 22805 }, { "epoch": 1.860673790684395, "grad_norm": 7.040079116821289, "learning_rate": 4.888043030090522e-05, "loss": 0.2055, "num_input_tokens_seen": 21799536, "step": 22810 }, { "epoch": 1.8610816542948037, "grad_norm": 0.15695123374462128, "learning_rate": 4.88793768478509e-05, "loss": 0.0188, "num_input_tokens_seen": 21804288, "step": 22815 }, { "epoch": 1.8614895179052124, "grad_norm": 0.5465173721313477, "learning_rate": 4.887832291077176e-05, "loss": 0.0251, "num_input_tokens_seen": 21808080, "step": 22820 }, { "epoch": 1.8618973815156212, "grad_norm": 1.521131157875061, "learning_rate": 4.887726848968917e-05, "loss": 0.0078, "num_input_tokens_seen": 21812032, "step": 22825 }, { "epoch": 1.8623052451260298, "grad_norm": 0.0821189433336258, "learning_rate": 4.887621358462448e-05, "loss": 0.0535, "num_input_tokens_seen": 21816432, "step": 22830 }, { "epoch": 1.8627131087364386, "grad_norm": 34.50920867919922, "learning_rate": 4.887515819559909e-05, "loss": 0.0581, "num_input_tokens_seen": 21820368, "step": 22835 }, { "epoch": 1.8631209723468474, "grad_norm": 38.4771614074707, "learning_rate": 4.8874102322634395e-05, "loss": 0.2382, "num_input_tokens_seen": 21824288, "step": 22840 }, { "epoch": 1.863528835957256, "grad_norm": 0.2582052946090698, "learning_rate": 4.8873045965751785e-05, "loss": 0.3124, "num_input_tokens_seen": 21830192, "step": 22845 }, { "epoch": 1.8639366995676645, "grad_norm": 0.07545247673988342, "learning_rate": 4.887198912497268e-05, "loss": 0.0058, "num_input_tokens_seen": 21834784, "step": 22850 }, { "epoch": 1.8643445631780733, "grad_norm": 0.20861755311489105, "learning_rate": 4.8870931800318495e-05, "loss": 0.0094, "num_input_tokens_seen": 21840496, "step": 22855 }, { "epoch": 1.864752426788482, "grad_norm": 0.08721375465393066, "learning_rate": 4.886987399181067e-05, "loss": 0.0045, "num_input_tokens_seen": 21844960, "step": 22860 }, { "epoch": 1.8651602903988906, "grad_norm": 0.053430914878845215, "learning_rate": 4.886881569947064e-05, "loss": 0.0091, "num_input_tokens_seen": 21850000, "step": 22865 }, { "epoch": 1.8655681540092992, "grad_norm": 0.060111403465270996, "learning_rate": 4.8867756923319865e-05, "loss": 0.0038, "num_input_tokens_seen": 21854496, "step": 22870 }, { "epoch": 1.865976017619708, "grad_norm": 14.277268409729004, "learning_rate": 4.886669766337979e-05, "loss": 0.0553, "num_input_tokens_seen": 21859088, "step": 22875 }, { "epoch": 1.8663838812301168, "grad_norm": 22.397687911987305, "learning_rate": 4.88656379196719e-05, "loss": 0.2091, "num_input_tokens_seen": 21863088, "step": 22880 }, { "epoch": 1.8667917448405253, "grad_norm": 0.02399369142949581, "learning_rate": 4.886457769221767e-05, "loss": 0.3385, "num_input_tokens_seen": 21867968, "step": 22885 }, { "epoch": 1.867199608450934, "grad_norm": 0.014050401747226715, "learning_rate": 4.8863516981038595e-05, "loss": 0.2426, "num_input_tokens_seen": 21872480, "step": 22890 }, { "epoch": 1.8676074720613427, "grad_norm": 0.057242341339588165, "learning_rate": 4.886245578615616e-05, "loss": 0.0013, "num_input_tokens_seen": 21877232, "step": 22895 }, { "epoch": 1.8680153356717515, "grad_norm": 0.008603906258940697, "learning_rate": 4.88613941075919e-05, "loss": 0.0067, "num_input_tokens_seen": 21882112, "step": 22900 }, { "epoch": 1.86842319928216, "grad_norm": 0.010263350792229176, "learning_rate": 4.886033194536731e-05, "loss": 0.1952, "num_input_tokens_seen": 21886976, "step": 22905 }, { "epoch": 1.8688310628925686, "grad_norm": 0.10219621658325195, "learning_rate": 4.885926929950393e-05, "loss": 0.1784, "num_input_tokens_seen": 21891552, "step": 22910 }, { "epoch": 1.8692389265029774, "grad_norm": 0.04328615218400955, "learning_rate": 4.88582061700233e-05, "loss": 0.5102, "num_input_tokens_seen": 21895248, "step": 22915 }, { "epoch": 1.8696467901133862, "grad_norm": 4.384220600128174, "learning_rate": 4.885714255694698e-05, "loss": 0.1623, "num_input_tokens_seen": 21900608, "step": 22920 }, { "epoch": 1.8700546537237948, "grad_norm": 0.02637096866965294, "learning_rate": 4.885607846029651e-05, "loss": 0.1068, "num_input_tokens_seen": 21905408, "step": 22925 }, { "epoch": 1.8704625173342033, "grad_norm": 3.1252307891845703, "learning_rate": 4.885501388009346e-05, "loss": 0.3546, "num_input_tokens_seen": 21910192, "step": 22930 }, { "epoch": 1.870870380944612, "grad_norm": 0.22790923714637756, "learning_rate": 4.885394881635942e-05, "loss": 0.1875, "num_input_tokens_seen": 21914672, "step": 22935 }, { "epoch": 1.871278244555021, "grad_norm": 0.0032528825104236603, "learning_rate": 4.885288326911598e-05, "loss": 0.349, "num_input_tokens_seen": 21919520, "step": 22940 }, { "epoch": 1.8716861081654295, "grad_norm": 1.1264728307724, "learning_rate": 4.885181723838472e-05, "loss": 0.2132, "num_input_tokens_seen": 21923760, "step": 22945 }, { "epoch": 1.872093971775838, "grad_norm": 1.1163913011550903, "learning_rate": 4.885075072418727e-05, "loss": 0.3115, "num_input_tokens_seen": 21928384, "step": 22950 }, { "epoch": 1.8725018353862468, "grad_norm": 1.6476850509643555, "learning_rate": 4.8849683726545236e-05, "loss": 0.2938, "num_input_tokens_seen": 21932576, "step": 22955 }, { "epoch": 1.8729096989966556, "grad_norm": 0.25144046545028687, "learning_rate": 4.884861624548025e-05, "loss": 0.2208, "num_input_tokens_seen": 21936592, "step": 22960 }, { "epoch": 1.8733175626070642, "grad_norm": 0.3183121383190155, "learning_rate": 4.884754828101394e-05, "loss": 0.1145, "num_input_tokens_seen": 21941040, "step": 22965 }, { "epoch": 1.8737254262174727, "grad_norm": 0.2595442235469818, "learning_rate": 4.8846479833167956e-05, "loss": 0.0922, "num_input_tokens_seen": 21945728, "step": 22970 }, { "epoch": 1.8741332898278815, "grad_norm": 0.11995533108711243, "learning_rate": 4.8845410901963975e-05, "loss": 0.0662, "num_input_tokens_seen": 21949920, "step": 22975 }, { "epoch": 1.8745411534382903, "grad_norm": 0.2089727371931076, "learning_rate": 4.884434148742363e-05, "loss": 0.0369, "num_input_tokens_seen": 21954016, "step": 22980 }, { "epoch": 1.8749490170486989, "grad_norm": 0.16138418018817902, "learning_rate": 4.8843271589568624e-05, "loss": 0.1705, "num_input_tokens_seen": 21959584, "step": 22985 }, { "epoch": 1.8753568806591074, "grad_norm": 0.02823255956172943, "learning_rate": 4.8842201208420636e-05, "loss": 0.015, "num_input_tokens_seen": 21964912, "step": 22990 }, { "epoch": 1.8757647442695162, "grad_norm": 1.8241490125656128, "learning_rate": 4.884113034400135e-05, "loss": 0.2409, "num_input_tokens_seen": 21969696, "step": 22995 }, { "epoch": 1.876172607879925, "grad_norm": 0.933133602142334, "learning_rate": 4.884005899633249e-05, "loss": 0.0179, "num_input_tokens_seen": 21975408, "step": 23000 }, { "epoch": 1.8765804714903336, "grad_norm": 1.2902270555496216, "learning_rate": 4.883898716543577e-05, "loss": 0.1025, "num_input_tokens_seen": 21980528, "step": 23005 }, { "epoch": 1.8769883351007424, "grad_norm": 0.024077897891402245, "learning_rate": 4.883791485133291e-05, "loss": 0.0201, "num_input_tokens_seen": 21985296, "step": 23010 }, { "epoch": 1.8773961987111512, "grad_norm": 0.03789297491312027, "learning_rate": 4.8836842054045636e-05, "loss": 0.0361, "num_input_tokens_seen": 21989728, "step": 23015 }, { "epoch": 1.8778040623215597, "grad_norm": 1.092232346534729, "learning_rate": 4.88357687735957e-05, "loss": 0.3002, "num_input_tokens_seen": 21995472, "step": 23020 }, { "epoch": 1.8782119259319683, "grad_norm": 0.11529184132814407, "learning_rate": 4.883469501000487e-05, "loss": 0.0123, "num_input_tokens_seen": 21999840, "step": 23025 }, { "epoch": 1.878619789542377, "grad_norm": 5.319644927978516, "learning_rate": 4.88336207632949e-05, "loss": 0.2227, "num_input_tokens_seen": 22004944, "step": 23030 }, { "epoch": 1.8790276531527859, "grad_norm": 1.9418559074401855, "learning_rate": 4.883254603348756e-05, "loss": 0.3093, "num_input_tokens_seen": 22010032, "step": 23035 }, { "epoch": 1.8794355167631944, "grad_norm": 4.795605659484863, "learning_rate": 4.883147082060464e-05, "loss": 0.1877, "num_input_tokens_seen": 22013344, "step": 23040 }, { "epoch": 1.879843380373603, "grad_norm": 0.11486993730068207, "learning_rate": 4.883039512466793e-05, "loss": 0.2516, "num_input_tokens_seen": 22018288, "step": 23045 }, { "epoch": 1.8802512439840118, "grad_norm": 1.94468355178833, "learning_rate": 4.882931894569924e-05, "loss": 0.1228, "num_input_tokens_seen": 22022656, "step": 23050 }, { "epoch": 1.8806591075944206, "grad_norm": 0.08898458629846573, "learning_rate": 4.8828242283720383e-05, "loss": 0.0827, "num_input_tokens_seen": 22026976, "step": 23055 }, { "epoch": 1.8810669712048291, "grad_norm": 1.8450438976287842, "learning_rate": 4.8827165138753175e-05, "loss": 0.0737, "num_input_tokens_seen": 22032032, "step": 23060 }, { "epoch": 1.8814748348152377, "grad_norm": 0.07797017693519592, "learning_rate": 4.8826087510819456e-05, "loss": 0.0529, "num_input_tokens_seen": 22037232, "step": 23065 }, { "epoch": 1.8818826984256465, "grad_norm": 2.764596700668335, "learning_rate": 4.8825009399941065e-05, "loss": 0.6267, "num_input_tokens_seen": 22042432, "step": 23070 }, { "epoch": 1.8822905620360553, "grad_norm": 4.6453166007995605, "learning_rate": 4.8823930806139864e-05, "loss": 0.4005, "num_input_tokens_seen": 22046784, "step": 23075 }, { "epoch": 1.8826984256464638, "grad_norm": 0.09241926670074463, "learning_rate": 4.8822851729437705e-05, "loss": 0.0414, "num_input_tokens_seen": 22051616, "step": 23080 }, { "epoch": 1.8831062892568724, "grad_norm": 6.556591510772705, "learning_rate": 4.882177216985647e-05, "loss": 0.1926, "num_input_tokens_seen": 22055104, "step": 23085 }, { "epoch": 1.8835141528672812, "grad_norm": 2.7529854774475098, "learning_rate": 4.8820692127418024e-05, "loss": 0.1614, "num_input_tokens_seen": 22059536, "step": 23090 }, { "epoch": 1.88392201647769, "grad_norm": 0.16449905931949615, "learning_rate": 4.8819611602144276e-05, "loss": 0.0748, "num_input_tokens_seen": 22064192, "step": 23095 }, { "epoch": 1.8843298800880985, "grad_norm": 8.71522331237793, "learning_rate": 4.881853059405713e-05, "loss": 0.057, "num_input_tokens_seen": 22069168, "step": 23100 }, { "epoch": 1.884737743698507, "grad_norm": 7.320668697357178, "learning_rate": 4.881744910317848e-05, "loss": 0.0825, "num_input_tokens_seen": 22074848, "step": 23105 }, { "epoch": 1.885145607308916, "grad_norm": 0.018107082694768906, "learning_rate": 4.881636712953026e-05, "loss": 0.0166, "num_input_tokens_seen": 22079904, "step": 23110 }, { "epoch": 1.8855534709193247, "grad_norm": 0.035826705396175385, "learning_rate": 4.8815284673134396e-05, "loss": 0.1748, "num_input_tokens_seen": 22084048, "step": 23115 }, { "epoch": 1.8859613345297332, "grad_norm": 0.012556508183479309, "learning_rate": 4.881420173401284e-05, "loss": 0.1377, "num_input_tokens_seen": 22088560, "step": 23120 }, { "epoch": 1.8863691981401418, "grad_norm": 0.015627382323145866, "learning_rate": 4.8813118312187536e-05, "loss": 0.1438, "num_input_tokens_seen": 22093456, "step": 23125 }, { "epoch": 1.8867770617505506, "grad_norm": 4.472400188446045, "learning_rate": 4.8812034407680426e-05, "loss": 0.3138, "num_input_tokens_seen": 22098960, "step": 23130 }, { "epoch": 1.8871849253609594, "grad_norm": 0.06302040070295334, "learning_rate": 4.881095002051351e-05, "loss": 0.0339, "num_input_tokens_seen": 22104256, "step": 23135 }, { "epoch": 1.887592788971368, "grad_norm": 0.43732160329818726, "learning_rate": 4.880986515070875e-05, "loss": 0.4394, "num_input_tokens_seen": 22110000, "step": 23140 }, { "epoch": 1.8880006525817765, "grad_norm": 0.05837663263082504, "learning_rate": 4.880877979828814e-05, "loss": 0.0316, "num_input_tokens_seen": 22114896, "step": 23145 }, { "epoch": 1.8884085161921853, "grad_norm": 0.31179115176200867, "learning_rate": 4.880769396327368e-05, "loss": 0.1955, "num_input_tokens_seen": 22119808, "step": 23150 }, { "epoch": 1.888816379802594, "grad_norm": 2.637753963470459, "learning_rate": 4.880660764568739e-05, "loss": 0.2135, "num_input_tokens_seen": 22124672, "step": 23155 }, { "epoch": 1.8892242434130027, "grad_norm": 0.13298849761486053, "learning_rate": 4.880552084555127e-05, "loss": 0.5669, "num_input_tokens_seen": 22129968, "step": 23160 }, { "epoch": 1.8896321070234112, "grad_norm": 0.878157377243042, "learning_rate": 4.8804433562887356e-05, "loss": 0.2123, "num_input_tokens_seen": 22134672, "step": 23165 }, { "epoch": 1.89003997063382, "grad_norm": 1.7937710285186768, "learning_rate": 4.880334579771769e-05, "loss": 0.5384, "num_input_tokens_seen": 22138960, "step": 23170 }, { "epoch": 1.8904478342442288, "grad_norm": 0.34783634543418884, "learning_rate": 4.880225755006433e-05, "loss": 0.0219, "num_input_tokens_seen": 22143776, "step": 23175 }, { "epoch": 1.8908556978546374, "grad_norm": 3.073096990585327, "learning_rate": 4.8801168819949315e-05, "loss": 0.1386, "num_input_tokens_seen": 22149632, "step": 23180 }, { "epoch": 1.891263561465046, "grad_norm": 8.426003456115723, "learning_rate": 4.880007960739472e-05, "loss": 0.604, "num_input_tokens_seen": 22153824, "step": 23185 }, { "epoch": 1.8916714250754547, "grad_norm": 0.5261083245277405, "learning_rate": 4.8798989912422625e-05, "loss": 0.1306, "num_input_tokens_seen": 22158768, "step": 23190 }, { "epoch": 1.8920792886858635, "grad_norm": 3.2518808841705322, "learning_rate": 4.879789973505512e-05, "loss": 0.3904, "num_input_tokens_seen": 22163648, "step": 23195 }, { "epoch": 1.892487152296272, "grad_norm": 0.7504352927207947, "learning_rate": 4.87968090753143e-05, "loss": 0.2987, "num_input_tokens_seen": 22168912, "step": 23200 }, { "epoch": 1.8928950159066809, "grad_norm": 0.16934917867183685, "learning_rate": 4.8795717933222274e-05, "loss": 0.1617, "num_input_tokens_seen": 22173552, "step": 23205 }, { "epoch": 1.8933028795170896, "grad_norm": 0.725093424320221, "learning_rate": 4.879462630880115e-05, "loss": 0.1627, "num_input_tokens_seen": 22178384, "step": 23210 }, { "epoch": 1.8937107431274982, "grad_norm": 1.0244410037994385, "learning_rate": 4.8793534202073065e-05, "loss": 0.0764, "num_input_tokens_seen": 22183008, "step": 23215 }, { "epoch": 1.8941186067379068, "grad_norm": 55.93276596069336, "learning_rate": 4.879244161306015e-05, "loss": 0.1241, "num_input_tokens_seen": 22187840, "step": 23220 }, { "epoch": 1.8945264703483156, "grad_norm": 3.3125081062316895, "learning_rate": 4.879134854178456e-05, "loss": 0.3471, "num_input_tokens_seen": 22192608, "step": 23225 }, { "epoch": 1.8949343339587243, "grad_norm": 0.04129287973046303, "learning_rate": 4.8790254988268434e-05, "loss": 0.2311, "num_input_tokens_seen": 22198144, "step": 23230 }, { "epoch": 1.895342197569133, "grad_norm": 72.80716705322266, "learning_rate": 4.8789160952533965e-05, "loss": 0.3119, "num_input_tokens_seen": 22201824, "step": 23235 }, { "epoch": 1.8957500611795415, "grad_norm": 0.4251243770122528, "learning_rate": 4.8788066434603296e-05, "loss": 0.2171, "num_input_tokens_seen": 22206048, "step": 23240 }, { "epoch": 1.8961579247899503, "grad_norm": 3.5120294094085693, "learning_rate": 4.878697143449864e-05, "loss": 0.2339, "num_input_tokens_seen": 22211552, "step": 23245 }, { "epoch": 1.896565788400359, "grad_norm": 0.8541977405548096, "learning_rate": 4.878587595224217e-05, "loss": 0.0312, "num_input_tokens_seen": 22216096, "step": 23250 }, { "epoch": 1.8969736520107676, "grad_norm": 0.5165766477584839, "learning_rate": 4.878477998785611e-05, "loss": 0.0753, "num_input_tokens_seen": 22220992, "step": 23255 }, { "epoch": 1.8973815156211762, "grad_norm": 0.028734106570482254, "learning_rate": 4.878368354136266e-05, "loss": 0.3627, "num_input_tokens_seen": 22226352, "step": 23260 }, { "epoch": 1.897789379231585, "grad_norm": 2.4268641471862793, "learning_rate": 4.878258661278405e-05, "loss": 0.0284, "num_input_tokens_seen": 22230864, "step": 23265 }, { "epoch": 1.8981972428419938, "grad_norm": 0.12103733420372009, "learning_rate": 4.8781489202142526e-05, "loss": 0.0698, "num_input_tokens_seen": 22235136, "step": 23270 }, { "epoch": 1.8986051064524023, "grad_norm": 0.017272531986236572, "learning_rate": 4.8780391309460316e-05, "loss": 0.1268, "num_input_tokens_seen": 22239232, "step": 23275 }, { "epoch": 1.899012970062811, "grad_norm": 0.0952218547463417, "learning_rate": 4.877929293475968e-05, "loss": 0.2287, "num_input_tokens_seen": 22243984, "step": 23280 }, { "epoch": 1.8994208336732197, "grad_norm": 9.076979637145996, "learning_rate": 4.877819407806288e-05, "loss": 0.0964, "num_input_tokens_seen": 22248576, "step": 23285 }, { "epoch": 1.8998286972836285, "grad_norm": 6.49235200881958, "learning_rate": 4.8777094739392184e-05, "loss": 0.2314, "num_input_tokens_seen": 22253456, "step": 23290 }, { "epoch": 1.900236560894037, "grad_norm": 0.11041894555091858, "learning_rate": 4.877599491876989e-05, "loss": 0.0403, "num_input_tokens_seen": 22258816, "step": 23295 }, { "epoch": 1.9006444245044456, "grad_norm": 1.46220064163208, "learning_rate": 4.8774894616218275e-05, "loss": 0.1857, "num_input_tokens_seen": 22262512, "step": 23300 }, { "epoch": 1.9010522881148544, "grad_norm": 1.9968706369400024, "learning_rate": 4.8773793831759654e-05, "loss": 0.1256, "num_input_tokens_seen": 22267104, "step": 23305 }, { "epoch": 1.9014601517252632, "grad_norm": 0.03298509120941162, "learning_rate": 4.8772692565416325e-05, "loss": 0.0061, "num_input_tokens_seen": 22271536, "step": 23310 }, { "epoch": 1.9018680153356717, "grad_norm": 6.654293537139893, "learning_rate": 4.877159081721063e-05, "loss": 0.2037, "num_input_tokens_seen": 22276320, "step": 23315 }, { "epoch": 1.9022758789460803, "grad_norm": 0.04925215244293213, "learning_rate": 4.877048858716489e-05, "loss": 0.0098, "num_input_tokens_seen": 22281312, "step": 23320 }, { "epoch": 1.902683742556489, "grad_norm": 0.028873641043901443, "learning_rate": 4.8769385875301445e-05, "loss": 0.2557, "num_input_tokens_seen": 22287040, "step": 23325 }, { "epoch": 1.9030916061668979, "grad_norm": 0.014717016369104385, "learning_rate": 4.876828268164264e-05, "loss": 0.0079, "num_input_tokens_seen": 22291872, "step": 23330 }, { "epoch": 1.9034994697773064, "grad_norm": 4.220462322235107, "learning_rate": 4.876717900621085e-05, "loss": 0.1792, "num_input_tokens_seen": 22296336, "step": 23335 }, { "epoch": 1.903907333387715, "grad_norm": 3.4510395526885986, "learning_rate": 4.8766074849028444e-05, "loss": 0.4347, "num_input_tokens_seen": 22301776, "step": 23340 }, { "epoch": 1.9043151969981238, "grad_norm": 0.38179025053977966, "learning_rate": 4.87649702101178e-05, "loss": 0.0942, "num_input_tokens_seen": 22307232, "step": 23345 }, { "epoch": 1.9047230606085326, "grad_norm": 0.006958961021155119, "learning_rate": 4.87638650895013e-05, "loss": 0.0096, "num_input_tokens_seen": 22312080, "step": 23350 }, { "epoch": 1.9051309242189411, "grad_norm": 0.053905535489320755, "learning_rate": 4.8762759487201365e-05, "loss": 0.0063, "num_input_tokens_seen": 22317328, "step": 23355 }, { "epoch": 1.9055387878293497, "grad_norm": 2.8870866298675537, "learning_rate": 4.8761653403240384e-05, "loss": 0.2351, "num_input_tokens_seen": 22321552, "step": 23360 }, { "epoch": 1.9059466514397585, "grad_norm": 0.04225059226155281, "learning_rate": 4.876054683764078e-05, "loss": 0.1661, "num_input_tokens_seen": 22326256, "step": 23365 }, { "epoch": 1.9063545150501673, "grad_norm": 0.02283201739192009, "learning_rate": 4.8759439790425e-05, "loss": 0.1212, "num_input_tokens_seen": 22330576, "step": 23370 }, { "epoch": 1.9067623786605759, "grad_norm": 0.3307463526725769, "learning_rate": 4.875833226161547e-05, "loss": 0.2873, "num_input_tokens_seen": 22335456, "step": 23375 }, { "epoch": 1.9071702422709844, "grad_norm": 0.013318494893610477, "learning_rate": 4.875722425123464e-05, "loss": 0.4244, "num_input_tokens_seen": 22340352, "step": 23380 }, { "epoch": 1.9075781058813934, "grad_norm": 2.1351208686828613, "learning_rate": 4.8756115759304966e-05, "loss": 0.0733, "num_input_tokens_seen": 22344944, "step": 23385 }, { "epoch": 1.907985969491802, "grad_norm": 0.5424027442932129, "learning_rate": 4.8755006785848925e-05, "loss": 0.1135, "num_input_tokens_seen": 22349488, "step": 23390 }, { "epoch": 1.9083938331022106, "grad_norm": 8.431227684020996, "learning_rate": 4.875389733088899e-05, "loss": 0.0588, "num_input_tokens_seen": 22353248, "step": 23395 }, { "epoch": 1.9088016967126193, "grad_norm": 0.8217387795448303, "learning_rate": 4.875278739444764e-05, "loss": 0.3457, "num_input_tokens_seen": 22358256, "step": 23400 }, { "epoch": 1.9092095603230281, "grad_norm": 0.1359102874994278, "learning_rate": 4.875167697654739e-05, "loss": 0.0164, "num_input_tokens_seen": 22362880, "step": 23405 }, { "epoch": 1.9096174239334367, "grad_norm": 3.706773519515991, "learning_rate": 4.875056607721074e-05, "loss": 0.3783, "num_input_tokens_seen": 22367360, "step": 23410 }, { "epoch": 1.9100252875438453, "grad_norm": 0.013994919136166573, "learning_rate": 4.874945469646021e-05, "loss": 0.1299, "num_input_tokens_seen": 22371488, "step": 23415 }, { "epoch": 1.910433151154254, "grad_norm": 5.080211639404297, "learning_rate": 4.874834283431832e-05, "loss": 0.1402, "num_input_tokens_seen": 22376336, "step": 23420 }, { "epoch": 1.9108410147646628, "grad_norm": 3.3861846923828125, "learning_rate": 4.874723049080762e-05, "loss": 0.0097, "num_input_tokens_seen": 22381200, "step": 23425 }, { "epoch": 1.9112488783750714, "grad_norm": 2.9522900581359863, "learning_rate": 4.874611766595064e-05, "loss": 0.3545, "num_input_tokens_seen": 22386720, "step": 23430 }, { "epoch": 1.91165674198548, "grad_norm": 9.013504981994629, "learning_rate": 4.874500435976995e-05, "loss": 0.377, "num_input_tokens_seen": 22391824, "step": 23435 }, { "epoch": 1.9120646055958888, "grad_norm": 0.05822119861841202, "learning_rate": 4.8743890572288106e-05, "loss": 0.012, "num_input_tokens_seen": 22395120, "step": 23440 }, { "epoch": 1.9124724692062975, "grad_norm": 0.1187591552734375, "learning_rate": 4.8742776303527695e-05, "loss": 0.2097, "num_input_tokens_seen": 22400320, "step": 23445 }, { "epoch": 1.912880332816706, "grad_norm": 0.04672613367438316, "learning_rate": 4.87416615535113e-05, "loss": 0.0144, "num_input_tokens_seen": 22404848, "step": 23450 }, { "epoch": 1.9132881964271147, "grad_norm": 0.044023457914590836, "learning_rate": 4.87405463222615e-05, "loss": 0.0582, "num_input_tokens_seen": 22409120, "step": 23455 }, { "epoch": 1.9136960600375235, "grad_norm": 0.023832736536860466, "learning_rate": 4.873943060980093e-05, "loss": 0.1476, "num_input_tokens_seen": 22414304, "step": 23460 }, { "epoch": 1.9141039236479322, "grad_norm": 1.5178203582763672, "learning_rate": 4.8738314416152185e-05, "loss": 0.2126, "num_input_tokens_seen": 22418784, "step": 23465 }, { "epoch": 1.9145117872583408, "grad_norm": 0.018293986096978188, "learning_rate": 4.873719774133789e-05, "loss": 0.087, "num_input_tokens_seen": 22422160, "step": 23470 }, { "epoch": 1.9149196508687494, "grad_norm": 0.03146699070930481, "learning_rate": 4.873608058538068e-05, "loss": 0.3617, "num_input_tokens_seen": 22426368, "step": 23475 }, { "epoch": 1.9153275144791582, "grad_norm": 0.042884957045316696, "learning_rate": 4.873496294830321e-05, "loss": 0.0352, "num_input_tokens_seen": 22431408, "step": 23480 }, { "epoch": 1.915735378089567, "grad_norm": 0.22542686760425568, "learning_rate": 4.8733844830128126e-05, "loss": 0.1638, "num_input_tokens_seen": 22435840, "step": 23485 }, { "epoch": 1.9161432416999755, "grad_norm": 0.19910791516304016, "learning_rate": 4.873272623087809e-05, "loss": 0.0152, "num_input_tokens_seen": 22441248, "step": 23490 }, { "epoch": 1.916551105310384, "grad_norm": 0.2352350950241089, "learning_rate": 4.873160715057578e-05, "loss": 0.2803, "num_input_tokens_seen": 22445632, "step": 23495 }, { "epoch": 1.9169589689207929, "grad_norm": 0.16632281243801117, "learning_rate": 4.873048758924388e-05, "loss": 0.0126, "num_input_tokens_seen": 22449776, "step": 23500 }, { "epoch": 1.9173668325312017, "grad_norm": 0.09618746489286423, "learning_rate": 4.8729367546905075e-05, "loss": 0.0581, "num_input_tokens_seen": 22454800, "step": 23505 }, { "epoch": 1.9177746961416102, "grad_norm": 0.08834615349769592, "learning_rate": 4.8728247023582076e-05, "loss": 0.1068, "num_input_tokens_seen": 22459392, "step": 23510 }, { "epoch": 1.9181825597520188, "grad_norm": 5.089341640472412, "learning_rate": 4.8727126019297587e-05, "loss": 0.4898, "num_input_tokens_seen": 22464800, "step": 23515 }, { "epoch": 1.9185904233624276, "grad_norm": 54.082462310791016, "learning_rate": 4.872600453407434e-05, "loss": 0.5163, "num_input_tokens_seen": 22469808, "step": 23520 }, { "epoch": 1.9189982869728364, "grad_norm": 0.09970656037330627, "learning_rate": 4.872488256793506e-05, "loss": 0.1069, "num_input_tokens_seen": 22475008, "step": 23525 }, { "epoch": 1.919406150583245, "grad_norm": 0.18391917645931244, "learning_rate": 4.8723760120902494e-05, "loss": 0.0249, "num_input_tokens_seen": 22480000, "step": 23530 }, { "epoch": 1.9198140141936535, "grad_norm": 0.05234270915389061, "learning_rate": 4.872263719299939e-05, "loss": 0.0372, "num_input_tokens_seen": 22483936, "step": 23535 }, { "epoch": 1.9202218778040623, "grad_norm": 7.197041988372803, "learning_rate": 4.872151378424852e-05, "loss": 0.4023, "num_input_tokens_seen": 22489248, "step": 23540 }, { "epoch": 1.920629741414471, "grad_norm": 19.897319793701172, "learning_rate": 4.872038989467263e-05, "loss": 0.2635, "num_input_tokens_seen": 22494432, "step": 23545 }, { "epoch": 1.9210376050248796, "grad_norm": 0.2355092465877533, "learning_rate": 4.8719265524294526e-05, "loss": 0.0929, "num_input_tokens_seen": 22498992, "step": 23550 }, { "epoch": 1.9214454686352882, "grad_norm": 9.943642616271973, "learning_rate": 4.871814067313699e-05, "loss": 0.611, "num_input_tokens_seen": 22502896, "step": 23555 }, { "epoch": 1.921853332245697, "grad_norm": 1.7533254623413086, "learning_rate": 4.8717015341222814e-05, "loss": 0.2492, "num_input_tokens_seen": 22507152, "step": 23560 }, { "epoch": 1.9222611958561058, "grad_norm": 0.14955131709575653, "learning_rate": 4.871588952857482e-05, "loss": 0.1037, "num_input_tokens_seen": 22512816, "step": 23565 }, { "epoch": 1.9226690594665143, "grad_norm": 2.3833236694335938, "learning_rate": 4.871476323521582e-05, "loss": 0.2139, "num_input_tokens_seen": 22518160, "step": 23570 }, { "epoch": 1.9230769230769231, "grad_norm": 0.064632847905159, "learning_rate": 4.8713636461168646e-05, "loss": 0.0459, "num_input_tokens_seen": 22522112, "step": 23575 }, { "epoch": 1.923484786687332, "grad_norm": 2.0737264156341553, "learning_rate": 4.871250920645614e-05, "loss": 0.862, "num_input_tokens_seen": 22526512, "step": 23580 }, { "epoch": 1.9238926502977405, "grad_norm": 0.02630642242729664, "learning_rate": 4.871138147110115e-05, "loss": 0.0541, "num_input_tokens_seen": 22531008, "step": 23585 }, { "epoch": 1.924300513908149, "grad_norm": 0.09627076238393784, "learning_rate": 4.8710253255126525e-05, "loss": 0.4115, "num_input_tokens_seen": 22535968, "step": 23590 }, { "epoch": 1.9247083775185578, "grad_norm": 0.2615886330604553, "learning_rate": 4.8709124558555145e-05, "loss": 0.0127, "num_input_tokens_seen": 22541568, "step": 23595 }, { "epoch": 1.9251162411289666, "grad_norm": 0.03993510454893112, "learning_rate": 4.8707995381409885e-05, "loss": 0.1099, "num_input_tokens_seen": 22546592, "step": 23600 }, { "epoch": 1.9255241047393752, "grad_norm": 0.09841050952672958, "learning_rate": 4.870686572371364e-05, "loss": 0.1291, "num_input_tokens_seen": 22551872, "step": 23605 }, { "epoch": 1.9259319683497838, "grad_norm": 1.3241647481918335, "learning_rate": 4.87057355854893e-05, "loss": 0.0203, "num_input_tokens_seen": 22556624, "step": 23610 }, { "epoch": 1.9263398319601925, "grad_norm": 0.2748284339904785, "learning_rate": 4.870460496675976e-05, "loss": 0.1248, "num_input_tokens_seen": 22561984, "step": 23615 }, { "epoch": 1.9267476955706013, "grad_norm": 0.10125359147787094, "learning_rate": 4.870347386754796e-05, "loss": 0.212, "num_input_tokens_seen": 22567168, "step": 23620 }, { "epoch": 1.92715555918101, "grad_norm": 0.24021388590335846, "learning_rate": 4.870234228787681e-05, "loss": 0.2197, "num_input_tokens_seen": 22572352, "step": 23625 }, { "epoch": 1.9275634227914185, "grad_norm": 0.6513469815254211, "learning_rate": 4.870121022776926e-05, "loss": 0.2931, "num_input_tokens_seen": 22577168, "step": 23630 }, { "epoch": 1.9279712864018272, "grad_norm": 0.03259258344769478, "learning_rate": 4.8700077687248254e-05, "loss": 0.0123, "num_input_tokens_seen": 22581312, "step": 23635 }, { "epoch": 1.928379150012236, "grad_norm": 0.5947966575622559, "learning_rate": 4.869894466633673e-05, "loss": 0.1627, "num_input_tokens_seen": 22585536, "step": 23640 }, { "epoch": 1.9287870136226446, "grad_norm": 0.8577629327774048, "learning_rate": 4.869781116505768e-05, "loss": 0.185, "num_input_tokens_seen": 22590032, "step": 23645 }, { "epoch": 1.9291948772330532, "grad_norm": 0.009982111863791943, "learning_rate": 4.8696677183434066e-05, "loss": 0.4201, "num_input_tokens_seen": 22595504, "step": 23650 }, { "epoch": 1.929602740843462, "grad_norm": 0.1766454428434372, "learning_rate": 4.869554272148887e-05, "loss": 0.0167, "num_input_tokens_seen": 22599632, "step": 23655 }, { "epoch": 1.9300106044538707, "grad_norm": 0.10045570880174637, "learning_rate": 4.86944077792451e-05, "loss": 0.4594, "num_input_tokens_seen": 22604144, "step": 23660 }, { "epoch": 1.9304184680642793, "grad_norm": 0.1256638765335083, "learning_rate": 4.869327235672574e-05, "loss": 0.3643, "num_input_tokens_seen": 22609216, "step": 23665 }, { "epoch": 1.9308263316746879, "grad_norm": 3.7162280082702637, "learning_rate": 4.869213645395383e-05, "loss": 0.0793, "num_input_tokens_seen": 22614176, "step": 23670 }, { "epoch": 1.9312341952850967, "grad_norm": 65.54222869873047, "learning_rate": 4.869100007095238e-05, "loss": 0.8198, "num_input_tokens_seen": 22618480, "step": 23675 }, { "epoch": 1.9316420588955054, "grad_norm": 0.13729417324066162, "learning_rate": 4.8689863207744414e-05, "loss": 0.8368, "num_input_tokens_seen": 22623184, "step": 23680 }, { "epoch": 1.932049922505914, "grad_norm": 0.26292353868484497, "learning_rate": 4.868872586435299e-05, "loss": 0.545, "num_input_tokens_seen": 22627472, "step": 23685 }, { "epoch": 1.9324577861163226, "grad_norm": 0.6666635274887085, "learning_rate": 4.868758804080117e-05, "loss": 0.0142, "num_input_tokens_seen": 22631872, "step": 23690 }, { "epoch": 1.9328656497267314, "grad_norm": 0.7617782354354858, "learning_rate": 4.8686449737111994e-05, "loss": 0.0063, "num_input_tokens_seen": 22637808, "step": 23695 }, { "epoch": 1.9332735133371401, "grad_norm": 6.776062965393066, "learning_rate": 4.868531095330855e-05, "loss": 0.3607, "num_input_tokens_seen": 22642896, "step": 23700 }, { "epoch": 1.9336813769475487, "grad_norm": 3.5529448986053467, "learning_rate": 4.8684171689413924e-05, "loss": 0.1597, "num_input_tokens_seen": 22646816, "step": 23705 }, { "epoch": 1.9340892405579573, "grad_norm": 0.29835134744644165, "learning_rate": 4.8683031945451196e-05, "loss": 0.0892, "num_input_tokens_seen": 22650640, "step": 23710 }, { "epoch": 1.934497104168366, "grad_norm": 2.2374041080474854, "learning_rate": 4.8681891721443475e-05, "loss": 0.1801, "num_input_tokens_seen": 22654832, "step": 23715 }, { "epoch": 1.9349049677787749, "grad_norm": 2.186007499694824, "learning_rate": 4.8680751017413875e-05, "loss": 0.3068, "num_input_tokens_seen": 22659424, "step": 23720 }, { "epoch": 1.9353128313891834, "grad_norm": 0.030481811612844467, "learning_rate": 4.867960983338551e-05, "loss": 0.0121, "num_input_tokens_seen": 22664352, "step": 23725 }, { "epoch": 1.935720694999592, "grad_norm": 119.256103515625, "learning_rate": 4.867846816938152e-05, "loss": 0.335, "num_input_tokens_seen": 22669408, "step": 23730 }, { "epoch": 1.9361285586100008, "grad_norm": 6.322437763214111, "learning_rate": 4.8677326025425036e-05, "loss": 0.2409, "num_input_tokens_seen": 22673392, "step": 23735 }, { "epoch": 1.9365364222204096, "grad_norm": 0.06606610119342804, "learning_rate": 4.8676183401539215e-05, "loss": 0.0085, "num_input_tokens_seen": 22677856, "step": 23740 }, { "epoch": 1.9369442858308181, "grad_norm": 0.49145910143852234, "learning_rate": 4.867504029774722e-05, "loss": 0.2993, "num_input_tokens_seen": 22682160, "step": 23745 }, { "epoch": 1.9373521494412267, "grad_norm": 0.13195732235908508, "learning_rate": 4.867389671407222e-05, "loss": 0.681, "num_input_tokens_seen": 22687024, "step": 23750 }, { "epoch": 1.9377600130516357, "grad_norm": 0.7496781349182129, "learning_rate": 4.867275265053739e-05, "loss": 0.3678, "num_input_tokens_seen": 22692000, "step": 23755 }, { "epoch": 1.9381678766620443, "grad_norm": 2.3350186347961426, "learning_rate": 4.867160810716592e-05, "loss": 1.6158, "num_input_tokens_seen": 22696720, "step": 23760 }, { "epoch": 1.9385757402724528, "grad_norm": 2.076051950454712, "learning_rate": 4.867046308398102e-05, "loss": 1.0159, "num_input_tokens_seen": 22701856, "step": 23765 }, { "epoch": 1.9389836038828616, "grad_norm": 0.1326284408569336, "learning_rate": 4.866931758100589e-05, "loss": 0.0096, "num_input_tokens_seen": 22707200, "step": 23770 }, { "epoch": 1.9393914674932704, "grad_norm": 0.08974526077508926, "learning_rate": 4.8668171598263745e-05, "loss": 0.0102, "num_input_tokens_seen": 22712416, "step": 23775 }, { "epoch": 1.939799331103679, "grad_norm": 40.96647644042969, "learning_rate": 4.8667025135777825e-05, "loss": 0.3487, "num_input_tokens_seen": 22717216, "step": 23780 }, { "epoch": 1.9402071947140875, "grad_norm": 0.16123954951763153, "learning_rate": 4.866587819357136e-05, "loss": 0.7313, "num_input_tokens_seen": 22721376, "step": 23785 }, { "epoch": 1.9406150583244963, "grad_norm": 0.012950468808412552, "learning_rate": 4.86647307716676e-05, "loss": 0.0542, "num_input_tokens_seen": 22726160, "step": 23790 }, { "epoch": 1.941022921934905, "grad_norm": 116.07926177978516, "learning_rate": 4.86635828700898e-05, "loss": 1.1264, "num_input_tokens_seen": 22730752, "step": 23795 }, { "epoch": 1.9414307855453137, "grad_norm": 0.2372996062040329, "learning_rate": 4.866243448886123e-05, "loss": 0.3514, "num_input_tokens_seen": 22735648, "step": 23800 }, { "epoch": 1.9418386491557222, "grad_norm": 0.005545728374272585, "learning_rate": 4.866128562800517e-05, "loss": 0.1655, "num_input_tokens_seen": 22739616, "step": 23805 }, { "epoch": 1.942246512766131, "grad_norm": 6.969454288482666, "learning_rate": 4.866013628754491e-05, "loss": 0.4522, "num_input_tokens_seen": 22744464, "step": 23810 }, { "epoch": 1.9426543763765398, "grad_norm": 24.679786682128906, "learning_rate": 4.865898646750374e-05, "loss": 0.5431, "num_input_tokens_seen": 22749504, "step": 23815 }, { "epoch": 1.9430622399869484, "grad_norm": 0.18998412787914276, "learning_rate": 4.8657836167904955e-05, "loss": 0.3367, "num_input_tokens_seen": 22755040, "step": 23820 }, { "epoch": 1.943470103597357, "grad_norm": 0.006452254951000214, "learning_rate": 4.86566853887719e-05, "loss": 0.005, "num_input_tokens_seen": 22758960, "step": 23825 }, { "epoch": 1.9438779672077657, "grad_norm": 0.11236965656280518, "learning_rate": 4.865553413012788e-05, "loss": 0.0087, "num_input_tokens_seen": 22762896, "step": 23830 }, { "epoch": 1.9442858308181745, "grad_norm": 0.3201749324798584, "learning_rate": 4.8654382391996227e-05, "loss": 0.4343, "num_input_tokens_seen": 22767488, "step": 23835 }, { "epoch": 1.944693694428583, "grad_norm": 0.0635901615023613, "learning_rate": 4.86532301744003e-05, "loss": 0.2258, "num_input_tokens_seen": 22772336, "step": 23840 }, { "epoch": 1.9451015580389917, "grad_norm": 2.397834300994873, "learning_rate": 4.865207747736345e-05, "loss": 0.1746, "num_input_tokens_seen": 22776656, "step": 23845 }, { "epoch": 1.9455094216494004, "grad_norm": 0.15072184801101685, "learning_rate": 4.865092430090904e-05, "loss": 0.3381, "num_input_tokens_seen": 22781376, "step": 23850 }, { "epoch": 1.9459172852598092, "grad_norm": 0.19558879733085632, "learning_rate": 4.8649770645060436e-05, "loss": 0.4973, "num_input_tokens_seen": 22786912, "step": 23855 }, { "epoch": 1.9463251488702178, "grad_norm": 0.20053589344024658, "learning_rate": 4.864861650984104e-05, "loss": 0.2724, "num_input_tokens_seen": 22791808, "step": 23860 }, { "epoch": 1.9467330124806264, "grad_norm": 0.3739273250102997, "learning_rate": 4.864746189527423e-05, "loss": 0.3295, "num_input_tokens_seen": 22796768, "step": 23865 }, { "epoch": 1.9471408760910351, "grad_norm": 0.2658173739910126, "learning_rate": 4.8646306801383426e-05, "loss": 0.1409, "num_input_tokens_seen": 22800944, "step": 23870 }, { "epoch": 1.947548739701444, "grad_norm": 0.01930372230708599, "learning_rate": 4.8645151228192023e-05, "loss": 0.3193, "num_input_tokens_seen": 22806048, "step": 23875 }, { "epoch": 1.9479566033118525, "grad_norm": 1.8206514120101929, "learning_rate": 4.864399517572345e-05, "loss": 0.2128, "num_input_tokens_seen": 22811200, "step": 23880 }, { "epoch": 1.948364466922261, "grad_norm": 0.05479765310883522, "learning_rate": 4.864283864400114e-05, "loss": 0.184, "num_input_tokens_seen": 22815936, "step": 23885 }, { "epoch": 1.9487723305326698, "grad_norm": 1.59300696849823, "learning_rate": 4.8641681633048545e-05, "loss": 0.1179, "num_input_tokens_seen": 22821200, "step": 23890 }, { "epoch": 1.9491801941430786, "grad_norm": 0.09040895849466324, "learning_rate": 4.864052414288911e-05, "loss": 0.191, "num_input_tokens_seen": 22825872, "step": 23895 }, { "epoch": 1.9495880577534872, "grad_norm": 0.1648513823747635, "learning_rate": 4.8639366173546284e-05, "loss": 0.0708, "num_input_tokens_seen": 22830624, "step": 23900 }, { "epoch": 1.9499959213638958, "grad_norm": 0.005762571934610605, "learning_rate": 4.8638207725043564e-05, "loss": 0.0421, "num_input_tokens_seen": 22836032, "step": 23905 }, { "epoch": 1.9504037849743046, "grad_norm": 11.16469955444336, "learning_rate": 4.863704879740442e-05, "loss": 0.0906, "num_input_tokens_seen": 22840096, "step": 23910 }, { "epoch": 1.9508116485847133, "grad_norm": 7.4564385414123535, "learning_rate": 4.863588939065232e-05, "loss": 0.3638, "num_input_tokens_seen": 22844368, "step": 23915 }, { "epoch": 1.951219512195122, "grad_norm": 0.11427345126867294, "learning_rate": 4.86347295048108e-05, "loss": 0.2221, "num_input_tokens_seen": 22849024, "step": 23920 }, { "epoch": 1.9516273758055305, "grad_norm": 0.28492268919944763, "learning_rate": 4.863356913990336e-05, "loss": 0.0181, "num_input_tokens_seen": 22854240, "step": 23925 }, { "epoch": 1.9520352394159393, "grad_norm": 169.6485595703125, "learning_rate": 4.8632408295953516e-05, "loss": 0.1656, "num_input_tokens_seen": 22859344, "step": 23930 }, { "epoch": 1.952443103026348, "grad_norm": 0.3745265603065491, "learning_rate": 4.863124697298479e-05, "loss": 0.5272, "num_input_tokens_seen": 22864304, "step": 23935 }, { "epoch": 1.9528509666367566, "grad_norm": 0.27107396721839905, "learning_rate": 4.863008517102073e-05, "loss": 0.0142, "num_input_tokens_seen": 22870128, "step": 23940 }, { "epoch": 1.9532588302471654, "grad_norm": 4.861766338348389, "learning_rate": 4.8628922890084886e-05, "loss": 0.1835, "num_input_tokens_seen": 22875040, "step": 23945 }, { "epoch": 1.9536666938575742, "grad_norm": 2.3147783279418945, "learning_rate": 4.8627760130200815e-05, "loss": 0.3387, "num_input_tokens_seen": 22880096, "step": 23950 }, { "epoch": 1.9540745574679828, "grad_norm": 4.077676296234131, "learning_rate": 4.862659689139209e-05, "loss": 0.1707, "num_input_tokens_seen": 22884816, "step": 23955 }, { "epoch": 1.9544824210783913, "grad_norm": 4.933477878570557, "learning_rate": 4.8625433173682284e-05, "loss": 0.1213, "num_input_tokens_seen": 22889200, "step": 23960 }, { "epoch": 1.9548902846888, "grad_norm": 0.08555510640144348, "learning_rate": 4.8624268977094985e-05, "loss": 0.0902, "num_input_tokens_seen": 22894176, "step": 23965 }, { "epoch": 1.955298148299209, "grad_norm": 0.13391029834747314, "learning_rate": 4.862310430165379e-05, "loss": 0.1729, "num_input_tokens_seen": 22898576, "step": 23970 }, { "epoch": 1.9557060119096175, "grad_norm": 0.022868376225233078, "learning_rate": 4.862193914738231e-05, "loss": 0.0101, "num_input_tokens_seen": 22903616, "step": 23975 }, { "epoch": 1.956113875520026, "grad_norm": 0.041733864694833755, "learning_rate": 4.862077351430417e-05, "loss": 0.0069, "num_input_tokens_seen": 22908832, "step": 23980 }, { "epoch": 1.9565217391304348, "grad_norm": 6.654261589050293, "learning_rate": 4.861960740244298e-05, "loss": 0.2952, "num_input_tokens_seen": 22913616, "step": 23985 }, { "epoch": 1.9569296027408436, "grad_norm": 0.15132401883602142, "learning_rate": 4.861844081182239e-05, "loss": 0.1492, "num_input_tokens_seen": 22918640, "step": 23990 }, { "epoch": 1.9573374663512522, "grad_norm": 0.02079053409397602, "learning_rate": 4.861727374246603e-05, "loss": 0.063, "num_input_tokens_seen": 22923904, "step": 23995 }, { "epoch": 1.9577453299616607, "grad_norm": 0.10535218566656113, "learning_rate": 4.861610619439757e-05, "loss": 0.0286, "num_input_tokens_seen": 22929136, "step": 24000 }, { "epoch": 1.9581531935720695, "grad_norm": 0.01677497662603855, "learning_rate": 4.861493816764068e-05, "loss": 0.0062, "num_input_tokens_seen": 22933936, "step": 24005 }, { "epoch": 1.9585610571824783, "grad_norm": 0.06013372540473938, "learning_rate": 4.861376966221902e-05, "loss": 0.0433, "num_input_tokens_seen": 22937568, "step": 24010 }, { "epoch": 1.9589689207928869, "grad_norm": 0.2119968682527542, "learning_rate": 4.861260067815629e-05, "loss": 0.3496, "num_input_tokens_seen": 22941584, "step": 24015 }, { "epoch": 1.9593767844032954, "grad_norm": 3.8660225868225098, "learning_rate": 4.8611431215476174e-05, "loss": 0.6012, "num_input_tokens_seen": 22946272, "step": 24020 }, { "epoch": 1.9597846480137042, "grad_norm": 0.024436170235276222, "learning_rate": 4.8610261274202384e-05, "loss": 0.1881, "num_input_tokens_seen": 22950752, "step": 24025 }, { "epoch": 1.960192511624113, "grad_norm": 0.0883982703089714, "learning_rate": 4.8609090854358625e-05, "loss": 0.1256, "num_input_tokens_seen": 22955568, "step": 24030 }, { "epoch": 1.9606003752345216, "grad_norm": 13.554927825927734, "learning_rate": 4.8607919955968626e-05, "loss": 0.2558, "num_input_tokens_seen": 22960336, "step": 24035 }, { "epoch": 1.9610082388449301, "grad_norm": 0.10345199704170227, "learning_rate": 4.8606748579056125e-05, "loss": 0.1871, "num_input_tokens_seen": 22964880, "step": 24040 }, { "epoch": 1.961416102455339, "grad_norm": 0.4930868446826935, "learning_rate": 4.860557672364486e-05, "loss": 0.1411, "num_input_tokens_seen": 22970080, "step": 24045 }, { "epoch": 1.9618239660657477, "grad_norm": 0.029057186096906662, "learning_rate": 4.860440438975859e-05, "loss": 0.3115, "num_input_tokens_seen": 22974528, "step": 24050 }, { "epoch": 1.9622318296761563, "grad_norm": 0.022444846108555794, "learning_rate": 4.860323157742107e-05, "loss": 0.184, "num_input_tokens_seen": 22979984, "step": 24055 }, { "epoch": 1.9626396932865648, "grad_norm": 11.869709968566895, "learning_rate": 4.860205828665607e-05, "loss": 0.47, "num_input_tokens_seen": 22984960, "step": 24060 }, { "epoch": 1.9630475568969736, "grad_norm": 0.29659736156463623, "learning_rate": 4.860088451748739e-05, "loss": 0.1673, "num_input_tokens_seen": 22990432, "step": 24065 }, { "epoch": 1.9634554205073824, "grad_norm": 0.14153200387954712, "learning_rate": 4.85997102699388e-05, "loss": 0.1341, "num_input_tokens_seen": 22995616, "step": 24070 }, { "epoch": 1.963863284117791, "grad_norm": 0.0416330024600029, "learning_rate": 4.8598535544034105e-05, "loss": 0.1424, "num_input_tokens_seen": 23000496, "step": 24075 }, { "epoch": 1.9642711477281996, "grad_norm": 0.8031759262084961, "learning_rate": 4.859736033979714e-05, "loss": 0.3029, "num_input_tokens_seen": 23005376, "step": 24080 }, { "epoch": 1.9646790113386083, "grad_norm": 0.05739419907331467, "learning_rate": 4.8596184657251695e-05, "loss": 0.0179, "num_input_tokens_seen": 23010656, "step": 24085 }, { "epoch": 1.9650868749490171, "grad_norm": 0.18506549298763275, "learning_rate": 4.859500849642161e-05, "loss": 0.047, "num_input_tokens_seen": 23015568, "step": 24090 }, { "epoch": 1.9654947385594257, "grad_norm": 0.06227894499897957, "learning_rate": 4.859383185733074e-05, "loss": 0.1366, "num_input_tokens_seen": 23020576, "step": 24095 }, { "epoch": 1.9659026021698343, "grad_norm": 0.26081064343452454, "learning_rate": 4.859265474000292e-05, "loss": 0.0453, "num_input_tokens_seen": 23024624, "step": 24100 }, { "epoch": 1.966310465780243, "grad_norm": 0.03548193722963333, "learning_rate": 4.859147714446201e-05, "loss": 0.3769, "num_input_tokens_seen": 23028736, "step": 24105 }, { "epoch": 1.9667183293906518, "grad_norm": 0.05278749018907547, "learning_rate": 4.8590299070731885e-05, "loss": 0.2565, "num_input_tokens_seen": 23033184, "step": 24110 }, { "epoch": 1.9671261930010604, "grad_norm": 9.12101936340332, "learning_rate": 4.8589120518836425e-05, "loss": 0.0801, "num_input_tokens_seen": 23037840, "step": 24115 }, { "epoch": 1.967534056611469, "grad_norm": 0.007626437582075596, "learning_rate": 4.8587941488799514e-05, "loss": 0.0106, "num_input_tokens_seen": 23043040, "step": 24120 }, { "epoch": 1.9679419202218777, "grad_norm": 0.10586193203926086, "learning_rate": 4.858676198064505e-05, "loss": 0.0956, "num_input_tokens_seen": 23047376, "step": 24125 }, { "epoch": 1.9683497838322865, "grad_norm": 0.023585820570588112, "learning_rate": 4.858558199439695e-05, "loss": 0.0056, "num_input_tokens_seen": 23053264, "step": 24130 }, { "epoch": 1.968757647442695, "grad_norm": 0.023519091308116913, "learning_rate": 4.858440153007912e-05, "loss": 0.1567, "num_input_tokens_seen": 23057984, "step": 24135 }, { "epoch": 1.969165511053104, "grad_norm": 0.07189549505710602, "learning_rate": 4.8583220587715485e-05, "loss": 0.0568, "num_input_tokens_seen": 23063056, "step": 24140 }, { "epoch": 1.9695733746635127, "grad_norm": 0.03150227293372154, "learning_rate": 4.858203916733e-05, "loss": 0.19, "num_input_tokens_seen": 23067920, "step": 24145 }, { "epoch": 1.9699812382739212, "grad_norm": 0.07991315424442291, "learning_rate": 4.85808572689466e-05, "loss": 0.0061, "num_input_tokens_seen": 23072416, "step": 24150 }, { "epoch": 1.9703891018843298, "grad_norm": 0.6321737766265869, "learning_rate": 4.8579674892589246e-05, "loss": 0.0107, "num_input_tokens_seen": 23076736, "step": 24155 }, { "epoch": 1.9707969654947386, "grad_norm": 0.011570381931960583, "learning_rate": 4.857849203828189e-05, "loss": 0.0091, "num_input_tokens_seen": 23082064, "step": 24160 }, { "epoch": 1.9712048291051474, "grad_norm": 0.14579978585243225, "learning_rate": 4.857730870604853e-05, "loss": 0.035, "num_input_tokens_seen": 23086544, "step": 24165 }, { "epoch": 1.971612692715556, "grad_norm": 0.03324228525161743, "learning_rate": 4.857612489591314e-05, "loss": 0.0063, "num_input_tokens_seen": 23091376, "step": 24170 }, { "epoch": 1.9720205563259645, "grad_norm": 14.60269546508789, "learning_rate": 4.857494060789971e-05, "loss": 0.2807, "num_input_tokens_seen": 23095712, "step": 24175 }, { "epoch": 1.9724284199363733, "grad_norm": 0.023919159546494484, "learning_rate": 4.857375584203226e-05, "loss": 0.2659, "num_input_tokens_seen": 23100704, "step": 24180 }, { "epoch": 1.972836283546782, "grad_norm": 0.006513310596346855, "learning_rate": 4.857257059833479e-05, "loss": 0.1551, "num_input_tokens_seen": 23105664, "step": 24185 }, { "epoch": 1.9732441471571907, "grad_norm": 11.923405647277832, "learning_rate": 4.857138487683134e-05, "loss": 0.9253, "num_input_tokens_seen": 23109872, "step": 24190 }, { "epoch": 1.9736520107675992, "grad_norm": 0.027699535712599754, "learning_rate": 4.857019867754592e-05, "loss": 0.0535, "num_input_tokens_seen": 23114800, "step": 24195 }, { "epoch": 1.974059874378008, "grad_norm": 2.6681573390960693, "learning_rate": 4.8569012000502604e-05, "loss": 0.3027, "num_input_tokens_seen": 23119792, "step": 24200 }, { "epoch": 1.9744677379884168, "grad_norm": 20.45591926574707, "learning_rate": 4.856782484572542e-05, "loss": 0.0532, "num_input_tokens_seen": 23124416, "step": 24205 }, { "epoch": 1.9748756015988254, "grad_norm": 4.246004104614258, "learning_rate": 4.856663721323844e-05, "loss": 0.4239, "num_input_tokens_seen": 23129056, "step": 24210 }, { "epoch": 1.975283465209234, "grad_norm": 0.06370952725410461, "learning_rate": 4.8565449103065744e-05, "loss": 0.08, "num_input_tokens_seen": 23134416, "step": 24215 }, { "epoch": 1.9756913288196427, "grad_norm": 0.07158427685499191, "learning_rate": 4.85642605152314e-05, "loss": 0.0212, "num_input_tokens_seen": 23138832, "step": 24220 }, { "epoch": 1.9760991924300515, "grad_norm": 0.867659330368042, "learning_rate": 4.8563071449759516e-05, "loss": 0.0752, "num_input_tokens_seen": 23143472, "step": 24225 }, { "epoch": 1.97650705604046, "grad_norm": 3.0353317260742188, "learning_rate": 4.8561881906674185e-05, "loss": 0.535, "num_input_tokens_seen": 23148784, "step": 24230 }, { "epoch": 1.9769149196508686, "grad_norm": 1.4016257524490356, "learning_rate": 4.856069188599952e-05, "loss": 0.1592, "num_input_tokens_seen": 23153696, "step": 24235 }, { "epoch": 1.9773227832612774, "grad_norm": 2.201167583465576, "learning_rate": 4.855950138775964e-05, "loss": 0.6096, "num_input_tokens_seen": 23157760, "step": 24240 }, { "epoch": 1.9777306468716862, "grad_norm": 2.8572685718536377, "learning_rate": 4.855831041197867e-05, "loss": 0.3272, "num_input_tokens_seen": 23162224, "step": 24245 }, { "epoch": 1.9781385104820948, "grad_norm": 0.17511793971061707, "learning_rate": 4.855711895868077e-05, "loss": 0.2589, "num_input_tokens_seen": 23167552, "step": 24250 }, { "epoch": 1.9785463740925033, "grad_norm": 0.13714852929115295, "learning_rate": 4.855592702789008e-05, "loss": 0.1803, "num_input_tokens_seen": 23171440, "step": 24255 }, { "epoch": 1.9789542377029121, "grad_norm": 5.185002326965332, "learning_rate": 4.8554734619630755e-05, "loss": 0.2805, "num_input_tokens_seen": 23176848, "step": 24260 }, { "epoch": 1.979362101313321, "grad_norm": 2.7738447189331055, "learning_rate": 4.8553541733926965e-05, "loss": 0.3052, "num_input_tokens_seen": 23182544, "step": 24265 }, { "epoch": 1.9797699649237295, "grad_norm": 0.4325920343399048, "learning_rate": 4.855234837080289e-05, "loss": 0.1656, "num_input_tokens_seen": 23187296, "step": 24270 }, { "epoch": 1.980177828534138, "grad_norm": 0.4356032907962799, "learning_rate": 4.855115453028273e-05, "loss": 0.195, "num_input_tokens_seen": 23192560, "step": 24275 }, { "epoch": 1.9805856921445468, "grad_norm": 0.6716234683990479, "learning_rate": 4.854996021239068e-05, "loss": 0.0716, "num_input_tokens_seen": 23196864, "step": 24280 }, { "epoch": 1.9809935557549556, "grad_norm": 0.49525904655456543, "learning_rate": 4.854876541715093e-05, "loss": 0.118, "num_input_tokens_seen": 23201680, "step": 24285 }, { "epoch": 1.9814014193653642, "grad_norm": 0.12152167409658432, "learning_rate": 4.854757014458772e-05, "loss": 0.0338, "num_input_tokens_seen": 23206416, "step": 24290 }, { "epoch": 1.9818092829757727, "grad_norm": 7.012792587280273, "learning_rate": 4.854637439472526e-05, "loss": 0.3636, "num_input_tokens_seen": 23210736, "step": 24295 }, { "epoch": 1.9822171465861815, "grad_norm": 0.9777708053588867, "learning_rate": 4.8545178167587804e-05, "loss": 0.1309, "num_input_tokens_seen": 23214944, "step": 24300 }, { "epoch": 1.9826250101965903, "grad_norm": 3.4536166191101074, "learning_rate": 4.854398146319959e-05, "loss": 0.1284, "num_input_tokens_seen": 23218992, "step": 24305 }, { "epoch": 1.9830328738069989, "grad_norm": 0.09226930886507034, "learning_rate": 4.854278428158488e-05, "loss": 0.246, "num_input_tokens_seen": 23223824, "step": 24310 }, { "epoch": 1.9834407374174077, "grad_norm": 0.06183145195245743, "learning_rate": 4.8541586622767935e-05, "loss": 0.0999, "num_input_tokens_seen": 23227936, "step": 24315 }, { "epoch": 1.9838486010278165, "grad_norm": 2.586249589920044, "learning_rate": 4.8540388486773024e-05, "loss": 0.076, "num_input_tokens_seen": 23233152, "step": 24320 }, { "epoch": 1.984256464638225, "grad_norm": 0.058588750660419464, "learning_rate": 4.853918987362445e-05, "loss": 0.0727, "num_input_tokens_seen": 23238384, "step": 24325 }, { "epoch": 1.9846643282486336, "grad_norm": 0.09374522417783737, "learning_rate": 4.853799078334649e-05, "loss": 0.0134, "num_input_tokens_seen": 23243008, "step": 24330 }, { "epoch": 1.9850721918590424, "grad_norm": 0.1587478220462799, "learning_rate": 4.853679121596347e-05, "loss": 0.1624, "num_input_tokens_seen": 23247648, "step": 24335 }, { "epoch": 1.9854800554694512, "grad_norm": 2.490100145339966, "learning_rate": 4.853559117149969e-05, "loss": 0.1715, "num_input_tokens_seen": 23251936, "step": 24340 }, { "epoch": 1.9858879190798597, "grad_norm": 13.968510627746582, "learning_rate": 4.853439064997948e-05, "loss": 0.2117, "num_input_tokens_seen": 23257280, "step": 24345 }, { "epoch": 1.9862957826902683, "grad_norm": 0.11087019741535187, "learning_rate": 4.853318965142717e-05, "loss": 0.0471, "num_input_tokens_seen": 23261984, "step": 24350 }, { "epoch": 1.986703646300677, "grad_norm": 6.928555488586426, "learning_rate": 4.85319881758671e-05, "loss": 0.2584, "num_input_tokens_seen": 23266192, "step": 24355 }, { "epoch": 1.9871115099110859, "grad_norm": 9.934289932250977, "learning_rate": 4.853078622332363e-05, "loss": 0.2409, "num_input_tokens_seen": 23270768, "step": 24360 }, { "epoch": 1.9875193735214944, "grad_norm": 0.06670035421848297, "learning_rate": 4.852958379382113e-05, "loss": 0.226, "num_input_tokens_seen": 23275648, "step": 24365 }, { "epoch": 1.987927237131903, "grad_norm": 22.775596618652344, "learning_rate": 4.8528380887383963e-05, "loss": 0.132, "num_input_tokens_seen": 23280576, "step": 24370 }, { "epoch": 1.9883351007423118, "grad_norm": 0.14651110768318176, "learning_rate": 4.8527177504036505e-05, "loss": 0.1103, "num_input_tokens_seen": 23284528, "step": 24375 }, { "epoch": 1.9887429643527206, "grad_norm": 0.07816580682992935, "learning_rate": 4.852597364380317e-05, "loss": 0.1017, "num_input_tokens_seen": 23289600, "step": 24380 }, { "epoch": 1.9891508279631291, "grad_norm": 3.671011209487915, "learning_rate": 4.852476930670833e-05, "loss": 0.288, "num_input_tokens_seen": 23294512, "step": 24385 }, { "epoch": 1.9895586915735377, "grad_norm": 43.74068069458008, "learning_rate": 4.8523564492776425e-05, "loss": 0.3615, "num_input_tokens_seen": 23299184, "step": 24390 }, { "epoch": 1.9899665551839465, "grad_norm": 0.8332518339157104, "learning_rate": 4.852235920203186e-05, "loss": 0.0184, "num_input_tokens_seen": 23304368, "step": 24395 }, { "epoch": 1.9903744187943553, "grad_norm": 0.09226670861244202, "learning_rate": 4.8521153434499066e-05, "loss": 0.0123, "num_input_tokens_seen": 23308608, "step": 24400 }, { "epoch": 1.9907822824047638, "grad_norm": 6.771517276763916, "learning_rate": 4.8519947190202486e-05, "loss": 0.2154, "num_input_tokens_seen": 23313056, "step": 24405 }, { "epoch": 1.9911901460151724, "grad_norm": 0.291431725025177, "learning_rate": 4.851874046916658e-05, "loss": 0.4324, "num_input_tokens_seen": 23317648, "step": 24410 }, { "epoch": 1.9915980096255812, "grad_norm": 11.448836326599121, "learning_rate": 4.8517533271415785e-05, "loss": 0.0578, "num_input_tokens_seen": 23322432, "step": 24415 }, { "epoch": 1.99200587323599, "grad_norm": 0.14007696509361267, "learning_rate": 4.8516325596974594e-05, "loss": 0.1215, "num_input_tokens_seen": 23327248, "step": 24420 }, { "epoch": 1.9924137368463986, "grad_norm": 5.167911052703857, "learning_rate": 4.8515117445867476e-05, "loss": 0.1921, "num_input_tokens_seen": 23332352, "step": 24425 }, { "epoch": 1.9928216004568071, "grad_norm": 0.3749054968357086, "learning_rate": 4.8513908818118913e-05, "loss": 0.0793, "num_input_tokens_seen": 23336064, "step": 24430 }, { "epoch": 1.993229464067216, "grad_norm": 0.4746359586715698, "learning_rate": 4.8512699713753415e-05, "loss": 0.1864, "num_input_tokens_seen": 23341264, "step": 24435 }, { "epoch": 1.9936373276776247, "grad_norm": 0.06856846064329147, "learning_rate": 4.851149013279548e-05, "loss": 0.1774, "num_input_tokens_seen": 23345744, "step": 24440 }, { "epoch": 1.9940451912880333, "grad_norm": 9.547185897827148, "learning_rate": 4.851028007526963e-05, "loss": 0.4623, "num_input_tokens_seen": 23350016, "step": 24445 }, { "epoch": 1.9944530548984418, "grad_norm": 3.5996592044830322, "learning_rate": 4.85090695412004e-05, "loss": 0.2834, "num_input_tokens_seen": 23354432, "step": 24450 }, { "epoch": 1.9948609185088506, "grad_norm": 271.8769226074219, "learning_rate": 4.850785853061231e-05, "loss": 0.0961, "num_input_tokens_seen": 23359440, "step": 24455 }, { "epoch": 1.9952687821192594, "grad_norm": 0.03222360461950302, "learning_rate": 4.850664704352993e-05, "loss": 0.0162, "num_input_tokens_seen": 23363536, "step": 24460 }, { "epoch": 1.995676645729668, "grad_norm": 5.7367472648620605, "learning_rate": 4.85054350799778e-05, "loss": 0.0457, "num_input_tokens_seen": 23367968, "step": 24465 }, { "epoch": 1.9960845093400765, "grad_norm": 0.03155086934566498, "learning_rate": 4.8504222639980476e-05, "loss": 0.2813, "num_input_tokens_seen": 23372336, "step": 24470 }, { "epoch": 1.9964923729504853, "grad_norm": 4.706175327301025, "learning_rate": 4.8503009723562565e-05, "loss": 0.2061, "num_input_tokens_seen": 23376800, "step": 24475 }, { "epoch": 1.996900236560894, "grad_norm": 0.027055934071540833, "learning_rate": 4.8501796330748625e-05, "loss": 0.1321, "num_input_tokens_seen": 23381344, "step": 24480 }, { "epoch": 1.9973081001713027, "grad_norm": 1.810434103012085, "learning_rate": 4.850058246156326e-05, "loss": 0.0279, "num_input_tokens_seen": 23386304, "step": 24485 }, { "epoch": 1.9977159637817112, "grad_norm": 1.2804923057556152, "learning_rate": 4.849936811603108e-05, "loss": 0.0122, "num_input_tokens_seen": 23390464, "step": 24490 }, { "epoch": 1.99812382739212, "grad_norm": 0.09343018382787704, "learning_rate": 4.849815329417668e-05, "loss": 0.0714, "num_input_tokens_seen": 23395504, "step": 24495 }, { "epoch": 1.9985316910025288, "grad_norm": 0.09900557994842529, "learning_rate": 4.84969379960247e-05, "loss": 0.3377, "num_input_tokens_seen": 23400576, "step": 24500 }, { "epoch": 1.9989395546129374, "grad_norm": 12.981274604797363, "learning_rate": 4.849572222159978e-05, "loss": 0.1289, "num_input_tokens_seen": 23405664, "step": 24505 }, { "epoch": 1.9993474182233462, "grad_norm": 24.74155044555664, "learning_rate": 4.8494505970926554e-05, "loss": 0.6645, "num_input_tokens_seen": 23410640, "step": 24510 }, { "epoch": 1.999755281833755, "grad_norm": 2.826537609100342, "learning_rate": 4.849328924402967e-05, "loss": 0.6581, "num_input_tokens_seen": 23415536, "step": 24515 }, { "epoch": 2.0001631454441635, "grad_norm": 71.74378204345703, "learning_rate": 4.84920720409338e-05, "loss": 0.2324, "num_input_tokens_seen": 23419920, "step": 24520 }, { "epoch": 2.0001631454441635, "eval_loss": 0.3016117513179779, "eval_runtime": 571.0289, "eval_samples_per_second": 4.772, "eval_steps_per_second": 2.387, "num_input_tokens_seen": 23419920, "step": 24520 }, { "epoch": 2.000571009054572, "grad_norm": 50.7149658203125, "learning_rate": 4.8490854361663615e-05, "loss": 0.7849, "num_input_tokens_seen": 23423968, "step": 24525 }, { "epoch": 2.0009788726649806, "grad_norm": 30.0407772064209, "learning_rate": 4.848963620624379e-05, "loss": 0.6174, "num_input_tokens_seen": 23429520, "step": 24530 }, { "epoch": 2.0013867362753897, "grad_norm": 35.657958984375, "learning_rate": 4.848841757469902e-05, "loss": 0.8996, "num_input_tokens_seen": 23433280, "step": 24535 }, { "epoch": 2.001794599885798, "grad_norm": 64.884033203125, "learning_rate": 4.848719846705401e-05, "loss": 0.6907, "num_input_tokens_seen": 23438224, "step": 24540 }, { "epoch": 2.002202463496207, "grad_norm": 98.48120880126953, "learning_rate": 4.848597888333346e-05, "loss": 0.6528, "num_input_tokens_seen": 23443312, "step": 24545 }, { "epoch": 2.0026103271066154, "grad_norm": 5.259228706359863, "learning_rate": 4.8484758823562104e-05, "loss": 0.1963, "num_input_tokens_seen": 23447984, "step": 24550 }, { "epoch": 2.0030181907170244, "grad_norm": 49.706024169921875, "learning_rate": 4.848353828776466e-05, "loss": 0.3156, "num_input_tokens_seen": 23452192, "step": 24555 }, { "epoch": 2.003426054327433, "grad_norm": 25.563566207885742, "learning_rate": 4.8482317275965884e-05, "loss": 0.5629, "num_input_tokens_seen": 23457120, "step": 24560 }, { "epoch": 2.0038339179378415, "grad_norm": 4.5043768882751465, "learning_rate": 4.848109578819051e-05, "loss": 0.0897, "num_input_tokens_seen": 23460992, "step": 24565 }, { "epoch": 2.00424178154825, "grad_norm": 35.138023376464844, "learning_rate": 4.8479873824463296e-05, "loss": 0.1631, "num_input_tokens_seen": 23465616, "step": 24570 }, { "epoch": 2.004649645158659, "grad_norm": 0.5076770186424255, "learning_rate": 4.8478651384809026e-05, "loss": 0.2195, "num_input_tokens_seen": 23470304, "step": 24575 }, { "epoch": 2.0050575087690676, "grad_norm": 0.07958316802978516, "learning_rate": 4.847742846925246e-05, "loss": 0.0199, "num_input_tokens_seen": 23474464, "step": 24580 }, { "epoch": 2.005465372379476, "grad_norm": 22.6844539642334, "learning_rate": 4.84762050778184e-05, "loss": 0.4298, "num_input_tokens_seen": 23478608, "step": 24585 }, { "epoch": 2.0058732359898848, "grad_norm": 7.408966541290283, "learning_rate": 4.847498121053164e-05, "loss": 0.2848, "num_input_tokens_seen": 23482976, "step": 24590 }, { "epoch": 2.0062810996002938, "grad_norm": 7.74925422668457, "learning_rate": 4.847375686741699e-05, "loss": 1.4093, "num_input_tokens_seen": 23488112, "step": 24595 }, { "epoch": 2.0066889632107023, "grad_norm": 15.565169334411621, "learning_rate": 4.847253204849925e-05, "loss": 0.3128, "num_input_tokens_seen": 23493136, "step": 24600 }, { "epoch": 2.007096826821111, "grad_norm": 9.207194328308105, "learning_rate": 4.847130675380327e-05, "loss": 1.2181, "num_input_tokens_seen": 23497536, "step": 24605 }, { "epoch": 2.00750469043152, "grad_norm": 6.939385890960693, "learning_rate": 4.8470080983353875e-05, "loss": 0.6788, "num_input_tokens_seen": 23503056, "step": 24610 }, { "epoch": 2.0079125540419285, "grad_norm": 34.81532669067383, "learning_rate": 4.8468854737175905e-05, "loss": 0.3067, "num_input_tokens_seen": 23507856, "step": 24615 }, { "epoch": 2.008320417652337, "grad_norm": 49.84111404418945, "learning_rate": 4.846762801529423e-05, "loss": 0.2133, "num_input_tokens_seen": 23512976, "step": 24620 }, { "epoch": 2.0087282812627456, "grad_norm": 8.949475288391113, "learning_rate": 4.8466400817733706e-05, "loss": 0.5072, "num_input_tokens_seen": 23517680, "step": 24625 }, { "epoch": 2.0091361448731546, "grad_norm": 27.127246856689453, "learning_rate": 4.846517314451921e-05, "loss": 0.5111, "num_input_tokens_seen": 23521872, "step": 24630 }, { "epoch": 2.009544008483563, "grad_norm": 21.631763458251953, "learning_rate": 4.846394499567562e-05, "loss": 0.5053, "num_input_tokens_seen": 23526960, "step": 24635 }, { "epoch": 2.0099518720939717, "grad_norm": 3.7834372520446777, "learning_rate": 4.846271637122783e-05, "loss": 0.2722, "num_input_tokens_seen": 23531504, "step": 24640 }, { "epoch": 2.0103597357043803, "grad_norm": 0.9931376576423645, "learning_rate": 4.8461487271200754e-05, "loss": 0.3122, "num_input_tokens_seen": 23536608, "step": 24645 }, { "epoch": 2.0107675993147893, "grad_norm": 3.2926042079925537, "learning_rate": 4.8460257695619306e-05, "loss": 0.0833, "num_input_tokens_seen": 23541552, "step": 24650 }, { "epoch": 2.011175462925198, "grad_norm": 0.22169730067253113, "learning_rate": 4.84590276445084e-05, "loss": 0.0799, "num_input_tokens_seen": 23546448, "step": 24655 }, { "epoch": 2.0115833265356065, "grad_norm": 0.10045983642339706, "learning_rate": 4.845779711789297e-05, "loss": 0.0048, "num_input_tokens_seen": 23551216, "step": 24660 }, { "epoch": 2.011991190146015, "grad_norm": 0.026242608204483986, "learning_rate": 4.845656611579796e-05, "loss": 0.0067, "num_input_tokens_seen": 23555344, "step": 24665 }, { "epoch": 2.012399053756424, "grad_norm": 0.04278578236699104, "learning_rate": 4.845533463824833e-05, "loss": 0.3744, "num_input_tokens_seen": 23559840, "step": 24670 }, { "epoch": 2.0128069173668326, "grad_norm": 0.6601791381835938, "learning_rate": 4.845410268526902e-05, "loss": 0.1887, "num_input_tokens_seen": 23564096, "step": 24675 }, { "epoch": 2.013214780977241, "grad_norm": 0.1045774519443512, "learning_rate": 4.845287025688503e-05, "loss": 0.1988, "num_input_tokens_seen": 23568944, "step": 24680 }, { "epoch": 2.0136226445876497, "grad_norm": 0.10675550997257233, "learning_rate": 4.845163735312132e-05, "loss": 0.0866, "num_input_tokens_seen": 23574656, "step": 24685 }, { "epoch": 2.0140305081980587, "grad_norm": 0.08320948481559753, "learning_rate": 4.8450403974002875e-05, "loss": 0.0694, "num_input_tokens_seen": 23579296, "step": 24690 }, { "epoch": 2.0144383718084673, "grad_norm": 0.15131737291812897, "learning_rate": 4.844917011955472e-05, "loss": 0.3452, "num_input_tokens_seen": 23584176, "step": 24695 }, { "epoch": 2.014846235418876, "grad_norm": 0.056040722876787186, "learning_rate": 4.844793578980185e-05, "loss": 0.3105, "num_input_tokens_seen": 23588816, "step": 24700 }, { "epoch": 2.0152540990292844, "grad_norm": 0.716018795967102, "learning_rate": 4.844670098476928e-05, "loss": 0.0384, "num_input_tokens_seen": 23593632, "step": 24705 }, { "epoch": 2.0156619626396934, "grad_norm": 0.23043487966060638, "learning_rate": 4.844546570448204e-05, "loss": 0.1402, "num_input_tokens_seen": 23598400, "step": 24710 }, { "epoch": 2.016069826250102, "grad_norm": 0.4348513185977936, "learning_rate": 4.844422994896518e-05, "loss": 0.2407, "num_input_tokens_seen": 23602688, "step": 24715 }, { "epoch": 2.0164776898605106, "grad_norm": 141.3370361328125, "learning_rate": 4.844299371824374e-05, "loss": 0.2051, "num_input_tokens_seen": 23606784, "step": 24720 }, { "epoch": 2.016885553470919, "grad_norm": 1.2005020380020142, "learning_rate": 4.844175701234278e-05, "loss": 0.3583, "num_input_tokens_seen": 23611696, "step": 24725 }, { "epoch": 2.017293417081328, "grad_norm": 0.04967991262674332, "learning_rate": 4.844051983128736e-05, "loss": 0.0226, "num_input_tokens_seen": 23617744, "step": 24730 }, { "epoch": 2.0177012806917367, "grad_norm": 0.05990564450621605, "learning_rate": 4.8439282175102574e-05, "loss": 0.1581, "num_input_tokens_seen": 23622544, "step": 24735 }, { "epoch": 2.0181091443021453, "grad_norm": 10.248067855834961, "learning_rate": 4.843804404381349e-05, "loss": 0.0994, "num_input_tokens_seen": 23626496, "step": 24740 }, { "epoch": 2.018517007912554, "grad_norm": 12.051477432250977, "learning_rate": 4.8436805437445216e-05, "loss": 0.3237, "num_input_tokens_seen": 23630528, "step": 24745 }, { "epoch": 2.018924871522963, "grad_norm": 0.05493750795722008, "learning_rate": 4.843556635602286e-05, "loss": 0.1424, "num_input_tokens_seen": 23634560, "step": 24750 }, { "epoch": 2.0193327351333714, "grad_norm": 0.09476147592067719, "learning_rate": 4.843432679957152e-05, "loss": 0.0377, "num_input_tokens_seen": 23639440, "step": 24755 }, { "epoch": 2.01974059874378, "grad_norm": 0.06605695933103561, "learning_rate": 4.8433086768116335e-05, "loss": 0.007, "num_input_tokens_seen": 23644736, "step": 24760 }, { "epoch": 2.0201484623541885, "grad_norm": 0.03387988358736038, "learning_rate": 4.8431846261682445e-05, "loss": 0.1404, "num_input_tokens_seen": 23649056, "step": 24765 }, { "epoch": 2.0205563259645976, "grad_norm": 0.07098037004470825, "learning_rate": 4.843060528029499e-05, "loss": 0.1421, "num_input_tokens_seen": 23654640, "step": 24770 }, { "epoch": 2.020964189575006, "grad_norm": 0.469753623008728, "learning_rate": 4.8429363823979116e-05, "loss": 0.0134, "num_input_tokens_seen": 23658880, "step": 24775 }, { "epoch": 2.0213720531854147, "grad_norm": 0.10313884913921356, "learning_rate": 4.842812189275999e-05, "loss": 0.2077, "num_input_tokens_seen": 23663712, "step": 24780 }, { "epoch": 2.0217799167958237, "grad_norm": 0.05083267763257027, "learning_rate": 4.8426879486662805e-05, "loss": 0.0119, "num_input_tokens_seen": 23668672, "step": 24785 }, { "epoch": 2.0221877804062323, "grad_norm": 0.07225954532623291, "learning_rate": 4.842563660571271e-05, "loss": 0.2392, "num_input_tokens_seen": 23673744, "step": 24790 }, { "epoch": 2.022595644016641, "grad_norm": 0.08036407083272934, "learning_rate": 4.842439324993492e-05, "loss": 0.222, "num_input_tokens_seen": 23678944, "step": 24795 }, { "epoch": 2.0230035076270494, "grad_norm": 0.6351229548454285, "learning_rate": 4.8423149419354636e-05, "loss": 0.0186, "num_input_tokens_seen": 23683456, "step": 24800 }, { "epoch": 2.0234113712374584, "grad_norm": 0.013714337721467018, "learning_rate": 4.842190511399707e-05, "loss": 0.0109, "num_input_tokens_seen": 23687952, "step": 24805 }, { "epoch": 2.023819234847867, "grad_norm": 0.0328110046684742, "learning_rate": 4.842066033388743e-05, "loss": 0.2112, "num_input_tokens_seen": 23693488, "step": 24810 }, { "epoch": 2.0242270984582755, "grad_norm": 15.161493301391602, "learning_rate": 4.8419415079050956e-05, "loss": 0.4821, "num_input_tokens_seen": 23698464, "step": 24815 }, { "epoch": 2.024634962068684, "grad_norm": 19.73439598083496, "learning_rate": 4.84181693495129e-05, "loss": 0.11, "num_input_tokens_seen": 23703200, "step": 24820 }, { "epoch": 2.025042825679093, "grad_norm": 0.13588982820510864, "learning_rate": 4.84169231452985e-05, "loss": 0.2285, "num_input_tokens_seen": 23707520, "step": 24825 }, { "epoch": 2.0254506892895017, "grad_norm": 0.05946025997400284, "learning_rate": 4.8415676466433016e-05, "loss": 0.1806, "num_input_tokens_seen": 23712432, "step": 24830 }, { "epoch": 2.0258585528999102, "grad_norm": 3.2516698837280273, "learning_rate": 4.841442931294172e-05, "loss": 0.3168, "num_input_tokens_seen": 23717872, "step": 24835 }, { "epoch": 2.026266416510319, "grad_norm": 0.03949430212378502, "learning_rate": 4.841318168484988e-05, "loss": 0.006, "num_input_tokens_seen": 23722208, "step": 24840 }, { "epoch": 2.026674280120728, "grad_norm": 0.03517090156674385, "learning_rate": 4.8411933582182814e-05, "loss": 0.24, "num_input_tokens_seen": 23727152, "step": 24845 }, { "epoch": 2.0270821437311364, "grad_norm": 22.004268646240234, "learning_rate": 4.8410685004965794e-05, "loss": 0.409, "num_input_tokens_seen": 23731344, "step": 24850 }, { "epoch": 2.027490007341545, "grad_norm": 0.26780951023101807, "learning_rate": 4.840943595322414e-05, "loss": 0.0139, "num_input_tokens_seen": 23736816, "step": 24855 }, { "epoch": 2.0278978709519535, "grad_norm": 0.11067312210798264, "learning_rate": 4.8408186426983166e-05, "loss": 0.0219, "num_input_tokens_seen": 23742000, "step": 24860 }, { "epoch": 2.0283057345623625, "grad_norm": 14.581645011901855, "learning_rate": 4.84069364262682e-05, "loss": 0.3251, "num_input_tokens_seen": 23746080, "step": 24865 }, { "epoch": 2.028713598172771, "grad_norm": 5.381467819213867, "learning_rate": 4.8405685951104574e-05, "loss": 0.3267, "num_input_tokens_seen": 23750624, "step": 24870 }, { "epoch": 2.0291214617831796, "grad_norm": 2.518507480621338, "learning_rate": 4.840443500151764e-05, "loss": 0.0133, "num_input_tokens_seen": 23755424, "step": 24875 }, { "epoch": 2.029529325393588, "grad_norm": 0.06498337537050247, "learning_rate": 4.8403183577532764e-05, "loss": 0.0486, "num_input_tokens_seen": 23760816, "step": 24880 }, { "epoch": 2.0299371890039972, "grad_norm": 0.2556360960006714, "learning_rate": 4.8401931679175296e-05, "loss": 0.1141, "num_input_tokens_seen": 23765552, "step": 24885 }, { "epoch": 2.030345052614406, "grad_norm": 0.04630167409777641, "learning_rate": 4.840067930647061e-05, "loss": 0.2378, "num_input_tokens_seen": 23770400, "step": 24890 }, { "epoch": 2.0307529162248144, "grad_norm": 6.575644493103027, "learning_rate": 4.839942645944411e-05, "loss": 0.5503, "num_input_tokens_seen": 23775248, "step": 24895 }, { "epoch": 2.031160779835223, "grad_norm": 10.996109962463379, "learning_rate": 4.839817313812117e-05, "loss": 0.1345, "num_input_tokens_seen": 23780352, "step": 24900 }, { "epoch": 2.031568643445632, "grad_norm": 310.2623291015625, "learning_rate": 4.8396919342527204e-05, "loss": 0.9324, "num_input_tokens_seen": 23785472, "step": 24905 }, { "epoch": 2.0319765070560405, "grad_norm": 3.091881513595581, "learning_rate": 4.839566507268762e-05, "loss": 0.5211, "num_input_tokens_seen": 23790016, "step": 24910 }, { "epoch": 2.032384370666449, "grad_norm": 0.3147661089897156, "learning_rate": 4.839441032862786e-05, "loss": 0.3583, "num_input_tokens_seen": 23794688, "step": 24915 }, { "epoch": 2.0327922342768576, "grad_norm": 26.934051513671875, "learning_rate": 4.839315511037333e-05, "loss": 0.183, "num_input_tokens_seen": 23799328, "step": 24920 }, { "epoch": 2.0332000978872666, "grad_norm": 181.47471618652344, "learning_rate": 4.839189941794949e-05, "loss": 0.6251, "num_input_tokens_seen": 23804880, "step": 24925 }, { "epoch": 2.033607961497675, "grad_norm": 63.67110061645508, "learning_rate": 4.8390643251381796e-05, "loss": 0.3796, "num_input_tokens_seen": 23810128, "step": 24930 }, { "epoch": 2.0340158251080838, "grad_norm": 67.32887268066406, "learning_rate": 4.8389386610695694e-05, "loss": 0.1353, "num_input_tokens_seen": 23815072, "step": 24935 }, { "epoch": 2.0344236887184923, "grad_norm": 0.0875755250453949, "learning_rate": 4.838812949591667e-05, "loss": 0.1104, "num_input_tokens_seen": 23819984, "step": 24940 }, { "epoch": 2.0348315523289013, "grad_norm": 0.24799767136573792, "learning_rate": 4.83868719070702e-05, "loss": 0.0192, "num_input_tokens_seen": 23824448, "step": 24945 }, { "epoch": 2.03523941593931, "grad_norm": 0.08793839812278748, "learning_rate": 4.838561384418177e-05, "loss": 0.0049, "num_input_tokens_seen": 23829536, "step": 24950 }, { "epoch": 2.0356472795497185, "grad_norm": 24.00656509399414, "learning_rate": 4.838435530727688e-05, "loss": 0.3037, "num_input_tokens_seen": 23835056, "step": 24955 }, { "epoch": 2.0360551431601275, "grad_norm": 2.422484874725342, "learning_rate": 4.8383096296381046e-05, "loss": 0.0748, "num_input_tokens_seen": 23839504, "step": 24960 }, { "epoch": 2.036463006770536, "grad_norm": 0.47911855578422546, "learning_rate": 4.83818368115198e-05, "loss": 0.0126, "num_input_tokens_seen": 23844272, "step": 24965 }, { "epoch": 2.0368708703809446, "grad_norm": 8.263184547424316, "learning_rate": 4.838057685271864e-05, "loss": 0.2544, "num_input_tokens_seen": 23849408, "step": 24970 }, { "epoch": 2.037278733991353, "grad_norm": 29.93241310119629, "learning_rate": 4.8379316420003127e-05, "loss": 0.2072, "num_input_tokens_seen": 23853824, "step": 24975 }, { "epoch": 2.037686597601762, "grad_norm": 3.3878369331359863, "learning_rate": 4.837805551339881e-05, "loss": 0.1142, "num_input_tokens_seen": 23858784, "step": 24980 }, { "epoch": 2.0380944612121707, "grad_norm": 46.88351058959961, "learning_rate": 4.837679413293124e-05, "loss": 0.2818, "num_input_tokens_seen": 23863584, "step": 24985 }, { "epoch": 2.0385023248225793, "grad_norm": 4.523138523101807, "learning_rate": 4.837553227862599e-05, "loss": 0.2131, "num_input_tokens_seen": 23867696, "step": 24990 }, { "epoch": 2.038910188432988, "grad_norm": 0.048251427710056305, "learning_rate": 4.8374269950508624e-05, "loss": 0.1853, "num_input_tokens_seen": 23872560, "step": 24995 }, { "epoch": 2.039318052043397, "grad_norm": 0.005529146175831556, "learning_rate": 4.8373007148604746e-05, "loss": 0.25, "num_input_tokens_seen": 23876896, "step": 25000 }, { "epoch": 2.0397259156538055, "grad_norm": 0.3959486484527588, "learning_rate": 4.837174387293994e-05, "loss": 0.183, "num_input_tokens_seen": 23882448, "step": 25005 }, { "epoch": 2.040133779264214, "grad_norm": 0.1503230780363083, "learning_rate": 4.8370480123539815e-05, "loss": 0.0115, "num_input_tokens_seen": 23887952, "step": 25010 }, { "epoch": 2.0405416428746226, "grad_norm": 0.017012113705277443, "learning_rate": 4.836921590043e-05, "loss": 0.0026, "num_input_tokens_seen": 23893184, "step": 25015 }, { "epoch": 2.0409495064850316, "grad_norm": 4.68880558013916, "learning_rate": 4.83679512036361e-05, "loss": 0.2775, "num_input_tokens_seen": 23897152, "step": 25020 }, { "epoch": 2.04135737009544, "grad_norm": 0.005600145552307367, "learning_rate": 4.836668603318376e-05, "loss": 0.0166, "num_input_tokens_seen": 23902208, "step": 25025 }, { "epoch": 2.0417652337058487, "grad_norm": 0.016593247652053833, "learning_rate": 4.836542038909863e-05, "loss": 0.1034, "num_input_tokens_seen": 23907552, "step": 25030 }, { "epoch": 2.0421730973162573, "grad_norm": 0.055074773728847504, "learning_rate": 4.8364154271406345e-05, "loss": 0.1382, "num_input_tokens_seen": 23912496, "step": 25035 }, { "epoch": 2.0425809609266663, "grad_norm": 0.01013170089572668, "learning_rate": 4.836288768013259e-05, "loss": 0.0199, "num_input_tokens_seen": 23917376, "step": 25040 }, { "epoch": 2.042988824537075, "grad_norm": 0.03878692165017128, "learning_rate": 4.8361620615303025e-05, "loss": 0.0063, "num_input_tokens_seen": 23922480, "step": 25045 }, { "epoch": 2.0433966881474834, "grad_norm": 1.060469150543213, "learning_rate": 4.836035307694333e-05, "loss": 0.0108, "num_input_tokens_seen": 23926992, "step": 25050 }, { "epoch": 2.043804551757892, "grad_norm": 0.026749368757009506, "learning_rate": 4.8359085065079217e-05, "loss": 0.203, "num_input_tokens_seen": 23932432, "step": 25055 }, { "epoch": 2.044212415368301, "grad_norm": 0.6375629305839539, "learning_rate": 4.835781657973637e-05, "loss": 0.0032, "num_input_tokens_seen": 23937168, "step": 25060 }, { "epoch": 2.0446202789787096, "grad_norm": 0.05938968062400818, "learning_rate": 4.8356547620940506e-05, "loss": 0.0058, "num_input_tokens_seen": 23942480, "step": 25065 }, { "epoch": 2.045028142589118, "grad_norm": 0.00949053093791008, "learning_rate": 4.835527818871735e-05, "loss": 0.059, "num_input_tokens_seen": 23946784, "step": 25070 }, { "epoch": 2.0454360061995267, "grad_norm": 4.088181018829346, "learning_rate": 4.835400828309263e-05, "loss": 0.2472, "num_input_tokens_seen": 23950912, "step": 25075 }, { "epoch": 2.0458438698099357, "grad_norm": 0.012481654062867165, "learning_rate": 4.8352737904092085e-05, "loss": 0.0383, "num_input_tokens_seen": 23956336, "step": 25080 }, { "epoch": 2.0462517334203443, "grad_norm": 0.06498339027166367, "learning_rate": 4.8351467051741464e-05, "loss": 0.0039, "num_input_tokens_seen": 23961232, "step": 25085 }, { "epoch": 2.046659597030753, "grad_norm": 0.026172451674938202, "learning_rate": 4.835019572606653e-05, "loss": 0.2901, "num_input_tokens_seen": 23965872, "step": 25090 }, { "epoch": 2.0470674606411614, "grad_norm": 0.1499294489622116, "learning_rate": 4.834892392709305e-05, "loss": 0.3262, "num_input_tokens_seen": 23970496, "step": 25095 }, { "epoch": 2.0474753242515704, "grad_norm": 0.008562332019209862, "learning_rate": 4.83476516548468e-05, "loss": 0.2905, "num_input_tokens_seen": 23974864, "step": 25100 }, { "epoch": 2.047883187861979, "grad_norm": 0.012095939368009567, "learning_rate": 4.8346378909353574e-05, "loss": 0.2196, "num_input_tokens_seen": 23978864, "step": 25105 }, { "epoch": 2.0482910514723875, "grad_norm": 0.07373745739459991, "learning_rate": 4.834510569063917e-05, "loss": 0.0081, "num_input_tokens_seen": 23983680, "step": 25110 }, { "epoch": 2.048698915082796, "grad_norm": 0.16054195165634155, "learning_rate": 4.8343831998729387e-05, "loss": 0.1781, "num_input_tokens_seen": 23988192, "step": 25115 }, { "epoch": 2.049106778693205, "grad_norm": 12.450736999511719, "learning_rate": 4.834255783365006e-05, "loss": 0.0421, "num_input_tokens_seen": 23993024, "step": 25120 }, { "epoch": 2.0495146423036137, "grad_norm": 92.50951385498047, "learning_rate": 4.834128319542699e-05, "loss": 0.1958, "num_input_tokens_seen": 23998032, "step": 25125 }, { "epoch": 2.0499225059140223, "grad_norm": 0.09192927181720734, "learning_rate": 4.834000808408604e-05, "loss": 0.0991, "num_input_tokens_seen": 24002256, "step": 25130 }, { "epoch": 2.050330369524431, "grad_norm": 0.015442688018083572, "learning_rate": 4.833873249965304e-05, "loss": 0.112, "num_input_tokens_seen": 24006752, "step": 25135 }, { "epoch": 2.05073823313484, "grad_norm": 0.04067885875701904, "learning_rate": 4.833745644215385e-05, "loss": 0.2415, "num_input_tokens_seen": 24012176, "step": 25140 }, { "epoch": 2.0511460967452484, "grad_norm": 50.68641662597656, "learning_rate": 4.833617991161434e-05, "loss": 0.2425, "num_input_tokens_seen": 24016880, "step": 25145 }, { "epoch": 2.051553960355657, "grad_norm": 11.523482322692871, "learning_rate": 4.833490290806037e-05, "loss": 0.6508, "num_input_tokens_seen": 24021552, "step": 25150 }, { "epoch": 2.0519618239660655, "grad_norm": 0.009712295606732368, "learning_rate": 4.8333625431517834e-05, "loss": 0.0404, "num_input_tokens_seen": 24026288, "step": 25155 }, { "epoch": 2.0523696875764745, "grad_norm": 2.624189615249634, "learning_rate": 4.833234748201263e-05, "loss": 0.1969, "num_input_tokens_seen": 24031904, "step": 25160 }, { "epoch": 2.052777551186883, "grad_norm": 6.5962958335876465, "learning_rate": 4.833106905957066e-05, "loss": 0.3817, "num_input_tokens_seen": 24036944, "step": 25165 }, { "epoch": 2.0531854147972917, "grad_norm": 0.5884950757026672, "learning_rate": 4.8329790164217834e-05, "loss": 0.0113, "num_input_tokens_seen": 24041056, "step": 25170 }, { "epoch": 2.0535932784077007, "grad_norm": 0.2612951397895813, "learning_rate": 4.8328510795980066e-05, "loss": 0.1773, "num_input_tokens_seen": 24045264, "step": 25175 }, { "epoch": 2.0540011420181092, "grad_norm": 0.020171742886304855, "learning_rate": 4.8327230954883304e-05, "loss": 0.0081, "num_input_tokens_seen": 24050176, "step": 25180 }, { "epoch": 2.054409005628518, "grad_norm": 0.015022549778223038, "learning_rate": 4.8325950640953485e-05, "loss": 0.1199, "num_input_tokens_seen": 24055088, "step": 25185 }, { "epoch": 2.0548168692389264, "grad_norm": 1.932240605354309, "learning_rate": 4.8324669854216556e-05, "loss": 0.0187, "num_input_tokens_seen": 24059680, "step": 25190 }, { "epoch": 2.0552247328493354, "grad_norm": 4.573541164398193, "learning_rate": 4.832338859469847e-05, "loss": 0.0943, "num_input_tokens_seen": 24064864, "step": 25195 }, { "epoch": 2.055632596459744, "grad_norm": 0.032822564244270325, "learning_rate": 4.832210686242522e-05, "loss": 0.0735, "num_input_tokens_seen": 24069056, "step": 25200 }, { "epoch": 2.0560404600701525, "grad_norm": 3.267603635787964, "learning_rate": 4.832082465742277e-05, "loss": 0.2477, "num_input_tokens_seen": 24073360, "step": 25205 }, { "epoch": 2.056448323680561, "grad_norm": 0.012280723080039024, "learning_rate": 4.831954197971712e-05, "loss": 0.1599, "num_input_tokens_seen": 24077728, "step": 25210 }, { "epoch": 2.05685618729097, "grad_norm": 0.014177239499986172, "learning_rate": 4.831825882933425e-05, "loss": 0.0252, "num_input_tokens_seen": 24082864, "step": 25215 }, { "epoch": 2.0572640509013786, "grad_norm": 15.422536849975586, "learning_rate": 4.831697520630019e-05, "loss": 0.5079, "num_input_tokens_seen": 24088208, "step": 25220 }, { "epoch": 2.057671914511787, "grad_norm": 1.8359787464141846, "learning_rate": 4.831569111064095e-05, "loss": 0.0652, "num_input_tokens_seen": 24093728, "step": 25225 }, { "epoch": 2.058079778122196, "grad_norm": 0.017181897535920143, "learning_rate": 4.831440654238256e-05, "loss": 0.134, "num_input_tokens_seen": 24098848, "step": 25230 }, { "epoch": 2.058487641732605, "grad_norm": 21.000473022460938, "learning_rate": 4.831312150155106e-05, "loss": 0.058, "num_input_tokens_seen": 24103232, "step": 25235 }, { "epoch": 2.0588955053430134, "grad_norm": 10.617144584655762, "learning_rate": 4.8311835988172485e-05, "loss": 0.0088, "num_input_tokens_seen": 24107216, "step": 25240 }, { "epoch": 2.059303368953422, "grad_norm": 6.8132710456848145, "learning_rate": 4.83105500022729e-05, "loss": 0.2916, "num_input_tokens_seen": 24111200, "step": 25245 }, { "epoch": 2.0597112325638305, "grad_norm": 0.02077132649719715, "learning_rate": 4.830926354387838e-05, "loss": 0.0707, "num_input_tokens_seen": 24115616, "step": 25250 }, { "epoch": 2.0601190961742395, "grad_norm": 0.03122609853744507, "learning_rate": 4.830797661301499e-05, "loss": 0.0111, "num_input_tokens_seen": 24121072, "step": 25255 }, { "epoch": 2.060526959784648, "grad_norm": 0.08760177344083786, "learning_rate": 4.8306689209708816e-05, "loss": 0.2878, "num_input_tokens_seen": 24126080, "step": 25260 }, { "epoch": 2.0609348233950566, "grad_norm": 0.23835547268390656, "learning_rate": 4.8305401333985954e-05, "loss": 0.0111, "num_input_tokens_seen": 24131040, "step": 25265 }, { "epoch": 2.061342687005465, "grad_norm": 1.9163711071014404, "learning_rate": 4.830411298587251e-05, "loss": 0.0172, "num_input_tokens_seen": 24135568, "step": 25270 }, { "epoch": 2.061750550615874, "grad_norm": 0.009001325815916061, "learning_rate": 4.83028241653946e-05, "loss": 0.0027, "num_input_tokens_seen": 24140832, "step": 25275 }, { "epoch": 2.0621584142262828, "grad_norm": 0.16822898387908936, "learning_rate": 4.830153487257835e-05, "loss": 0.1755, "num_input_tokens_seen": 24146192, "step": 25280 }, { "epoch": 2.0625662778366913, "grad_norm": 0.6540961265563965, "learning_rate": 4.830024510744988e-05, "loss": 0.0476, "num_input_tokens_seen": 24151232, "step": 25285 }, { "epoch": 2.0629741414471, "grad_norm": 0.015522710978984833, "learning_rate": 4.8298954870035354e-05, "loss": 0.0044, "num_input_tokens_seen": 24156704, "step": 25290 }, { "epoch": 2.063382005057509, "grad_norm": 0.023515790700912476, "learning_rate": 4.829766416036091e-05, "loss": 0.273, "num_input_tokens_seen": 24161744, "step": 25295 }, { "epoch": 2.0637898686679175, "grad_norm": 0.058460820466279984, "learning_rate": 4.829637297845271e-05, "loss": 0.0216, "num_input_tokens_seen": 24165872, "step": 25300 }, { "epoch": 2.064197732278326, "grad_norm": 0.01184915378689766, "learning_rate": 4.8295081324336924e-05, "loss": 0.0329, "num_input_tokens_seen": 24170384, "step": 25305 }, { "epoch": 2.0646055958887346, "grad_norm": 14.34364128112793, "learning_rate": 4.829378919803975e-05, "loss": 0.1546, "num_input_tokens_seen": 24175552, "step": 25310 }, { "epoch": 2.0650134594991436, "grad_norm": 0.015213635750114918, "learning_rate": 4.829249659958736e-05, "loss": 0.1846, "num_input_tokens_seen": 24180816, "step": 25315 }, { "epoch": 2.065421323109552, "grad_norm": 0.5224880576133728, "learning_rate": 4.829120352900596e-05, "loss": 0.2824, "num_input_tokens_seen": 24186496, "step": 25320 }, { "epoch": 2.0658291867199607, "grad_norm": 0.030890189111232758, "learning_rate": 4.828990998632177e-05, "loss": 0.5273, "num_input_tokens_seen": 24191888, "step": 25325 }, { "epoch": 2.0662370503303693, "grad_norm": 19.11206817626953, "learning_rate": 4.828861597156099e-05, "loss": 0.3766, "num_input_tokens_seen": 24196992, "step": 25330 }, { "epoch": 2.0666449139407783, "grad_norm": 0.034033481031656265, "learning_rate": 4.828732148474987e-05, "loss": 0.0078, "num_input_tokens_seen": 24202304, "step": 25335 }, { "epoch": 2.067052777551187, "grad_norm": 4.770650863647461, "learning_rate": 4.828602652591463e-05, "loss": 0.0651, "num_input_tokens_seen": 24206272, "step": 25340 }, { "epoch": 2.0674606411615954, "grad_norm": 0.10629868507385254, "learning_rate": 4.828473109508153e-05, "loss": 0.0735, "num_input_tokens_seen": 24210144, "step": 25345 }, { "epoch": 2.0678685047720045, "grad_norm": 0.08378735184669495, "learning_rate": 4.8283435192276826e-05, "loss": 0.0064, "num_input_tokens_seen": 24214704, "step": 25350 }, { "epoch": 2.068276368382413, "grad_norm": 0.018982136622071266, "learning_rate": 4.828213881752679e-05, "loss": 0.0178, "num_input_tokens_seen": 24218640, "step": 25355 }, { "epoch": 2.0686842319928216, "grad_norm": 0.011200751177966595, "learning_rate": 4.8280841970857684e-05, "loss": 0.0068, "num_input_tokens_seen": 24223328, "step": 25360 }, { "epoch": 2.06909209560323, "grad_norm": 0.04787253960967064, "learning_rate": 4.827954465229581e-05, "loss": 0.2563, "num_input_tokens_seen": 24228704, "step": 25365 }, { "epoch": 2.069499959213639, "grad_norm": 0.01224744226783514, "learning_rate": 4.8278246861867456e-05, "loss": 0.0062, "num_input_tokens_seen": 24233344, "step": 25370 }, { "epoch": 2.0699078228240477, "grad_norm": 0.019890036433935165, "learning_rate": 4.8276948599598926e-05, "loss": 0.0074, "num_input_tokens_seen": 24238144, "step": 25375 }, { "epoch": 2.0703156864344563, "grad_norm": 5.136507034301758, "learning_rate": 4.827564986551655e-05, "loss": 0.2964, "num_input_tokens_seen": 24243072, "step": 25380 }, { "epoch": 2.070723550044865, "grad_norm": 2.0163285732269287, "learning_rate": 4.827435065964663e-05, "loss": 0.1874, "num_input_tokens_seen": 24248080, "step": 25385 }, { "epoch": 2.071131413655274, "grad_norm": 0.04353555291891098, "learning_rate": 4.8273050982015515e-05, "loss": 0.216, "num_input_tokens_seen": 24253168, "step": 25390 }, { "epoch": 2.0715392772656824, "grad_norm": 0.07509731501340866, "learning_rate": 4.8271750832649546e-05, "loss": 0.0096, "num_input_tokens_seen": 24257888, "step": 25395 }, { "epoch": 2.071947140876091, "grad_norm": 19.25855827331543, "learning_rate": 4.827045021157508e-05, "loss": 0.1798, "num_input_tokens_seen": 24263424, "step": 25400 }, { "epoch": 2.0723550044864996, "grad_norm": 0.025963235646486282, "learning_rate": 4.826914911881848e-05, "loss": 0.0213, "num_input_tokens_seen": 24268304, "step": 25405 }, { "epoch": 2.0727628680969086, "grad_norm": 0.012140985578298569, "learning_rate": 4.82678475544061e-05, "loss": 0.3748, "num_input_tokens_seen": 24273296, "step": 25410 }, { "epoch": 2.073170731707317, "grad_norm": 5.830567836761475, "learning_rate": 4.826654551836435e-05, "loss": 0.2638, "num_input_tokens_seen": 24278640, "step": 25415 }, { "epoch": 2.0735785953177257, "grad_norm": 1.1072882413864136, "learning_rate": 4.82652430107196e-05, "loss": 0.0147, "num_input_tokens_seen": 24283312, "step": 25420 }, { "epoch": 2.0739864589281343, "grad_norm": 2.191166877746582, "learning_rate": 4.826394003149827e-05, "loss": 0.0398, "num_input_tokens_seen": 24288368, "step": 25425 }, { "epoch": 2.0743943225385433, "grad_norm": 0.03051869012415409, "learning_rate": 4.826263658072675e-05, "loss": 0.0519, "num_input_tokens_seen": 24293264, "step": 25430 }, { "epoch": 2.074802186148952, "grad_norm": 4.215068817138672, "learning_rate": 4.826133265843148e-05, "loss": 0.013, "num_input_tokens_seen": 24298096, "step": 25435 }, { "epoch": 2.0752100497593604, "grad_norm": 0.031449537724256516, "learning_rate": 4.826002826463888e-05, "loss": 0.358, "num_input_tokens_seen": 24303392, "step": 25440 }, { "epoch": 2.075617913369769, "grad_norm": 0.04265950247645378, "learning_rate": 4.825872339937539e-05, "loss": 0.1453, "num_input_tokens_seen": 24308240, "step": 25445 }, { "epoch": 2.076025776980178, "grad_norm": 0.9207162857055664, "learning_rate": 4.825741806266746e-05, "loss": 0.0364, "num_input_tokens_seen": 24312240, "step": 25450 }, { "epoch": 2.0764336405905865, "grad_norm": 0.018080443143844604, "learning_rate": 4.8256112254541546e-05, "loss": 0.0061, "num_input_tokens_seen": 24317872, "step": 25455 }, { "epoch": 2.076841504200995, "grad_norm": 0.2917940020561218, "learning_rate": 4.825480597502412e-05, "loss": 0.1967, "num_input_tokens_seen": 24323232, "step": 25460 }, { "epoch": 2.0772493678114037, "grad_norm": 0.010251388885080814, "learning_rate": 4.825349922414166e-05, "loss": 0.1571, "num_input_tokens_seen": 24327408, "step": 25465 }, { "epoch": 2.0776572314218127, "grad_norm": 0.025470612570643425, "learning_rate": 4.8252192001920654e-05, "loss": 0.0048, "num_input_tokens_seen": 24332384, "step": 25470 }, { "epoch": 2.0780650950322213, "grad_norm": 0.031329743564128876, "learning_rate": 4.825088430838759e-05, "loss": 0.2328, "num_input_tokens_seen": 24335904, "step": 25475 }, { "epoch": 2.07847295864263, "grad_norm": 0.00623495876789093, "learning_rate": 4.824957614356899e-05, "loss": 0.0058, "num_input_tokens_seen": 24340400, "step": 25480 }, { "epoch": 2.0788808222530384, "grad_norm": 0.06766900420188904, "learning_rate": 4.8248267507491354e-05, "loss": 0.106, "num_input_tokens_seen": 24346256, "step": 25485 }, { "epoch": 2.0792886858634474, "grad_norm": 0.08758305013179779, "learning_rate": 4.8246958400181217e-05, "loss": 0.2951, "num_input_tokens_seen": 24350896, "step": 25490 }, { "epoch": 2.079696549473856, "grad_norm": 4.542603492736816, "learning_rate": 4.824564882166511e-05, "loss": 0.2084, "num_input_tokens_seen": 24356656, "step": 25495 }, { "epoch": 2.0801044130842645, "grad_norm": 0.012428478337824345, "learning_rate": 4.824433877196958e-05, "loss": 0.005, "num_input_tokens_seen": 24361568, "step": 25500 }, { "epoch": 2.080512276694673, "grad_norm": 0.02077493444085121, "learning_rate": 4.824302825112118e-05, "loss": 0.1388, "num_input_tokens_seen": 24366256, "step": 25505 }, { "epoch": 2.080920140305082, "grad_norm": 1.721594214439392, "learning_rate": 4.8241717259146467e-05, "loss": 0.2052, "num_input_tokens_seen": 24371472, "step": 25510 }, { "epoch": 2.0813280039154907, "grad_norm": 0.03245551884174347, "learning_rate": 4.824040579607203e-05, "loss": 0.0063, "num_input_tokens_seen": 24376048, "step": 25515 }, { "epoch": 2.0817358675258992, "grad_norm": 2.4898641109466553, "learning_rate": 4.823909386192444e-05, "loss": 0.1152, "num_input_tokens_seen": 24380720, "step": 25520 }, { "epoch": 2.0821437311363082, "grad_norm": 0.023732192814350128, "learning_rate": 4.82377814567303e-05, "loss": 0.1572, "num_input_tokens_seen": 24385568, "step": 25525 }, { "epoch": 2.082551594746717, "grad_norm": 0.0175319891422987, "learning_rate": 4.823646858051619e-05, "loss": 0.2319, "num_input_tokens_seen": 24390240, "step": 25530 }, { "epoch": 2.0829594583571254, "grad_norm": 0.020779229700565338, "learning_rate": 4.8235155233308746e-05, "loss": 0.2336, "num_input_tokens_seen": 24394944, "step": 25535 }, { "epoch": 2.083367321967534, "grad_norm": 1.7514867782592773, "learning_rate": 4.823384141513457e-05, "loss": 0.3339, "num_input_tokens_seen": 24399792, "step": 25540 }, { "epoch": 2.083775185577943, "grad_norm": 0.5507997274398804, "learning_rate": 4.823252712602031e-05, "loss": 0.1981, "num_input_tokens_seen": 24404400, "step": 25545 }, { "epoch": 2.0841830491883515, "grad_norm": 0.07161165028810501, "learning_rate": 4.8231212365992595e-05, "loss": 0.1832, "num_input_tokens_seen": 24409568, "step": 25550 }, { "epoch": 2.08459091279876, "grad_norm": 0.1643887311220169, "learning_rate": 4.822989713507808e-05, "loss": 0.1197, "num_input_tokens_seen": 24414432, "step": 25555 }, { "epoch": 2.0849987764091686, "grad_norm": 8.671548843383789, "learning_rate": 4.822858143330341e-05, "loss": 0.2989, "num_input_tokens_seen": 24419456, "step": 25560 }, { "epoch": 2.0854066400195777, "grad_norm": 0.48569828271865845, "learning_rate": 4.8227265260695276e-05, "loss": 0.1376, "num_input_tokens_seen": 24424352, "step": 25565 }, { "epoch": 2.085814503629986, "grad_norm": 21.02585220336914, "learning_rate": 4.8225948617280336e-05, "loss": 0.1863, "num_input_tokens_seen": 24429200, "step": 25570 }, { "epoch": 2.086222367240395, "grad_norm": 3.8027210235595703, "learning_rate": 4.8224631503085296e-05, "loss": 0.1784, "num_input_tokens_seen": 24434352, "step": 25575 }, { "epoch": 2.0866302308508033, "grad_norm": 18.28254508972168, "learning_rate": 4.8223313918136835e-05, "loss": 0.2193, "num_input_tokens_seen": 24439280, "step": 25580 }, { "epoch": 2.0870380944612124, "grad_norm": 47.57618713378906, "learning_rate": 4.822199586246168e-05, "loss": 0.4717, "num_input_tokens_seen": 24443840, "step": 25585 }, { "epoch": 2.087445958071621, "grad_norm": 5.35614538192749, "learning_rate": 4.822067733608653e-05, "loss": 0.5837, "num_input_tokens_seen": 24448352, "step": 25590 }, { "epoch": 2.0878538216820295, "grad_norm": 2.9866044521331787, "learning_rate": 4.8219358339038117e-05, "loss": 0.1865, "num_input_tokens_seen": 24452816, "step": 25595 }, { "epoch": 2.088261685292438, "grad_norm": 0.2809295058250427, "learning_rate": 4.8218038871343174e-05, "loss": 0.3015, "num_input_tokens_seen": 24457504, "step": 25600 }, { "epoch": 2.088669548902847, "grad_norm": 0.21721965074539185, "learning_rate": 4.821671893302845e-05, "loss": 0.285, "num_input_tokens_seen": 24462240, "step": 25605 }, { "epoch": 2.0890774125132556, "grad_norm": 0.07280023396015167, "learning_rate": 4.821539852412071e-05, "loss": 0.0112, "num_input_tokens_seen": 24465984, "step": 25610 }, { "epoch": 2.089485276123664, "grad_norm": 0.9166554808616638, "learning_rate": 4.8214077644646696e-05, "loss": 0.103, "num_input_tokens_seen": 24470688, "step": 25615 }, { "epoch": 2.0898931397340728, "grad_norm": 0.52897047996521, "learning_rate": 4.8212756294633194e-05, "loss": 0.3742, "num_input_tokens_seen": 24475904, "step": 25620 }, { "epoch": 2.0903010033444818, "grad_norm": 0.07429308444261551, "learning_rate": 4.8211434474106984e-05, "loss": 0.0095, "num_input_tokens_seen": 24480688, "step": 25625 }, { "epoch": 2.0907088669548903, "grad_norm": 14.560340881347656, "learning_rate": 4.8210112183094866e-05, "loss": 0.0709, "num_input_tokens_seen": 24484752, "step": 25630 }, { "epoch": 2.091116730565299, "grad_norm": 14.689225196838379, "learning_rate": 4.8208789421623634e-05, "loss": 0.391, "num_input_tokens_seen": 24489120, "step": 25635 }, { "epoch": 2.0915245941757075, "grad_norm": 1.0011759996414185, "learning_rate": 4.8207466189720095e-05, "loss": 0.6719, "num_input_tokens_seen": 24493712, "step": 25640 }, { "epoch": 2.0919324577861165, "grad_norm": 1.0167300701141357, "learning_rate": 4.820614248741109e-05, "loss": 0.0701, "num_input_tokens_seen": 24498608, "step": 25645 }, { "epoch": 2.092340321396525, "grad_norm": 0.0651492178440094, "learning_rate": 4.820481831472343e-05, "loss": 0.0138, "num_input_tokens_seen": 24503552, "step": 25650 }, { "epoch": 2.0927481850069336, "grad_norm": 0.07783706486225128, "learning_rate": 4.820349367168396e-05, "loss": 0.0096, "num_input_tokens_seen": 24508848, "step": 25655 }, { "epoch": 2.093156048617342, "grad_norm": 0.05250956863164902, "learning_rate": 4.820216855831954e-05, "loss": 0.193, "num_input_tokens_seen": 24514128, "step": 25660 }, { "epoch": 2.093563912227751, "grad_norm": 3.099134922027588, "learning_rate": 4.8200842974657024e-05, "loss": 0.378, "num_input_tokens_seen": 24518576, "step": 25665 }, { "epoch": 2.0939717758381597, "grad_norm": 0.09632598608732224, "learning_rate": 4.819951692072328e-05, "loss": 0.0908, "num_input_tokens_seen": 24523520, "step": 25670 }, { "epoch": 2.0943796394485683, "grad_norm": 2.941028594970703, "learning_rate": 4.819819039654518e-05, "loss": 0.1419, "num_input_tokens_seen": 24528096, "step": 25675 }, { "epoch": 2.094787503058977, "grad_norm": 2.3735082149505615, "learning_rate": 4.819686340214962e-05, "loss": 0.0242, "num_input_tokens_seen": 24532240, "step": 25680 }, { "epoch": 2.095195366669386, "grad_norm": 2.209867000579834, "learning_rate": 4.8195535937563495e-05, "loss": 0.1947, "num_input_tokens_seen": 24536608, "step": 25685 }, { "epoch": 2.0956032302797944, "grad_norm": 1.6045663356781006, "learning_rate": 4.819420800281371e-05, "loss": 0.4106, "num_input_tokens_seen": 24540944, "step": 25690 }, { "epoch": 2.096011093890203, "grad_norm": 0.693105161190033, "learning_rate": 4.819287959792719e-05, "loss": 0.021, "num_input_tokens_seen": 24545744, "step": 25695 }, { "epoch": 2.096418957500612, "grad_norm": 1.433130145072937, "learning_rate": 4.8191550722930855e-05, "loss": 0.0296, "num_input_tokens_seen": 24550432, "step": 25700 }, { "epoch": 2.0968268211110206, "grad_norm": 0.0654308944940567, "learning_rate": 4.819022137785164e-05, "loss": 0.05, "num_input_tokens_seen": 24555232, "step": 25705 }, { "epoch": 2.097234684721429, "grad_norm": 0.1939372718334198, "learning_rate": 4.81888915627165e-05, "loss": 0.0224, "num_input_tokens_seen": 24560368, "step": 25710 }, { "epoch": 2.0976425483318377, "grad_norm": 0.07089453935623169, "learning_rate": 4.8187561277552374e-05, "loss": 0.0117, "num_input_tokens_seen": 24565056, "step": 25715 }, { "epoch": 2.0980504119422463, "grad_norm": 0.16074702143669128, "learning_rate": 4.818623052238623e-05, "loss": 0.0524, "num_input_tokens_seen": 24569984, "step": 25720 }, { "epoch": 2.0984582755526553, "grad_norm": 0.04173910990357399, "learning_rate": 4.818489929724505e-05, "loss": 0.1268, "num_input_tokens_seen": 24574544, "step": 25725 }, { "epoch": 2.098866139163064, "grad_norm": 0.06944132596254349, "learning_rate": 4.818356760215581e-05, "loss": 0.009, "num_input_tokens_seen": 24579056, "step": 25730 }, { "epoch": 2.0992740027734724, "grad_norm": 0.037863101810216904, "learning_rate": 4.8182235437145515e-05, "loss": 0.005, "num_input_tokens_seen": 24583552, "step": 25735 }, { "epoch": 2.0996818663838814, "grad_norm": 0.3375462293624878, "learning_rate": 4.8180902802241144e-05, "loss": 0.0236, "num_input_tokens_seen": 24589376, "step": 25740 }, { "epoch": 2.10008972999429, "grad_norm": 0.024410123005509377, "learning_rate": 4.817956969746974e-05, "loss": 0.073, "num_input_tokens_seen": 24593856, "step": 25745 }, { "epoch": 2.1004975936046986, "grad_norm": 0.027386590838432312, "learning_rate": 4.81782361228583e-05, "loss": 0.2185, "num_input_tokens_seen": 24599040, "step": 25750 }, { "epoch": 2.100905457215107, "grad_norm": 0.05507495254278183, "learning_rate": 4.8176902078433863e-05, "loss": 0.3077, "num_input_tokens_seen": 24604320, "step": 25755 }, { "epoch": 2.101313320825516, "grad_norm": 0.037476614117622375, "learning_rate": 4.817556756422347e-05, "loss": 0.0496, "num_input_tokens_seen": 24609408, "step": 25760 }, { "epoch": 2.1017211844359247, "grad_norm": 2.341533899307251, "learning_rate": 4.8174232580254165e-05, "loss": 0.1834, "num_input_tokens_seen": 24615056, "step": 25765 }, { "epoch": 2.1021290480463333, "grad_norm": 0.10574530810117722, "learning_rate": 4.817289712655302e-05, "loss": 0.1587, "num_input_tokens_seen": 24619600, "step": 25770 }, { "epoch": 2.102536911656742, "grad_norm": 4.458616733551025, "learning_rate": 4.817156120314709e-05, "loss": 0.2692, "num_input_tokens_seen": 24624880, "step": 25775 }, { "epoch": 2.102944775267151, "grad_norm": 0.25975528359413147, "learning_rate": 4.817022481006347e-05, "loss": 0.1762, "num_input_tokens_seen": 24629216, "step": 25780 }, { "epoch": 2.1033526388775594, "grad_norm": 0.08733263611793518, "learning_rate": 4.8168887947329234e-05, "loss": 0.0137, "num_input_tokens_seen": 24634304, "step": 25785 }, { "epoch": 2.103760502487968, "grad_norm": 1.460226058959961, "learning_rate": 4.816755061497148e-05, "loss": 0.2148, "num_input_tokens_seen": 24638896, "step": 25790 }, { "epoch": 2.1041683660983765, "grad_norm": 0.23193244636058807, "learning_rate": 4.8166212813017316e-05, "loss": 0.0683, "num_input_tokens_seen": 24643776, "step": 25795 }, { "epoch": 2.1045762297087856, "grad_norm": 0.04015899449586868, "learning_rate": 4.816487454149388e-05, "loss": 0.1124, "num_input_tokens_seen": 24648704, "step": 25800 }, { "epoch": 2.104984093319194, "grad_norm": 10.288278579711914, "learning_rate": 4.816353580042827e-05, "loss": 0.068, "num_input_tokens_seen": 24653328, "step": 25805 }, { "epoch": 2.1053919569296027, "grad_norm": 1.441536545753479, "learning_rate": 4.816219658984763e-05, "loss": 0.0215, "num_input_tokens_seen": 24657936, "step": 25810 }, { "epoch": 2.1057998205400112, "grad_norm": 0.04184285178780556, "learning_rate": 4.816085690977911e-05, "loss": 0.1521, "num_input_tokens_seen": 24663120, "step": 25815 }, { "epoch": 2.1062076841504203, "grad_norm": 0.1455812305212021, "learning_rate": 4.815951676024986e-05, "loss": 0.086, "num_input_tokens_seen": 24667824, "step": 25820 }, { "epoch": 2.106615547760829, "grad_norm": 0.038160327821969986, "learning_rate": 4.815817614128705e-05, "loss": 0.2192, "num_input_tokens_seen": 24673072, "step": 25825 }, { "epoch": 2.1070234113712374, "grad_norm": 0.032113298773765564, "learning_rate": 4.815683505291785e-05, "loss": 0.2215, "num_input_tokens_seen": 24678816, "step": 25830 }, { "epoch": 2.107431274981646, "grad_norm": 0.11412207782268524, "learning_rate": 4.8155493495169434e-05, "loss": 0.0056, "num_input_tokens_seen": 24684208, "step": 25835 }, { "epoch": 2.107839138592055, "grad_norm": 3.551504135131836, "learning_rate": 4.815415146806902e-05, "loss": 0.2391, "num_input_tokens_seen": 24688464, "step": 25840 }, { "epoch": 2.1082470022024635, "grad_norm": 0.09580264240503311, "learning_rate": 4.815280897164378e-05, "loss": 0.1677, "num_input_tokens_seen": 24692960, "step": 25845 }, { "epoch": 2.108654865812872, "grad_norm": 0.11160502582788467, "learning_rate": 4.8151466005920945e-05, "loss": 0.0107, "num_input_tokens_seen": 24697872, "step": 25850 }, { "epoch": 2.1090627294232807, "grad_norm": 0.20375807583332062, "learning_rate": 4.815012257092774e-05, "loss": 0.0358, "num_input_tokens_seen": 24703232, "step": 25855 }, { "epoch": 2.1094705930336897, "grad_norm": 0.041965387761592865, "learning_rate": 4.8148778666691375e-05, "loss": 0.006, "num_input_tokens_seen": 24707600, "step": 25860 }, { "epoch": 2.1098784566440982, "grad_norm": 14.056000709533691, "learning_rate": 4.8147434293239105e-05, "loss": 0.5384, "num_input_tokens_seen": 24713152, "step": 25865 }, { "epoch": 2.110286320254507, "grad_norm": 20.97041130065918, "learning_rate": 4.814608945059818e-05, "loss": 0.1359, "num_input_tokens_seen": 24717520, "step": 25870 }, { "epoch": 2.1106941838649154, "grad_norm": 0.009153781458735466, "learning_rate": 4.814474413879586e-05, "loss": 0.042, "num_input_tokens_seen": 24722176, "step": 25875 }, { "epoch": 2.1111020474753244, "grad_norm": 0.11848244816064835, "learning_rate": 4.814339835785942e-05, "loss": 0.0093, "num_input_tokens_seen": 24727808, "step": 25880 }, { "epoch": 2.111509911085733, "grad_norm": 0.04379556328058243, "learning_rate": 4.8142052107816117e-05, "loss": 0.0136, "num_input_tokens_seen": 24732160, "step": 25885 }, { "epoch": 2.1119177746961415, "grad_norm": 0.12934766709804535, "learning_rate": 4.8140705388693253e-05, "loss": 0.0417, "num_input_tokens_seen": 24736368, "step": 25890 }, { "epoch": 2.11232563830655, "grad_norm": 3.33184814453125, "learning_rate": 4.8139358200518124e-05, "loss": 0.4131, "num_input_tokens_seen": 24741504, "step": 25895 }, { "epoch": 2.112733501916959, "grad_norm": 0.03894490748643875, "learning_rate": 4.8138010543318035e-05, "loss": 0.246, "num_input_tokens_seen": 24746240, "step": 25900 }, { "epoch": 2.1131413655273676, "grad_norm": 0.06669792532920837, "learning_rate": 4.81366624171203e-05, "loss": 0.017, "num_input_tokens_seen": 24751120, "step": 25905 }, { "epoch": 2.113549229137776, "grad_norm": 0.03137153014540672, "learning_rate": 4.813531382195226e-05, "loss": 0.1154, "num_input_tokens_seen": 24754944, "step": 25910 }, { "epoch": 2.113957092748185, "grad_norm": 0.04856027662754059, "learning_rate": 4.813396475784124e-05, "loss": 0.2271, "num_input_tokens_seen": 24759776, "step": 25915 }, { "epoch": 2.114364956358594, "grad_norm": 0.22958047688007355, "learning_rate": 4.813261522481458e-05, "loss": 0.0247, "num_input_tokens_seen": 24764496, "step": 25920 }, { "epoch": 2.1147728199690023, "grad_norm": 0.04083048179745674, "learning_rate": 4.8131265222899636e-05, "loss": 0.231, "num_input_tokens_seen": 24769264, "step": 25925 }, { "epoch": 2.115180683579411, "grad_norm": 0.09069249033927917, "learning_rate": 4.812991475212378e-05, "loss": 0.1483, "num_input_tokens_seen": 24774624, "step": 25930 }, { "epoch": 2.11558854718982, "grad_norm": 0.054145339876413345, "learning_rate": 4.812856381251438e-05, "loss": 0.3162, "num_input_tokens_seen": 24778208, "step": 25935 }, { "epoch": 2.1159964108002285, "grad_norm": 0.02802988886833191, "learning_rate": 4.8127212404098814e-05, "loss": 0.0272, "num_input_tokens_seen": 24783376, "step": 25940 }, { "epoch": 2.116404274410637, "grad_norm": 7.945977210998535, "learning_rate": 4.8125860526904485e-05, "loss": 0.2552, "num_input_tokens_seen": 24789040, "step": 25945 }, { "epoch": 2.1168121380210456, "grad_norm": 10.580611228942871, "learning_rate": 4.812450818095879e-05, "loss": 0.319, "num_input_tokens_seen": 24794480, "step": 25950 }, { "epoch": 2.1172200016314546, "grad_norm": 0.28901660442352295, "learning_rate": 4.812315536628914e-05, "loss": 0.1906, "num_input_tokens_seen": 24798320, "step": 25955 }, { "epoch": 2.117627865241863, "grad_norm": 1.8089964389801025, "learning_rate": 4.812180208292295e-05, "loss": 0.4102, "num_input_tokens_seen": 24803584, "step": 25960 }, { "epoch": 2.1180357288522718, "grad_norm": 0.8415789604187012, "learning_rate": 4.812044833088767e-05, "loss": 0.1299, "num_input_tokens_seen": 24809136, "step": 25965 }, { "epoch": 2.1184435924626803, "grad_norm": 2.205092668533325, "learning_rate": 4.811909411021072e-05, "loss": 0.2402, "num_input_tokens_seen": 24814000, "step": 25970 }, { "epoch": 2.1188514560730893, "grad_norm": 7.3795881271362305, "learning_rate": 4.811773942091955e-05, "loss": 0.2231, "num_input_tokens_seen": 24818656, "step": 25975 }, { "epoch": 2.119259319683498, "grad_norm": 0.36538514494895935, "learning_rate": 4.811638426304163e-05, "loss": 0.0277, "num_input_tokens_seen": 24823344, "step": 25980 }, { "epoch": 2.1196671832939065, "grad_norm": 0.08911757916212082, "learning_rate": 4.811502863660442e-05, "loss": 0.1124, "num_input_tokens_seen": 24828000, "step": 25985 }, { "epoch": 2.120075046904315, "grad_norm": 0.20254595577716827, "learning_rate": 4.811367254163541e-05, "loss": 0.1946, "num_input_tokens_seen": 24833152, "step": 25990 }, { "epoch": 2.120482910514724, "grad_norm": 0.2175568789243698, "learning_rate": 4.8112315978162075e-05, "loss": 0.0157, "num_input_tokens_seen": 24837312, "step": 25995 }, { "epoch": 2.1208907741251326, "grad_norm": 0.05471799522638321, "learning_rate": 4.811095894621191e-05, "loss": 0.1088, "num_input_tokens_seen": 24842816, "step": 26000 }, { "epoch": 2.121298637735541, "grad_norm": 0.07076893746852875, "learning_rate": 4.810960144581244e-05, "loss": 0.4182, "num_input_tokens_seen": 24847968, "step": 26005 }, { "epoch": 2.1217065013459497, "grad_norm": 2.8807969093322754, "learning_rate": 4.810824347699117e-05, "loss": 0.1994, "num_input_tokens_seen": 24852480, "step": 26010 }, { "epoch": 2.1221143649563587, "grad_norm": 0.029236219823360443, "learning_rate": 4.8106885039775614e-05, "loss": 0.1423, "num_input_tokens_seen": 24856320, "step": 26015 }, { "epoch": 2.1225222285667673, "grad_norm": 2.3513522148132324, "learning_rate": 4.8105526134193316e-05, "loss": 0.2925, "num_input_tokens_seen": 24861168, "step": 26020 }, { "epoch": 2.122930092177176, "grad_norm": 0.8212064504623413, "learning_rate": 4.810416676027183e-05, "loss": 0.3223, "num_input_tokens_seen": 24865184, "step": 26025 }, { "epoch": 2.1233379557875844, "grad_norm": 0.03516511619091034, "learning_rate": 4.810280691803869e-05, "loss": 0.0351, "num_input_tokens_seen": 24870016, "step": 26030 }, { "epoch": 2.1237458193979935, "grad_norm": 8.771669387817383, "learning_rate": 4.8101446607521485e-05, "loss": 0.1478, "num_input_tokens_seen": 24875376, "step": 26035 }, { "epoch": 2.124153683008402, "grad_norm": 0.030295565724372864, "learning_rate": 4.810008582874776e-05, "loss": 0.022, "num_input_tokens_seen": 24879952, "step": 26040 }, { "epoch": 2.1245615466188106, "grad_norm": 5.37205171585083, "learning_rate": 4.8098724581745115e-05, "loss": 0.0204, "num_input_tokens_seen": 24884672, "step": 26045 }, { "epoch": 2.124969410229219, "grad_norm": 1.889144778251648, "learning_rate": 4.809736286654114e-05, "loss": 0.0298, "num_input_tokens_seen": 24890192, "step": 26050 }, { "epoch": 2.125377273839628, "grad_norm": 16.026636123657227, "learning_rate": 4.8096000683163434e-05, "loss": 0.0567, "num_input_tokens_seen": 24894896, "step": 26055 }, { "epoch": 2.1257851374500367, "grad_norm": 0.02317199297249317, "learning_rate": 4.809463803163961e-05, "loss": 0.0741, "num_input_tokens_seen": 24899632, "step": 26060 }, { "epoch": 2.1261930010604453, "grad_norm": 0.07930915802717209, "learning_rate": 4.809327491199729e-05, "loss": 0.2133, "num_input_tokens_seen": 24904608, "step": 26065 }, { "epoch": 2.126600864670854, "grad_norm": 3.953927993774414, "learning_rate": 4.80919113242641e-05, "loss": 0.282, "num_input_tokens_seen": 24909856, "step": 26070 }, { "epoch": 2.127008728281263, "grad_norm": 0.043135661631822586, "learning_rate": 4.809054726846767e-05, "loss": 0.0071, "num_input_tokens_seen": 24914192, "step": 26075 }, { "epoch": 2.1274165918916714, "grad_norm": 1.513779878616333, "learning_rate": 4.808918274463566e-05, "loss": 0.0266, "num_input_tokens_seen": 24918816, "step": 26080 }, { "epoch": 2.12782445550208, "grad_norm": 0.037764664739370346, "learning_rate": 4.8087817752795725e-05, "loss": 0.0031, "num_input_tokens_seen": 24923360, "step": 26085 }, { "epoch": 2.128232319112489, "grad_norm": 18.334104537963867, "learning_rate": 4.808645229297555e-05, "loss": 0.2535, "num_input_tokens_seen": 24927936, "step": 26090 }, { "epoch": 2.1286401827228976, "grad_norm": 0.41752052307128906, "learning_rate": 4.808508636520278e-05, "loss": 0.456, "num_input_tokens_seen": 24933680, "step": 26095 }, { "epoch": 2.129048046333306, "grad_norm": 0.019207438454031944, "learning_rate": 4.808371996950513e-05, "loss": 0.012, "num_input_tokens_seen": 24938288, "step": 26100 }, { "epoch": 2.1294559099437147, "grad_norm": 0.04847477376461029, "learning_rate": 4.808235310591028e-05, "loss": 0.0046, "num_input_tokens_seen": 24943200, "step": 26105 }, { "epoch": 2.1298637735541237, "grad_norm": 0.2870122492313385, "learning_rate": 4.808098577444594e-05, "loss": 0.3189, "num_input_tokens_seen": 24947472, "step": 26110 }, { "epoch": 2.1302716371645323, "grad_norm": 0.02011515013873577, "learning_rate": 4.8079617975139836e-05, "loss": 0.1361, "num_input_tokens_seen": 24951712, "step": 26115 }, { "epoch": 2.130679500774941, "grad_norm": 29.152687072753906, "learning_rate": 4.807824970801967e-05, "loss": 0.2325, "num_input_tokens_seen": 24956736, "step": 26120 }, { "epoch": 2.1310873643853494, "grad_norm": 11.635758399963379, "learning_rate": 4.807688097311319e-05, "loss": 0.3588, "num_input_tokens_seen": 24960544, "step": 26125 }, { "epoch": 2.1314952279957584, "grad_norm": 0.045162566006183624, "learning_rate": 4.8075511770448146e-05, "loss": 0.0153, "num_input_tokens_seen": 24965872, "step": 26130 }, { "epoch": 2.131903091606167, "grad_norm": 0.012690935283899307, "learning_rate": 4.807414210005228e-05, "loss": 0.01, "num_input_tokens_seen": 24970752, "step": 26135 }, { "epoch": 2.1323109552165755, "grad_norm": 0.07898436486721039, "learning_rate": 4.807277196195336e-05, "loss": 0.3766, "num_input_tokens_seen": 24975744, "step": 26140 }, { "epoch": 2.132718818826984, "grad_norm": 0.04392454773187637, "learning_rate": 4.8071401356179155e-05, "loss": 0.4423, "num_input_tokens_seen": 24981456, "step": 26145 }, { "epoch": 2.133126682437393, "grad_norm": 0.5498069524765015, "learning_rate": 4.807003028275745e-05, "loss": 0.0108, "num_input_tokens_seen": 24986128, "step": 26150 }, { "epoch": 2.1335345460478017, "grad_norm": 2.2015671730041504, "learning_rate": 4.806865874171603e-05, "loss": 0.4713, "num_input_tokens_seen": 24989616, "step": 26155 }, { "epoch": 2.1339424096582102, "grad_norm": 1.722583532333374, "learning_rate": 4.8067286733082704e-05, "loss": 0.0196, "num_input_tokens_seen": 24994064, "step": 26160 }, { "epoch": 2.134350273268619, "grad_norm": 0.05517955496907234, "learning_rate": 4.806591425688528e-05, "loss": 0.0184, "num_input_tokens_seen": 24998912, "step": 26165 }, { "epoch": 2.134758136879028, "grad_norm": 0.08328275382518768, "learning_rate": 4.806454131315157e-05, "loss": 0.1505, "num_input_tokens_seen": 25003408, "step": 26170 }, { "epoch": 2.1351660004894364, "grad_norm": 3.664259672164917, "learning_rate": 4.8063167901909414e-05, "loss": 0.1153, "num_input_tokens_seen": 25008208, "step": 26175 }, { "epoch": 2.135573864099845, "grad_norm": 0.5896320939064026, "learning_rate": 4.806179402318664e-05, "loss": 0.0909, "num_input_tokens_seen": 25013424, "step": 26180 }, { "epoch": 2.1359817277102535, "grad_norm": 6.9222025871276855, "learning_rate": 4.80604196770111e-05, "loss": 0.2236, "num_input_tokens_seen": 25018080, "step": 26185 }, { "epoch": 2.1363895913206625, "grad_norm": 0.08260128647089005, "learning_rate": 4.805904486341065e-05, "loss": 0.2708, "num_input_tokens_seen": 25022080, "step": 26190 }, { "epoch": 2.136797454931071, "grad_norm": 1.651188850402832, "learning_rate": 4.805766958241317e-05, "loss": 0.1982, "num_input_tokens_seen": 25026912, "step": 26195 }, { "epoch": 2.1372053185414797, "grad_norm": 0.2922479212284088, "learning_rate": 4.805629383404651e-05, "loss": 0.0357, "num_input_tokens_seen": 25032400, "step": 26200 }, { "epoch": 2.1376131821518882, "grad_norm": 5.478261947631836, "learning_rate": 4.805491761833858e-05, "loss": 0.1777, "num_input_tokens_seen": 25037280, "step": 26205 }, { "epoch": 2.1380210457622972, "grad_norm": 0.8356432318687439, "learning_rate": 4.8053540935317266e-05, "loss": 0.3598, "num_input_tokens_seen": 25042048, "step": 26210 }, { "epoch": 2.138428909372706, "grad_norm": 0.12923559546470642, "learning_rate": 4.805216378501047e-05, "loss": 0.0157, "num_input_tokens_seen": 25047568, "step": 26215 }, { "epoch": 2.1388367729831144, "grad_norm": 0.019375070929527283, "learning_rate": 4.8050786167446114e-05, "loss": 0.2464, "num_input_tokens_seen": 25052576, "step": 26220 }, { "epoch": 2.139244636593523, "grad_norm": 0.034970320761203766, "learning_rate": 4.804940808265212e-05, "loss": 0.295, "num_input_tokens_seen": 25057488, "step": 26225 }, { "epoch": 2.139652500203932, "grad_norm": 0.049153175204992294, "learning_rate": 4.8048029530656415e-05, "loss": 0.0135, "num_input_tokens_seen": 25061376, "step": 26230 }, { "epoch": 2.1400603638143405, "grad_norm": 0.03673525154590607, "learning_rate": 4.804665051148695e-05, "loss": 0.0102, "num_input_tokens_seen": 25066080, "step": 26235 }, { "epoch": 2.140468227424749, "grad_norm": 0.4218893051147461, "learning_rate": 4.804527102517166e-05, "loss": 0.0159, "num_input_tokens_seen": 25070256, "step": 26240 }, { "epoch": 2.1408760910351576, "grad_norm": 0.07948824018239975, "learning_rate": 4.8043891071738525e-05, "loss": 0.1666, "num_input_tokens_seen": 25075632, "step": 26245 }, { "epoch": 2.1412839546455666, "grad_norm": 0.04794817045331001, "learning_rate": 4.804251065121551e-05, "loss": 0.0176, "num_input_tokens_seen": 25079504, "step": 26250 }, { "epoch": 2.141691818255975, "grad_norm": 0.011392096057534218, "learning_rate": 4.8041129763630596e-05, "loss": 0.2827, "num_input_tokens_seen": 25084784, "step": 26255 }, { "epoch": 2.1420996818663838, "grad_norm": 0.26945731043815613, "learning_rate": 4.803974840901178e-05, "loss": 0.1471, "num_input_tokens_seen": 25090304, "step": 26260 }, { "epoch": 2.142507545476793, "grad_norm": 3.326585531234741, "learning_rate": 4.803836658738704e-05, "loss": 0.3814, "num_input_tokens_seen": 25094800, "step": 26265 }, { "epoch": 2.1429154090872014, "grad_norm": 0.027505408972501755, "learning_rate": 4.8036984298784405e-05, "loss": 0.0738, "num_input_tokens_seen": 25098864, "step": 26270 }, { "epoch": 2.14332327269761, "grad_norm": 2.1176748275756836, "learning_rate": 4.803560154323188e-05, "loss": 0.3428, "num_input_tokens_seen": 25103488, "step": 26275 }, { "epoch": 2.1437311363080185, "grad_norm": 3.2360360622406006, "learning_rate": 4.803421832075751e-05, "loss": 0.3102, "num_input_tokens_seen": 25107600, "step": 26280 }, { "epoch": 2.144138999918427, "grad_norm": 0.046245887875556946, "learning_rate": 4.803283463138932e-05, "loss": 0.0206, "num_input_tokens_seen": 25112592, "step": 26285 }, { "epoch": 2.144546863528836, "grad_norm": 1.1788992881774902, "learning_rate": 4.803145047515536e-05, "loss": 0.0426, "num_input_tokens_seen": 25117504, "step": 26290 }, { "epoch": 2.1449547271392446, "grad_norm": 0.03396105766296387, "learning_rate": 4.8030065852083687e-05, "loss": 0.1099, "num_input_tokens_seen": 25121424, "step": 26295 }, { "epoch": 2.145362590749653, "grad_norm": 5.821334362030029, "learning_rate": 4.802868076220236e-05, "loss": 0.2476, "num_input_tokens_seen": 25126192, "step": 26300 }, { "epoch": 2.145770454360062, "grad_norm": 20.289079666137695, "learning_rate": 4.802729520553947e-05, "loss": 0.1349, "num_input_tokens_seen": 25131152, "step": 26305 }, { "epoch": 2.1461783179704708, "grad_norm": 0.16670851409435272, "learning_rate": 4.8025909182123076e-05, "loss": 0.4144, "num_input_tokens_seen": 25136400, "step": 26310 }, { "epoch": 2.1465861815808793, "grad_norm": 0.044316086918115616, "learning_rate": 4.80245226919813e-05, "loss": 0.3272, "num_input_tokens_seen": 25140864, "step": 26315 }, { "epoch": 2.146994045191288, "grad_norm": 4.354907512664795, "learning_rate": 4.802313573514222e-05, "loss": 0.2824, "num_input_tokens_seen": 25145744, "step": 26320 }, { "epoch": 2.147401908801697, "grad_norm": 1.7356832027435303, "learning_rate": 4.802174831163397e-05, "loss": 0.1847, "num_input_tokens_seen": 25150496, "step": 26325 }, { "epoch": 2.1478097724121055, "grad_norm": 2.0203263759613037, "learning_rate": 4.802036042148467e-05, "loss": 0.1823, "num_input_tokens_seen": 25155248, "step": 26330 }, { "epoch": 2.148217636022514, "grad_norm": 0.05254269018769264, "learning_rate": 4.801897206472244e-05, "loss": 0.2641, "num_input_tokens_seen": 25159696, "step": 26335 }, { "epoch": 2.1486254996329226, "grad_norm": 3.070366859436035, "learning_rate": 4.801758324137543e-05, "loss": 0.0961, "num_input_tokens_seen": 25163776, "step": 26340 }, { "epoch": 2.1490333632433316, "grad_norm": 19.31319236755371, "learning_rate": 4.8016193951471775e-05, "loss": 0.2069, "num_input_tokens_seen": 25168240, "step": 26345 }, { "epoch": 2.14944122685374, "grad_norm": 1.7089074850082397, "learning_rate": 4.801480419503967e-05, "loss": 0.4467, "num_input_tokens_seen": 25173360, "step": 26350 }, { "epoch": 2.1498490904641487, "grad_norm": 3.5154662132263184, "learning_rate": 4.801341397210725e-05, "loss": 0.1853, "num_input_tokens_seen": 25177888, "step": 26355 }, { "epoch": 2.1502569540745573, "grad_norm": 0.21770642697811127, "learning_rate": 4.801202328270271e-05, "loss": 0.1517, "num_input_tokens_seen": 25183120, "step": 26360 }, { "epoch": 2.1506648176849663, "grad_norm": 0.24250428378582, "learning_rate": 4.801063212685424e-05, "loss": 0.0613, "num_input_tokens_seen": 25188768, "step": 26365 }, { "epoch": 2.151072681295375, "grad_norm": 1.1384565830230713, "learning_rate": 4.8009240504590026e-05, "loss": 0.1006, "num_input_tokens_seen": 25193952, "step": 26370 }, { "epoch": 2.1514805449057834, "grad_norm": 0.04915512353181839, "learning_rate": 4.8007848415938296e-05, "loss": 0.3943, "num_input_tokens_seen": 25197968, "step": 26375 }, { "epoch": 2.151888408516192, "grad_norm": 0.0768040269613266, "learning_rate": 4.800645586092725e-05, "loss": 0.0417, "num_input_tokens_seen": 25202896, "step": 26380 }, { "epoch": 2.152296272126601, "grad_norm": 0.14603520929813385, "learning_rate": 4.800506283958511e-05, "loss": 0.0388, "num_input_tokens_seen": 25207824, "step": 26385 }, { "epoch": 2.1527041357370096, "grad_norm": 0.06773725152015686, "learning_rate": 4.8003669351940134e-05, "loss": 0.1526, "num_input_tokens_seen": 25213056, "step": 26390 }, { "epoch": 2.153111999347418, "grad_norm": 0.24283196032047272, "learning_rate": 4.800227539802055e-05, "loss": 0.1071, "num_input_tokens_seen": 25218192, "step": 26395 }, { "epoch": 2.1535198629578267, "grad_norm": 0.2728044390678406, "learning_rate": 4.8000880977854626e-05, "loss": 0.0231, "num_input_tokens_seen": 25222416, "step": 26400 }, { "epoch": 2.1539277265682357, "grad_norm": 0.021994290873408318, "learning_rate": 4.799948609147061e-05, "loss": 0.0118, "num_input_tokens_seen": 25227856, "step": 26405 }, { "epoch": 2.1543355901786443, "grad_norm": 0.7188562154769897, "learning_rate": 4.799809073889678e-05, "loss": 0.1474, "num_input_tokens_seen": 25233024, "step": 26410 }, { "epoch": 2.154743453789053, "grad_norm": 0.07368774712085724, "learning_rate": 4.799669492016143e-05, "loss": 0.1438, "num_input_tokens_seen": 25237248, "step": 26415 }, { "epoch": 2.1551513173994614, "grad_norm": 0.04425887018442154, "learning_rate": 4.799529863529285e-05, "loss": 0.0088, "num_input_tokens_seen": 25240944, "step": 26420 }, { "epoch": 2.1555591810098704, "grad_norm": 2.0566823482513428, "learning_rate": 4.799390188431933e-05, "loss": 0.3906, "num_input_tokens_seen": 25245584, "step": 26425 }, { "epoch": 2.155967044620279, "grad_norm": 0.07919786870479584, "learning_rate": 4.799250466726919e-05, "loss": 0.1332, "num_input_tokens_seen": 25250240, "step": 26430 }, { "epoch": 2.1563749082306876, "grad_norm": 1.4466549158096313, "learning_rate": 4.799110698417075e-05, "loss": 0.1823, "num_input_tokens_seen": 25254816, "step": 26435 }, { "epoch": 2.1567827718410966, "grad_norm": 0.5833666920661926, "learning_rate": 4.798970883505235e-05, "loss": 0.0386, "num_input_tokens_seen": 25259456, "step": 26440 }, { "epoch": 2.157190635451505, "grad_norm": 0.07536981999874115, "learning_rate": 4.798831021994231e-05, "loss": 0.0655, "num_input_tokens_seen": 25264992, "step": 26445 }, { "epoch": 2.1575984990619137, "grad_norm": 0.04145844653248787, "learning_rate": 4.7986911138869e-05, "loss": 0.0176, "num_input_tokens_seen": 25270080, "step": 26450 }, { "epoch": 2.1580063626723223, "grad_norm": 0.09660609811544418, "learning_rate": 4.798551159186077e-05, "loss": 0.013, "num_input_tokens_seen": 25275328, "step": 26455 }, { "epoch": 2.158414226282731, "grad_norm": 0.022574637085199356, "learning_rate": 4.798411157894598e-05, "loss": 0.1827, "num_input_tokens_seen": 25279744, "step": 26460 }, { "epoch": 2.15882208989314, "grad_norm": 0.028926178812980652, "learning_rate": 4.7982711100153014e-05, "loss": 0.2576, "num_input_tokens_seen": 25284464, "step": 26465 }, { "epoch": 2.1592299535035484, "grad_norm": 0.057642240077257156, "learning_rate": 4.798131015551026e-05, "loss": 0.1666, "num_input_tokens_seen": 25289520, "step": 26470 }, { "epoch": 2.159637817113957, "grad_norm": 3.8136236667633057, "learning_rate": 4.7979908745046114e-05, "loss": 0.4445, "num_input_tokens_seen": 25293632, "step": 26475 }, { "epoch": 2.160045680724366, "grad_norm": 0.010141192935407162, "learning_rate": 4.7978506868788986e-05, "loss": 0.1278, "num_input_tokens_seen": 25298592, "step": 26480 }, { "epoch": 2.1604535443347745, "grad_norm": 11.666606903076172, "learning_rate": 4.7977104526767285e-05, "loss": 0.3856, "num_input_tokens_seen": 25304112, "step": 26485 }, { "epoch": 2.160861407945183, "grad_norm": 1.1113682985305786, "learning_rate": 4.7975701719009445e-05, "loss": 0.109, "num_input_tokens_seen": 25308864, "step": 26490 }, { "epoch": 2.1612692715555917, "grad_norm": 0.4342355728149414, "learning_rate": 4.7974298445543884e-05, "loss": 0.1335, "num_input_tokens_seen": 25313968, "step": 26495 }, { "epoch": 2.1616771351660007, "grad_norm": 12.590950012207031, "learning_rate": 4.7972894706399064e-05, "loss": 0.0607, "num_input_tokens_seen": 25319264, "step": 26500 }, { "epoch": 2.1620849987764093, "grad_norm": 0.05768829584121704, "learning_rate": 4.797149050160343e-05, "loss": 0.162, "num_input_tokens_seen": 25323248, "step": 26505 }, { "epoch": 2.162492862386818, "grad_norm": 1.3219002485275269, "learning_rate": 4.797008583118544e-05, "loss": 0.1519, "num_input_tokens_seen": 25328576, "step": 26510 }, { "epoch": 2.1629007259972264, "grad_norm": 0.14448212087154388, "learning_rate": 4.796868069517357e-05, "loss": 0.2209, "num_input_tokens_seen": 25333200, "step": 26515 }, { "epoch": 2.1633085896076354, "grad_norm": 0.04510354995727539, "learning_rate": 4.796727509359631e-05, "loss": 0.1011, "num_input_tokens_seen": 25337296, "step": 26520 }, { "epoch": 2.163716453218044, "grad_norm": 0.4036400616168976, "learning_rate": 4.7965869026482136e-05, "loss": 0.0221, "num_input_tokens_seen": 25341936, "step": 26525 }, { "epoch": 2.1641243168284525, "grad_norm": 10.050850868225098, "learning_rate": 4.7964462493859565e-05, "loss": 0.18, "num_input_tokens_seen": 25347056, "step": 26530 }, { "epoch": 2.164532180438861, "grad_norm": 0.19676409661769867, "learning_rate": 4.7963055495757086e-05, "loss": 0.2312, "num_input_tokens_seen": 25352288, "step": 26535 }, { "epoch": 2.16494004404927, "grad_norm": 1.4586130380630493, "learning_rate": 4.7961648032203233e-05, "loss": 0.2072, "num_input_tokens_seen": 25357024, "step": 26540 }, { "epoch": 2.1653479076596787, "grad_norm": 0.046240199357271194, "learning_rate": 4.7960240103226535e-05, "loss": 0.013, "num_input_tokens_seen": 25360768, "step": 26545 }, { "epoch": 2.1657557712700872, "grad_norm": 0.31842783093452454, "learning_rate": 4.7958831708855525e-05, "loss": 0.2508, "num_input_tokens_seen": 25365552, "step": 26550 }, { "epoch": 2.166163634880496, "grad_norm": 0.24372613430023193, "learning_rate": 4.795742284911875e-05, "loss": 0.181, "num_input_tokens_seen": 25370256, "step": 26555 }, { "epoch": 2.166571498490905, "grad_norm": 0.11967752128839493, "learning_rate": 4.795601352404477e-05, "loss": 0.186, "num_input_tokens_seen": 25374256, "step": 26560 }, { "epoch": 2.1669793621013134, "grad_norm": 0.02202526107430458, "learning_rate": 4.795460373366215e-05, "loss": 0.0109, "num_input_tokens_seen": 25379120, "step": 26565 }, { "epoch": 2.167387225711722, "grad_norm": 8.680800437927246, "learning_rate": 4.795319347799947e-05, "loss": 0.1475, "num_input_tokens_seen": 25383936, "step": 26570 }, { "epoch": 2.1677950893221305, "grad_norm": 1.5269306898117065, "learning_rate": 4.795178275708531e-05, "loss": 0.3293, "num_input_tokens_seen": 25388592, "step": 26575 }, { "epoch": 2.1682029529325395, "grad_norm": 0.026616068556904793, "learning_rate": 4.795037157094826e-05, "loss": 0.2034, "num_input_tokens_seen": 25392832, "step": 26580 }, { "epoch": 2.168610816542948, "grad_norm": 0.042957205325365067, "learning_rate": 4.7948959919616934e-05, "loss": 0.1084, "num_input_tokens_seen": 25398288, "step": 26585 }, { "epoch": 2.1690186801533566, "grad_norm": 0.3184005320072174, "learning_rate": 4.7947547803119944e-05, "loss": 0.3319, "num_input_tokens_seen": 25402656, "step": 26590 }, { "epoch": 2.169426543763765, "grad_norm": 0.07560407370328903, "learning_rate": 4.794613522148591e-05, "loss": 0.0832, "num_input_tokens_seen": 25407712, "step": 26595 }, { "epoch": 2.169834407374174, "grad_norm": 0.23935264348983765, "learning_rate": 4.7944722174743475e-05, "loss": 0.0138, "num_input_tokens_seen": 25412896, "step": 26600 }, { "epoch": 2.1702422709845828, "grad_norm": 0.1299019753932953, "learning_rate": 4.7943308662921264e-05, "loss": 0.0851, "num_input_tokens_seen": 25417472, "step": 26605 }, { "epoch": 2.1706501345949913, "grad_norm": 0.3235940635204315, "learning_rate": 4.7941894686047925e-05, "loss": 0.1491, "num_input_tokens_seen": 25422368, "step": 26610 }, { "epoch": 2.1710579982054, "grad_norm": 0.07593920081853867, "learning_rate": 4.794048024415214e-05, "loss": 0.0302, "num_input_tokens_seen": 25427264, "step": 26615 }, { "epoch": 2.171465861815809, "grad_norm": 0.14969848096370697, "learning_rate": 4.793906533726257e-05, "loss": 0.0348, "num_input_tokens_seen": 25432624, "step": 26620 }, { "epoch": 2.1718737254262175, "grad_norm": 0.09531870484352112, "learning_rate": 4.793764996540789e-05, "loss": 0.0225, "num_input_tokens_seen": 25437344, "step": 26625 }, { "epoch": 2.172281589036626, "grad_norm": 0.05401831865310669, "learning_rate": 4.7936234128616786e-05, "loss": 0.0121, "num_input_tokens_seen": 25441968, "step": 26630 }, { "epoch": 2.1726894526470346, "grad_norm": 0.20290745794773102, "learning_rate": 4.793481782691797e-05, "loss": 0.0096, "num_input_tokens_seen": 25446144, "step": 26635 }, { "epoch": 2.1730973162574436, "grad_norm": 0.004114404786378145, "learning_rate": 4.793340106034015e-05, "loss": 0.1408, "num_input_tokens_seen": 25450544, "step": 26640 }, { "epoch": 2.173505179867852, "grad_norm": 0.062459077686071396, "learning_rate": 4.793198382891202e-05, "loss": 0.0034, "num_input_tokens_seen": 25454672, "step": 26645 }, { "epoch": 2.1739130434782608, "grad_norm": 0.16730757057666779, "learning_rate": 4.7930566132662326e-05, "loss": 0.1922, "num_input_tokens_seen": 25458416, "step": 26650 }, { "epoch": 2.1743209070886698, "grad_norm": 0.0039013258647173643, "learning_rate": 4.79291479716198e-05, "loss": 0.0768, "num_input_tokens_seen": 25462000, "step": 26655 }, { "epoch": 2.1747287706990783, "grad_norm": 0.09098342806100845, "learning_rate": 4.792772934581319e-05, "loss": 0.0027, "num_input_tokens_seen": 25466800, "step": 26660 }, { "epoch": 2.175136634309487, "grad_norm": 0.03621818497776985, "learning_rate": 4.792631025527125e-05, "loss": 0.2835, "num_input_tokens_seen": 25472160, "step": 26665 }, { "epoch": 2.1755444979198955, "grad_norm": 4.042669773101807, "learning_rate": 4.792489070002274e-05, "loss": 0.0194, "num_input_tokens_seen": 25476528, "step": 26670 }, { "epoch": 2.1759523615303045, "grad_norm": 6.180492401123047, "learning_rate": 4.792347068009644e-05, "loss": 0.0486, "num_input_tokens_seen": 25481104, "step": 26675 }, { "epoch": 2.176360225140713, "grad_norm": 0.013506336137652397, "learning_rate": 4.7922050195521127e-05, "loss": 0.0444, "num_input_tokens_seen": 25486080, "step": 26680 }, { "epoch": 2.1767680887511216, "grad_norm": 2.0044944286346436, "learning_rate": 4.79206292463256e-05, "loss": 0.4786, "num_input_tokens_seen": 25490848, "step": 26685 }, { "epoch": 2.17717595236153, "grad_norm": 0.008998855948448181, "learning_rate": 4.791920783253866e-05, "loss": 0.0037, "num_input_tokens_seen": 25495360, "step": 26690 }, { "epoch": 2.177583815971939, "grad_norm": 0.09128636121749878, "learning_rate": 4.791778595418911e-05, "loss": 0.0085, "num_input_tokens_seen": 25499456, "step": 26695 }, { "epoch": 2.1779916795823477, "grad_norm": 0.014479203149676323, "learning_rate": 4.791636361130578e-05, "loss": 0.0291, "num_input_tokens_seen": 25503616, "step": 26700 }, { "epoch": 2.1783995431927563, "grad_norm": 1.7570981979370117, "learning_rate": 4.791494080391749e-05, "loss": 0.1533, "num_input_tokens_seen": 25508368, "step": 26705 }, { "epoch": 2.178807406803165, "grad_norm": 0.24053998291492462, "learning_rate": 4.79135175320531e-05, "loss": 0.0161, "num_input_tokens_seen": 25512736, "step": 26710 }, { "epoch": 2.179215270413574, "grad_norm": 0.088060662150383, "learning_rate": 4.791209379574144e-05, "loss": 0.1456, "num_input_tokens_seen": 25517520, "step": 26715 }, { "epoch": 2.1796231340239824, "grad_norm": 0.06382130831480026, "learning_rate": 4.791066959501138e-05, "loss": 0.1322, "num_input_tokens_seen": 25521264, "step": 26720 }, { "epoch": 2.180030997634391, "grad_norm": 2.588364601135254, "learning_rate": 4.7909244929891776e-05, "loss": 0.0212, "num_input_tokens_seen": 25525792, "step": 26725 }, { "epoch": 2.1804388612447996, "grad_norm": 0.3478199243545532, "learning_rate": 4.7907819800411516e-05, "loss": 0.3266, "num_input_tokens_seen": 25530624, "step": 26730 }, { "epoch": 2.1808467248552086, "grad_norm": 11.397383689880371, "learning_rate": 4.7906394206599484e-05, "loss": 0.6466, "num_input_tokens_seen": 25536288, "step": 26735 }, { "epoch": 2.181254588465617, "grad_norm": 0.032053545117378235, "learning_rate": 4.7904968148484566e-05, "loss": 0.259, "num_input_tokens_seen": 25541328, "step": 26740 }, { "epoch": 2.1816624520760257, "grad_norm": 13.15073299407959, "learning_rate": 4.7903541626095684e-05, "loss": 0.2693, "num_input_tokens_seen": 25545840, "step": 26745 }, { "epoch": 2.1820703156864343, "grad_norm": 0.10274283587932587, "learning_rate": 4.790211463946174e-05, "loss": 0.0633, "num_input_tokens_seen": 25551104, "step": 26750 }, { "epoch": 2.1824781792968433, "grad_norm": 0.22986459732055664, "learning_rate": 4.790068718861167e-05, "loss": 0.014, "num_input_tokens_seen": 25554880, "step": 26755 }, { "epoch": 2.182886042907252, "grad_norm": 0.3326258361339569, "learning_rate": 4.78992592735744e-05, "loss": 0.3556, "num_input_tokens_seen": 25560064, "step": 26760 }, { "epoch": 2.1832939065176604, "grad_norm": 0.2511982023715973, "learning_rate": 4.789783089437887e-05, "loss": 0.1583, "num_input_tokens_seen": 25564080, "step": 26765 }, { "epoch": 2.183701770128069, "grad_norm": 0.10200077295303345, "learning_rate": 4.789640205105405e-05, "loss": 0.0107, "num_input_tokens_seen": 25568368, "step": 26770 }, { "epoch": 2.184109633738478, "grad_norm": 0.06718770414590836, "learning_rate": 4.789497274362888e-05, "loss": 0.0878, "num_input_tokens_seen": 25571696, "step": 26775 }, { "epoch": 2.1845174973488866, "grad_norm": 22.19113540649414, "learning_rate": 4.789354297213234e-05, "loss": 0.1433, "num_input_tokens_seen": 25576704, "step": 26780 }, { "epoch": 2.184925360959295, "grad_norm": 0.6955705881118774, "learning_rate": 4.789211273659341e-05, "loss": 0.1971, "num_input_tokens_seen": 25581664, "step": 26785 }, { "epoch": 2.1853332245697037, "grad_norm": 0.10251027345657349, "learning_rate": 4.789068203704109e-05, "loss": 0.1485, "num_input_tokens_seen": 25586336, "step": 26790 }, { "epoch": 2.1857410881801127, "grad_norm": 4.067751407623291, "learning_rate": 4.7889250873504366e-05, "loss": 0.1287, "num_input_tokens_seen": 25591168, "step": 26795 }, { "epoch": 2.1861489517905213, "grad_norm": 9.123823165893555, "learning_rate": 4.7887819246012254e-05, "loss": 0.37, "num_input_tokens_seen": 25595328, "step": 26800 }, { "epoch": 2.18655681540093, "grad_norm": 4.5705037117004395, "learning_rate": 4.788638715459378e-05, "loss": 0.3785, "num_input_tokens_seen": 25600112, "step": 26805 }, { "epoch": 2.1869646790113384, "grad_norm": 0.09331485629081726, "learning_rate": 4.788495459927795e-05, "loss": 0.1332, "num_input_tokens_seen": 25604896, "step": 26810 }, { "epoch": 2.1873725426217474, "grad_norm": 0.37922394275665283, "learning_rate": 4.788352158009383e-05, "loss": 0.0282, "num_input_tokens_seen": 25610512, "step": 26815 }, { "epoch": 2.187780406232156, "grad_norm": 0.05055265128612518, "learning_rate": 4.7882088097070444e-05, "loss": 0.1341, "num_input_tokens_seen": 25615584, "step": 26820 }, { "epoch": 2.1881882698425645, "grad_norm": 0.037240542471408844, "learning_rate": 4.788065415023685e-05, "loss": 0.2516, "num_input_tokens_seen": 25620720, "step": 26825 }, { "epoch": 2.1885961334529735, "grad_norm": 0.053026773035526276, "learning_rate": 4.787921973962213e-05, "loss": 0.0149, "num_input_tokens_seen": 25624992, "step": 26830 }, { "epoch": 2.189003997063382, "grad_norm": 0.22059965133666992, "learning_rate": 4.787778486525534e-05, "loss": 0.0137, "num_input_tokens_seen": 25628880, "step": 26835 }, { "epoch": 2.1894118606737907, "grad_norm": 0.11230645328760147, "learning_rate": 4.7876349527165566e-05, "loss": 0.1626, "num_input_tokens_seen": 25634000, "step": 26840 }, { "epoch": 2.1898197242841992, "grad_norm": 6.068425178527832, "learning_rate": 4.787491372538192e-05, "loss": 0.4499, "num_input_tokens_seen": 25638528, "step": 26845 }, { "epoch": 2.190227587894608, "grad_norm": 0.06213563680648804, "learning_rate": 4.787347745993349e-05, "loss": 0.0055, "num_input_tokens_seen": 25643248, "step": 26850 }, { "epoch": 2.190635451505017, "grad_norm": 0.012299483641982079, "learning_rate": 4.787204073084939e-05, "loss": 0.2354, "num_input_tokens_seen": 25647456, "step": 26855 }, { "epoch": 2.1910433151154254, "grad_norm": 8.192375183105469, "learning_rate": 4.7870603538158734e-05, "loss": 0.0746, "num_input_tokens_seen": 25652080, "step": 26860 }, { "epoch": 2.191451178725834, "grad_norm": 0.2537151873111725, "learning_rate": 4.7869165881890674e-05, "loss": 0.0077, "num_input_tokens_seen": 25656208, "step": 26865 }, { "epoch": 2.191859042336243, "grad_norm": 7.874162197113037, "learning_rate": 4.786772776207434e-05, "loss": 0.1592, "num_input_tokens_seen": 25660880, "step": 26870 }, { "epoch": 2.1922669059466515, "grad_norm": 11.239322662353516, "learning_rate": 4.7866289178738876e-05, "loss": 0.4527, "num_input_tokens_seen": 25665616, "step": 26875 }, { "epoch": 2.19267476955706, "grad_norm": 0.009005576372146606, "learning_rate": 4.7864850131913455e-05, "loss": 0.2221, "num_input_tokens_seen": 25669824, "step": 26880 }, { "epoch": 2.1930826331674687, "grad_norm": 3.785733222961426, "learning_rate": 4.786341062162723e-05, "loss": 0.1972, "num_input_tokens_seen": 25674832, "step": 26885 }, { "epoch": 2.1934904967778777, "grad_norm": 15.135139465332031, "learning_rate": 4.786197064790938e-05, "loss": 0.1535, "num_input_tokens_seen": 25679904, "step": 26890 }, { "epoch": 2.1938983603882862, "grad_norm": 0.11776744574308395, "learning_rate": 4.786053021078911e-05, "loss": 0.0469, "num_input_tokens_seen": 25685024, "step": 26895 }, { "epoch": 2.194306223998695, "grad_norm": 0.4508489668369293, "learning_rate": 4.78590893102956e-05, "loss": 0.0365, "num_input_tokens_seen": 25690160, "step": 26900 }, { "epoch": 2.1947140876091034, "grad_norm": 0.7983827590942383, "learning_rate": 4.785764794645807e-05, "loss": 0.1615, "num_input_tokens_seen": 25695072, "step": 26905 }, { "epoch": 2.1951219512195124, "grad_norm": 3.5672895908355713, "learning_rate": 4.785620611930573e-05, "loss": 0.2017, "num_input_tokens_seen": 25700352, "step": 26910 }, { "epoch": 2.195529814829921, "grad_norm": 0.23804135620594025, "learning_rate": 4.78547638288678e-05, "loss": 0.0167, "num_input_tokens_seen": 25705312, "step": 26915 }, { "epoch": 2.1959376784403295, "grad_norm": 0.02174876630306244, "learning_rate": 4.785332107517352e-05, "loss": 0.0539, "num_input_tokens_seen": 25710112, "step": 26920 }, { "epoch": 2.196345542050738, "grad_norm": 4.424301624298096, "learning_rate": 4.7851877858252135e-05, "loss": 0.3958, "num_input_tokens_seen": 25714336, "step": 26925 }, { "epoch": 2.196753405661147, "grad_norm": 2.5964369773864746, "learning_rate": 4.785043417813289e-05, "loss": 0.244, "num_input_tokens_seen": 25719712, "step": 26930 }, { "epoch": 2.1971612692715556, "grad_norm": 0.687418520450592, "learning_rate": 4.784899003484506e-05, "loss": 0.0055, "num_input_tokens_seen": 25724848, "step": 26935 }, { "epoch": 2.197569132881964, "grad_norm": 0.048053111881017685, "learning_rate": 4.78475454284179e-05, "loss": 0.0122, "num_input_tokens_seen": 25729376, "step": 26940 }, { "epoch": 2.1979769964923728, "grad_norm": 0.12525229156017303, "learning_rate": 4.784610035888072e-05, "loss": 0.1032, "num_input_tokens_seen": 25733504, "step": 26945 }, { "epoch": 2.198384860102782, "grad_norm": 0.023667892441153526, "learning_rate": 4.784465482626278e-05, "loss": 0.183, "num_input_tokens_seen": 25737328, "step": 26950 }, { "epoch": 2.1987927237131903, "grad_norm": 0.014486460946500301, "learning_rate": 4.78432088305934e-05, "loss": 0.0148, "num_input_tokens_seen": 25742352, "step": 26955 }, { "epoch": 2.199200587323599, "grad_norm": 3.964844226837158, "learning_rate": 4.784176237190189e-05, "loss": 0.0109, "num_input_tokens_seen": 25747632, "step": 26960 }, { "epoch": 2.1996084509340075, "grad_norm": 0.09677034616470337, "learning_rate": 4.784031545021755e-05, "loss": 0.0032, "num_input_tokens_seen": 25752208, "step": 26965 }, { "epoch": 2.2000163145444165, "grad_norm": 0.034014683216810226, "learning_rate": 4.783886806556973e-05, "loss": 0.4854, "num_input_tokens_seen": 25756880, "step": 26970 }, { "epoch": 2.200424178154825, "grad_norm": 0.2512630522251129, "learning_rate": 4.7837420217987756e-05, "loss": 0.0452, "num_input_tokens_seen": 25760912, "step": 26975 }, { "epoch": 2.2008320417652336, "grad_norm": 0.7313953042030334, "learning_rate": 4.783597190750098e-05, "loss": 0.0119, "num_input_tokens_seen": 25765424, "step": 26980 }, { "epoch": 2.201239905375642, "grad_norm": 0.2827792763710022, "learning_rate": 4.7834523134138754e-05, "loss": 0.0401, "num_input_tokens_seen": 25769552, "step": 26985 }, { "epoch": 2.201647768986051, "grad_norm": 2.1316440105438232, "learning_rate": 4.7833073897930446e-05, "loss": 0.0588, "num_input_tokens_seen": 25774576, "step": 26990 }, { "epoch": 2.2020556325964598, "grad_norm": 17.30994415283203, "learning_rate": 4.783162419890544e-05, "loss": 0.344, "num_input_tokens_seen": 25779136, "step": 26995 }, { "epoch": 2.2024634962068683, "grad_norm": 0.006275056395679712, "learning_rate": 4.78301740370931e-05, "loss": 0.0995, "num_input_tokens_seen": 25784320, "step": 27000 }, { "epoch": 2.2028713598172773, "grad_norm": 4.676844120025635, "learning_rate": 4.782872341252285e-05, "loss": 0.1432, "num_input_tokens_seen": 25788560, "step": 27005 }, { "epoch": 2.203279223427686, "grad_norm": 1.192429542541504, "learning_rate": 4.782727232522406e-05, "loss": 0.0087, "num_input_tokens_seen": 25793536, "step": 27010 }, { "epoch": 2.2036870870380945, "grad_norm": 0.010725549422204494, "learning_rate": 4.782582077522617e-05, "loss": 0.0063, "num_input_tokens_seen": 25797760, "step": 27015 }, { "epoch": 2.204094950648503, "grad_norm": 0.11543057858943939, "learning_rate": 4.782436876255859e-05, "loss": 0.0026, "num_input_tokens_seen": 25802336, "step": 27020 }, { "epoch": 2.2045028142589116, "grad_norm": 0.5159903764724731, "learning_rate": 4.782291628725075e-05, "loss": 0.1752, "num_input_tokens_seen": 25807680, "step": 27025 }, { "epoch": 2.2049106778693206, "grad_norm": 0.05995262786746025, "learning_rate": 4.78214633493321e-05, "loss": 0.1059, "num_input_tokens_seen": 25812336, "step": 27030 }, { "epoch": 2.205318541479729, "grad_norm": 1.1074398756027222, "learning_rate": 4.782000994883209e-05, "loss": 0.2492, "num_input_tokens_seen": 25816688, "step": 27035 }, { "epoch": 2.2057264050901377, "grad_norm": 0.23853260278701782, "learning_rate": 4.781855608578016e-05, "loss": 0.1874, "num_input_tokens_seen": 25821408, "step": 27040 }, { "epoch": 2.2061342687005467, "grad_norm": 0.7424235939979553, "learning_rate": 4.7817101760205805e-05, "loss": 0.0163, "num_input_tokens_seen": 25826192, "step": 27045 }, { "epoch": 2.2065421323109553, "grad_norm": 0.2567664384841919, "learning_rate": 4.7815646972138496e-05, "loss": 0.2399, "num_input_tokens_seen": 25830560, "step": 27050 }, { "epoch": 2.206949995921364, "grad_norm": 0.010339165106415749, "learning_rate": 4.781419172160771e-05, "loss": 0.1681, "num_input_tokens_seen": 25834800, "step": 27055 }, { "epoch": 2.2073578595317724, "grad_norm": 0.013127947226166725, "learning_rate": 4.7812736008642964e-05, "loss": 0.0171, "num_input_tokens_seen": 25839920, "step": 27060 }, { "epoch": 2.2077657231421814, "grad_norm": 0.19549408555030823, "learning_rate": 4.781127983327374e-05, "loss": 0.2742, "num_input_tokens_seen": 25844000, "step": 27065 }, { "epoch": 2.20817358675259, "grad_norm": 2.883146047592163, "learning_rate": 4.7809823195529574e-05, "loss": 0.0818, "num_input_tokens_seen": 25848320, "step": 27070 }, { "epoch": 2.2085814503629986, "grad_norm": 0.07154054939746857, "learning_rate": 4.780836609543998e-05, "loss": 0.0031, "num_input_tokens_seen": 25853216, "step": 27075 }, { "epoch": 2.208989313973407, "grad_norm": 0.04872383549809456, "learning_rate": 4.78069085330345e-05, "loss": 0.24, "num_input_tokens_seen": 25858048, "step": 27080 }, { "epoch": 2.209397177583816, "grad_norm": 0.18277060985565186, "learning_rate": 4.7805450508342674e-05, "loss": 0.4179, "num_input_tokens_seen": 25862256, "step": 27085 }, { "epoch": 2.2098050411942247, "grad_norm": 0.344319611787796, "learning_rate": 4.7803992021394056e-05, "loss": 0.0143, "num_input_tokens_seen": 25866800, "step": 27090 }, { "epoch": 2.2102129048046333, "grad_norm": 2.1532793045043945, "learning_rate": 4.780253307221821e-05, "loss": 0.4843, "num_input_tokens_seen": 25871888, "step": 27095 }, { "epoch": 2.210620768415042, "grad_norm": 0.5844927430152893, "learning_rate": 4.780107366084471e-05, "loss": 0.4019, "num_input_tokens_seen": 25876432, "step": 27100 }, { "epoch": 2.211028632025451, "grad_norm": 0.16232994198799133, "learning_rate": 4.779961378730313e-05, "loss": 0.3939, "num_input_tokens_seen": 25880416, "step": 27105 }, { "epoch": 2.2114364956358594, "grad_norm": 0.1794864982366562, "learning_rate": 4.779815345162307e-05, "loss": 0.0164, "num_input_tokens_seen": 25885392, "step": 27110 }, { "epoch": 2.211844359246268, "grad_norm": 0.06064187362790108, "learning_rate": 4.779669265383413e-05, "loss": 0.0125, "num_input_tokens_seen": 25890400, "step": 27115 }, { "epoch": 2.2122522228566766, "grad_norm": 0.07229284942150116, "learning_rate": 4.779523139396592e-05, "loss": 0.014, "num_input_tokens_seen": 25894960, "step": 27120 }, { "epoch": 2.2126600864670856, "grad_norm": 0.037604961544275284, "learning_rate": 4.779376967204805e-05, "loss": 0.1774, "num_input_tokens_seen": 25899328, "step": 27125 }, { "epoch": 2.213067950077494, "grad_norm": 1.7605798244476318, "learning_rate": 4.7792307488110155e-05, "loss": 0.384, "num_input_tokens_seen": 25904496, "step": 27130 }, { "epoch": 2.2134758136879027, "grad_norm": 1.27112877368927, "learning_rate": 4.779084484218187e-05, "loss": 0.0208, "num_input_tokens_seen": 25908752, "step": 27135 }, { "epoch": 2.2138836772983113, "grad_norm": 0.02481752075254917, "learning_rate": 4.778938173429285e-05, "loss": 0.0143, "num_input_tokens_seen": 25913792, "step": 27140 }, { "epoch": 2.2142915409087203, "grad_norm": 16.98518943786621, "learning_rate": 4.778791816447274e-05, "loss": 0.1635, "num_input_tokens_seen": 25918304, "step": 27145 }, { "epoch": 2.214699404519129, "grad_norm": 3.4024159908294678, "learning_rate": 4.778645413275121e-05, "loss": 0.1635, "num_input_tokens_seen": 25922128, "step": 27150 }, { "epoch": 2.2151072681295374, "grad_norm": 0.21173550188541412, "learning_rate": 4.778498963915795e-05, "loss": 0.1576, "num_input_tokens_seen": 25926528, "step": 27155 }, { "epoch": 2.215515131739946, "grad_norm": 0.42402803897857666, "learning_rate": 4.778352468372262e-05, "loss": 0.0114, "num_input_tokens_seen": 25932112, "step": 27160 }, { "epoch": 2.215922995350355, "grad_norm": 0.08629408478736877, "learning_rate": 4.778205926647493e-05, "loss": 0.0052, "num_input_tokens_seen": 25936240, "step": 27165 }, { "epoch": 2.2163308589607635, "grad_norm": 0.039971206337213516, "learning_rate": 4.778059338744458e-05, "loss": 0.0086, "num_input_tokens_seen": 25940784, "step": 27170 }, { "epoch": 2.216738722571172, "grad_norm": 0.010442032478749752, "learning_rate": 4.777912704666129e-05, "loss": 0.0079, "num_input_tokens_seen": 25945712, "step": 27175 }, { "epoch": 2.217146586181581, "grad_norm": 0.11787881702184677, "learning_rate": 4.777766024415476e-05, "loss": 0.6089, "num_input_tokens_seen": 25950224, "step": 27180 }, { "epoch": 2.2175544497919897, "grad_norm": 0.14464154839515686, "learning_rate": 4.7776192979954734e-05, "loss": 0.139, "num_input_tokens_seen": 25954784, "step": 27185 }, { "epoch": 2.2179623134023982, "grad_norm": 5.017892837524414, "learning_rate": 4.777472525409097e-05, "loss": 0.2178, "num_input_tokens_seen": 25960656, "step": 27190 }, { "epoch": 2.218370177012807, "grad_norm": 28.813859939575195, "learning_rate": 4.777325706659319e-05, "loss": 0.2553, "num_input_tokens_seen": 25965504, "step": 27195 }, { "epoch": 2.2187780406232154, "grad_norm": 20.932924270629883, "learning_rate": 4.777178841749117e-05, "loss": 0.1104, "num_input_tokens_seen": 25969408, "step": 27200 }, { "epoch": 2.2191859042336244, "grad_norm": 0.051041100174188614, "learning_rate": 4.7770319306814684e-05, "loss": 0.2917, "num_input_tokens_seen": 25973264, "step": 27205 }, { "epoch": 2.219593767844033, "grad_norm": 0.12705442309379578, "learning_rate": 4.776884973459349e-05, "loss": 0.5592, "num_input_tokens_seen": 25977696, "step": 27210 }, { "epoch": 2.2200016314544415, "grad_norm": 0.11473017930984497, "learning_rate": 4.776737970085739e-05, "loss": 0.1258, "num_input_tokens_seen": 25982544, "step": 27215 }, { "epoch": 2.2204094950648505, "grad_norm": 0.0765887200832367, "learning_rate": 4.7765909205636175e-05, "loss": 0.0092, "num_input_tokens_seen": 25987312, "step": 27220 }, { "epoch": 2.220817358675259, "grad_norm": 0.09068723767995834, "learning_rate": 4.7764438248959656e-05, "loss": 0.0439, "num_input_tokens_seen": 25991520, "step": 27225 }, { "epoch": 2.2212252222856677, "grad_norm": 0.057350043207407, "learning_rate": 4.776296683085765e-05, "loss": 0.1676, "num_input_tokens_seen": 25995872, "step": 27230 }, { "epoch": 2.221633085896076, "grad_norm": 0.04852407053112984, "learning_rate": 4.776149495135997e-05, "loss": 0.1305, "num_input_tokens_seen": 26001536, "step": 27235 }, { "epoch": 2.2220409495064852, "grad_norm": 0.11777294427156448, "learning_rate": 4.776002261049647e-05, "loss": 0.4765, "num_input_tokens_seen": 26007216, "step": 27240 }, { "epoch": 2.222448813116894, "grad_norm": 0.022617582231760025, "learning_rate": 4.775854980829698e-05, "loss": 0.1221, "num_input_tokens_seen": 26012048, "step": 27245 }, { "epoch": 2.2228566767273024, "grad_norm": 4.559422969818115, "learning_rate": 4.7757076544791356e-05, "loss": 0.1077, "num_input_tokens_seen": 26016032, "step": 27250 }, { "epoch": 2.223264540337711, "grad_norm": 0.03930630162358284, "learning_rate": 4.775560282000946e-05, "loss": 0.013, "num_input_tokens_seen": 26021328, "step": 27255 }, { "epoch": 2.22367240394812, "grad_norm": 4.50154447555542, "learning_rate": 4.775412863398116e-05, "loss": 0.1823, "num_input_tokens_seen": 26025584, "step": 27260 }, { "epoch": 2.2240802675585285, "grad_norm": 9.204034805297852, "learning_rate": 4.7752653986736354e-05, "loss": 0.0343, "num_input_tokens_seen": 26030960, "step": 27265 }, { "epoch": 2.224488131168937, "grad_norm": 0.017077933996915817, "learning_rate": 4.775117887830491e-05, "loss": 0.0555, "num_input_tokens_seen": 26034992, "step": 27270 }, { "epoch": 2.2248959947793456, "grad_norm": 0.07165905833244324, "learning_rate": 4.774970330871674e-05, "loss": 0.2855, "num_input_tokens_seen": 26039776, "step": 27275 }, { "epoch": 2.2253038583897546, "grad_norm": 2.2915215492248535, "learning_rate": 4.774822727800175e-05, "loss": 0.3895, "num_input_tokens_seen": 26044000, "step": 27280 }, { "epoch": 2.225711722000163, "grad_norm": 16.195425033569336, "learning_rate": 4.774675078618986e-05, "loss": 0.1112, "num_input_tokens_seen": 26049264, "step": 27285 }, { "epoch": 2.2261195856105718, "grad_norm": 0.03232457488775253, "learning_rate": 4.7745273833311e-05, "loss": 0.0102, "num_input_tokens_seen": 26053584, "step": 27290 }, { "epoch": 2.2265274492209803, "grad_norm": 0.02422778122127056, "learning_rate": 4.77437964193951e-05, "loss": 0.1815, "num_input_tokens_seen": 26058336, "step": 27295 }, { "epoch": 2.2269353128313893, "grad_norm": 0.05636953189969063, "learning_rate": 4.774231854447211e-05, "loss": 0.2883, "num_input_tokens_seen": 26063488, "step": 27300 }, { "epoch": 2.227343176441798, "grad_norm": 0.037190891802310944, "learning_rate": 4.774084020857199e-05, "loss": 0.0098, "num_input_tokens_seen": 26068736, "step": 27305 }, { "epoch": 2.2277510400522065, "grad_norm": 0.06437427550554276, "learning_rate": 4.77393614117247e-05, "loss": 0.0358, "num_input_tokens_seen": 26073568, "step": 27310 }, { "epoch": 2.228158903662615, "grad_norm": 0.035413146018981934, "learning_rate": 4.7737882153960225e-05, "loss": 0.1365, "num_input_tokens_seen": 26078512, "step": 27315 }, { "epoch": 2.228566767273024, "grad_norm": 0.048347581177949905, "learning_rate": 4.7736402435308525e-05, "loss": 0.3146, "num_input_tokens_seen": 26082832, "step": 27320 }, { "epoch": 2.2289746308834326, "grad_norm": 0.12440314143896103, "learning_rate": 4.7734922255799616e-05, "loss": 0.0954, "num_input_tokens_seen": 26087152, "step": 27325 }, { "epoch": 2.229382494493841, "grad_norm": 0.05302411690354347, "learning_rate": 4.7733441615463496e-05, "loss": 0.0159, "num_input_tokens_seen": 26093264, "step": 27330 }, { "epoch": 2.2297903581042497, "grad_norm": 0.274842768907547, "learning_rate": 4.7731960514330176e-05, "loss": 0.1047, "num_input_tokens_seen": 26098720, "step": 27335 }, { "epoch": 2.2301982217146588, "grad_norm": 0.09515618532896042, "learning_rate": 4.773047895242968e-05, "loss": 0.1483, "num_input_tokens_seen": 26103328, "step": 27340 }, { "epoch": 2.2306060853250673, "grad_norm": 0.1616646647453308, "learning_rate": 4.772899692979202e-05, "loss": 0.0176, "num_input_tokens_seen": 26107984, "step": 27345 }, { "epoch": 2.231013948935476, "grad_norm": 0.3031795620918274, "learning_rate": 4.772751444644726e-05, "loss": 0.0798, "num_input_tokens_seen": 26113424, "step": 27350 }, { "epoch": 2.2314218125458845, "grad_norm": 0.33190760016441345, "learning_rate": 4.772603150242544e-05, "loss": 0.4013, "num_input_tokens_seen": 26118160, "step": 27355 }, { "epoch": 2.2318296761562935, "grad_norm": 0.25538548827171326, "learning_rate": 4.772454809775662e-05, "loss": 0.2502, "num_input_tokens_seen": 26122480, "step": 27360 }, { "epoch": 2.232237539766702, "grad_norm": 0.018562784418463707, "learning_rate": 4.7723064232470854e-05, "loss": 0.467, "num_input_tokens_seen": 26127408, "step": 27365 }, { "epoch": 2.2326454033771106, "grad_norm": 0.03612007573246956, "learning_rate": 4.772157990659824e-05, "loss": 0.0251, "num_input_tokens_seen": 26132992, "step": 27370 }, { "epoch": 2.233053266987519, "grad_norm": 0.06563729792833328, "learning_rate": 4.772009512016886e-05, "loss": 0.0067, "num_input_tokens_seen": 26138320, "step": 27375 }, { "epoch": 2.233461130597928, "grad_norm": 0.021850822493433952, "learning_rate": 4.7718609873212796e-05, "loss": 0.2428, "num_input_tokens_seen": 26143024, "step": 27380 }, { "epoch": 2.2338689942083367, "grad_norm": 0.03541411831974983, "learning_rate": 4.771712416576018e-05, "loss": 0.5522, "num_input_tokens_seen": 26147536, "step": 27385 }, { "epoch": 2.2342768578187453, "grad_norm": 0.9150498509407043, "learning_rate": 4.7715637997841094e-05, "loss": 0.233, "num_input_tokens_seen": 26152784, "step": 27390 }, { "epoch": 2.2346847214291543, "grad_norm": 0.051794689148664474, "learning_rate": 4.771415136948568e-05, "loss": 0.0104, "num_input_tokens_seen": 26158144, "step": 27395 }, { "epoch": 2.235092585039563, "grad_norm": 0.06459685415029526, "learning_rate": 4.7712664280724076e-05, "loss": 0.3013, "num_input_tokens_seen": 26162784, "step": 27400 }, { "epoch": 2.2355004486499714, "grad_norm": 28.217395782470703, "learning_rate": 4.7711176731586415e-05, "loss": 0.2448, "num_input_tokens_seen": 26167200, "step": 27405 }, { "epoch": 2.23590831226038, "grad_norm": 0.23219667375087738, "learning_rate": 4.770968872210286e-05, "loss": 0.0188, "num_input_tokens_seen": 26171296, "step": 27410 }, { "epoch": 2.236316175870789, "grad_norm": 2.554456949234009, "learning_rate": 4.770820025230356e-05, "loss": 0.1042, "num_input_tokens_seen": 26176160, "step": 27415 }, { "epoch": 2.2367240394811976, "grad_norm": 4.248425006866455, "learning_rate": 4.770671132221869e-05, "loss": 0.0172, "num_input_tokens_seen": 26181440, "step": 27420 }, { "epoch": 2.237131903091606, "grad_norm": 2.989360809326172, "learning_rate": 4.7705221931878424e-05, "loss": 0.1726, "num_input_tokens_seen": 26185712, "step": 27425 }, { "epoch": 2.2375397667020147, "grad_norm": 0.4561746418476105, "learning_rate": 4.770373208131296e-05, "loss": 0.0099, "num_input_tokens_seen": 26190992, "step": 27430 }, { "epoch": 2.2379476303124237, "grad_norm": 0.24772056937217712, "learning_rate": 4.77022417705525e-05, "loss": 0.0184, "num_input_tokens_seen": 26194784, "step": 27435 }, { "epoch": 2.2383554939228323, "grad_norm": 0.0679837167263031, "learning_rate": 4.770075099962724e-05, "loss": 0.1571, "num_input_tokens_seen": 26199200, "step": 27440 }, { "epoch": 2.238763357533241, "grad_norm": 7.684980392456055, "learning_rate": 4.769925976856741e-05, "loss": 0.4105, "num_input_tokens_seen": 26204384, "step": 27445 }, { "epoch": 2.2391712211436494, "grad_norm": 0.09725990891456604, "learning_rate": 4.769776807740322e-05, "loss": 0.3981, "num_input_tokens_seen": 26209008, "step": 27450 }, { "epoch": 2.2395790847540584, "grad_norm": 0.12058129906654358, "learning_rate": 4.7696275926164924e-05, "loss": 0.4555, "num_input_tokens_seen": 26213472, "step": 27455 }, { "epoch": 2.239986948364467, "grad_norm": 0.11763578653335571, "learning_rate": 4.769478331488275e-05, "loss": 0.3143, "num_input_tokens_seen": 26218288, "step": 27460 }, { "epoch": 2.2403948119748756, "grad_norm": 0.053706731647253036, "learning_rate": 4.7693290243586976e-05, "loss": 0.0257, "num_input_tokens_seen": 26223168, "step": 27465 }, { "epoch": 2.240802675585284, "grad_norm": 3.8663570880889893, "learning_rate": 4.769179671230784e-05, "loss": 0.1621, "num_input_tokens_seen": 26227984, "step": 27470 }, { "epoch": 2.241210539195693, "grad_norm": 8.331260681152344, "learning_rate": 4.769030272107563e-05, "loss": 0.259, "num_input_tokens_seen": 26231600, "step": 27475 }, { "epoch": 2.2416184028061017, "grad_norm": 0.40381279587745667, "learning_rate": 4.768880826992063e-05, "loss": 0.1449, "num_input_tokens_seen": 26236608, "step": 27480 }, { "epoch": 2.2420262664165103, "grad_norm": 2.2492499351501465, "learning_rate": 4.7687313358873124e-05, "loss": 0.329, "num_input_tokens_seen": 26241968, "step": 27485 }, { "epoch": 2.242434130026919, "grad_norm": 6.821743011474609, "learning_rate": 4.768581798796342e-05, "loss": 0.1653, "num_input_tokens_seen": 26247360, "step": 27490 }, { "epoch": 2.242841993637328, "grad_norm": 0.1772637665271759, "learning_rate": 4.768432215722182e-05, "loss": 0.0242, "num_input_tokens_seen": 26251712, "step": 27495 }, { "epoch": 2.2432498572477364, "grad_norm": 0.1501043736934662, "learning_rate": 4.768282586667865e-05, "loss": 0.0161, "num_input_tokens_seen": 26257216, "step": 27500 }, { "epoch": 2.243657720858145, "grad_norm": 12.228683471679688, "learning_rate": 4.7681329116364234e-05, "loss": 0.1917, "num_input_tokens_seen": 26261712, "step": 27505 }, { "epoch": 2.2440655844685535, "grad_norm": 0.5939521789550781, "learning_rate": 4.7679831906308925e-05, "loss": 0.3388, "num_input_tokens_seen": 26266448, "step": 27510 }, { "epoch": 2.2444734480789625, "grad_norm": 0.09547565877437592, "learning_rate": 4.767833423654305e-05, "loss": 0.0138, "num_input_tokens_seen": 26271744, "step": 27515 }, { "epoch": 2.244881311689371, "grad_norm": 0.27293848991394043, "learning_rate": 4.767683610709699e-05, "loss": 0.052, "num_input_tokens_seen": 26277136, "step": 27520 }, { "epoch": 2.2452891752997797, "grad_norm": 0.12126509100198746, "learning_rate": 4.767533751800108e-05, "loss": 0.1053, "num_input_tokens_seen": 26281232, "step": 27525 }, { "epoch": 2.2456970389101882, "grad_norm": 0.3490872383117676, "learning_rate": 4.767383846928573e-05, "loss": 0.1196, "num_input_tokens_seen": 26285488, "step": 27530 }, { "epoch": 2.2461049025205972, "grad_norm": 0.016229718923568726, "learning_rate": 4.76723389609813e-05, "loss": 0.1772, "num_input_tokens_seen": 26289408, "step": 27535 }, { "epoch": 2.246512766131006, "grad_norm": 0.6739904284477234, "learning_rate": 4.76708389931182e-05, "loss": 0.1336, "num_input_tokens_seen": 26294352, "step": 27540 }, { "epoch": 2.2469206297414144, "grad_norm": 0.014386815950274467, "learning_rate": 4.7669338565726826e-05, "loss": 0.0296, "num_input_tokens_seen": 26299728, "step": 27545 }, { "epoch": 2.247328493351823, "grad_norm": 0.04458429291844368, "learning_rate": 4.766783767883759e-05, "loss": 0.0204, "num_input_tokens_seen": 26304432, "step": 27550 }, { "epoch": 2.247736356962232, "grad_norm": 0.2265152931213379, "learning_rate": 4.766633633248091e-05, "loss": 0.1874, "num_input_tokens_seen": 26309408, "step": 27555 }, { "epoch": 2.2481442205726405, "grad_norm": 0.1634470671415329, "learning_rate": 4.766483452668723e-05, "loss": 0.0158, "num_input_tokens_seen": 26313328, "step": 27560 }, { "epoch": 2.248552084183049, "grad_norm": 0.04868458956480026, "learning_rate": 4.766333226148699e-05, "loss": 0.4807, "num_input_tokens_seen": 26317936, "step": 27565 }, { "epoch": 2.248959947793458, "grad_norm": 8.062925338745117, "learning_rate": 4.766182953691063e-05, "loss": 0.524, "num_input_tokens_seen": 26322304, "step": 27570 }, { "epoch": 2.2493678114038667, "grad_norm": 0.054495904594659805, "learning_rate": 4.766032635298861e-05, "loss": 0.146, "num_input_tokens_seen": 26325936, "step": 27575 }, { "epoch": 2.2497756750142752, "grad_norm": 22.28544807434082, "learning_rate": 4.76588227097514e-05, "loss": 0.7162, "num_input_tokens_seen": 26330928, "step": 27580 }, { "epoch": 2.250183538624684, "grad_norm": 29.09794807434082, "learning_rate": 4.76573186072295e-05, "loss": 0.4028, "num_input_tokens_seen": 26335552, "step": 27585 }, { "epoch": 2.2505914022350924, "grad_norm": 14.86910629272461, "learning_rate": 4.765581404545337e-05, "loss": 0.3774, "num_input_tokens_seen": 26339376, "step": 27590 }, { "epoch": 2.2509992658455014, "grad_norm": 2.3993852138519287, "learning_rate": 4.7654309024453516e-05, "loss": 0.352, "num_input_tokens_seen": 26344640, "step": 27595 }, { "epoch": 2.25140712945591, "grad_norm": 11.504310607910156, "learning_rate": 4.765280354426045e-05, "loss": 0.392, "num_input_tokens_seen": 26349488, "step": 27600 }, { "epoch": 2.2518149930663185, "grad_norm": 8.93612289428711, "learning_rate": 4.765129760490467e-05, "loss": 0.3872, "num_input_tokens_seen": 26353728, "step": 27605 }, { "epoch": 2.2522228566767275, "grad_norm": 1.7405719757080078, "learning_rate": 4.764979120641672e-05, "loss": 0.289, "num_input_tokens_seen": 26358048, "step": 27610 }, { "epoch": 2.252630720287136, "grad_norm": 4.05859375, "learning_rate": 4.764828434882713e-05, "loss": 0.3848, "num_input_tokens_seen": 26362672, "step": 27615 }, { "epoch": 2.2530385838975446, "grad_norm": 3.267404079437256, "learning_rate": 4.764677703216643e-05, "loss": 1.1431, "num_input_tokens_seen": 26367024, "step": 27620 }, { "epoch": 2.253446447507953, "grad_norm": 5.326168060302734, "learning_rate": 4.7645269256465196e-05, "loss": 0.536, "num_input_tokens_seen": 26371968, "step": 27625 }, { "epoch": 2.253854311118362, "grad_norm": 9.54736042022705, "learning_rate": 4.764376102175397e-05, "loss": 0.4805, "num_input_tokens_seen": 26376064, "step": 27630 }, { "epoch": 2.2542621747287708, "grad_norm": 0.9158545136451721, "learning_rate": 4.764225232806333e-05, "loss": 0.4018, "num_input_tokens_seen": 26381712, "step": 27635 }, { "epoch": 2.2546700383391793, "grad_norm": 6.6275739669799805, "learning_rate": 4.7640743175423854e-05, "loss": 0.4862, "num_input_tokens_seen": 26386400, "step": 27640 }, { "epoch": 2.255077901949588, "grad_norm": 3.8288931846618652, "learning_rate": 4.763923356386614e-05, "loss": 0.4178, "num_input_tokens_seen": 26390896, "step": 27645 }, { "epoch": 2.255485765559997, "grad_norm": 1.7803932428359985, "learning_rate": 4.763772349342077e-05, "loss": 0.4445, "num_input_tokens_seen": 26395888, "step": 27650 }, { "epoch": 2.2558936291704055, "grad_norm": 0.707194447517395, "learning_rate": 4.7636212964118375e-05, "loss": 0.3794, "num_input_tokens_seen": 26400496, "step": 27655 }, { "epoch": 2.256301492780814, "grad_norm": 0.7705725431442261, "learning_rate": 4.763470197598956e-05, "loss": 0.316, "num_input_tokens_seen": 26404992, "step": 27660 }, { "epoch": 2.2567093563912226, "grad_norm": 1.2621961832046509, "learning_rate": 4.7633190529064954e-05, "loss": 0.25, "num_input_tokens_seen": 26409808, "step": 27665 }, { "epoch": 2.2571172200016316, "grad_norm": 0.9004406929016113, "learning_rate": 4.76316786233752e-05, "loss": 0.691, "num_input_tokens_seen": 26415024, "step": 27670 }, { "epoch": 2.25752508361204, "grad_norm": 0.6713990569114685, "learning_rate": 4.7630166258950925e-05, "loss": 0.3662, "num_input_tokens_seen": 26418992, "step": 27675 }, { "epoch": 2.2579329472224487, "grad_norm": 0.4796737730503082, "learning_rate": 4.7628653435822804e-05, "loss": 0.302, "num_input_tokens_seen": 26422928, "step": 27680 }, { "epoch": 2.2583408108328573, "grad_norm": 1.2167620658874512, "learning_rate": 4.762714015402149e-05, "loss": 0.337, "num_input_tokens_seen": 26428112, "step": 27685 }, { "epoch": 2.2587486744432663, "grad_norm": 0.5453066229820251, "learning_rate": 4.7625626413577665e-05, "loss": 0.331, "num_input_tokens_seen": 26433712, "step": 27690 }, { "epoch": 2.259156538053675, "grad_norm": 0.6171814203262329, "learning_rate": 4.7624112214522e-05, "loss": 0.2963, "num_input_tokens_seen": 26438320, "step": 27695 }, { "epoch": 2.2595644016640835, "grad_norm": 0.5894769430160522, "learning_rate": 4.76225975568852e-05, "loss": 0.4399, "num_input_tokens_seen": 26442000, "step": 27700 }, { "epoch": 2.259972265274492, "grad_norm": 0.7552899122238159, "learning_rate": 4.762108244069795e-05, "loss": 0.368, "num_input_tokens_seen": 26446560, "step": 27705 }, { "epoch": 2.260380128884901, "grad_norm": 1.6057932376861572, "learning_rate": 4.7619566865990975e-05, "loss": 0.3487, "num_input_tokens_seen": 26451584, "step": 27710 }, { "epoch": 2.2607879924953096, "grad_norm": 0.6780328750610352, "learning_rate": 4.7618050832794995e-05, "loss": 0.3889, "num_input_tokens_seen": 26457392, "step": 27715 }, { "epoch": 2.261195856105718, "grad_norm": 1.1787285804748535, "learning_rate": 4.761653434114074e-05, "loss": 0.4142, "num_input_tokens_seen": 26461680, "step": 27720 }, { "epoch": 2.2616037197161267, "grad_norm": 3.6499533653259277, "learning_rate": 4.7615017391058934e-05, "loss": 0.33, "num_input_tokens_seen": 26465856, "step": 27725 }, { "epoch": 2.2620115833265357, "grad_norm": 0.371311217546463, "learning_rate": 4.761349998258035e-05, "loss": 0.2616, "num_input_tokens_seen": 26471216, "step": 27730 }, { "epoch": 2.2624194469369443, "grad_norm": 1.1820610761642456, "learning_rate": 4.761198211573571e-05, "loss": 0.6515, "num_input_tokens_seen": 26475536, "step": 27735 }, { "epoch": 2.262827310547353, "grad_norm": 1.010707139968872, "learning_rate": 4.761046379055581e-05, "loss": 0.3818, "num_input_tokens_seen": 26479696, "step": 27740 }, { "epoch": 2.263235174157762, "grad_norm": 0.32275107502937317, "learning_rate": 4.7608945007071425e-05, "loss": 0.3352, "num_input_tokens_seen": 26484128, "step": 27745 }, { "epoch": 2.2636430377681704, "grad_norm": 0.48924747109413147, "learning_rate": 4.760742576531332e-05, "loss": 0.3249, "num_input_tokens_seen": 26488896, "step": 27750 }, { "epoch": 2.264050901378579, "grad_norm": 1.08083176612854, "learning_rate": 4.760590606531231e-05, "loss": 0.4185, "num_input_tokens_seen": 26493328, "step": 27755 }, { "epoch": 2.2644587649889876, "grad_norm": 0.34378376603126526, "learning_rate": 4.760438590709918e-05, "loss": 0.3418, "num_input_tokens_seen": 26498672, "step": 27760 }, { "epoch": 2.264866628599396, "grad_norm": 1.6399481296539307, "learning_rate": 4.7602865290704766e-05, "loss": 0.3799, "num_input_tokens_seen": 26503072, "step": 27765 }, { "epoch": 2.265274492209805, "grad_norm": 0.7412872314453125, "learning_rate": 4.760134421615987e-05, "loss": 0.3541, "num_input_tokens_seen": 26507440, "step": 27770 }, { "epoch": 2.2656823558202137, "grad_norm": 1.7057181596755981, "learning_rate": 4.759982268349533e-05, "loss": 0.3459, "num_input_tokens_seen": 26511936, "step": 27775 }, { "epoch": 2.2660902194306223, "grad_norm": 2.2249794006347656, "learning_rate": 4.759830069274199e-05, "loss": 0.4049, "num_input_tokens_seen": 26516864, "step": 27780 }, { "epoch": 2.2664980830410313, "grad_norm": 7.759976863861084, "learning_rate": 4.7596778243930694e-05, "loss": 0.5344, "num_input_tokens_seen": 26521696, "step": 27785 }, { "epoch": 2.26690594665144, "grad_norm": 15.827629089355469, "learning_rate": 4.7595255337092304e-05, "loss": 0.158, "num_input_tokens_seen": 26525376, "step": 27790 }, { "epoch": 2.2673138102618484, "grad_norm": 6.487016677856445, "learning_rate": 4.759373197225769e-05, "loss": 1.5546, "num_input_tokens_seen": 26530224, "step": 27795 }, { "epoch": 2.267721673872257, "grad_norm": 1.1563581228256226, "learning_rate": 4.759220814945774e-05, "loss": 0.351, "num_input_tokens_seen": 26534656, "step": 27800 }, { "epoch": 2.2681295374826655, "grad_norm": 1.0349787473678589, "learning_rate": 4.759068386872332e-05, "loss": 0.3094, "num_input_tokens_seen": 26539088, "step": 27805 }, { "epoch": 2.2685374010930746, "grad_norm": 0.38489970564842224, "learning_rate": 4.758915913008534e-05, "loss": 0.346, "num_input_tokens_seen": 26543200, "step": 27810 }, { "epoch": 2.268945264703483, "grad_norm": 0.3661995232105255, "learning_rate": 4.7587633933574705e-05, "loss": 0.3704, "num_input_tokens_seen": 26548480, "step": 27815 }, { "epoch": 2.2693531283138917, "grad_norm": 0.3808702230453491, "learning_rate": 4.758610827922233e-05, "loss": 0.3429, "num_input_tokens_seen": 26552864, "step": 27820 }, { "epoch": 2.2697609919243007, "grad_norm": 0.46088486909866333, "learning_rate": 4.758458216705913e-05, "loss": 0.2851, "num_input_tokens_seen": 26556848, "step": 27825 }, { "epoch": 2.2701688555347093, "grad_norm": 0.5573521852493286, "learning_rate": 4.758305559711605e-05, "loss": 0.4393, "num_input_tokens_seen": 26561296, "step": 27830 }, { "epoch": 2.270576719145118, "grad_norm": 0.28931868076324463, "learning_rate": 4.7581528569424026e-05, "loss": 0.3616, "num_input_tokens_seen": 26565984, "step": 27835 }, { "epoch": 2.2709845827555264, "grad_norm": 0.2714945375919342, "learning_rate": 4.758000108401402e-05, "loss": 0.3364, "num_input_tokens_seen": 26570816, "step": 27840 }, { "epoch": 2.2713924463659354, "grad_norm": 0.2544206380844116, "learning_rate": 4.757847314091698e-05, "loss": 0.3456, "num_input_tokens_seen": 26574464, "step": 27845 }, { "epoch": 2.271800309976344, "grad_norm": 0.2874172329902649, "learning_rate": 4.757694474016389e-05, "loss": 0.4085, "num_input_tokens_seen": 26579088, "step": 27850 }, { "epoch": 2.2722081735867525, "grad_norm": 0.1779792457818985, "learning_rate": 4.7575415881785726e-05, "loss": 0.3545, "num_input_tokens_seen": 26583584, "step": 27855 }, { "epoch": 2.272616037197161, "grad_norm": 0.2477790117263794, "learning_rate": 4.7573886565813466e-05, "loss": 0.3792, "num_input_tokens_seen": 26588208, "step": 27860 }, { "epoch": 2.27302390080757, "grad_norm": 0.28015777468681335, "learning_rate": 4.757235679227812e-05, "loss": 0.342, "num_input_tokens_seen": 26592128, "step": 27865 }, { "epoch": 2.2734317644179787, "grad_norm": 0.5607934594154358, "learning_rate": 4.75708265612107e-05, "loss": 0.4077, "num_input_tokens_seen": 26596880, "step": 27870 }, { "epoch": 2.2738396280283872, "grad_norm": 0.48627081513404846, "learning_rate": 4.756929587264221e-05, "loss": 0.3876, "num_input_tokens_seen": 26602320, "step": 27875 }, { "epoch": 2.274247491638796, "grad_norm": 0.1673145443201065, "learning_rate": 4.7567764726603684e-05, "loss": 0.3388, "num_input_tokens_seen": 26607184, "step": 27880 }, { "epoch": 2.274655355249205, "grad_norm": 0.6420775651931763, "learning_rate": 4.7566233123126155e-05, "loss": 0.4539, "num_input_tokens_seen": 26611696, "step": 27885 }, { "epoch": 2.2750632188596134, "grad_norm": 0.346208393573761, "learning_rate": 4.756470106224067e-05, "loss": 0.2814, "num_input_tokens_seen": 26616352, "step": 27890 }, { "epoch": 2.275471082470022, "grad_norm": 0.31507501006126404, "learning_rate": 4.7563168543978285e-05, "loss": 0.3729, "num_input_tokens_seen": 26621552, "step": 27895 }, { "epoch": 2.2758789460804305, "grad_norm": 0.43071794509887695, "learning_rate": 4.756163556837006e-05, "loss": 0.3327, "num_input_tokens_seen": 26626352, "step": 27900 }, { "epoch": 2.2762868096908395, "grad_norm": 0.4060749411582947, "learning_rate": 4.756010213544707e-05, "loss": 0.3348, "num_input_tokens_seen": 26631504, "step": 27905 }, { "epoch": 2.276694673301248, "grad_norm": 0.37160351872444153, "learning_rate": 4.75585682452404e-05, "loss": 0.3103, "num_input_tokens_seen": 26636688, "step": 27910 }, { "epoch": 2.2771025369116566, "grad_norm": 0.17674343287944794, "learning_rate": 4.755703389778113e-05, "loss": 0.3131, "num_input_tokens_seen": 26642064, "step": 27915 }, { "epoch": 2.2775104005220657, "grad_norm": 0.3319889307022095, "learning_rate": 4.755549909310037e-05, "loss": 0.3836, "num_input_tokens_seen": 26647200, "step": 27920 }, { "epoch": 2.2779182641324742, "grad_norm": 0.45296454429626465, "learning_rate": 4.755396383122923e-05, "loss": 0.418, "num_input_tokens_seen": 26652192, "step": 27925 }, { "epoch": 2.278326127742883, "grad_norm": 0.2825334072113037, "learning_rate": 4.755242811219883e-05, "loss": 0.3704, "num_input_tokens_seen": 26657472, "step": 27930 }, { "epoch": 2.2787339913532914, "grad_norm": 0.34038788080215454, "learning_rate": 4.755089193604029e-05, "loss": 0.3521, "num_input_tokens_seen": 26662192, "step": 27935 }, { "epoch": 2.2791418549637, "grad_norm": 0.6231538653373718, "learning_rate": 4.754935530278475e-05, "loss": 0.3543, "num_input_tokens_seen": 26666816, "step": 27940 }, { "epoch": 2.279549718574109, "grad_norm": 0.25158536434173584, "learning_rate": 4.754781821246336e-05, "loss": 0.3957, "num_input_tokens_seen": 26671200, "step": 27945 }, { "epoch": 2.2799575821845175, "grad_norm": 0.502467155456543, "learning_rate": 4.7546280665107286e-05, "loss": 0.3498, "num_input_tokens_seen": 26676144, "step": 27950 }, { "epoch": 2.280365445794926, "grad_norm": 0.21100863814353943, "learning_rate": 4.7544742660747675e-05, "loss": 0.3249, "num_input_tokens_seen": 26680592, "step": 27955 }, { "epoch": 2.280773309405335, "grad_norm": 0.3931499123573303, "learning_rate": 4.754320419941571e-05, "loss": 0.3525, "num_input_tokens_seen": 26685200, "step": 27960 }, { "epoch": 2.2811811730157436, "grad_norm": 0.7576189041137695, "learning_rate": 4.754166528114258e-05, "loss": 0.3522, "num_input_tokens_seen": 26689280, "step": 27965 }, { "epoch": 2.281589036626152, "grad_norm": 0.3566700220108032, "learning_rate": 4.754012590595947e-05, "loss": 0.3244, "num_input_tokens_seen": 26694864, "step": 27970 }, { "epoch": 2.2819969002365608, "grad_norm": 0.2422059029340744, "learning_rate": 4.753858607389758e-05, "loss": 0.3645, "num_input_tokens_seen": 26699488, "step": 27975 }, { "epoch": 2.2824047638469693, "grad_norm": 0.2187841236591339, "learning_rate": 4.753704578498814e-05, "loss": 0.3163, "num_input_tokens_seen": 26704752, "step": 27980 }, { "epoch": 2.2828126274573783, "grad_norm": 0.34655267000198364, "learning_rate": 4.753550503926235e-05, "loss": 0.3137, "num_input_tokens_seen": 26710112, "step": 27985 }, { "epoch": 2.283220491067787, "grad_norm": 0.42118531465530396, "learning_rate": 4.753396383675145e-05, "loss": 0.4489, "num_input_tokens_seen": 26715088, "step": 27990 }, { "epoch": 2.2836283546781955, "grad_norm": 1.922918438911438, "learning_rate": 4.753242217748669e-05, "loss": 0.3403, "num_input_tokens_seen": 26719728, "step": 27995 }, { "epoch": 2.2840362182886045, "grad_norm": 0.5712107419967651, "learning_rate": 4.7530880061499296e-05, "loss": 0.374, "num_input_tokens_seen": 26724640, "step": 28000 }, { "epoch": 2.284444081899013, "grad_norm": 1.006052017211914, "learning_rate": 4.752933748882053e-05, "loss": 0.3457, "num_input_tokens_seen": 26729824, "step": 28005 }, { "epoch": 2.2848519455094216, "grad_norm": 1.4740546941757202, "learning_rate": 4.7527794459481675e-05, "loss": 0.2265, "num_input_tokens_seen": 26734992, "step": 28010 }, { "epoch": 2.28525980911983, "grad_norm": 1.0234767198562622, "learning_rate": 4.7526250973513997e-05, "loss": 0.2038, "num_input_tokens_seen": 26740064, "step": 28015 }, { "epoch": 2.285667672730239, "grad_norm": 0.44699975848197937, "learning_rate": 4.752470703094879e-05, "loss": 0.1538, "num_input_tokens_seen": 26744384, "step": 28020 }, { "epoch": 2.2860755363406478, "grad_norm": 0.07700645923614502, "learning_rate": 4.7523162631817334e-05, "loss": 0.0867, "num_input_tokens_seen": 26749088, "step": 28025 }, { "epoch": 2.2864833999510563, "grad_norm": 0.25929608941078186, "learning_rate": 4.752161777615095e-05, "loss": 0.0464, "num_input_tokens_seen": 26753296, "step": 28030 }, { "epoch": 2.286891263561465, "grad_norm": 3.161167860031128, "learning_rate": 4.7520072463980934e-05, "loss": 0.2728, "num_input_tokens_seen": 26758608, "step": 28035 }, { "epoch": 2.287299127171874, "grad_norm": 0.08739661425352097, "learning_rate": 4.751852669533863e-05, "loss": 0.0571, "num_input_tokens_seen": 26763792, "step": 28040 }, { "epoch": 2.2877069907822825, "grad_norm": 6.0549187660217285, "learning_rate": 4.751698047025534e-05, "loss": 0.4744, "num_input_tokens_seen": 26767760, "step": 28045 }, { "epoch": 2.288114854392691, "grad_norm": 0.0473918691277504, "learning_rate": 4.751543378876244e-05, "loss": 0.2579, "num_input_tokens_seen": 26772320, "step": 28050 }, { "epoch": 2.2885227180030996, "grad_norm": 0.2504652738571167, "learning_rate": 4.751388665089127e-05, "loss": 0.112, "num_input_tokens_seen": 26776592, "step": 28055 }, { "epoch": 2.2889305816135086, "grad_norm": 0.46163639426231384, "learning_rate": 4.7512339056673175e-05, "loss": 0.0624, "num_input_tokens_seen": 26781168, "step": 28060 }, { "epoch": 2.289338445223917, "grad_norm": 0.17916038632392883, "learning_rate": 4.751079100613953e-05, "loss": 0.0169, "num_input_tokens_seen": 26786432, "step": 28065 }, { "epoch": 2.2897463088343257, "grad_norm": 0.11312180757522583, "learning_rate": 4.750924249932172e-05, "loss": 0.0242, "num_input_tokens_seen": 26791600, "step": 28070 }, { "epoch": 2.2901541724447343, "grad_norm": 6.317052841186523, "learning_rate": 4.750769353625113e-05, "loss": 0.0919, "num_input_tokens_seen": 26796448, "step": 28075 }, { "epoch": 2.2905620360551433, "grad_norm": 0.13861478865146637, "learning_rate": 4.750614411695916e-05, "loss": 0.2271, "num_input_tokens_seen": 26800384, "step": 28080 }, { "epoch": 2.290969899665552, "grad_norm": 0.13168299198150635, "learning_rate": 4.7504594241477204e-05, "loss": 0.0129, "num_input_tokens_seen": 26804272, "step": 28085 }, { "epoch": 2.2913777632759604, "grad_norm": 3.0532450675964355, "learning_rate": 4.7503043909836686e-05, "loss": 0.2084, "num_input_tokens_seen": 26809152, "step": 28090 }, { "epoch": 2.2917856268863694, "grad_norm": 7.45871114730835, "learning_rate": 4.7501493122069035e-05, "loss": 0.0714, "num_input_tokens_seen": 26813744, "step": 28095 }, { "epoch": 2.292193490496778, "grad_norm": 11.625211715698242, "learning_rate": 4.749994187820568e-05, "loss": 0.19, "num_input_tokens_seen": 26818992, "step": 28100 }, { "epoch": 2.2926013541071866, "grad_norm": 0.0911925658583641, "learning_rate": 4.7498390178278065e-05, "loss": 0.2409, "num_input_tokens_seen": 26823600, "step": 28105 }, { "epoch": 2.293009217717595, "grad_norm": 0.03277252987027168, "learning_rate": 4.749683802231764e-05, "loss": 0.0176, "num_input_tokens_seen": 26828560, "step": 28110 }, { "epoch": 2.2934170813280037, "grad_norm": 0.3217898905277252, "learning_rate": 4.749528541035586e-05, "loss": 0.0508, "num_input_tokens_seen": 26834000, "step": 28115 }, { "epoch": 2.2938249449384127, "grad_norm": 0.09347990900278091, "learning_rate": 4.749373234242421e-05, "loss": 0.0121, "num_input_tokens_seen": 26838528, "step": 28120 }, { "epoch": 2.2942328085488213, "grad_norm": 0.02313876338303089, "learning_rate": 4.749217881855417e-05, "loss": 0.2353, "num_input_tokens_seen": 26843184, "step": 28125 }, { "epoch": 2.29464067215923, "grad_norm": 0.023566482588648796, "learning_rate": 4.749062483877721e-05, "loss": 0.0047, "num_input_tokens_seen": 26847776, "step": 28130 }, { "epoch": 2.295048535769639, "grad_norm": 0.026681717485189438, "learning_rate": 4.7489070403124844e-05, "loss": 0.1066, "num_input_tokens_seen": 26852640, "step": 28135 }, { "epoch": 2.2954563993800474, "grad_norm": 0.058097194880247116, "learning_rate": 4.748751551162859e-05, "loss": 0.0104, "num_input_tokens_seen": 26856304, "step": 28140 }, { "epoch": 2.295864262990456, "grad_norm": 0.0068854340352118015, "learning_rate": 4.7485960164319936e-05, "loss": 0.0053, "num_input_tokens_seen": 26861440, "step": 28145 }, { "epoch": 2.2962721266008645, "grad_norm": 1.3960152864456177, "learning_rate": 4.7484404361230434e-05, "loss": 0.0099, "num_input_tokens_seen": 26866688, "step": 28150 }, { "epoch": 2.296679990211273, "grad_norm": 0.017110472545027733, "learning_rate": 4.748284810239161e-05, "loss": 0.0044, "num_input_tokens_seen": 26871280, "step": 28155 }, { "epoch": 2.297087853821682, "grad_norm": 0.016008038073778152, "learning_rate": 4.7481291387835005e-05, "loss": 0.9599, "num_input_tokens_seen": 26877136, "step": 28160 }, { "epoch": 2.2974957174320907, "grad_norm": 2.2850358486175537, "learning_rate": 4.747973421759218e-05, "loss": 0.4427, "num_input_tokens_seen": 26881024, "step": 28165 }, { "epoch": 2.2979035810424993, "grad_norm": 30.13489532470703, "learning_rate": 4.747817659169469e-05, "loss": 0.3871, "num_input_tokens_seen": 26885888, "step": 28170 }, { "epoch": 2.2983114446529083, "grad_norm": 1.457541823387146, "learning_rate": 4.747661851017411e-05, "loss": 0.3107, "num_input_tokens_seen": 26891424, "step": 28175 }, { "epoch": 2.298719308263317, "grad_norm": 0.025018611922860146, "learning_rate": 4.747505997306203e-05, "loss": 0.0443, "num_input_tokens_seen": 26896304, "step": 28180 }, { "epoch": 2.2991271718737254, "grad_norm": 3.042642831802368, "learning_rate": 4.747350098039003e-05, "loss": 0.3914, "num_input_tokens_seen": 26901536, "step": 28185 }, { "epoch": 2.299535035484134, "grad_norm": 0.04425269365310669, "learning_rate": 4.747194153218971e-05, "loss": 0.477, "num_input_tokens_seen": 26905920, "step": 28190 }, { "epoch": 2.299942899094543, "grad_norm": 5.860499382019043, "learning_rate": 4.74703816284927e-05, "loss": 0.2036, "num_input_tokens_seen": 26910368, "step": 28195 }, { "epoch": 2.3003507627049515, "grad_norm": 0.3169127106666565, "learning_rate": 4.7468821269330584e-05, "loss": 0.1378, "num_input_tokens_seen": 26915536, "step": 28200 }, { "epoch": 2.30075862631536, "grad_norm": 14.7432279586792, "learning_rate": 4.746726045473502e-05, "loss": 0.2017, "num_input_tokens_seen": 26919888, "step": 28205 }, { "epoch": 2.3011664899257687, "grad_norm": 1.5381733179092407, "learning_rate": 4.7465699184737634e-05, "loss": 0.0867, "num_input_tokens_seen": 26924960, "step": 28210 }, { "epoch": 2.3015743535361777, "grad_norm": 0.7563758492469788, "learning_rate": 4.746413745937007e-05, "loss": 0.2304, "num_input_tokens_seen": 26929376, "step": 28215 }, { "epoch": 2.3019822171465862, "grad_norm": 0.3085099458694458, "learning_rate": 4.746257527866399e-05, "loss": 2.1382, "num_input_tokens_seen": 26933808, "step": 28220 }, { "epoch": 2.302390080756995, "grad_norm": 7.372525691986084, "learning_rate": 4.7461012642651044e-05, "loss": 1.6808, "num_input_tokens_seen": 26938480, "step": 28225 }, { "epoch": 2.3027979443674034, "grad_norm": 0.22268518805503845, "learning_rate": 4.745944955136292e-05, "loss": 2.4899, "num_input_tokens_seen": 26943760, "step": 28230 }, { "epoch": 2.3032058079778124, "grad_norm": 18.730785369873047, "learning_rate": 4.74578860048313e-05, "loss": 2.9304, "num_input_tokens_seen": 26948784, "step": 28235 }, { "epoch": 2.303613671588221, "grad_norm": 1.054883599281311, "learning_rate": 4.7456322003087864e-05, "loss": 1.114, "num_input_tokens_seen": 26953248, "step": 28240 }, { "epoch": 2.3040215351986295, "grad_norm": 33.12737274169922, "learning_rate": 4.745475754616433e-05, "loss": 1.2955, "num_input_tokens_seen": 26958080, "step": 28245 }, { "epoch": 2.304429398809038, "grad_norm": 3.448042869567871, "learning_rate": 4.74531926340924e-05, "loss": 0.6223, "num_input_tokens_seen": 26962624, "step": 28250 }, { "epoch": 2.304837262419447, "grad_norm": 9.207509994506836, "learning_rate": 4.74516272669038e-05, "loss": 1.5011, "num_input_tokens_seen": 26967440, "step": 28255 }, { "epoch": 2.3052451260298557, "grad_norm": 125.39014434814453, "learning_rate": 4.7450061444630255e-05, "loss": 1.4384, "num_input_tokens_seen": 26971952, "step": 28260 }, { "epoch": 2.305652989640264, "grad_norm": 47.41389465332031, "learning_rate": 4.7448495167303506e-05, "loss": 1.9725, "num_input_tokens_seen": 26976080, "step": 28265 }, { "epoch": 2.306060853250673, "grad_norm": 30.190643310546875, "learning_rate": 4.744692843495529e-05, "loss": 1.4179, "num_input_tokens_seen": 26981680, "step": 28270 }, { "epoch": 2.306468716861082, "grad_norm": 127.14710998535156, "learning_rate": 4.744536124761739e-05, "loss": 2.6497, "num_input_tokens_seen": 26985904, "step": 28275 }, { "epoch": 2.3068765804714904, "grad_norm": 36.85126876831055, "learning_rate": 4.744379360532154e-05, "loss": 2.2219, "num_input_tokens_seen": 26990640, "step": 28280 }, { "epoch": 2.307284444081899, "grad_norm": 47.44306182861328, "learning_rate": 4.744222550809954e-05, "loss": 2.0653, "num_input_tokens_seen": 26994848, "step": 28285 }, { "epoch": 2.3076923076923075, "grad_norm": 35.61507034301758, "learning_rate": 4.744065695598315e-05, "loss": 2.3589, "num_input_tokens_seen": 26999904, "step": 28290 }, { "epoch": 2.3081001713027165, "grad_norm": 9.018013954162598, "learning_rate": 4.74390879490042e-05, "loss": 1.1382, "num_input_tokens_seen": 27005552, "step": 28295 }, { "epoch": 2.308508034913125, "grad_norm": 8.378966331481934, "learning_rate": 4.7437518487194456e-05, "loss": 0.6505, "num_input_tokens_seen": 27010560, "step": 28300 }, { "epoch": 2.3089158985235336, "grad_norm": 13.21341609954834, "learning_rate": 4.7435948570585756e-05, "loss": 0.7281, "num_input_tokens_seen": 27014976, "step": 28305 }, { "epoch": 2.3093237621339426, "grad_norm": 10.138057708740234, "learning_rate": 4.7434378199209905e-05, "loss": 1.22, "num_input_tokens_seen": 27019472, "step": 28310 }, { "epoch": 2.309731625744351, "grad_norm": 5.391341686248779, "learning_rate": 4.7432807373098745e-05, "loss": 0.9041, "num_input_tokens_seen": 27024048, "step": 28315 }, { "epoch": 2.3101394893547598, "grad_norm": 16.977453231811523, "learning_rate": 4.743123609228411e-05, "loss": 1.0424, "num_input_tokens_seen": 27028560, "step": 28320 }, { "epoch": 2.3105473529651683, "grad_norm": 8.557515144348145, "learning_rate": 4.7429664356797856e-05, "loss": 0.9478, "num_input_tokens_seen": 27033088, "step": 28325 }, { "epoch": 2.310955216575577, "grad_norm": 7.626985549926758, "learning_rate": 4.742809216667183e-05, "loss": 0.6621, "num_input_tokens_seen": 27038544, "step": 28330 }, { "epoch": 2.311363080185986, "grad_norm": 2.6094164848327637, "learning_rate": 4.74265195219379e-05, "loss": 0.5422, "num_input_tokens_seen": 27043312, "step": 28335 }, { "epoch": 2.3117709437963945, "grad_norm": 7.798403263092041, "learning_rate": 4.742494642262796e-05, "loss": 0.42, "num_input_tokens_seen": 27047936, "step": 28340 }, { "epoch": 2.312178807406803, "grad_norm": 4.193296432495117, "learning_rate": 4.7423372868773885e-05, "loss": 0.4373, "num_input_tokens_seen": 27052240, "step": 28345 }, { "epoch": 2.312586671017212, "grad_norm": 3.561596155166626, "learning_rate": 4.7421798860407565e-05, "loss": 0.4754, "num_input_tokens_seen": 27057824, "step": 28350 }, { "epoch": 2.3129945346276206, "grad_norm": 8.091367721557617, "learning_rate": 4.742022439756092e-05, "loss": 0.558, "num_input_tokens_seen": 27063984, "step": 28355 }, { "epoch": 2.313402398238029, "grad_norm": 2.503453254699707, "learning_rate": 4.741864948026584e-05, "loss": 0.6177, "num_input_tokens_seen": 27068384, "step": 28360 }, { "epoch": 2.3138102618484377, "grad_norm": 1.4849485158920288, "learning_rate": 4.741707410855427e-05, "loss": 0.3526, "num_input_tokens_seen": 27073488, "step": 28365 }, { "epoch": 2.3142181254588468, "grad_norm": 1.324851155281067, "learning_rate": 4.741549828245813e-05, "loss": 0.3389, "num_input_tokens_seen": 27077936, "step": 28370 }, { "epoch": 2.3146259890692553, "grad_norm": 2.9509665966033936, "learning_rate": 4.741392200200936e-05, "loss": 0.5737, "num_input_tokens_seen": 27082656, "step": 28375 }, { "epoch": 2.315033852679664, "grad_norm": 24.977737426757812, "learning_rate": 4.741234526723992e-05, "loss": 0.7521, "num_input_tokens_seen": 27088512, "step": 28380 }, { "epoch": 2.3154417162900724, "grad_norm": 3.4479780197143555, "learning_rate": 4.741076807818177e-05, "loss": 0.481, "num_input_tokens_seen": 27093936, "step": 28385 }, { "epoch": 2.3158495799004815, "grad_norm": 2.8641557693481445, "learning_rate": 4.740919043486687e-05, "loss": 0.859, "num_input_tokens_seen": 27098544, "step": 28390 }, { "epoch": 2.31625744351089, "grad_norm": 4.517947673797607, "learning_rate": 4.74076123373272e-05, "loss": 0.5321, "num_input_tokens_seen": 27104160, "step": 28395 }, { "epoch": 2.3166653071212986, "grad_norm": 2.9872450828552246, "learning_rate": 4.740603378559475e-05, "loss": 0.4601, "num_input_tokens_seen": 27108336, "step": 28400 }, { "epoch": 2.317073170731707, "grad_norm": 4.093958377838135, "learning_rate": 4.7404454779701516e-05, "loss": 0.6549, "num_input_tokens_seen": 27113568, "step": 28405 }, { "epoch": 2.317481034342116, "grad_norm": 14.381176948547363, "learning_rate": 4.740287531967951e-05, "loss": 0.497, "num_input_tokens_seen": 27118640, "step": 28410 }, { "epoch": 2.3178888979525247, "grad_norm": 3.847795009613037, "learning_rate": 4.740129540556073e-05, "loss": 0.4538, "num_input_tokens_seen": 27123344, "step": 28415 }, { "epoch": 2.3182967615629333, "grad_norm": 1.2499969005584717, "learning_rate": 4.739971503737722e-05, "loss": 0.5612, "num_input_tokens_seen": 27128720, "step": 28420 }, { "epoch": 2.318704625173342, "grad_norm": 7.470185279846191, "learning_rate": 4.7398134215161e-05, "loss": 0.5383, "num_input_tokens_seen": 27134048, "step": 28425 }, { "epoch": 2.319112488783751, "grad_norm": 5.698076248168945, "learning_rate": 4.7396552938944114e-05, "loss": 0.6885, "num_input_tokens_seen": 27139408, "step": 28430 }, { "epoch": 2.3195203523941594, "grad_norm": 7.58546257019043, "learning_rate": 4.739497120875862e-05, "loss": 0.351, "num_input_tokens_seen": 27144288, "step": 28435 }, { "epoch": 2.319928216004568, "grad_norm": 3.4048550128936768, "learning_rate": 4.739338902463658e-05, "loss": 0.3928, "num_input_tokens_seen": 27149920, "step": 28440 }, { "epoch": 2.3203360796149766, "grad_norm": 2.0437686443328857, "learning_rate": 4.739180638661006e-05, "loss": 0.3308, "num_input_tokens_seen": 27154752, "step": 28445 }, { "epoch": 2.3207439432253856, "grad_norm": 4.940476417541504, "learning_rate": 4.7390223294711137e-05, "loss": 0.4371, "num_input_tokens_seen": 27159088, "step": 28450 }, { "epoch": 2.321151806835794, "grad_norm": 3.0412657260894775, "learning_rate": 4.73886397489719e-05, "loss": 0.595, "num_input_tokens_seen": 27163552, "step": 28455 }, { "epoch": 2.3215596704462027, "grad_norm": 2.1013243198394775, "learning_rate": 4.7387055749424444e-05, "loss": 0.3838, "num_input_tokens_seen": 27168432, "step": 28460 }, { "epoch": 2.3219675340566113, "grad_norm": 3.302447557449341, "learning_rate": 4.7385471296100895e-05, "loss": 0.3718, "num_input_tokens_seen": 27173392, "step": 28465 }, { "epoch": 2.3223753976670203, "grad_norm": 3.3646693229675293, "learning_rate": 4.7383886389033336e-05, "loss": 0.6405, "num_input_tokens_seen": 27177856, "step": 28470 }, { "epoch": 2.322783261277429, "grad_norm": 1.1881645917892456, "learning_rate": 4.738230102825393e-05, "loss": 0.3975, "num_input_tokens_seen": 27182464, "step": 28475 }, { "epoch": 2.3231911248878374, "grad_norm": 2.0628812313079834, "learning_rate": 4.738071521379478e-05, "loss": 0.5268, "num_input_tokens_seen": 27187264, "step": 28480 }, { "epoch": 2.3235989884982464, "grad_norm": 2.8092305660247803, "learning_rate": 4.737912894568805e-05, "loss": 0.4003, "num_input_tokens_seen": 27192096, "step": 28485 }, { "epoch": 2.324006852108655, "grad_norm": 0.5492669343948364, "learning_rate": 4.737754222396588e-05, "loss": 0.2547, "num_input_tokens_seen": 27196400, "step": 28490 }, { "epoch": 2.3244147157190636, "grad_norm": 0.19747301936149597, "learning_rate": 4.737595504866045e-05, "loss": 0.8836, "num_input_tokens_seen": 27201760, "step": 28495 }, { "epoch": 2.324822579329472, "grad_norm": 5.766193389892578, "learning_rate": 4.73743674198039e-05, "loss": 1.6408, "num_input_tokens_seen": 27207184, "step": 28500 }, { "epoch": 2.3252304429398807, "grad_norm": 2.834143877029419, "learning_rate": 4.737277933742844e-05, "loss": 0.1789, "num_input_tokens_seen": 27212064, "step": 28505 }, { "epoch": 2.3256383065502897, "grad_norm": 3.5719666481018066, "learning_rate": 4.7371190801566256e-05, "loss": 0.515, "num_input_tokens_seen": 27216704, "step": 28510 }, { "epoch": 2.3260461701606983, "grad_norm": 5.069714546203613, "learning_rate": 4.736960181224953e-05, "loss": 0.427, "num_input_tokens_seen": 27221584, "step": 28515 }, { "epoch": 2.326454033771107, "grad_norm": 1.6657123565673828, "learning_rate": 4.736801236951049e-05, "loss": 0.4985, "num_input_tokens_seen": 27226288, "step": 28520 }, { "epoch": 2.326861897381516, "grad_norm": 3.5547173023223877, "learning_rate": 4.736642247338134e-05, "loss": 0.3312, "num_input_tokens_seen": 27229792, "step": 28525 }, { "epoch": 2.3272697609919244, "grad_norm": 4.508675575256348, "learning_rate": 4.736483212389431e-05, "loss": 0.4299, "num_input_tokens_seen": 27234096, "step": 28530 }, { "epoch": 2.327677624602333, "grad_norm": 2.4987590312957764, "learning_rate": 4.736324132108163e-05, "loss": 0.5074, "num_input_tokens_seen": 27239536, "step": 28535 }, { "epoch": 2.3280854882127415, "grad_norm": 1.7891896963119507, "learning_rate": 4.736165006497556e-05, "loss": 0.3194, "num_input_tokens_seen": 27243936, "step": 28540 }, { "epoch": 2.32849335182315, "grad_norm": 2.7524755001068115, "learning_rate": 4.7360058355608344e-05, "loss": 0.3452, "num_input_tokens_seen": 27248560, "step": 28545 }, { "epoch": 2.328901215433559, "grad_norm": 3.6941637992858887, "learning_rate": 4.735846619301224e-05, "loss": 0.3229, "num_input_tokens_seen": 27253312, "step": 28550 }, { "epoch": 2.3293090790439677, "grad_norm": 0.6462761759757996, "learning_rate": 4.735687357721954e-05, "loss": 0.3631, "num_input_tokens_seen": 27257792, "step": 28555 }, { "epoch": 2.3297169426543762, "grad_norm": 3.5480940341949463, "learning_rate": 4.73552805082625e-05, "loss": 0.5131, "num_input_tokens_seen": 27263280, "step": 28560 }, { "epoch": 2.3301248062647852, "grad_norm": 2.22894287109375, "learning_rate": 4.735368698617343e-05, "loss": 0.3645, "num_input_tokens_seen": 27268000, "step": 28565 }, { "epoch": 2.330532669875194, "grad_norm": 3.0764217376708984, "learning_rate": 4.735209301098462e-05, "loss": 0.4023, "num_input_tokens_seen": 27273136, "step": 28570 }, { "epoch": 2.3309405334856024, "grad_norm": 1.0818270444869995, "learning_rate": 4.7350498582728386e-05, "loss": 0.4411, "num_input_tokens_seen": 27277760, "step": 28575 }, { "epoch": 2.331348397096011, "grad_norm": 2.8649823665618896, "learning_rate": 4.734890370143704e-05, "loss": 0.3892, "num_input_tokens_seen": 27283744, "step": 28580 }, { "epoch": 2.33175626070642, "grad_norm": 2.6664724349975586, "learning_rate": 4.7347308367142916e-05, "loss": 0.4196, "num_input_tokens_seen": 27289120, "step": 28585 }, { "epoch": 2.3321641243168285, "grad_norm": 3.2821128368377686, "learning_rate": 4.7345712579878345e-05, "loss": 0.3913, "num_input_tokens_seen": 27294608, "step": 28590 }, { "epoch": 2.332571987927237, "grad_norm": 1.0443097352981567, "learning_rate": 4.734411633967567e-05, "loss": 0.3687, "num_input_tokens_seen": 27298944, "step": 28595 }, { "epoch": 2.3329798515376456, "grad_norm": 1.9721004962921143, "learning_rate": 4.734251964656726e-05, "loss": 0.3759, "num_input_tokens_seen": 27302576, "step": 28600 }, { "epoch": 2.3333877151480547, "grad_norm": 0.9344648122787476, "learning_rate": 4.734092250058547e-05, "loss": 0.3599, "num_input_tokens_seen": 27306384, "step": 28605 }, { "epoch": 2.333795578758463, "grad_norm": 1.6655571460723877, "learning_rate": 4.7339324901762675e-05, "loss": 0.498, "num_input_tokens_seen": 27311984, "step": 28610 }, { "epoch": 2.334203442368872, "grad_norm": 1.1035127639770508, "learning_rate": 4.733772685013125e-05, "loss": 0.3531, "num_input_tokens_seen": 27316112, "step": 28615 }, { "epoch": 2.3346113059792803, "grad_norm": 1.852054238319397, "learning_rate": 4.73361283457236e-05, "loss": 0.4472, "num_input_tokens_seen": 27320320, "step": 28620 }, { "epoch": 2.3350191695896894, "grad_norm": 4.721146106719971, "learning_rate": 4.7334529388572114e-05, "loss": 0.2651, "num_input_tokens_seen": 27324800, "step": 28625 }, { "epoch": 2.335427033200098, "grad_norm": 1.8544949293136597, "learning_rate": 4.733292997870921e-05, "loss": 0.4355, "num_input_tokens_seen": 27328704, "step": 28630 }, { "epoch": 2.3358348968105065, "grad_norm": 0.7184544801712036, "learning_rate": 4.73313301161673e-05, "loss": 0.316, "num_input_tokens_seen": 27333536, "step": 28635 }, { "epoch": 2.336242760420915, "grad_norm": 2.183450698852539, "learning_rate": 4.7329729800978825e-05, "loss": 0.3393, "num_input_tokens_seen": 27339136, "step": 28640 }, { "epoch": 2.336650624031324, "grad_norm": 4.3275017738342285, "learning_rate": 4.732812903317622e-05, "loss": 0.6788, "num_input_tokens_seen": 27343984, "step": 28645 }, { "epoch": 2.3370584876417326, "grad_norm": 1.6724551916122437, "learning_rate": 4.7326527812791914e-05, "loss": 0.3639, "num_input_tokens_seen": 27348496, "step": 28650 }, { "epoch": 2.337466351252141, "grad_norm": 1.9587758779525757, "learning_rate": 4.732492613985838e-05, "loss": 0.452, "num_input_tokens_seen": 27352880, "step": 28655 }, { "epoch": 2.33787421486255, "grad_norm": 2.198573112487793, "learning_rate": 4.7323324014408085e-05, "loss": 0.5811, "num_input_tokens_seen": 27357280, "step": 28660 }, { "epoch": 2.3382820784729588, "grad_norm": 2.2537941932678223, "learning_rate": 4.73217214364735e-05, "loss": 0.324, "num_input_tokens_seen": 27362432, "step": 28665 }, { "epoch": 2.3386899420833673, "grad_norm": 1.9112334251403809, "learning_rate": 4.73201184060871e-05, "loss": 0.4068, "num_input_tokens_seen": 27367904, "step": 28670 }, { "epoch": 2.339097805693776, "grad_norm": 1.6553081274032593, "learning_rate": 4.7318514923281385e-05, "loss": 0.3387, "num_input_tokens_seen": 27372240, "step": 28675 }, { "epoch": 2.3395056693041845, "grad_norm": 0.9698335528373718, "learning_rate": 4.731691098808886e-05, "loss": 0.4549, "num_input_tokens_seen": 27376448, "step": 28680 }, { "epoch": 2.3399135329145935, "grad_norm": 4.6796722412109375, "learning_rate": 4.731530660054203e-05, "loss": 0.347, "num_input_tokens_seen": 27381408, "step": 28685 }, { "epoch": 2.340321396525002, "grad_norm": 1.0968072414398193, "learning_rate": 4.7313701760673415e-05, "loss": 0.3851, "num_input_tokens_seen": 27386080, "step": 28690 }, { "epoch": 2.3407292601354106, "grad_norm": 1.8422882556915283, "learning_rate": 4.731209646851555e-05, "loss": 0.328, "num_input_tokens_seen": 27390624, "step": 28695 }, { "epoch": 2.3411371237458196, "grad_norm": 1.6865944862365723, "learning_rate": 4.7310490724100975e-05, "loss": 0.3299, "num_input_tokens_seen": 27395232, "step": 28700 }, { "epoch": 2.341544987356228, "grad_norm": 1.7628896236419678, "learning_rate": 4.730888452746223e-05, "loss": 0.5934, "num_input_tokens_seen": 27399968, "step": 28705 }, { "epoch": 2.3419528509666367, "grad_norm": 2.541259765625, "learning_rate": 4.7307277878631875e-05, "loss": 0.4368, "num_input_tokens_seen": 27404800, "step": 28710 }, { "epoch": 2.3423607145770453, "grad_norm": 5.689680099487305, "learning_rate": 4.730567077764247e-05, "loss": 0.461, "num_input_tokens_seen": 27410208, "step": 28715 }, { "epoch": 2.342768578187454, "grad_norm": 10.124933242797852, "learning_rate": 4.73040632245266e-05, "loss": 0.6178, "num_input_tokens_seen": 27415344, "step": 28720 }, { "epoch": 2.343176441797863, "grad_norm": 1.36561918258667, "learning_rate": 4.730245521931685e-05, "loss": 0.4757, "num_input_tokens_seen": 27420752, "step": 28725 }, { "epoch": 2.3435843054082715, "grad_norm": 1.7588019371032715, "learning_rate": 4.73008467620458e-05, "loss": 0.3861, "num_input_tokens_seen": 27426368, "step": 28730 }, { "epoch": 2.34399216901868, "grad_norm": 1.4837493896484375, "learning_rate": 4.729923785274607e-05, "loss": 0.3421, "num_input_tokens_seen": 27431008, "step": 28735 }, { "epoch": 2.344400032629089, "grad_norm": 2.517026901245117, "learning_rate": 4.7297628491450274e-05, "loss": 0.3834, "num_input_tokens_seen": 27435616, "step": 28740 }, { "epoch": 2.3448078962394976, "grad_norm": 1.697903037071228, "learning_rate": 4.729601867819101e-05, "loss": 0.5196, "num_input_tokens_seen": 27440640, "step": 28745 }, { "epoch": 2.345215759849906, "grad_norm": 0.6572988629341125, "learning_rate": 4.7294408413000926e-05, "loss": 0.4002, "num_input_tokens_seen": 27446272, "step": 28750 }, { "epoch": 2.3456236234603147, "grad_norm": 1.5121082067489624, "learning_rate": 4.7292797695912655e-05, "loss": 0.2873, "num_input_tokens_seen": 27451488, "step": 28755 }, { "epoch": 2.3460314870707237, "grad_norm": 1.2023121118545532, "learning_rate": 4.729118652695885e-05, "loss": 0.5383, "num_input_tokens_seen": 27456704, "step": 28760 }, { "epoch": 2.3464393506811323, "grad_norm": 1.4078272581100464, "learning_rate": 4.728957490617216e-05, "loss": 0.4373, "num_input_tokens_seen": 27461712, "step": 28765 }, { "epoch": 2.346847214291541, "grad_norm": 2.1991941928863525, "learning_rate": 4.7287962833585264e-05, "loss": 0.4096, "num_input_tokens_seen": 27466448, "step": 28770 }, { "epoch": 2.3472550779019494, "grad_norm": 3.382045030593872, "learning_rate": 4.728635030923083e-05, "loss": 0.3517, "num_input_tokens_seen": 27471888, "step": 28775 }, { "epoch": 2.3476629415123584, "grad_norm": 1.509277105331421, "learning_rate": 4.728473733314155e-05, "loss": 0.3842, "num_input_tokens_seen": 27476352, "step": 28780 }, { "epoch": 2.348070805122767, "grad_norm": 1.1626368761062622, "learning_rate": 4.7283123905350106e-05, "loss": 0.3568, "num_input_tokens_seen": 27481120, "step": 28785 }, { "epoch": 2.3484786687331756, "grad_norm": 1.5038453340530396, "learning_rate": 4.7281510025889215e-05, "loss": 0.3697, "num_input_tokens_seen": 27485040, "step": 28790 }, { "epoch": 2.348886532343584, "grad_norm": 3.1148383617401123, "learning_rate": 4.7279895694791576e-05, "loss": 0.3954, "num_input_tokens_seen": 27489344, "step": 28795 }, { "epoch": 2.349294395953993, "grad_norm": 2.4164137840270996, "learning_rate": 4.7278280912089926e-05, "loss": 0.3739, "num_input_tokens_seen": 27494304, "step": 28800 }, { "epoch": 2.3497022595644017, "grad_norm": 2.5094873905181885, "learning_rate": 4.7276665677816986e-05, "loss": 0.3733, "num_input_tokens_seen": 27498816, "step": 28805 }, { "epoch": 2.3501101231748103, "grad_norm": 2.8320720195770264, "learning_rate": 4.727504999200551e-05, "loss": 0.4168, "num_input_tokens_seen": 27503216, "step": 28810 }, { "epoch": 2.350517986785219, "grad_norm": 1.8590342998504639, "learning_rate": 4.727343385468822e-05, "loss": 0.3641, "num_input_tokens_seen": 27507280, "step": 28815 }, { "epoch": 2.350925850395628, "grad_norm": 2.2166037559509277, "learning_rate": 4.727181726589789e-05, "loss": 0.3667, "num_input_tokens_seen": 27511920, "step": 28820 }, { "epoch": 2.3513337140060364, "grad_norm": 2.733600378036499, "learning_rate": 4.727020022566729e-05, "loss": 0.373, "num_input_tokens_seen": 27517552, "step": 28825 }, { "epoch": 2.351741577616445, "grad_norm": 1.0672500133514404, "learning_rate": 4.72685827340292e-05, "loss": 0.3648, "num_input_tokens_seen": 27522096, "step": 28830 }, { "epoch": 2.352149441226854, "grad_norm": 0.9775806665420532, "learning_rate": 4.72669647910164e-05, "loss": 0.3151, "num_input_tokens_seen": 27527424, "step": 28835 }, { "epoch": 2.3525573048372626, "grad_norm": 1.198368787765503, "learning_rate": 4.726534639666169e-05, "loss": 0.3901, "num_input_tokens_seen": 27532416, "step": 28840 }, { "epoch": 2.352965168447671, "grad_norm": 2.260205030441284, "learning_rate": 4.726372755099786e-05, "loss": 0.3784, "num_input_tokens_seen": 27536416, "step": 28845 }, { "epoch": 2.3533730320580797, "grad_norm": 1.0289981365203857, "learning_rate": 4.726210825405774e-05, "loss": 0.3875, "num_input_tokens_seen": 27541296, "step": 28850 }, { "epoch": 2.3537808956684882, "grad_norm": 1.5806689262390137, "learning_rate": 4.726048850587414e-05, "loss": 0.3777, "num_input_tokens_seen": 27546320, "step": 28855 }, { "epoch": 2.3541887592788973, "grad_norm": 0.5995783805847168, "learning_rate": 4.72588683064799e-05, "loss": 0.3662, "num_input_tokens_seen": 27551216, "step": 28860 }, { "epoch": 2.354596622889306, "grad_norm": 0.9686960577964783, "learning_rate": 4.725724765590786e-05, "loss": 0.355, "num_input_tokens_seen": 27556224, "step": 28865 }, { "epoch": 2.3550044864997144, "grad_norm": 3.203622817993164, "learning_rate": 4.7255626554190855e-05, "loss": 0.4684, "num_input_tokens_seen": 27560224, "step": 28870 }, { "epoch": 2.3554123501101234, "grad_norm": 2.3587753772735596, "learning_rate": 4.725400500136177e-05, "loss": 0.4032, "num_input_tokens_seen": 27565392, "step": 28875 }, { "epoch": 2.355820213720532, "grad_norm": 0.8829838633537292, "learning_rate": 4.7252382997453455e-05, "loss": 0.2349, "num_input_tokens_seen": 27570864, "step": 28880 }, { "epoch": 2.3562280773309405, "grad_norm": 15.030755996704102, "learning_rate": 4.7250760542498794e-05, "loss": 0.5989, "num_input_tokens_seen": 27574832, "step": 28885 }, { "epoch": 2.356635940941349, "grad_norm": 1.3641427755355835, "learning_rate": 4.724913763653067e-05, "loss": 0.7545, "num_input_tokens_seen": 27579744, "step": 28890 }, { "epoch": 2.3570438045517577, "grad_norm": 2.204160451889038, "learning_rate": 4.7247514279581984e-05, "loss": 0.4924, "num_input_tokens_seen": 27583776, "step": 28895 }, { "epoch": 2.3574516681621667, "grad_norm": 1.9631761312484741, "learning_rate": 4.7245890471685625e-05, "loss": 0.3467, "num_input_tokens_seen": 27588112, "step": 28900 }, { "epoch": 2.3578595317725752, "grad_norm": 0.8446919322013855, "learning_rate": 4.7244266212874536e-05, "loss": 0.4526, "num_input_tokens_seen": 27593664, "step": 28905 }, { "epoch": 2.358267395382984, "grad_norm": 3.7974085807800293, "learning_rate": 4.7242641503181616e-05, "loss": 0.4234, "num_input_tokens_seen": 27598576, "step": 28910 }, { "epoch": 2.358675258993393, "grad_norm": 1.773295521736145, "learning_rate": 4.72410163426398e-05, "loss": 0.4149, "num_input_tokens_seen": 27604112, "step": 28915 }, { "epoch": 2.3590831226038014, "grad_norm": 0.6471622586250305, "learning_rate": 4.723939073128204e-05, "loss": 0.3626, "num_input_tokens_seen": 27609088, "step": 28920 }, { "epoch": 2.35949098621421, "grad_norm": 2.5250844955444336, "learning_rate": 4.7237764669141275e-05, "loss": 0.358, "num_input_tokens_seen": 27614016, "step": 28925 }, { "epoch": 2.3598988498246185, "grad_norm": 2.698881149291992, "learning_rate": 4.723613815625048e-05, "loss": 0.3443, "num_input_tokens_seen": 27619136, "step": 28930 }, { "epoch": 2.3603067134350275, "grad_norm": 14.325428009033203, "learning_rate": 4.7234511192642594e-05, "loss": 0.3448, "num_input_tokens_seen": 27624064, "step": 28935 }, { "epoch": 2.360714577045436, "grad_norm": 4.335909843444824, "learning_rate": 4.723288377835063e-05, "loss": 0.5529, "num_input_tokens_seen": 27628704, "step": 28940 }, { "epoch": 2.3611224406558446, "grad_norm": 1.59428870677948, "learning_rate": 4.723125591340755e-05, "loss": 0.3451, "num_input_tokens_seen": 27632656, "step": 28945 }, { "epoch": 2.361530304266253, "grad_norm": 1.8336610794067383, "learning_rate": 4.7229627597846364e-05, "loss": 0.332, "num_input_tokens_seen": 27637408, "step": 28950 }, { "epoch": 2.361938167876662, "grad_norm": 1.0046536922454834, "learning_rate": 4.722799883170007e-05, "loss": 0.3634, "num_input_tokens_seen": 27642560, "step": 28955 }, { "epoch": 2.362346031487071, "grad_norm": 0.6493282914161682, "learning_rate": 4.722636961500169e-05, "loss": 0.3429, "num_input_tokens_seen": 27647520, "step": 28960 }, { "epoch": 2.3627538950974794, "grad_norm": 1.9241607189178467, "learning_rate": 4.722473994778424e-05, "loss": 0.3683, "num_input_tokens_seen": 27652528, "step": 28965 }, { "epoch": 2.363161758707888, "grad_norm": 1.1447497606277466, "learning_rate": 4.722310983008076e-05, "loss": 0.3863, "num_input_tokens_seen": 27657328, "step": 28970 }, { "epoch": 2.363569622318297, "grad_norm": 2.0382089614868164, "learning_rate": 4.7221479261924276e-05, "loss": 0.3451, "num_input_tokens_seen": 27660880, "step": 28975 }, { "epoch": 2.3639774859287055, "grad_norm": 1.878413200378418, "learning_rate": 4.721984824334785e-05, "loss": 0.347, "num_input_tokens_seen": 27666016, "step": 28980 }, { "epoch": 2.364385349539114, "grad_norm": 0.8941237330436707, "learning_rate": 4.721821677438455e-05, "loss": 0.3978, "num_input_tokens_seen": 27670656, "step": 28985 }, { "epoch": 2.3647932131495226, "grad_norm": 1.5799695253372192, "learning_rate": 4.7216584855067435e-05, "loss": 0.397, "num_input_tokens_seen": 27674960, "step": 28990 }, { "epoch": 2.3652010767599316, "grad_norm": 0.8828758001327515, "learning_rate": 4.7214952485429584e-05, "loss": 0.3302, "num_input_tokens_seen": 27679872, "step": 28995 }, { "epoch": 2.36560894037034, "grad_norm": 1.9830774068832397, "learning_rate": 4.7213319665504086e-05, "loss": 0.4261, "num_input_tokens_seen": 27684544, "step": 29000 }, { "epoch": 2.3660168039807488, "grad_norm": 0.7998315691947937, "learning_rate": 4.721168639532404e-05, "loss": 0.371, "num_input_tokens_seen": 27689104, "step": 29005 }, { "epoch": 2.3664246675911573, "grad_norm": 0.6333718299865723, "learning_rate": 4.7210052674922547e-05, "loss": 0.3766, "num_input_tokens_seen": 27693296, "step": 29010 }, { "epoch": 2.3668325312015663, "grad_norm": 2.0026772022247314, "learning_rate": 4.7208418504332723e-05, "loss": 0.3408, "num_input_tokens_seen": 27697712, "step": 29015 }, { "epoch": 2.367240394811975, "grad_norm": 4.347435474395752, "learning_rate": 4.7206783883587694e-05, "loss": 0.4248, "num_input_tokens_seen": 27702048, "step": 29020 }, { "epoch": 2.3676482584223835, "grad_norm": 1.1296448707580566, "learning_rate": 4.720514881272059e-05, "loss": 0.3509, "num_input_tokens_seen": 27706816, "step": 29025 }, { "epoch": 2.368056122032792, "grad_norm": 1.1717729568481445, "learning_rate": 4.720351329176456e-05, "loss": 0.2192, "num_input_tokens_seen": 27712704, "step": 29030 }, { "epoch": 2.368463985643201, "grad_norm": 0.9008482694625854, "learning_rate": 4.7201877320752746e-05, "loss": 0.6265, "num_input_tokens_seen": 27717152, "step": 29035 }, { "epoch": 2.3688718492536096, "grad_norm": 0.5699442625045776, "learning_rate": 4.720024089971832e-05, "loss": 0.3125, "num_input_tokens_seen": 27721824, "step": 29040 }, { "epoch": 2.369279712864018, "grad_norm": 0.93217533826828, "learning_rate": 4.719860402869444e-05, "loss": 0.6415, "num_input_tokens_seen": 27725888, "step": 29045 }, { "epoch": 2.369687576474427, "grad_norm": 0.973426103591919, "learning_rate": 4.7196966707714285e-05, "loss": 0.5353, "num_input_tokens_seen": 27730992, "step": 29050 }, { "epoch": 2.3700954400848357, "grad_norm": 2.412641763687134, "learning_rate": 4.719532893681104e-05, "loss": 0.3701, "num_input_tokens_seen": 27735120, "step": 29055 }, { "epoch": 2.3705033036952443, "grad_norm": 2.590909481048584, "learning_rate": 4.7193690716017924e-05, "loss": 0.4012, "num_input_tokens_seen": 27739264, "step": 29060 }, { "epoch": 2.370911167305653, "grad_norm": 2.807037353515625, "learning_rate": 4.7192052045368115e-05, "loss": 0.378, "num_input_tokens_seen": 27743536, "step": 29065 }, { "epoch": 2.3713190309160614, "grad_norm": 0.659206748008728, "learning_rate": 4.719041292489485e-05, "loss": 0.3938, "num_input_tokens_seen": 27747664, "step": 29070 }, { "epoch": 2.3717268945264705, "grad_norm": 1.0529899597167969, "learning_rate": 4.718877335463135e-05, "loss": 0.3421, "num_input_tokens_seen": 27751760, "step": 29075 }, { "epoch": 2.372134758136879, "grad_norm": 1.2910970449447632, "learning_rate": 4.7187133334610836e-05, "loss": 0.3498, "num_input_tokens_seen": 27757296, "step": 29080 }, { "epoch": 2.3725426217472876, "grad_norm": 1.8266884088516235, "learning_rate": 4.718549286486655e-05, "loss": 0.3324, "num_input_tokens_seen": 27761520, "step": 29085 }, { "epoch": 2.3729504853576966, "grad_norm": 2.7046968936920166, "learning_rate": 4.718385194543176e-05, "loss": 0.408, "num_input_tokens_seen": 27766256, "step": 29090 }, { "epoch": 2.373358348968105, "grad_norm": 6.3958210945129395, "learning_rate": 4.7182210576339714e-05, "loss": 0.3907, "num_input_tokens_seen": 27771392, "step": 29095 }, { "epoch": 2.3737662125785137, "grad_norm": 1.2431056499481201, "learning_rate": 4.7180568757623686e-05, "loss": 0.2588, "num_input_tokens_seen": 27776080, "step": 29100 }, { "epoch": 2.3741740761889223, "grad_norm": 3.7354507446289062, "learning_rate": 4.7178926489316955e-05, "loss": 0.7709, "num_input_tokens_seen": 27781152, "step": 29105 }, { "epoch": 2.374581939799331, "grad_norm": 2.6032028198242188, "learning_rate": 4.717728377145281e-05, "loss": 0.695, "num_input_tokens_seen": 27785600, "step": 29110 }, { "epoch": 2.37498980340974, "grad_norm": 2.3265538215637207, "learning_rate": 4.717564060406454e-05, "loss": 0.3532, "num_input_tokens_seen": 27790912, "step": 29115 }, { "epoch": 2.3753976670201484, "grad_norm": 1.476619005203247, "learning_rate": 4.717399698718547e-05, "loss": 0.316, "num_input_tokens_seen": 27795264, "step": 29120 }, { "epoch": 2.375805530630557, "grad_norm": 4.006190299987793, "learning_rate": 4.717235292084889e-05, "loss": 0.4002, "num_input_tokens_seen": 27800320, "step": 29125 }, { "epoch": 2.376213394240966, "grad_norm": 1.7788928747177124, "learning_rate": 4.717070840508815e-05, "loss": 0.4074, "num_input_tokens_seen": 27805264, "step": 29130 }, { "epoch": 2.3766212578513746, "grad_norm": 1.2401230335235596, "learning_rate": 4.716906343993656e-05, "loss": 0.3914, "num_input_tokens_seen": 27809872, "step": 29135 }, { "epoch": 2.377029121461783, "grad_norm": 1.6391867399215698, "learning_rate": 4.7167418025427476e-05, "loss": 0.3794, "num_input_tokens_seen": 27813792, "step": 29140 }, { "epoch": 2.3774369850721917, "grad_norm": 2.1762290000915527, "learning_rate": 4.7165772161594254e-05, "loss": 0.3474, "num_input_tokens_seen": 27819216, "step": 29145 }, { "epoch": 2.3778448486826007, "grad_norm": 3.068554162979126, "learning_rate": 4.716412584847025e-05, "loss": 0.3767, "num_input_tokens_seen": 27824272, "step": 29150 }, { "epoch": 2.3782527122930093, "grad_norm": 1.2320003509521484, "learning_rate": 4.716247908608883e-05, "loss": 0.3658, "num_input_tokens_seen": 27828736, "step": 29155 }, { "epoch": 2.378660575903418, "grad_norm": 3.080510377883911, "learning_rate": 4.7160831874483377e-05, "loss": 0.3424, "num_input_tokens_seen": 27834304, "step": 29160 }, { "epoch": 2.3790684395138264, "grad_norm": 2.901520013809204, "learning_rate": 4.715918421368728e-05, "loss": 0.3575, "num_input_tokens_seen": 27839888, "step": 29165 }, { "epoch": 2.3794763031242354, "grad_norm": 1.3271936178207397, "learning_rate": 4.715753610373393e-05, "loss": 0.4475, "num_input_tokens_seen": 27844560, "step": 29170 }, { "epoch": 2.379884166734644, "grad_norm": 2.298140048980713, "learning_rate": 4.715588754465674e-05, "loss": 0.34, "num_input_tokens_seen": 27849392, "step": 29175 }, { "epoch": 2.3802920303450525, "grad_norm": 1.7440128326416016, "learning_rate": 4.715423853648913e-05, "loss": 0.3307, "num_input_tokens_seen": 27854432, "step": 29180 }, { "epoch": 2.380699893955461, "grad_norm": 2.0207390785217285, "learning_rate": 4.7152589079264506e-05, "loss": 0.3199, "num_input_tokens_seen": 27859888, "step": 29185 }, { "epoch": 2.38110775756587, "grad_norm": 1.3298593759536743, "learning_rate": 4.715093917301633e-05, "loss": 0.2999, "num_input_tokens_seen": 27865168, "step": 29190 }, { "epoch": 2.3815156211762787, "grad_norm": 1.3780385255813599, "learning_rate": 4.714928881777802e-05, "loss": 0.3417, "num_input_tokens_seen": 27870224, "step": 29195 }, { "epoch": 2.3819234847866873, "grad_norm": 3.1808295249938965, "learning_rate": 4.714763801358304e-05, "loss": 0.4817, "num_input_tokens_seen": 27875776, "step": 29200 }, { "epoch": 2.382331348397096, "grad_norm": 0.6348661780357361, "learning_rate": 4.7145986760464846e-05, "loss": 0.4252, "num_input_tokens_seen": 27880144, "step": 29205 }, { "epoch": 2.382739212007505, "grad_norm": 1.8342777490615845, "learning_rate": 4.714433505845691e-05, "loss": 0.3457, "num_input_tokens_seen": 27884192, "step": 29210 }, { "epoch": 2.3831470756179134, "grad_norm": 4.536027431488037, "learning_rate": 4.7142682907592716e-05, "loss": 0.4629, "num_input_tokens_seen": 27889440, "step": 29215 }, { "epoch": 2.383554939228322, "grad_norm": 1.201826572418213, "learning_rate": 4.7141030307905757e-05, "loss": 0.3377, "num_input_tokens_seen": 27892992, "step": 29220 }, { "epoch": 2.383962802838731, "grad_norm": 1.1327929496765137, "learning_rate": 4.7139377259429504e-05, "loss": 0.3876, "num_input_tokens_seen": 27897680, "step": 29225 }, { "epoch": 2.3843706664491395, "grad_norm": 1.4377332925796509, "learning_rate": 4.713772376219749e-05, "loss": 0.3943, "num_input_tokens_seen": 27901568, "step": 29230 }, { "epoch": 2.384778530059548, "grad_norm": 1.3220583200454712, "learning_rate": 4.7136069816243224e-05, "loss": 0.4016, "num_input_tokens_seen": 27906608, "step": 29235 }, { "epoch": 2.3851863936699567, "grad_norm": 3.03230619430542, "learning_rate": 4.713441542160023e-05, "loss": 0.6034, "num_input_tokens_seen": 27911232, "step": 29240 }, { "epoch": 2.3855942572803652, "grad_norm": 1.8928585052490234, "learning_rate": 4.7132760578302035e-05, "loss": 0.4044, "num_input_tokens_seen": 27916240, "step": 29245 }, { "epoch": 2.3860021208907742, "grad_norm": 0.9272463917732239, "learning_rate": 4.713110528638219e-05, "loss": 0.3883, "num_input_tokens_seen": 27921216, "step": 29250 }, { "epoch": 2.386409984501183, "grad_norm": 2.17204213142395, "learning_rate": 4.712944954587425e-05, "loss": 0.3811, "num_input_tokens_seen": 27926352, "step": 29255 }, { "epoch": 2.3868178481115914, "grad_norm": 2.1140027046203613, "learning_rate": 4.712779335681177e-05, "loss": 0.3701, "num_input_tokens_seen": 27931488, "step": 29260 }, { "epoch": 2.3872257117220004, "grad_norm": 1.2470635175704956, "learning_rate": 4.7126136719228314e-05, "loss": 0.2663, "num_input_tokens_seen": 27935520, "step": 29265 }, { "epoch": 2.387633575332409, "grad_norm": 1.7395424842834473, "learning_rate": 4.712447963315747e-05, "loss": 0.4388, "num_input_tokens_seen": 27940592, "step": 29270 }, { "epoch": 2.3880414389428175, "grad_norm": 1.2660696506500244, "learning_rate": 4.7122822098632835e-05, "loss": 0.417, "num_input_tokens_seen": 27944880, "step": 29275 }, { "epoch": 2.388449302553226, "grad_norm": 2.4238715171813965, "learning_rate": 4.712116411568799e-05, "loss": 0.3558, "num_input_tokens_seen": 27950560, "step": 29280 }, { "epoch": 2.3888571661636346, "grad_norm": 0.6933827996253967, "learning_rate": 4.7119505684356545e-05, "loss": 0.3552, "num_input_tokens_seen": 27955328, "step": 29285 }, { "epoch": 2.3892650297740436, "grad_norm": 0.9801732897758484, "learning_rate": 4.711784680467212e-05, "loss": 0.3407, "num_input_tokens_seen": 27960384, "step": 29290 }, { "epoch": 2.389672893384452, "grad_norm": 1.8611924648284912, "learning_rate": 4.711618747666834e-05, "loss": 0.3594, "num_input_tokens_seen": 27964896, "step": 29295 }, { "epoch": 2.3900807569948608, "grad_norm": 3.8605613708496094, "learning_rate": 4.711452770037884e-05, "loss": 0.4121, "num_input_tokens_seen": 27970016, "step": 29300 }, { "epoch": 2.39048862060527, "grad_norm": 1.4366209506988525, "learning_rate": 4.7112867475837256e-05, "loss": 0.325, "num_input_tokens_seen": 27974720, "step": 29305 }, { "epoch": 2.3908964842156784, "grad_norm": 1.749492883682251, "learning_rate": 4.711120680307724e-05, "loss": 0.4642, "num_input_tokens_seen": 27980192, "step": 29310 }, { "epoch": 2.391304347826087, "grad_norm": 1.8421374559402466, "learning_rate": 4.7109545682132475e-05, "loss": 0.4782, "num_input_tokens_seen": 27984864, "step": 29315 }, { "epoch": 2.3917122114364955, "grad_norm": 0.7071225643157959, "learning_rate": 4.710788411303661e-05, "loss": 0.3488, "num_input_tokens_seen": 27989936, "step": 29320 }, { "epoch": 2.3921200750469045, "grad_norm": 2.0544326305389404, "learning_rate": 4.710622209582331e-05, "loss": 0.3832, "num_input_tokens_seen": 27995104, "step": 29325 }, { "epoch": 2.392527938657313, "grad_norm": 1.627354621887207, "learning_rate": 4.71045596305263e-05, "loss": 0.3701, "num_input_tokens_seen": 27999824, "step": 29330 }, { "epoch": 2.3929358022677216, "grad_norm": 2.2350118160247803, "learning_rate": 4.7102896717179254e-05, "loss": 0.3819, "num_input_tokens_seen": 28005344, "step": 29335 }, { "epoch": 2.39334366587813, "grad_norm": 1.5895570516586304, "learning_rate": 4.710123335581588e-05, "loss": 0.3772, "num_input_tokens_seen": 28010704, "step": 29340 }, { "epoch": 2.393751529488539, "grad_norm": 1.0783185958862305, "learning_rate": 4.70995695464699e-05, "loss": 0.309, "num_input_tokens_seen": 28015520, "step": 29345 }, { "epoch": 2.3941593930989478, "grad_norm": 1.0149027109146118, "learning_rate": 4.709790528917504e-05, "loss": 0.3125, "num_input_tokens_seen": 28021040, "step": 29350 }, { "epoch": 2.3945672567093563, "grad_norm": 0.957542359828949, "learning_rate": 4.7096240583965026e-05, "loss": 0.4721, "num_input_tokens_seen": 28024960, "step": 29355 }, { "epoch": 2.394975120319765, "grad_norm": 0.7554223537445068, "learning_rate": 4.70945754308736e-05, "loss": 0.3295, "num_input_tokens_seen": 28029952, "step": 29360 }, { "epoch": 2.395382983930174, "grad_norm": 1.8476231098175049, "learning_rate": 4.7092909829934526e-05, "loss": 0.3454, "num_input_tokens_seen": 28035344, "step": 29365 }, { "epoch": 2.3957908475405825, "grad_norm": 1.7942792177200317, "learning_rate": 4.709124378118155e-05, "loss": 0.3215, "num_input_tokens_seen": 28039904, "step": 29370 }, { "epoch": 2.396198711150991, "grad_norm": 2.5757765769958496, "learning_rate": 4.708957728464846e-05, "loss": 0.4631, "num_input_tokens_seen": 28044176, "step": 29375 }, { "epoch": 2.3966065747613996, "grad_norm": 1.6300911903381348, "learning_rate": 4.708791034036901e-05, "loss": 0.2135, "num_input_tokens_seen": 28049984, "step": 29380 }, { "epoch": 2.3970144383718086, "grad_norm": 3.0889759063720703, "learning_rate": 4.7086242948377015e-05, "loss": 0.3309, "num_input_tokens_seen": 28054832, "step": 29385 }, { "epoch": 2.397422301982217, "grad_norm": 3.8537182807922363, "learning_rate": 4.708457510870626e-05, "loss": 0.6103, "num_input_tokens_seen": 28059248, "step": 29390 }, { "epoch": 2.3978301655926257, "grad_norm": 0.7959989309310913, "learning_rate": 4.708290682139055e-05, "loss": 0.3165, "num_input_tokens_seen": 28063776, "step": 29395 }, { "epoch": 2.3982380292030347, "grad_norm": 2.7688515186309814, "learning_rate": 4.70812380864637e-05, "loss": 0.4326, "num_input_tokens_seen": 28068784, "step": 29400 }, { "epoch": 2.3986458928134433, "grad_norm": 3.4403653144836426, "learning_rate": 4.707956890395954e-05, "loss": 0.5747, "num_input_tokens_seen": 28072816, "step": 29405 }, { "epoch": 2.399053756423852, "grad_norm": 2.0268843173980713, "learning_rate": 4.70778992739119e-05, "loss": 0.3826, "num_input_tokens_seen": 28077648, "step": 29410 }, { "epoch": 2.3994616200342604, "grad_norm": 2.02563214302063, "learning_rate": 4.707622919635462e-05, "loss": 0.3306, "num_input_tokens_seen": 28082528, "step": 29415 }, { "epoch": 2.399869483644669, "grad_norm": 0.9074587225914001, "learning_rate": 4.7074558671321556e-05, "loss": 0.3299, "num_input_tokens_seen": 28087824, "step": 29420 }, { "epoch": 2.400277347255078, "grad_norm": 1.5492023229599, "learning_rate": 4.7072887698846565e-05, "loss": 0.3182, "num_input_tokens_seen": 28092512, "step": 29425 }, { "epoch": 2.4006852108654866, "grad_norm": 2.0951459407806396, "learning_rate": 4.707121627896353e-05, "loss": 0.4679, "num_input_tokens_seen": 28096896, "step": 29430 }, { "epoch": 2.401093074475895, "grad_norm": 18.497045516967773, "learning_rate": 4.7069544411706314e-05, "loss": 0.4133, "num_input_tokens_seen": 28101184, "step": 29435 }, { "epoch": 2.401500938086304, "grad_norm": 1.8714622259140015, "learning_rate": 4.706787209710881e-05, "loss": 0.3729, "num_input_tokens_seen": 28106064, "step": 29440 }, { "epoch": 2.4019088016967127, "grad_norm": 2.5894808769226074, "learning_rate": 4.706619933520492e-05, "loss": 0.3937, "num_input_tokens_seen": 28111408, "step": 29445 }, { "epoch": 2.4023166653071213, "grad_norm": 1.113842487335205, "learning_rate": 4.706452612602854e-05, "loss": 0.3346, "num_input_tokens_seen": 28116368, "step": 29450 }, { "epoch": 2.40272452891753, "grad_norm": 1.670330286026001, "learning_rate": 4.7062852469613594e-05, "loss": 0.3675, "num_input_tokens_seen": 28121648, "step": 29455 }, { "epoch": 2.4031323925279384, "grad_norm": 1.1887366771697998, "learning_rate": 4.7061178365993994e-05, "loss": 0.3109, "num_input_tokens_seen": 28126032, "step": 29460 }, { "epoch": 2.4035402561383474, "grad_norm": 1.1174899339675903, "learning_rate": 4.70595038152037e-05, "loss": 0.402, "num_input_tokens_seen": 28131216, "step": 29465 }, { "epoch": 2.403948119748756, "grad_norm": 1.6511973142623901, "learning_rate": 4.705782881727663e-05, "loss": 0.4414, "num_input_tokens_seen": 28134960, "step": 29470 }, { "epoch": 2.4043559833591646, "grad_norm": 2.644491195678711, "learning_rate": 4.705615337224674e-05, "loss": 0.4147, "num_input_tokens_seen": 28139856, "step": 29475 }, { "epoch": 2.4047638469695736, "grad_norm": 1.9316233396530151, "learning_rate": 4.705447748014799e-05, "loss": 0.4608, "num_input_tokens_seen": 28145264, "step": 29480 }, { "epoch": 2.405171710579982, "grad_norm": 1.146047592163086, "learning_rate": 4.705280114101436e-05, "loss": 0.2941, "num_input_tokens_seen": 28149216, "step": 29485 }, { "epoch": 2.4055795741903907, "grad_norm": 0.5837685465812683, "learning_rate": 4.7051124354879815e-05, "loss": 0.4312, "num_input_tokens_seen": 28154048, "step": 29490 }, { "epoch": 2.4059874378007993, "grad_norm": 4.269050598144531, "learning_rate": 4.704944712177835e-05, "loss": 0.7521, "num_input_tokens_seen": 28158720, "step": 29495 }, { "epoch": 2.4063953014112083, "grad_norm": 0.8849445581436157, "learning_rate": 4.7047769441743974e-05, "loss": 0.3184, "num_input_tokens_seen": 28164032, "step": 29500 }, { "epoch": 2.406803165021617, "grad_norm": 1.535975456237793, "learning_rate": 4.704609131481067e-05, "loss": 0.4864, "num_input_tokens_seen": 28168848, "step": 29505 }, { "epoch": 2.4072110286320254, "grad_norm": 1.3045457601547241, "learning_rate": 4.704441274101247e-05, "loss": 0.4615, "num_input_tokens_seen": 28173456, "step": 29510 }, { "epoch": 2.407618892242434, "grad_norm": 1.8144208192825317, "learning_rate": 4.704273372038338e-05, "loss": 0.369, "num_input_tokens_seen": 28178064, "step": 29515 }, { "epoch": 2.408026755852843, "grad_norm": 0.6286497712135315, "learning_rate": 4.704105425295745e-05, "loss": 0.3406, "num_input_tokens_seen": 28182048, "step": 29520 }, { "epoch": 2.4084346194632515, "grad_norm": 1.8619134426116943, "learning_rate": 4.703937433876872e-05, "loss": 0.3514, "num_input_tokens_seen": 28187088, "step": 29525 }, { "epoch": 2.40884248307366, "grad_norm": 1.165338158607483, "learning_rate": 4.703769397785123e-05, "loss": 0.3501, "num_input_tokens_seen": 28192352, "step": 29530 }, { "epoch": 2.4092503466840687, "grad_norm": 1.9401352405548096, "learning_rate": 4.7036013170239044e-05, "loss": 0.3472, "num_input_tokens_seen": 28196544, "step": 29535 }, { "epoch": 2.4096582102944777, "grad_norm": 1.9496445655822754, "learning_rate": 4.7034331915966246e-05, "loss": 0.3906, "num_input_tokens_seen": 28201184, "step": 29540 }, { "epoch": 2.4100660739048863, "grad_norm": 3.3350625038146973, "learning_rate": 4.70326502150669e-05, "loss": 0.3553, "num_input_tokens_seen": 28205648, "step": 29545 }, { "epoch": 2.410473937515295, "grad_norm": 3.627192497253418, "learning_rate": 4.7030968067575086e-05, "loss": 0.3991, "num_input_tokens_seen": 28211024, "step": 29550 }, { "epoch": 2.4108818011257034, "grad_norm": 2.62198543548584, "learning_rate": 4.702928547352492e-05, "loss": 0.3781, "num_input_tokens_seen": 28215680, "step": 29555 }, { "epoch": 2.4112896647361124, "grad_norm": 1.303411602973938, "learning_rate": 4.70276024329505e-05, "loss": 0.3776, "num_input_tokens_seen": 28221632, "step": 29560 }, { "epoch": 2.411697528346521, "grad_norm": 1.5925863981246948, "learning_rate": 4.7025918945885937e-05, "loss": 0.4239, "num_input_tokens_seen": 28226688, "step": 29565 }, { "epoch": 2.4121053919569295, "grad_norm": 0.7228617668151855, "learning_rate": 4.702423501236535e-05, "loss": 0.3439, "num_input_tokens_seen": 28231184, "step": 29570 }, { "epoch": 2.412513255567338, "grad_norm": 0.5927014350891113, "learning_rate": 4.702255063242289e-05, "loss": 0.413, "num_input_tokens_seen": 28235952, "step": 29575 }, { "epoch": 2.412921119177747, "grad_norm": 0.652498185634613, "learning_rate": 4.702086580609268e-05, "loss": 0.3444, "num_input_tokens_seen": 28240496, "step": 29580 }, { "epoch": 2.4133289827881557, "grad_norm": 1.6031908988952637, "learning_rate": 4.701918053340887e-05, "loss": 0.3784, "num_input_tokens_seen": 28244416, "step": 29585 }, { "epoch": 2.4137368463985642, "grad_norm": 2.465116262435913, "learning_rate": 4.7017494814405637e-05, "loss": 0.4644, "num_input_tokens_seen": 28248688, "step": 29590 }, { "epoch": 2.414144710008973, "grad_norm": 1.0800952911376953, "learning_rate": 4.7015808649117146e-05, "loss": 0.3907, "num_input_tokens_seen": 28253664, "step": 29595 }, { "epoch": 2.414552573619382, "grad_norm": 0.7302694320678711, "learning_rate": 4.7014122037577554e-05, "loss": 0.35, "num_input_tokens_seen": 28258816, "step": 29600 }, { "epoch": 2.4149604372297904, "grad_norm": 2.0256731510162354, "learning_rate": 4.7012434979821066e-05, "loss": 0.3648, "num_input_tokens_seen": 28263312, "step": 29605 }, { "epoch": 2.415368300840199, "grad_norm": 1.0347222089767456, "learning_rate": 4.7010747475881876e-05, "loss": 0.3381, "num_input_tokens_seen": 28268080, "step": 29610 }, { "epoch": 2.415776164450608, "grad_norm": 3.4928972721099854, "learning_rate": 4.700905952579419e-05, "loss": 0.5204, "num_input_tokens_seen": 28272688, "step": 29615 }, { "epoch": 2.4161840280610165, "grad_norm": 1.1081490516662598, "learning_rate": 4.700737112959222e-05, "loss": 0.4161, "num_input_tokens_seen": 28277392, "step": 29620 }, { "epoch": 2.416591891671425, "grad_norm": 1.116594910621643, "learning_rate": 4.700568228731019e-05, "loss": 0.2867, "num_input_tokens_seen": 28281840, "step": 29625 }, { "epoch": 2.4169997552818336, "grad_norm": 0.9560410380363464, "learning_rate": 4.700399299898233e-05, "loss": 0.4549, "num_input_tokens_seen": 28287520, "step": 29630 }, { "epoch": 2.417407618892242, "grad_norm": 2.0029449462890625, "learning_rate": 4.700230326464288e-05, "loss": 0.3625, "num_input_tokens_seen": 28291792, "step": 29635 }, { "epoch": 2.417815482502651, "grad_norm": 1.0642073154449463, "learning_rate": 4.700061308432609e-05, "loss": 0.4006, "num_input_tokens_seen": 28296336, "step": 29640 }, { "epoch": 2.41822334611306, "grad_norm": 1.9092590808868408, "learning_rate": 4.699892245806622e-05, "loss": 0.3459, "num_input_tokens_seen": 28301408, "step": 29645 }, { "epoch": 2.4186312097234683, "grad_norm": 0.8277136087417603, "learning_rate": 4.699723138589755e-05, "loss": 0.3595, "num_input_tokens_seen": 28306192, "step": 29650 }, { "epoch": 2.4190390733338774, "grad_norm": 1.6076241731643677, "learning_rate": 4.699553986785433e-05, "loss": 0.3689, "num_input_tokens_seen": 28309472, "step": 29655 }, { "epoch": 2.419446936944286, "grad_norm": 0.6670352220535278, "learning_rate": 4.699384790397088e-05, "loss": 0.3789, "num_input_tokens_seen": 28314096, "step": 29660 }, { "epoch": 2.4198548005546945, "grad_norm": 2.1098804473876953, "learning_rate": 4.699215549428147e-05, "loss": 0.376, "num_input_tokens_seen": 28319696, "step": 29665 }, { "epoch": 2.420262664165103, "grad_norm": 2.3105130195617676, "learning_rate": 4.6990462638820424e-05, "loss": 0.3836, "num_input_tokens_seen": 28324192, "step": 29670 }, { "epoch": 2.420670527775512, "grad_norm": 1.7935336828231812, "learning_rate": 4.698876933762203e-05, "loss": 0.3563, "num_input_tokens_seen": 28329488, "step": 29675 }, { "epoch": 2.4210783913859206, "grad_norm": 4.095205783843994, "learning_rate": 4.698707559072063e-05, "loss": 0.557, "num_input_tokens_seen": 28334032, "step": 29680 }, { "epoch": 2.421486254996329, "grad_norm": 1.832040786743164, "learning_rate": 4.6985381398150555e-05, "loss": 0.4225, "num_input_tokens_seen": 28338432, "step": 29685 }, { "epoch": 2.4218941186067378, "grad_norm": 1.0751372575759888, "learning_rate": 4.698368675994613e-05, "loss": 0.4315, "num_input_tokens_seen": 28343760, "step": 29690 }, { "epoch": 2.4223019822171468, "grad_norm": 1.3769868612289429, "learning_rate": 4.698199167614173e-05, "loss": 0.3851, "num_input_tokens_seen": 28348624, "step": 29695 }, { "epoch": 2.4227098458275553, "grad_norm": 2.089691638946533, "learning_rate": 4.69802961467717e-05, "loss": 0.3325, "num_input_tokens_seen": 28352352, "step": 29700 }, { "epoch": 2.423117709437964, "grad_norm": 2.0392026901245117, "learning_rate": 4.697860017187039e-05, "loss": 0.4005, "num_input_tokens_seen": 28357264, "step": 29705 }, { "epoch": 2.4235255730483725, "grad_norm": 1.7437283992767334, "learning_rate": 4.697690375147221e-05, "loss": 0.4009, "num_input_tokens_seen": 28361472, "step": 29710 }, { "epoch": 2.4239334366587815, "grad_norm": 2.7413218021392822, "learning_rate": 4.6975206885611524e-05, "loss": 0.4053, "num_input_tokens_seen": 28365904, "step": 29715 }, { "epoch": 2.42434130026919, "grad_norm": 2.9993462562561035, "learning_rate": 4.6973509574322726e-05, "loss": 0.433, "num_input_tokens_seen": 28370384, "step": 29720 }, { "epoch": 2.4247491638795986, "grad_norm": 1.4096167087554932, "learning_rate": 4.697181181764023e-05, "loss": 0.4186, "num_input_tokens_seen": 28376032, "step": 29725 }, { "epoch": 2.425157027490007, "grad_norm": 0.5487095713615417, "learning_rate": 4.6970113615598455e-05, "loss": 0.3598, "num_input_tokens_seen": 28380880, "step": 29730 }, { "epoch": 2.425564891100416, "grad_norm": 1.5158116817474365, "learning_rate": 4.69684149682318e-05, "loss": 0.3438, "num_input_tokens_seen": 28385728, "step": 29735 }, { "epoch": 2.4259727547108247, "grad_norm": 0.7956606149673462, "learning_rate": 4.696671587557472e-05, "loss": 0.2253, "num_input_tokens_seen": 28390800, "step": 29740 }, { "epoch": 2.4263806183212333, "grad_norm": 5.431715965270996, "learning_rate": 4.696501633766163e-05, "loss": 0.6535, "num_input_tokens_seen": 28395728, "step": 29745 }, { "epoch": 2.426788481931642, "grad_norm": 6.2769975662231445, "learning_rate": 4.6963316354527e-05, "loss": 0.7452, "num_input_tokens_seen": 28400384, "step": 29750 }, { "epoch": 2.427196345542051, "grad_norm": 4.462843418121338, "learning_rate": 4.696161592620528e-05, "loss": 0.9294, "num_input_tokens_seen": 28405920, "step": 29755 }, { "epoch": 2.4276042091524594, "grad_norm": 2.523724317550659, "learning_rate": 4.6959915052730944e-05, "loss": 0.5291, "num_input_tokens_seen": 28410752, "step": 29760 }, { "epoch": 2.428012072762868, "grad_norm": 1.1849656105041504, "learning_rate": 4.695821373413845e-05, "loss": 0.2333, "num_input_tokens_seen": 28416688, "step": 29765 }, { "epoch": 2.4284199363732766, "grad_norm": 2.1825616359710693, "learning_rate": 4.695651197046229e-05, "loss": 0.4617, "num_input_tokens_seen": 28421776, "step": 29770 }, { "epoch": 2.4288277999836856, "grad_norm": 1.7010278701782227, "learning_rate": 4.695480976173697e-05, "loss": 0.3457, "num_input_tokens_seen": 28426608, "step": 29775 }, { "epoch": 2.429235663594094, "grad_norm": 1.132675051689148, "learning_rate": 4.6953107107996994e-05, "loss": 0.374, "num_input_tokens_seen": 28431312, "step": 29780 }, { "epoch": 2.4296435272045027, "grad_norm": 1.8422436714172363, "learning_rate": 4.695140400927686e-05, "loss": 0.3725, "num_input_tokens_seen": 28435520, "step": 29785 }, { "epoch": 2.4300513908149117, "grad_norm": 1.403135061264038, "learning_rate": 4.6949700465611086e-05, "loss": 0.3684, "num_input_tokens_seen": 28440288, "step": 29790 }, { "epoch": 2.4304592544253203, "grad_norm": 0.41771388053894043, "learning_rate": 4.694799647703422e-05, "loss": 0.4616, "num_input_tokens_seen": 28444608, "step": 29795 }, { "epoch": 2.430867118035729, "grad_norm": 1.7504769563674927, "learning_rate": 4.694629204358078e-05, "loss": 0.345, "num_input_tokens_seen": 28449600, "step": 29800 }, { "epoch": 2.4312749816461374, "grad_norm": 0.6348248720169067, "learning_rate": 4.6944587165285336e-05, "loss": 0.3724, "num_input_tokens_seen": 28454320, "step": 29805 }, { "epoch": 2.431682845256546, "grad_norm": 0.7745915055274963, "learning_rate": 4.694288184218243e-05, "loss": 0.3885, "num_input_tokens_seen": 28459328, "step": 29810 }, { "epoch": 2.432090708866955, "grad_norm": 1.4860349893569946, "learning_rate": 4.6941176074306634e-05, "loss": 0.3354, "num_input_tokens_seen": 28464784, "step": 29815 }, { "epoch": 2.4324985724773636, "grad_norm": 0.8663619756698608, "learning_rate": 4.693946986169253e-05, "loss": 0.3867, "num_input_tokens_seen": 28470672, "step": 29820 }, { "epoch": 2.432906436087772, "grad_norm": 0.6000568866729736, "learning_rate": 4.6937763204374686e-05, "loss": 0.3274, "num_input_tokens_seen": 28475248, "step": 29825 }, { "epoch": 2.433314299698181, "grad_norm": 1.0805304050445557, "learning_rate": 4.69360561023877e-05, "loss": 0.249, "num_input_tokens_seen": 28480304, "step": 29830 }, { "epoch": 2.4337221633085897, "grad_norm": 0.7595944404602051, "learning_rate": 4.693434855576619e-05, "loss": 0.5978, "num_input_tokens_seen": 28485072, "step": 29835 }, { "epoch": 2.4341300269189983, "grad_norm": 1.3347264528274536, "learning_rate": 4.693264056454475e-05, "loss": 0.3201, "num_input_tokens_seen": 28489296, "step": 29840 }, { "epoch": 2.434537890529407, "grad_norm": 0.6294893622398376, "learning_rate": 4.6930932128758e-05, "loss": 0.5191, "num_input_tokens_seen": 28493728, "step": 29845 }, { "epoch": 2.4349457541398154, "grad_norm": 1.884907603263855, "learning_rate": 4.692922324844058e-05, "loss": 0.375, "num_input_tokens_seen": 28499344, "step": 29850 }, { "epoch": 2.4353536177502244, "grad_norm": 1.415459156036377, "learning_rate": 4.6927513923627124e-05, "loss": 0.3657, "num_input_tokens_seen": 28503984, "step": 29855 }, { "epoch": 2.435761481360633, "grad_norm": 1.7169264554977417, "learning_rate": 4.692580415435228e-05, "loss": 0.3641, "num_input_tokens_seen": 28509216, "step": 29860 }, { "epoch": 2.4361693449710415, "grad_norm": 1.3114022016525269, "learning_rate": 4.6924093940650696e-05, "loss": 0.3412, "num_input_tokens_seen": 28514272, "step": 29865 }, { "epoch": 2.4365772085814505, "grad_norm": 2.19968843460083, "learning_rate": 4.692238328255705e-05, "loss": 0.4422, "num_input_tokens_seen": 28519408, "step": 29870 }, { "epoch": 2.436985072191859, "grad_norm": 1.4044301509857178, "learning_rate": 4.692067218010601e-05, "loss": 0.4111, "num_input_tokens_seen": 28523376, "step": 29875 }, { "epoch": 2.4373929358022677, "grad_norm": 1.6018760204315186, "learning_rate": 4.6918960633332254e-05, "loss": 0.319, "num_input_tokens_seen": 28528208, "step": 29880 }, { "epoch": 2.4378007994126762, "grad_norm": 2.55727481842041, "learning_rate": 4.691724864227048e-05, "loss": 0.401, "num_input_tokens_seen": 28532736, "step": 29885 }, { "epoch": 2.4382086630230853, "grad_norm": 0.7177672386169434, "learning_rate": 4.69155362069554e-05, "loss": 0.3968, "num_input_tokens_seen": 28537120, "step": 29890 }, { "epoch": 2.438616526633494, "grad_norm": 1.5382546186447144, "learning_rate": 4.6913823327421705e-05, "loss": 0.3628, "num_input_tokens_seen": 28541808, "step": 29895 }, { "epoch": 2.4390243902439024, "grad_norm": 2.0129268169403076, "learning_rate": 4.6912110003704124e-05, "loss": 0.3593, "num_input_tokens_seen": 28546864, "step": 29900 }, { "epoch": 2.439432253854311, "grad_norm": 0.7624809741973877, "learning_rate": 4.6910396235837384e-05, "loss": 0.2836, "num_input_tokens_seen": 28551408, "step": 29905 }, { "epoch": 2.43984011746472, "grad_norm": 1.1053928136825562, "learning_rate": 4.690868202385622e-05, "loss": 0.2708, "num_input_tokens_seen": 28555664, "step": 29910 }, { "epoch": 2.4402479810751285, "grad_norm": 1.8433693647384644, "learning_rate": 4.690696736779539e-05, "loss": 0.5089, "num_input_tokens_seen": 28560560, "step": 29915 }, { "epoch": 2.440655844685537, "grad_norm": 0.528070330619812, "learning_rate": 4.6905252267689627e-05, "loss": 0.3939, "num_input_tokens_seen": 28565008, "step": 29920 }, { "epoch": 2.4410637082959457, "grad_norm": 0.8221251964569092, "learning_rate": 4.6903536723573715e-05, "loss": 0.3377, "num_input_tokens_seen": 28570368, "step": 29925 }, { "epoch": 2.4414715719063547, "grad_norm": 1.9809237718582153, "learning_rate": 4.690182073548243e-05, "loss": 0.521, "num_input_tokens_seen": 28574816, "step": 29930 }, { "epoch": 2.4418794355167632, "grad_norm": 0.4487707018852234, "learning_rate": 4.690010430345053e-05, "loss": 0.3786, "num_input_tokens_seen": 28579648, "step": 29935 }, { "epoch": 2.442287299127172, "grad_norm": 0.791289746761322, "learning_rate": 4.689838742751283e-05, "loss": 0.3462, "num_input_tokens_seen": 28585488, "step": 29940 }, { "epoch": 2.4426951627375804, "grad_norm": 2.982543468475342, "learning_rate": 4.689667010770412e-05, "loss": 0.4461, "num_input_tokens_seen": 28590384, "step": 29945 }, { "epoch": 2.4431030263479894, "grad_norm": 0.834794819355011, "learning_rate": 4.689495234405921e-05, "loss": 0.1887, "num_input_tokens_seen": 28595264, "step": 29950 }, { "epoch": 2.443510889958398, "grad_norm": 2.007580518722534, "learning_rate": 4.689323413661292e-05, "loss": 0.5999, "num_input_tokens_seen": 28599696, "step": 29955 }, { "epoch": 2.4439187535688065, "grad_norm": 0.8150277733802795, "learning_rate": 4.689151548540008e-05, "loss": 0.27, "num_input_tokens_seen": 28604784, "step": 29960 }, { "epoch": 2.4443266171792155, "grad_norm": 2.545240640640259, "learning_rate": 4.688979639045552e-05, "loss": 0.5202, "num_input_tokens_seen": 28609712, "step": 29965 }, { "epoch": 2.444734480789624, "grad_norm": 1.379062533378601, "learning_rate": 4.688807685181409e-05, "loss": 0.4169, "num_input_tokens_seen": 28615360, "step": 29970 }, { "epoch": 2.4451423444000326, "grad_norm": 1.4017633199691772, "learning_rate": 4.688635686951065e-05, "loss": 0.3118, "num_input_tokens_seen": 28620256, "step": 29975 }, { "epoch": 2.445550208010441, "grad_norm": 2.612675428390503, "learning_rate": 4.688463644358005e-05, "loss": 0.393, "num_input_tokens_seen": 28624448, "step": 29980 }, { "epoch": 2.4459580716208498, "grad_norm": 2.8418636322021484, "learning_rate": 4.6882915574057165e-05, "loss": 0.3456, "num_input_tokens_seen": 28629376, "step": 29985 }, { "epoch": 2.446365935231259, "grad_norm": 0.830124020576477, "learning_rate": 4.688119426097689e-05, "loss": 0.3385, "num_input_tokens_seen": 28633920, "step": 29990 }, { "epoch": 2.4467737988416673, "grad_norm": 1.8798434734344482, "learning_rate": 4.68794725043741e-05, "loss": 0.3413, "num_input_tokens_seen": 28638560, "step": 29995 }, { "epoch": 2.447181662452076, "grad_norm": 2.160435914993286, "learning_rate": 4.6877750304283695e-05, "loss": 0.3658, "num_input_tokens_seen": 28643296, "step": 30000 }, { "epoch": 2.447589526062485, "grad_norm": 1.366809368133545, "learning_rate": 4.687602766074059e-05, "loss": 0.3816, "num_input_tokens_seen": 28647296, "step": 30005 }, { "epoch": 2.4479973896728935, "grad_norm": 0.6613263487815857, "learning_rate": 4.68743045737797e-05, "loss": 0.3483, "num_input_tokens_seen": 28651504, "step": 30010 }, { "epoch": 2.448405253283302, "grad_norm": 0.42862051725387573, "learning_rate": 4.6872581043435955e-05, "loss": 0.2926, "num_input_tokens_seen": 28655328, "step": 30015 }, { "epoch": 2.4488131168937106, "grad_norm": 1.1017411947250366, "learning_rate": 4.687085706974428e-05, "loss": 0.3269, "num_input_tokens_seen": 28659552, "step": 30020 }, { "epoch": 2.449220980504119, "grad_norm": 1.530449390411377, "learning_rate": 4.686913265273963e-05, "loss": 0.5637, "num_input_tokens_seen": 28663856, "step": 30025 }, { "epoch": 2.449628844114528, "grad_norm": 1.8919847011566162, "learning_rate": 4.686740779245696e-05, "loss": 0.4582, "num_input_tokens_seen": 28669216, "step": 30030 }, { "epoch": 2.4500367077249368, "grad_norm": 1.8313971757888794, "learning_rate": 4.686568248893122e-05, "loss": 0.4214, "num_input_tokens_seen": 28673792, "step": 30035 }, { "epoch": 2.4504445713353453, "grad_norm": 0.9849539995193481, "learning_rate": 4.6863956742197384e-05, "loss": 0.3824, "num_input_tokens_seen": 28677712, "step": 30040 }, { "epoch": 2.4508524349457543, "grad_norm": 1.6115254163742065, "learning_rate": 4.6862230552290434e-05, "loss": 0.3514, "num_input_tokens_seen": 28682464, "step": 30045 }, { "epoch": 2.451260298556163, "grad_norm": 1.8706213235855103, "learning_rate": 4.686050391924537e-05, "loss": 0.3675, "num_input_tokens_seen": 28687040, "step": 30050 }, { "epoch": 2.4516681621665715, "grad_norm": 1.5851765871047974, "learning_rate": 4.685877684309717e-05, "loss": 0.3264, "num_input_tokens_seen": 28691664, "step": 30055 }, { "epoch": 2.45207602577698, "grad_norm": 1.1333556175231934, "learning_rate": 4.6857049323880865e-05, "loss": 0.2992, "num_input_tokens_seen": 28696192, "step": 30060 }, { "epoch": 2.452483889387389, "grad_norm": 2.7745437622070312, "learning_rate": 4.685532136163144e-05, "loss": 0.429, "num_input_tokens_seen": 28700704, "step": 30065 }, { "epoch": 2.4528917529977976, "grad_norm": 0.792400598526001, "learning_rate": 4.685359295638395e-05, "loss": 0.3791, "num_input_tokens_seen": 28705792, "step": 30070 }, { "epoch": 2.453299616608206, "grad_norm": 0.8947849869728088, "learning_rate": 4.685186410817342e-05, "loss": 0.4581, "num_input_tokens_seen": 28710576, "step": 30075 }, { "epoch": 2.4537074802186147, "grad_norm": 0.7765345573425293, "learning_rate": 4.685013481703488e-05, "loss": 0.4657, "num_input_tokens_seen": 28714704, "step": 30080 }, { "epoch": 2.4541153438290237, "grad_norm": 5.109099388122559, "learning_rate": 4.68484050830034e-05, "loss": 0.6181, "num_input_tokens_seen": 28719472, "step": 30085 }, { "epoch": 2.4545232074394323, "grad_norm": 1.1496466398239136, "learning_rate": 4.684667490611403e-05, "loss": 0.3828, "num_input_tokens_seen": 28724320, "step": 30090 }, { "epoch": 2.454931071049841, "grad_norm": 1.4461358785629272, "learning_rate": 4.6844944286401846e-05, "loss": 0.3951, "num_input_tokens_seen": 28729776, "step": 30095 }, { "epoch": 2.4553389346602494, "grad_norm": 1.270340085029602, "learning_rate": 4.684321322390192e-05, "loss": 0.3613, "num_input_tokens_seen": 28734368, "step": 30100 }, { "epoch": 2.4557467982706584, "grad_norm": 1.4856703281402588, "learning_rate": 4.684148171864934e-05, "loss": 0.34, "num_input_tokens_seen": 28739152, "step": 30105 }, { "epoch": 2.456154661881067, "grad_norm": 2.7026925086975098, "learning_rate": 4.683974977067921e-05, "loss": 0.4213, "num_input_tokens_seen": 28744816, "step": 30110 }, { "epoch": 2.4565625254914756, "grad_norm": 1.0310460329055786, "learning_rate": 4.683801738002663e-05, "loss": 0.384, "num_input_tokens_seen": 28749344, "step": 30115 }, { "epoch": 2.456970389101884, "grad_norm": 0.612137496471405, "learning_rate": 4.683628454672673e-05, "loss": 0.3808, "num_input_tokens_seen": 28753888, "step": 30120 }, { "epoch": 2.457378252712293, "grad_norm": 1.9276657104492188, "learning_rate": 4.683455127081461e-05, "loss": 0.3809, "num_input_tokens_seen": 28758128, "step": 30125 }, { "epoch": 2.4577861163227017, "grad_norm": 1.0590740442276, "learning_rate": 4.6832817552325414e-05, "loss": 0.3822, "num_input_tokens_seen": 28762768, "step": 30130 }, { "epoch": 2.4581939799331103, "grad_norm": 1.5856455564498901, "learning_rate": 4.683108339129428e-05, "loss": 0.3603, "num_input_tokens_seen": 28767760, "step": 30135 }, { "epoch": 2.4586018435435193, "grad_norm": 1.416285514831543, "learning_rate": 4.682934878775637e-05, "loss": 0.3523, "num_input_tokens_seen": 28773424, "step": 30140 }, { "epoch": 2.459009707153928, "grad_norm": 0.9440029859542847, "learning_rate": 4.6827613741746834e-05, "loss": 0.3596, "num_input_tokens_seen": 28778528, "step": 30145 }, { "epoch": 2.4594175707643364, "grad_norm": 1.351157307624817, "learning_rate": 4.6825878253300834e-05, "loss": 0.3122, "num_input_tokens_seen": 28783872, "step": 30150 }, { "epoch": 2.459825434374745, "grad_norm": 1.053822636604309, "learning_rate": 4.682414232245357e-05, "loss": 0.221, "num_input_tokens_seen": 28788144, "step": 30155 }, { "epoch": 2.4602332979851536, "grad_norm": 3.5363247394561768, "learning_rate": 4.6822405949240206e-05, "loss": 0.4625, "num_input_tokens_seen": 28793376, "step": 30160 }, { "epoch": 2.4606411615955626, "grad_norm": 0.6017493605613708, "learning_rate": 4.682066913369595e-05, "loss": 0.3635, "num_input_tokens_seen": 28798208, "step": 30165 }, { "epoch": 2.461049025205971, "grad_norm": 0.7466460466384888, "learning_rate": 4.6818931875856e-05, "loss": 0.643, "num_input_tokens_seen": 28802176, "step": 30170 }, { "epoch": 2.4614568888163797, "grad_norm": 2.5564651489257812, "learning_rate": 4.6817194175755576e-05, "loss": 0.4144, "num_input_tokens_seen": 28806240, "step": 30175 }, { "epoch": 2.4618647524267887, "grad_norm": 1.3025423288345337, "learning_rate": 4.6815456033429896e-05, "loss": 0.3848, "num_input_tokens_seen": 28810320, "step": 30180 }, { "epoch": 2.4622726160371973, "grad_norm": 1.0557712316513062, "learning_rate": 4.681371744891418e-05, "loss": 0.3729, "num_input_tokens_seen": 28815488, "step": 30185 }, { "epoch": 2.462680479647606, "grad_norm": 2.12272047996521, "learning_rate": 4.681197842224369e-05, "loss": 0.5215, "num_input_tokens_seen": 28820608, "step": 30190 }, { "epoch": 2.4630883432580144, "grad_norm": 0.5235434174537659, "learning_rate": 4.681023895345368e-05, "loss": 0.4303, "num_input_tokens_seen": 28824768, "step": 30195 }, { "epoch": 2.463496206868423, "grad_norm": 1.0769668817520142, "learning_rate": 4.680849904257938e-05, "loss": 0.345, "num_input_tokens_seen": 28829840, "step": 30200 }, { "epoch": 2.463904070478832, "grad_norm": 1.532691240310669, "learning_rate": 4.6806758689656075e-05, "loss": 0.4381, "num_input_tokens_seen": 28835392, "step": 30205 }, { "epoch": 2.4643119340892405, "grad_norm": 0.9708526134490967, "learning_rate": 4.680501789471903e-05, "loss": 0.3738, "num_input_tokens_seen": 28839328, "step": 30210 }, { "epoch": 2.464719797699649, "grad_norm": 1.0011838674545288, "learning_rate": 4.680327665780354e-05, "loss": 0.3104, "num_input_tokens_seen": 28843248, "step": 30215 }, { "epoch": 2.465127661310058, "grad_norm": 0.9091519117355347, "learning_rate": 4.680153497894491e-05, "loss": 0.4122, "num_input_tokens_seen": 28847712, "step": 30220 }, { "epoch": 2.4655355249204667, "grad_norm": 2.128985643386841, "learning_rate": 4.679979285817841e-05, "loss": 0.4003, "num_input_tokens_seen": 28852224, "step": 30225 }, { "epoch": 2.4659433885308752, "grad_norm": 0.6625474691390991, "learning_rate": 4.679805029553939e-05, "loss": 0.3438, "num_input_tokens_seen": 28856032, "step": 30230 }, { "epoch": 2.466351252141284, "grad_norm": 0.8834848999977112, "learning_rate": 4.679630729106314e-05, "loss": 0.3393, "num_input_tokens_seen": 28861504, "step": 30235 }, { "epoch": 2.466759115751693, "grad_norm": 0.8108643889427185, "learning_rate": 4.679456384478501e-05, "loss": 0.3632, "num_input_tokens_seen": 28866448, "step": 30240 }, { "epoch": 2.4671669793621014, "grad_norm": 0.4681849181652069, "learning_rate": 4.679281995674033e-05, "loss": 0.3638, "num_input_tokens_seen": 28870960, "step": 30245 }, { "epoch": 2.46757484297251, "grad_norm": 0.9501895308494568, "learning_rate": 4.679107562696445e-05, "loss": 0.369, "num_input_tokens_seen": 28876144, "step": 30250 }, { "epoch": 2.4679827065829185, "grad_norm": 0.9228675961494446, "learning_rate": 4.678933085549272e-05, "loss": 0.4053, "num_input_tokens_seen": 28880800, "step": 30255 }, { "epoch": 2.4683905701933275, "grad_norm": 2.4048585891723633, "learning_rate": 4.678758564236052e-05, "loss": 0.3525, "num_input_tokens_seen": 28886032, "step": 30260 }, { "epoch": 2.468798433803736, "grad_norm": 1.7647334337234497, "learning_rate": 4.678583998760322e-05, "loss": 0.3507, "num_input_tokens_seen": 28890208, "step": 30265 }, { "epoch": 2.4692062974141447, "grad_norm": 1.880079746246338, "learning_rate": 4.67840938912562e-05, "loss": 0.4357, "num_input_tokens_seen": 28894976, "step": 30270 }, { "epoch": 2.4696141610245532, "grad_norm": 0.9614523649215698, "learning_rate": 4.678234735335485e-05, "loss": 0.362, "num_input_tokens_seen": 28899696, "step": 30275 }, { "epoch": 2.4700220246349622, "grad_norm": 0.6670905947685242, "learning_rate": 4.678060037393457e-05, "loss": 0.3832, "num_input_tokens_seen": 28904832, "step": 30280 }, { "epoch": 2.470429888245371, "grad_norm": 1.309388279914856, "learning_rate": 4.677885295303078e-05, "loss": 0.346, "num_input_tokens_seen": 28909792, "step": 30285 }, { "epoch": 2.4708377518557794, "grad_norm": 1.142414927482605, "learning_rate": 4.67771050906789e-05, "loss": 0.3805, "num_input_tokens_seen": 28913888, "step": 30290 }, { "epoch": 2.471245615466188, "grad_norm": 0.9806681275367737, "learning_rate": 4.677535678691435e-05, "loss": 0.3214, "num_input_tokens_seen": 28918432, "step": 30295 }, { "epoch": 2.471653479076597, "grad_norm": 1.2609925270080566, "learning_rate": 4.6773608041772566e-05, "loss": 0.4144, "num_input_tokens_seen": 28922720, "step": 30300 }, { "epoch": 2.4720613426870055, "grad_norm": 0.7492448091506958, "learning_rate": 4.6771858855289e-05, "loss": 0.3767, "num_input_tokens_seen": 28926928, "step": 30305 }, { "epoch": 2.472469206297414, "grad_norm": 0.7929328083992004, "learning_rate": 4.677010922749912e-05, "loss": 0.3335, "num_input_tokens_seen": 28931504, "step": 30310 }, { "epoch": 2.4728770699078226, "grad_norm": 1.7248022556304932, "learning_rate": 4.676835915843836e-05, "loss": 0.3701, "num_input_tokens_seen": 28936624, "step": 30315 }, { "epoch": 2.4732849335182316, "grad_norm": 0.7507015466690063, "learning_rate": 4.676660864814222e-05, "loss": 0.3714, "num_input_tokens_seen": 28941744, "step": 30320 }, { "epoch": 2.47369279712864, "grad_norm": 1.2714972496032715, "learning_rate": 4.6764857696646157e-05, "loss": 0.3359, "num_input_tokens_seen": 28946560, "step": 30325 }, { "epoch": 2.4741006607390488, "grad_norm": 0.9104769825935364, "learning_rate": 4.6763106303985694e-05, "loss": 0.2548, "num_input_tokens_seen": 28950624, "step": 30330 }, { "epoch": 2.4745085243494573, "grad_norm": 0.5410036444664001, "learning_rate": 4.676135447019631e-05, "loss": 0.3604, "num_input_tokens_seen": 28955856, "step": 30335 }, { "epoch": 2.4749163879598663, "grad_norm": 1.526128888130188, "learning_rate": 4.675960219531351e-05, "loss": 0.7223, "num_input_tokens_seen": 28960272, "step": 30340 }, { "epoch": 2.475324251570275, "grad_norm": 0.9412264227867126, "learning_rate": 4.675784947937283e-05, "loss": 0.5361, "num_input_tokens_seen": 28965888, "step": 30345 }, { "epoch": 2.4757321151806835, "grad_norm": 0.6498560905456543, "learning_rate": 4.675609632240978e-05, "loss": 0.4519, "num_input_tokens_seen": 28970000, "step": 30350 }, { "epoch": 2.4761399787910925, "grad_norm": 0.987108051776886, "learning_rate": 4.67543427244599e-05, "loss": 0.3085, "num_input_tokens_seen": 28975264, "step": 30355 }, { "epoch": 2.476547842401501, "grad_norm": 0.9683979153633118, "learning_rate": 4.6752588685558744e-05, "loss": 0.5643, "num_input_tokens_seen": 28980416, "step": 30360 }, { "epoch": 2.4769557060119096, "grad_norm": 2.335568904876709, "learning_rate": 4.6750834205741856e-05, "loss": 0.6574, "num_input_tokens_seen": 28985552, "step": 30365 }, { "epoch": 2.477363569622318, "grad_norm": 2.3624541759490967, "learning_rate": 4.674907928504481e-05, "loss": 0.372, "num_input_tokens_seen": 28989840, "step": 30370 }, { "epoch": 2.4777714332327267, "grad_norm": 0.667497456073761, "learning_rate": 4.674732392350315e-05, "loss": 0.3441, "num_input_tokens_seen": 28995120, "step": 30375 }, { "epoch": 2.4781792968431358, "grad_norm": 0.8270882368087769, "learning_rate": 4.6745568121152485e-05, "loss": 0.4402, "num_input_tokens_seen": 29000272, "step": 30380 }, { "epoch": 2.4785871604535443, "grad_norm": 0.5697739720344543, "learning_rate": 4.67438118780284e-05, "loss": 0.3526, "num_input_tokens_seen": 29005344, "step": 30385 }, { "epoch": 2.478995024063953, "grad_norm": 0.422943115234375, "learning_rate": 4.674205519416648e-05, "loss": 0.3554, "num_input_tokens_seen": 29010176, "step": 30390 }, { "epoch": 2.479402887674362, "grad_norm": 1.3892396688461304, "learning_rate": 4.674029806960234e-05, "loss": 0.3461, "num_input_tokens_seen": 29014816, "step": 30395 }, { "epoch": 2.4798107512847705, "grad_norm": 0.5708857178688049, "learning_rate": 4.67385405043716e-05, "loss": 0.3784, "num_input_tokens_seen": 29018704, "step": 30400 }, { "epoch": 2.480218614895179, "grad_norm": 0.6972352266311646, "learning_rate": 4.673678249850988e-05, "loss": 0.3569, "num_input_tokens_seen": 29023616, "step": 30405 }, { "epoch": 2.4806264785055876, "grad_norm": 0.6334515810012817, "learning_rate": 4.673502405205281e-05, "loss": 0.3426, "num_input_tokens_seen": 29028192, "step": 30410 }, { "epoch": 2.4810343421159966, "grad_norm": 0.5067357420921326, "learning_rate": 4.673326516503604e-05, "loss": 0.3491, "num_input_tokens_seen": 29032144, "step": 30415 }, { "epoch": 2.481442205726405, "grad_norm": 1.0534226894378662, "learning_rate": 4.6731505837495226e-05, "loss": 0.332, "num_input_tokens_seen": 29037680, "step": 30420 }, { "epoch": 2.4818500693368137, "grad_norm": 0.6543540358543396, "learning_rate": 4.672974606946602e-05, "loss": 0.4854, "num_input_tokens_seen": 29042768, "step": 30425 }, { "epoch": 2.4822579329472223, "grad_norm": 1.5308741331100464, "learning_rate": 4.672798586098409e-05, "loss": 0.4073, "num_input_tokens_seen": 29046944, "step": 30430 }, { "epoch": 2.4826657965576313, "grad_norm": 1.4524940252304077, "learning_rate": 4.6726225212085125e-05, "loss": 0.3562, "num_input_tokens_seen": 29051600, "step": 30435 }, { "epoch": 2.48307366016804, "grad_norm": 0.813118577003479, "learning_rate": 4.672446412280481e-05, "loss": 0.2927, "num_input_tokens_seen": 29056576, "step": 30440 }, { "epoch": 2.4834815237784484, "grad_norm": 1.6635844707489014, "learning_rate": 4.672270259317884e-05, "loss": 0.4837, "num_input_tokens_seen": 29061152, "step": 30445 }, { "epoch": 2.483889387388857, "grad_norm": 0.683069109916687, "learning_rate": 4.672094062324291e-05, "loss": 0.6059, "num_input_tokens_seen": 29066368, "step": 30450 }, { "epoch": 2.484297250999266, "grad_norm": 1.8548502922058105, "learning_rate": 4.671917821303274e-05, "loss": 0.4435, "num_input_tokens_seen": 29070560, "step": 30455 }, { "epoch": 2.4847051146096746, "grad_norm": 0.7146656513214111, "learning_rate": 4.671741536258406e-05, "loss": 0.3151, "num_input_tokens_seen": 29074832, "step": 30460 }, { "epoch": 2.485112978220083, "grad_norm": 1.7217093706130981, "learning_rate": 4.6715652071932605e-05, "loss": 0.3861, "num_input_tokens_seen": 29078704, "step": 30465 }, { "epoch": 2.4855208418304917, "grad_norm": 1.3780485391616821, "learning_rate": 4.671388834111411e-05, "loss": 0.3518, "num_input_tokens_seen": 29083360, "step": 30470 }, { "epoch": 2.4859287054409007, "grad_norm": 0.9951026439666748, "learning_rate": 4.671212417016432e-05, "loss": 0.3602, "num_input_tokens_seen": 29088496, "step": 30475 }, { "epoch": 2.4863365690513093, "grad_norm": 0.6081136465072632, "learning_rate": 4.6710359559119e-05, "loss": 0.3665, "num_input_tokens_seen": 29093616, "step": 30480 }, { "epoch": 2.486744432661718, "grad_norm": 0.8009328246116638, "learning_rate": 4.6708594508013925e-05, "loss": 0.3302, "num_input_tokens_seen": 29098384, "step": 30485 }, { "epoch": 2.4871522962721264, "grad_norm": 1.0796153545379639, "learning_rate": 4.6706829016884854e-05, "loss": 0.292, "num_input_tokens_seen": 29102800, "step": 30490 }, { "epoch": 2.4875601598825354, "grad_norm": 1.183529257774353, "learning_rate": 4.670506308576759e-05, "loss": 0.3412, "num_input_tokens_seen": 29107792, "step": 30495 }, { "epoch": 2.487968023492944, "grad_norm": 2.0036332607269287, "learning_rate": 4.670329671469791e-05, "loss": 0.4111, "num_input_tokens_seen": 29112320, "step": 30500 }, { "epoch": 2.4883758871033526, "grad_norm": 2.3751063346862793, "learning_rate": 4.670152990371164e-05, "loss": 0.4706, "num_input_tokens_seen": 29116576, "step": 30505 }, { "epoch": 2.488783750713761, "grad_norm": 0.9211511611938477, "learning_rate": 4.669976265284457e-05, "loss": 0.3631, "num_input_tokens_seen": 29121168, "step": 30510 }, { "epoch": 2.48919161432417, "grad_norm": 1.7024966478347778, "learning_rate": 4.6697994962132544e-05, "loss": 0.3445, "num_input_tokens_seen": 29125712, "step": 30515 }, { "epoch": 2.4895994779345787, "grad_norm": 0.7753375768661499, "learning_rate": 4.669622683161137e-05, "loss": 0.2505, "num_input_tokens_seen": 29130864, "step": 30520 }, { "epoch": 2.4900073415449873, "grad_norm": 1.303728461265564, "learning_rate": 4.66944582613169e-05, "loss": 0.3549, "num_input_tokens_seen": 29135776, "step": 30525 }, { "epoch": 2.4904152051553963, "grad_norm": 2.391937494277954, "learning_rate": 4.669268925128498e-05, "loss": 0.4274, "num_input_tokens_seen": 29139616, "step": 30530 }, { "epoch": 2.490823068765805, "grad_norm": 2.3970437049865723, "learning_rate": 4.669091980155146e-05, "loss": 0.3849, "num_input_tokens_seen": 29143840, "step": 30535 }, { "epoch": 2.4912309323762134, "grad_norm": 0.5422207117080688, "learning_rate": 4.668914991215222e-05, "loss": 0.4081, "num_input_tokens_seen": 29149280, "step": 30540 }, { "epoch": 2.491638795986622, "grad_norm": 0.5488798022270203, "learning_rate": 4.668737958312313e-05, "loss": 0.367, "num_input_tokens_seen": 29154768, "step": 30545 }, { "epoch": 2.4920466595970305, "grad_norm": 1.527907133102417, "learning_rate": 4.668560881450007e-05, "loss": 0.3802, "num_input_tokens_seen": 29158896, "step": 30550 }, { "epoch": 2.4924545232074395, "grad_norm": 1.2693430185317993, "learning_rate": 4.6683837606318926e-05, "loss": 0.3774, "num_input_tokens_seen": 29163776, "step": 30555 }, { "epoch": 2.492862386817848, "grad_norm": 1.8121840953826904, "learning_rate": 4.6682065958615615e-05, "loss": 0.397, "num_input_tokens_seen": 29168832, "step": 30560 }, { "epoch": 2.4932702504282567, "grad_norm": 0.5518991947174072, "learning_rate": 4.668029387142604e-05, "loss": 0.3585, "num_input_tokens_seen": 29174272, "step": 30565 }, { "epoch": 2.4936781140386657, "grad_norm": 0.5359957814216614, "learning_rate": 4.667852134478612e-05, "loss": 0.3311, "num_input_tokens_seen": 29178960, "step": 30570 }, { "epoch": 2.4940859776490742, "grad_norm": 1.2156023979187012, "learning_rate": 4.667674837873178e-05, "loss": 0.3151, "num_input_tokens_seen": 29183968, "step": 30575 }, { "epoch": 2.494493841259483, "grad_norm": 2.1419708728790283, "learning_rate": 4.667497497329897e-05, "loss": 0.301, "num_input_tokens_seen": 29188688, "step": 30580 }, { "epoch": 2.4949017048698914, "grad_norm": 0.8370288610458374, "learning_rate": 4.6673201128523624e-05, "loss": 0.4855, "num_input_tokens_seen": 29193232, "step": 30585 }, { "epoch": 2.4953095684803, "grad_norm": 0.47938916087150574, "learning_rate": 4.66714268444417e-05, "loss": 0.3549, "num_input_tokens_seen": 29198128, "step": 30590 }, { "epoch": 2.495717432090709, "grad_norm": 1.4233548641204834, "learning_rate": 4.666965212108916e-05, "loss": 0.3972, "num_input_tokens_seen": 29203568, "step": 30595 }, { "epoch": 2.4961252957011175, "grad_norm": 0.8475708365440369, "learning_rate": 4.666787695850198e-05, "loss": 0.3567, "num_input_tokens_seen": 29207840, "step": 30600 }, { "epoch": 2.496533159311526, "grad_norm": 1.0040130615234375, "learning_rate": 4.666610135671614e-05, "loss": 0.3165, "num_input_tokens_seen": 29211536, "step": 30605 }, { "epoch": 2.496941022921935, "grad_norm": 0.8436099290847778, "learning_rate": 4.666432531576763e-05, "loss": 0.373, "num_input_tokens_seen": 29216416, "step": 30610 }, { "epoch": 2.4973488865323437, "grad_norm": 1.8923978805541992, "learning_rate": 4.6662548835692464e-05, "loss": 0.5209, "num_input_tokens_seen": 29221264, "step": 30615 }, { "epoch": 2.4977567501427522, "grad_norm": 1.3620080947875977, "learning_rate": 4.666077191652663e-05, "loss": 0.3343, "num_input_tokens_seen": 29225376, "step": 30620 }, { "epoch": 2.498164613753161, "grad_norm": 1.1115353107452393, "learning_rate": 4.665899455830615e-05, "loss": 0.3072, "num_input_tokens_seen": 29230944, "step": 30625 }, { "epoch": 2.49857247736357, "grad_norm": 1.4479039907455444, "learning_rate": 4.665721676106706e-05, "loss": 0.3493, "num_input_tokens_seen": 29235216, "step": 30630 }, { "epoch": 2.4989803409739784, "grad_norm": 2.036426305770874, "learning_rate": 4.665543852484538e-05, "loss": 0.3643, "num_input_tokens_seen": 29240496, "step": 30635 }, { "epoch": 2.499388204584387, "grad_norm": 1.2442235946655273, "learning_rate": 4.665365984967718e-05, "loss": 0.5395, "num_input_tokens_seen": 29244928, "step": 30640 }, { "epoch": 2.4997960681947955, "grad_norm": 0.7728883624076843, "learning_rate": 4.6651880735598484e-05, "loss": 0.4437, "num_input_tokens_seen": 29249888, "step": 30645 }, { "epoch": 2.5002039318052045, "grad_norm": 0.6015539765357971, "learning_rate": 4.665010118264537e-05, "loss": 0.277, "num_input_tokens_seen": 29254768, "step": 30650 }, { "epoch": 2.5002039318052045, "eval_loss": 0.35679835081100464, "eval_runtime": 571.092, "eval_samples_per_second": 4.772, "eval_steps_per_second": 2.387, "num_input_tokens_seen": 29254768, "step": 30650 }, { "epoch": 2.500611795415613, "grad_norm": 0.9870917201042175, "learning_rate": 4.664832119085389e-05, "loss": 0.3245, "num_input_tokens_seen": 29259440, "step": 30655 }, { "epoch": 2.5010196590260216, "grad_norm": 0.9279305338859558, "learning_rate": 4.664654076026016e-05, "loss": 0.3145, "num_input_tokens_seen": 29264640, "step": 30660 }, { "epoch": 2.5014275226364306, "grad_norm": 0.5772296190261841, "learning_rate": 4.6644759890900234e-05, "loss": 0.3884, "num_input_tokens_seen": 29269808, "step": 30665 }, { "epoch": 2.501835386246839, "grad_norm": 0.7808502316474915, "learning_rate": 4.664297858281023e-05, "loss": 0.3419, "num_input_tokens_seen": 29275232, "step": 30670 }, { "epoch": 2.5022432498572478, "grad_norm": 0.5323622226715088, "learning_rate": 4.664119683602624e-05, "loss": 0.3712, "num_input_tokens_seen": 29279680, "step": 30675 }, { "epoch": 2.5026511134676563, "grad_norm": 1.3688836097717285, "learning_rate": 4.663941465058439e-05, "loss": 0.3329, "num_input_tokens_seen": 29284560, "step": 30680 }, { "epoch": 2.503058977078065, "grad_norm": 0.46607518196105957, "learning_rate": 4.6637632026520794e-05, "loss": 0.3519, "num_input_tokens_seen": 29289344, "step": 30685 }, { "epoch": 2.503466840688474, "grad_norm": 1.4294071197509766, "learning_rate": 4.66358489638716e-05, "loss": 0.3401, "num_input_tokens_seen": 29295472, "step": 30690 }, { "epoch": 2.5038747042988825, "grad_norm": 0.45314666628837585, "learning_rate": 4.6634065462672935e-05, "loss": 0.3612, "num_input_tokens_seen": 29299216, "step": 30695 }, { "epoch": 2.504282567909291, "grad_norm": 1.7070022821426392, "learning_rate": 4.6632281522960966e-05, "loss": 0.3561, "num_input_tokens_seen": 29304528, "step": 30700 }, { "epoch": 2.5046904315197, "grad_norm": 1.0470813512802124, "learning_rate": 4.6630497144771834e-05, "loss": 0.3357, "num_input_tokens_seen": 29309216, "step": 30705 }, { "epoch": 2.5050982951301086, "grad_norm": 0.5228263139724731, "learning_rate": 4.662871232814171e-05, "loss": 0.4017, "num_input_tokens_seen": 29314064, "step": 30710 }, { "epoch": 2.505506158740517, "grad_norm": 0.5930534601211548, "learning_rate": 4.662692707310678e-05, "loss": 0.4022, "num_input_tokens_seen": 29318832, "step": 30715 }, { "epoch": 2.5059140223509258, "grad_norm": 1.6511341333389282, "learning_rate": 4.662514137970324e-05, "loss": 0.3477, "num_input_tokens_seen": 29323952, "step": 30720 }, { "epoch": 2.5063218859613343, "grad_norm": 1.2011253833770752, "learning_rate": 4.662335524796727e-05, "loss": 0.342, "num_input_tokens_seen": 29329504, "step": 30725 }, { "epoch": 2.5067297495717433, "grad_norm": 0.7146235704421997, "learning_rate": 4.662156867793507e-05, "loss": 0.3046, "num_input_tokens_seen": 29334480, "step": 30730 }, { "epoch": 2.507137613182152, "grad_norm": 1.4439349174499512, "learning_rate": 4.661978166964287e-05, "loss": 0.3597, "num_input_tokens_seen": 29339280, "step": 30735 }, { "epoch": 2.5075454767925605, "grad_norm": 1.086675763130188, "learning_rate": 4.661799422312687e-05, "loss": 0.3208, "num_input_tokens_seen": 29343856, "step": 30740 }, { "epoch": 2.5079533404029695, "grad_norm": 0.4237827658653259, "learning_rate": 4.661620633842332e-05, "loss": 0.4443, "num_input_tokens_seen": 29348128, "step": 30745 }, { "epoch": 2.508361204013378, "grad_norm": 0.4581088125705719, "learning_rate": 4.6614418015568453e-05, "loss": 0.3439, "num_input_tokens_seen": 29353520, "step": 30750 }, { "epoch": 2.5087690676237866, "grad_norm": 0.5595696568489075, "learning_rate": 4.6612629254598515e-05, "loss": 0.3711, "num_input_tokens_seen": 29358144, "step": 30755 }, { "epoch": 2.509176931234195, "grad_norm": 0.5551779270172119, "learning_rate": 4.661084005554977e-05, "loss": 0.3566, "num_input_tokens_seen": 29363488, "step": 30760 }, { "epoch": 2.5095847948446037, "grad_norm": 1.381872534751892, "learning_rate": 4.660905041845848e-05, "loss": 0.3845, "num_input_tokens_seen": 29367984, "step": 30765 }, { "epoch": 2.5099926584550127, "grad_norm": 1.1566264629364014, "learning_rate": 4.660726034336091e-05, "loss": 0.3668, "num_input_tokens_seen": 29372720, "step": 30770 }, { "epoch": 2.5104005220654213, "grad_norm": 0.8972855806350708, "learning_rate": 4.660546983029337e-05, "loss": 0.2733, "num_input_tokens_seen": 29376944, "step": 30775 }, { "epoch": 2.51080838567583, "grad_norm": 0.6869382858276367, "learning_rate": 4.660367887929213e-05, "loss": 0.4688, "num_input_tokens_seen": 29381792, "step": 30780 }, { "epoch": 2.511216249286239, "grad_norm": 0.4620748460292816, "learning_rate": 4.6601887490393495e-05, "loss": 0.3331, "num_input_tokens_seen": 29386608, "step": 30785 }, { "epoch": 2.5116241128966474, "grad_norm": 0.8664785027503967, "learning_rate": 4.660009566363379e-05, "loss": 0.3836, "num_input_tokens_seen": 29390528, "step": 30790 }, { "epoch": 2.512031976507056, "grad_norm": 0.3615110218524933, "learning_rate": 4.6598303399049316e-05, "loss": 0.3354, "num_input_tokens_seen": 29394880, "step": 30795 }, { "epoch": 2.5124398401174646, "grad_norm": 2.0397863388061523, "learning_rate": 4.659651069667641e-05, "loss": 0.346, "num_input_tokens_seen": 29399696, "step": 30800 }, { "epoch": 2.512847703727873, "grad_norm": 1.6828936338424683, "learning_rate": 4.6594717556551403e-05, "loss": 0.4677, "num_input_tokens_seen": 29403568, "step": 30805 }, { "epoch": 2.513255567338282, "grad_norm": 0.5228832364082336, "learning_rate": 4.659292397871066e-05, "loss": 0.3574, "num_input_tokens_seen": 29407264, "step": 30810 }, { "epoch": 2.5136634309486907, "grad_norm": 0.5392197966575623, "learning_rate": 4.6591129963190514e-05, "loss": 0.3526, "num_input_tokens_seen": 29412640, "step": 30815 }, { "epoch": 2.5140712945590993, "grad_norm": 1.955814242362976, "learning_rate": 4.6589335510027344e-05, "loss": 0.3763, "num_input_tokens_seen": 29418240, "step": 30820 }, { "epoch": 2.5144791581695083, "grad_norm": 1.303195834159851, "learning_rate": 4.658754061925752e-05, "loss": 0.4068, "num_input_tokens_seen": 29422624, "step": 30825 }, { "epoch": 2.514887021779917, "grad_norm": 1.126769781112671, "learning_rate": 4.6585745290917416e-05, "loss": 0.3157, "num_input_tokens_seen": 29427504, "step": 30830 }, { "epoch": 2.5152948853903254, "grad_norm": 0.9440277218818665, "learning_rate": 4.658394952504342e-05, "loss": 0.3889, "num_input_tokens_seen": 29432784, "step": 30835 }, { "epoch": 2.5157027490007344, "grad_norm": 1.2608094215393066, "learning_rate": 4.658215332167195e-05, "loss": 0.368, "num_input_tokens_seen": 29438464, "step": 30840 }, { "epoch": 2.516110612611143, "grad_norm": 1.1556190252304077, "learning_rate": 4.6580356680839396e-05, "loss": 0.336, "num_input_tokens_seen": 29443504, "step": 30845 }, { "epoch": 2.5165184762215516, "grad_norm": 1.8300081491470337, "learning_rate": 4.657855960258219e-05, "loss": 0.3363, "num_input_tokens_seen": 29448688, "step": 30850 }, { "epoch": 2.51692633983196, "grad_norm": 2.231492042541504, "learning_rate": 4.6576762086936745e-05, "loss": 0.3838, "num_input_tokens_seen": 29453616, "step": 30855 }, { "epoch": 2.5173342034423687, "grad_norm": 0.654732882976532, "learning_rate": 4.6574964133939506e-05, "loss": 0.301, "num_input_tokens_seen": 29458240, "step": 30860 }, { "epoch": 2.5177420670527777, "grad_norm": 0.38857972621917725, "learning_rate": 4.65731657436269e-05, "loss": 0.355, "num_input_tokens_seen": 29462480, "step": 30865 }, { "epoch": 2.5181499306631863, "grad_norm": 0.4730302691459656, "learning_rate": 4.65713669160354e-05, "loss": 0.3656, "num_input_tokens_seen": 29467504, "step": 30870 }, { "epoch": 2.518557794273595, "grad_norm": 1.25946044921875, "learning_rate": 4.656956765120146e-05, "loss": 0.3818, "num_input_tokens_seen": 29472016, "step": 30875 }, { "epoch": 2.518965657884004, "grad_norm": 1.089106559753418, "learning_rate": 4.656776794916155e-05, "loss": 0.3092, "num_input_tokens_seen": 29476896, "step": 30880 }, { "epoch": 2.5193735214944124, "grad_norm": 0.6063532829284668, "learning_rate": 4.656596780995214e-05, "loss": 0.4183, "num_input_tokens_seen": 29481872, "step": 30885 }, { "epoch": 2.519781385104821, "grad_norm": 0.7265346646308899, "learning_rate": 4.6564167233609736e-05, "loss": 0.1434, "num_input_tokens_seen": 29486560, "step": 30890 }, { "epoch": 2.5201892487152295, "grad_norm": 1.878048062324524, "learning_rate": 4.656236622017082e-05, "loss": 0.4233, "num_input_tokens_seen": 29491824, "step": 30895 }, { "epoch": 2.520597112325638, "grad_norm": 1.5178911685943604, "learning_rate": 4.65605647696719e-05, "loss": 0.5462, "num_input_tokens_seen": 29497040, "step": 30900 }, { "epoch": 2.521004975936047, "grad_norm": 0.939087450504303, "learning_rate": 4.65587628821495e-05, "loss": 0.4897, "num_input_tokens_seen": 29501456, "step": 30905 }, { "epoch": 2.5214128395464557, "grad_norm": 0.47001615166664124, "learning_rate": 4.6556960557640135e-05, "loss": 0.3494, "num_input_tokens_seen": 29506352, "step": 30910 }, { "epoch": 2.5218207031568642, "grad_norm": 1.1767752170562744, "learning_rate": 4.6555157796180335e-05, "loss": 0.3448, "num_input_tokens_seen": 29511296, "step": 30915 }, { "epoch": 2.5222285667672733, "grad_norm": 1.3619359731674194, "learning_rate": 4.655335459780665e-05, "loss": 0.3689, "num_input_tokens_seen": 29514944, "step": 30920 }, { "epoch": 2.522636430377682, "grad_norm": 0.5589463710784912, "learning_rate": 4.655155096255563e-05, "loss": 0.3753, "num_input_tokens_seen": 29519136, "step": 30925 }, { "epoch": 2.5230442939880904, "grad_norm": 1.5393606424331665, "learning_rate": 4.654974689046383e-05, "loss": 0.4353, "num_input_tokens_seen": 29523888, "step": 30930 }, { "epoch": 2.523452157598499, "grad_norm": 1.1458191871643066, "learning_rate": 4.654794238156781e-05, "loss": 0.3549, "num_input_tokens_seen": 29528192, "step": 30935 }, { "epoch": 2.5238600212089075, "grad_norm": 2.006917715072632, "learning_rate": 4.654613743590416e-05, "loss": 0.4195, "num_input_tokens_seen": 29533488, "step": 30940 }, { "epoch": 2.5242678848193165, "grad_norm": 0.8335155248641968, "learning_rate": 4.654433205350945e-05, "loss": 0.3625, "num_input_tokens_seen": 29537920, "step": 30945 }, { "epoch": 2.524675748429725, "grad_norm": 0.7027888298034668, "learning_rate": 4.65425262344203e-05, "loss": 0.2177, "num_input_tokens_seen": 29542944, "step": 30950 }, { "epoch": 2.5250836120401337, "grad_norm": 0.6756237745285034, "learning_rate": 4.6540719978673284e-05, "loss": 0.4684, "num_input_tokens_seen": 29547968, "step": 30955 }, { "epoch": 2.5254914756505427, "grad_norm": 0.82230144739151, "learning_rate": 4.653891328630503e-05, "loss": 0.3149, "num_input_tokens_seen": 29552512, "step": 30960 }, { "epoch": 2.5258993392609512, "grad_norm": 0.35966557264328003, "learning_rate": 4.653710615735216e-05, "loss": 0.4038, "num_input_tokens_seen": 29557568, "step": 30965 }, { "epoch": 2.52630720287136, "grad_norm": 0.39711156487464905, "learning_rate": 4.65352985918513e-05, "loss": 0.3775, "num_input_tokens_seen": 29561984, "step": 30970 }, { "epoch": 2.5267150664817684, "grad_norm": 0.43877002596855164, "learning_rate": 4.653349058983909e-05, "loss": 0.342, "num_input_tokens_seen": 29567456, "step": 30975 }, { "epoch": 2.527122930092177, "grad_norm": 1.0856448411941528, "learning_rate": 4.653168215135216e-05, "loss": 0.3569, "num_input_tokens_seen": 29571840, "step": 30980 }, { "epoch": 2.527530793702586, "grad_norm": 1.2968906164169312, "learning_rate": 4.6529873276427206e-05, "loss": 0.363, "num_input_tokens_seen": 29577376, "step": 30985 }, { "epoch": 2.5279386573129945, "grad_norm": 1.0953155755996704, "learning_rate": 4.6528063965100854e-05, "loss": 0.3292, "num_input_tokens_seen": 29581664, "step": 30990 }, { "epoch": 2.528346520923403, "grad_norm": 1.8048683404922485, "learning_rate": 4.65262542174098e-05, "loss": 0.4103, "num_input_tokens_seen": 29586496, "step": 30995 }, { "epoch": 2.528754384533812, "grad_norm": 0.6189730167388916, "learning_rate": 4.6524444033390715e-05, "loss": 0.4441, "num_input_tokens_seen": 29591648, "step": 31000 }, { "epoch": 2.5291622481442206, "grad_norm": 1.474046230316162, "learning_rate": 4.6522633413080294e-05, "loss": 0.3589, "num_input_tokens_seen": 29595856, "step": 31005 }, { "epoch": 2.529570111754629, "grad_norm": 0.3915228545665741, "learning_rate": 4.652082235651525e-05, "loss": 0.3831, "num_input_tokens_seen": 29601712, "step": 31010 }, { "epoch": 2.5299779753650378, "grad_norm": 4.37326717376709, "learning_rate": 4.651901086373227e-05, "loss": 0.4123, "num_input_tokens_seen": 29605856, "step": 31015 }, { "epoch": 2.5303858389754463, "grad_norm": 1.2067526578903198, "learning_rate": 4.651719893476809e-05, "loss": 0.3352, "num_input_tokens_seen": 29611008, "step": 31020 }, { "epoch": 2.5307937025858553, "grad_norm": 0.47090059518814087, "learning_rate": 4.6515386569659425e-05, "loss": 0.3314, "num_input_tokens_seen": 29615760, "step": 31025 }, { "epoch": 2.531201566196264, "grad_norm": 0.7880954146385193, "learning_rate": 4.651357376844302e-05, "loss": 0.3077, "num_input_tokens_seen": 29620592, "step": 31030 }, { "epoch": 2.5316094298066725, "grad_norm": 0.7395082116127014, "learning_rate": 4.6511760531155626e-05, "loss": 0.3824, "num_input_tokens_seen": 29625488, "step": 31035 }, { "epoch": 2.5320172934170815, "grad_norm": 0.4110948145389557, "learning_rate": 4.650994685783397e-05, "loss": 0.38, "num_input_tokens_seen": 29629808, "step": 31040 }, { "epoch": 2.53242515702749, "grad_norm": 1.3149797916412354, "learning_rate": 4.650813274851484e-05, "loss": 0.37, "num_input_tokens_seen": 29635072, "step": 31045 }, { "epoch": 2.5328330206378986, "grad_norm": 0.783872663974762, "learning_rate": 4.6506318203235e-05, "loss": 0.3562, "num_input_tokens_seen": 29640256, "step": 31050 }, { "epoch": 2.5332408842483076, "grad_norm": 0.39866766333580017, "learning_rate": 4.6504503222031225e-05, "loss": 0.3627, "num_input_tokens_seen": 29645584, "step": 31055 }, { "epoch": 2.533648747858716, "grad_norm": 1.2998552322387695, "learning_rate": 4.650268780494032e-05, "loss": 0.3323, "num_input_tokens_seen": 29649872, "step": 31060 }, { "epoch": 2.5340566114691248, "grad_norm": 1.5949485301971436, "learning_rate": 4.650087195199905e-05, "loss": 0.3809, "num_input_tokens_seen": 29654960, "step": 31065 }, { "epoch": 2.5344644750795333, "grad_norm": 0.46189168095588684, "learning_rate": 4.6499055663244254e-05, "loss": 0.2964, "num_input_tokens_seen": 29659184, "step": 31070 }, { "epoch": 2.534872338689942, "grad_norm": 0.5801199674606323, "learning_rate": 4.649723893871273e-05, "loss": 0.3717, "num_input_tokens_seen": 29664496, "step": 31075 }, { "epoch": 2.535280202300351, "grad_norm": 1.3940330743789673, "learning_rate": 4.64954217784413e-05, "loss": 0.3404, "num_input_tokens_seen": 29668864, "step": 31080 }, { "epoch": 2.5356880659107595, "grad_norm": 0.36143597960472107, "learning_rate": 4.6493604182466824e-05, "loss": 0.3697, "num_input_tokens_seen": 29674160, "step": 31085 }, { "epoch": 2.536095929521168, "grad_norm": 0.61603182554245, "learning_rate": 4.649178615082611e-05, "loss": 0.298, "num_input_tokens_seen": 29678544, "step": 31090 }, { "epoch": 2.536503793131577, "grad_norm": 0.6727569699287415, "learning_rate": 4.6489967683556036e-05, "loss": 0.4031, "num_input_tokens_seen": 29683520, "step": 31095 }, { "epoch": 2.5369116567419856, "grad_norm": 1.4020673036575317, "learning_rate": 4.648814878069344e-05, "loss": 0.337, "num_input_tokens_seen": 29688800, "step": 31100 }, { "epoch": 2.537319520352394, "grad_norm": 1.1844874620437622, "learning_rate": 4.64863294422752e-05, "loss": 0.3876, "num_input_tokens_seen": 29693168, "step": 31105 }, { "epoch": 2.5377273839628027, "grad_norm": 0.2294560819864273, "learning_rate": 4.648450966833819e-05, "loss": 0.3611, "num_input_tokens_seen": 29697728, "step": 31110 }, { "epoch": 2.5381352475732113, "grad_norm": 0.4945456087589264, "learning_rate": 4.64826894589193e-05, "loss": 0.4169, "num_input_tokens_seen": 29701360, "step": 31115 }, { "epoch": 2.5385431111836203, "grad_norm": 1.6759953498840332, "learning_rate": 4.6480868814055424e-05, "loss": 0.3991, "num_input_tokens_seen": 29706240, "step": 31120 }, { "epoch": 2.538950974794029, "grad_norm": 0.46133893728256226, "learning_rate": 4.6479047733783465e-05, "loss": 0.4217, "num_input_tokens_seen": 29710896, "step": 31125 }, { "epoch": 2.5393588384044374, "grad_norm": 1.2394784688949585, "learning_rate": 4.647722621814033e-05, "loss": 0.3484, "num_input_tokens_seen": 29715840, "step": 31130 }, { "epoch": 2.5397667020148464, "grad_norm": 1.5027753114700317, "learning_rate": 4.647540426716295e-05, "loss": 0.3626, "num_input_tokens_seen": 29721184, "step": 31135 }, { "epoch": 2.540174565625255, "grad_norm": 0.5252405405044556, "learning_rate": 4.6473581880888254e-05, "loss": 0.3309, "num_input_tokens_seen": 29726336, "step": 31140 }, { "epoch": 2.5405824292356636, "grad_norm": 0.9299936890602112, "learning_rate": 4.6471759059353176e-05, "loss": 0.3538, "num_input_tokens_seen": 29730896, "step": 31145 }, { "epoch": 2.540990292846072, "grad_norm": 0.2655770182609558, "learning_rate": 4.646993580259467e-05, "loss": 0.3678, "num_input_tokens_seen": 29735600, "step": 31150 }, { "epoch": 2.5413981564564807, "grad_norm": 0.2784149944782257, "learning_rate": 4.646811211064968e-05, "loss": 0.3638, "num_input_tokens_seen": 29740688, "step": 31155 }, { "epoch": 2.5418060200668897, "grad_norm": 1.435174584388733, "learning_rate": 4.646628798355518e-05, "loss": 0.4078, "num_input_tokens_seen": 29745760, "step": 31160 }, { "epoch": 2.5422138836772983, "grad_norm": 1.1781082153320312, "learning_rate": 4.6464463421348145e-05, "loss": 0.3132, "num_input_tokens_seen": 29750656, "step": 31165 }, { "epoch": 2.542621747287707, "grad_norm": 1.7293697595596313, "learning_rate": 4.646263842406556e-05, "loss": 0.3555, "num_input_tokens_seen": 29755392, "step": 31170 }, { "epoch": 2.543029610898116, "grad_norm": 0.8752374053001404, "learning_rate": 4.646081299174442e-05, "loss": 0.3512, "num_input_tokens_seen": 29760256, "step": 31175 }, { "epoch": 2.5434374745085244, "grad_norm": 0.7010870575904846, "learning_rate": 4.6458987124421715e-05, "loss": 0.3328, "num_input_tokens_seen": 29765072, "step": 31180 }, { "epoch": 2.543845338118933, "grad_norm": 0.6186928749084473, "learning_rate": 4.645716082213446e-05, "loss": 0.2585, "num_input_tokens_seen": 29769024, "step": 31185 }, { "epoch": 2.5442532017293416, "grad_norm": 0.8806648254394531, "learning_rate": 4.645533408491966e-05, "loss": 0.3572, "num_input_tokens_seen": 29772832, "step": 31190 }, { "epoch": 2.54466106533975, "grad_norm": 0.8186424970626831, "learning_rate": 4.6453506912814374e-05, "loss": 0.3705, "num_input_tokens_seen": 29777968, "step": 31195 }, { "epoch": 2.545068928950159, "grad_norm": 0.3409968912601471, "learning_rate": 4.645167930585561e-05, "loss": 0.3282, "num_input_tokens_seen": 29782928, "step": 31200 }, { "epoch": 2.5454767925605677, "grad_norm": 0.33026012778282166, "learning_rate": 4.644985126408042e-05, "loss": 0.3678, "num_input_tokens_seen": 29788016, "step": 31205 }, { "epoch": 2.5458846561709763, "grad_norm": 0.34195318818092346, "learning_rate": 4.644802278752587e-05, "loss": 0.3744, "num_input_tokens_seen": 29792672, "step": 31210 }, { "epoch": 2.5462925197813853, "grad_norm": 1.0462974309921265, "learning_rate": 4.6446193876229004e-05, "loss": 0.3319, "num_input_tokens_seen": 29797696, "step": 31215 }, { "epoch": 2.546700383391794, "grad_norm": 1.1171112060546875, "learning_rate": 4.6444364530226904e-05, "loss": 0.3718, "num_input_tokens_seen": 29802624, "step": 31220 }, { "epoch": 2.5471082470022024, "grad_norm": 0.35277479887008667, "learning_rate": 4.644253474955664e-05, "loss": 0.3465, "num_input_tokens_seen": 29807360, "step": 31225 }, { "epoch": 2.5475161106126114, "grad_norm": 0.514585018157959, "learning_rate": 4.644070453425532e-05, "loss": 0.3615, "num_input_tokens_seen": 29811904, "step": 31230 }, { "epoch": 2.54792397422302, "grad_norm": 0.5498365163803101, "learning_rate": 4.6438873884360015e-05, "loss": 0.3351, "num_input_tokens_seen": 29816336, "step": 31235 }, { "epoch": 2.5483318378334285, "grad_norm": 0.3358137905597687, "learning_rate": 4.643704279990786e-05, "loss": 0.3567, "num_input_tokens_seen": 29821664, "step": 31240 }, { "epoch": 2.548739701443837, "grad_norm": 1.248163104057312, "learning_rate": 4.6435211280935955e-05, "loss": 0.3349, "num_input_tokens_seen": 29825632, "step": 31245 }, { "epoch": 2.5491475650542457, "grad_norm": 0.8862439393997192, "learning_rate": 4.643337932748142e-05, "loss": 0.3911, "num_input_tokens_seen": 29830000, "step": 31250 }, { "epoch": 2.5495554286646547, "grad_norm": 0.38755863904953003, "learning_rate": 4.64315469395814e-05, "loss": 0.373, "num_input_tokens_seen": 29834000, "step": 31255 }, { "epoch": 2.5499632922750632, "grad_norm": 1.2764629125595093, "learning_rate": 4.642971411727302e-05, "loss": 0.3524, "num_input_tokens_seen": 29838384, "step": 31260 }, { "epoch": 2.550371155885472, "grad_norm": 1.4428881406784058, "learning_rate": 4.6427880860593445e-05, "loss": 0.3638, "num_input_tokens_seen": 29842192, "step": 31265 }, { "epoch": 2.550779019495881, "grad_norm": 1.0570930242538452, "learning_rate": 4.6426047169579826e-05, "loss": 0.3178, "num_input_tokens_seen": 29846864, "step": 31270 }, { "epoch": 2.5511868831062894, "grad_norm": 1.3164474964141846, "learning_rate": 4.642421304426934e-05, "loss": 0.4317, "num_input_tokens_seen": 29852176, "step": 31275 }, { "epoch": 2.551594746716698, "grad_norm": 1.3599339723587036, "learning_rate": 4.6422378484699165e-05, "loss": 0.3686, "num_input_tokens_seen": 29856208, "step": 31280 }, { "epoch": 2.5520026103271065, "grad_norm": 0.4231151342391968, "learning_rate": 4.6420543490906474e-05, "loss": 0.2825, "num_input_tokens_seen": 29860784, "step": 31285 }, { "epoch": 2.552410473937515, "grad_norm": 1.2418506145477295, "learning_rate": 4.6418708062928475e-05, "loss": 0.4037, "num_input_tokens_seen": 29866064, "step": 31290 }, { "epoch": 2.552818337547924, "grad_norm": 1.580367088317871, "learning_rate": 4.6416872200802356e-05, "loss": 0.5359, "num_input_tokens_seen": 29870880, "step": 31295 }, { "epoch": 2.5532262011583327, "grad_norm": 0.478094220161438, "learning_rate": 4.641503590456535e-05, "loss": 0.3463, "num_input_tokens_seen": 29875616, "step": 31300 }, { "epoch": 2.553634064768741, "grad_norm": 1.1041991710662842, "learning_rate": 4.641319917425465e-05, "loss": 0.3323, "num_input_tokens_seen": 29880816, "step": 31305 }, { "epoch": 2.5540419283791502, "grad_norm": 0.3946877419948578, "learning_rate": 4.641136200990752e-05, "loss": 0.3897, "num_input_tokens_seen": 29885440, "step": 31310 }, { "epoch": 2.554449791989559, "grad_norm": 1.169679045677185, "learning_rate": 4.640952441156117e-05, "loss": 0.3734, "num_input_tokens_seen": 29889760, "step": 31315 }, { "epoch": 2.5548576555999674, "grad_norm": 0.606501042842865, "learning_rate": 4.640768637925286e-05, "loss": 0.3383, "num_input_tokens_seen": 29895248, "step": 31320 }, { "epoch": 2.555265519210376, "grad_norm": 0.5550414323806763, "learning_rate": 4.640584791301984e-05, "loss": 0.3403, "num_input_tokens_seen": 29899840, "step": 31325 }, { "epoch": 2.5556733828207845, "grad_norm": 1.4348194599151611, "learning_rate": 4.640400901289937e-05, "loss": 0.4192, "num_input_tokens_seen": 29905344, "step": 31330 }, { "epoch": 2.5560812464311935, "grad_norm": 0.9919034242630005, "learning_rate": 4.640216967892875e-05, "loss": 0.3983, "num_input_tokens_seen": 29909952, "step": 31335 }, { "epoch": 2.556489110041602, "grad_norm": 0.2591734528541565, "learning_rate": 4.640032991114524e-05, "loss": 0.3655, "num_input_tokens_seen": 29914848, "step": 31340 }, { "epoch": 2.5568969736520106, "grad_norm": 0.9544237852096558, "learning_rate": 4.6398489709586134e-05, "loss": 0.3446, "num_input_tokens_seen": 29918976, "step": 31345 }, { "epoch": 2.5573048372624196, "grad_norm": 0.40901726484298706, "learning_rate": 4.6396649074288736e-05, "loss": 0.3796, "num_input_tokens_seen": 29923664, "step": 31350 }, { "epoch": 2.557712700872828, "grad_norm": 1.0522934198379517, "learning_rate": 4.639480800529035e-05, "loss": 0.3297, "num_input_tokens_seen": 29928656, "step": 31355 }, { "epoch": 2.5581205644832368, "grad_norm": 1.210254430770874, "learning_rate": 4.63929665026283e-05, "loss": 0.3656, "num_input_tokens_seen": 29932416, "step": 31360 }, { "epoch": 2.5585284280936453, "grad_norm": 0.45534688234329224, "learning_rate": 4.6391124566339906e-05, "loss": 0.3567, "num_input_tokens_seen": 29936848, "step": 31365 }, { "epoch": 2.558936291704054, "grad_norm": 0.41149261593818665, "learning_rate": 4.638928219646251e-05, "loss": 0.3122, "num_input_tokens_seen": 29941632, "step": 31370 }, { "epoch": 2.559344155314463, "grad_norm": 1.6546696424484253, "learning_rate": 4.6387439393033454e-05, "loss": 0.3797, "num_input_tokens_seen": 29946992, "step": 31375 }, { "epoch": 2.5597520189248715, "grad_norm": 0.5060490369796753, "learning_rate": 4.638559615609009e-05, "loss": 0.383, "num_input_tokens_seen": 29951984, "step": 31380 }, { "epoch": 2.56015988253528, "grad_norm": 1.295609474182129, "learning_rate": 4.6383752485669776e-05, "loss": 0.3831, "num_input_tokens_seen": 29957088, "step": 31385 }, { "epoch": 2.560567746145689, "grad_norm": 1.6887791156768799, "learning_rate": 4.6381908381809885e-05, "loss": 0.3982, "num_input_tokens_seen": 29960896, "step": 31390 }, { "epoch": 2.5609756097560976, "grad_norm": 0.7954886555671692, "learning_rate": 4.63800638445478e-05, "loss": 0.3425, "num_input_tokens_seen": 29966480, "step": 31395 }, { "epoch": 2.561383473366506, "grad_norm": 0.5700341463088989, "learning_rate": 4.63782188739209e-05, "loss": 0.2226, "num_input_tokens_seen": 29970624, "step": 31400 }, { "epoch": 2.561791336976915, "grad_norm": 0.43476995825767517, "learning_rate": 4.637637346996659e-05, "loss": 0.247, "num_input_tokens_seen": 29975632, "step": 31405 }, { "epoch": 2.5621992005873238, "grad_norm": 1.454344391822815, "learning_rate": 4.6374527632722283e-05, "loss": 0.1885, "num_input_tokens_seen": 29980640, "step": 31410 }, { "epoch": 2.5626070641977323, "grad_norm": 2.9602274894714355, "learning_rate": 4.637268136222537e-05, "loss": 0.4769, "num_input_tokens_seen": 29986096, "step": 31415 }, { "epoch": 2.563014927808141, "grad_norm": 0.4154972434043884, "learning_rate": 4.6370834658513296e-05, "loss": 0.3534, "num_input_tokens_seen": 29991776, "step": 31420 }, { "epoch": 2.5634227914185495, "grad_norm": 0.6085678935050964, "learning_rate": 4.636898752162348e-05, "loss": 0.338, "num_input_tokens_seen": 29996976, "step": 31425 }, { "epoch": 2.5638306550289585, "grad_norm": 0.7624154090881348, "learning_rate": 4.636713995159336e-05, "loss": 0.334, "num_input_tokens_seen": 30001680, "step": 31430 }, { "epoch": 2.564238518639367, "grad_norm": 0.5054769515991211, "learning_rate": 4.636529194846041e-05, "loss": 0.3934, "num_input_tokens_seen": 30006320, "step": 31435 }, { "epoch": 2.5646463822497756, "grad_norm": 0.30936822295188904, "learning_rate": 4.636344351226206e-05, "loss": 0.3716, "num_input_tokens_seen": 30011152, "step": 31440 }, { "epoch": 2.5650542458601846, "grad_norm": 1.389784574508667, "learning_rate": 4.6361594643035786e-05, "loss": 0.3573, "num_input_tokens_seen": 30015328, "step": 31445 }, { "epoch": 2.565462109470593, "grad_norm": 0.49559956789016724, "learning_rate": 4.6359745340819074e-05, "loss": 0.2796, "num_input_tokens_seen": 30020304, "step": 31450 }, { "epoch": 2.5658699730810017, "grad_norm": 1.9687840938568115, "learning_rate": 4.635789560564939e-05, "loss": 0.5358, "num_input_tokens_seen": 30024608, "step": 31455 }, { "epoch": 2.5662778366914103, "grad_norm": 0.7563689947128296, "learning_rate": 4.635604543756425e-05, "loss": 0.3046, "num_input_tokens_seen": 30028928, "step": 31460 }, { "epoch": 2.566685700301819, "grad_norm": 1.3118343353271484, "learning_rate": 4.635419483660114e-05, "loss": 0.377, "num_input_tokens_seen": 30033520, "step": 31465 }, { "epoch": 2.567093563912228, "grad_norm": 1.1647855043411255, "learning_rate": 4.635234380279757e-05, "loss": 0.3628, "num_input_tokens_seen": 30037312, "step": 31470 }, { "epoch": 2.5675014275226364, "grad_norm": 1.4917473793029785, "learning_rate": 4.6350492336191064e-05, "loss": 0.3479, "num_input_tokens_seen": 30041456, "step": 31475 }, { "epoch": 2.567909291133045, "grad_norm": 0.8597146272659302, "learning_rate": 4.6348640436819155e-05, "loss": 0.3432, "num_input_tokens_seen": 30045808, "step": 31480 }, { "epoch": 2.568317154743454, "grad_norm": 2.1089298725128174, "learning_rate": 4.6346788104719376e-05, "loss": 0.4572, "num_input_tokens_seen": 30050752, "step": 31485 }, { "epoch": 2.5687250183538626, "grad_norm": 0.6011217832565308, "learning_rate": 4.634493533992926e-05, "loss": 0.3675, "num_input_tokens_seen": 30055376, "step": 31490 }, { "epoch": 2.569132881964271, "grad_norm": 0.3315947651863098, "learning_rate": 4.6343082142486396e-05, "loss": 0.3384, "num_input_tokens_seen": 30059728, "step": 31495 }, { "epoch": 2.5695407455746797, "grad_norm": 1.074804663658142, "learning_rate": 4.6341228512428315e-05, "loss": 0.3835, "num_input_tokens_seen": 30064944, "step": 31500 }, { "epoch": 2.5699486091850883, "grad_norm": 0.18990282714366913, "learning_rate": 4.6339374449792595e-05, "loss": 0.3712, "num_input_tokens_seen": 30069440, "step": 31505 }, { "epoch": 2.5703564727954973, "grad_norm": 1.1678515672683716, "learning_rate": 4.633751995461683e-05, "loss": 0.3164, "num_input_tokens_seen": 30073200, "step": 31510 }, { "epoch": 2.570764336405906, "grad_norm": 0.8149935007095337, "learning_rate": 4.6335665026938604e-05, "loss": 0.3203, "num_input_tokens_seen": 30078000, "step": 31515 }, { "epoch": 2.5711722000163144, "grad_norm": 0.8038355112075806, "learning_rate": 4.633380966679551e-05, "loss": 0.4271, "num_input_tokens_seen": 30083072, "step": 31520 }, { "epoch": 2.5715800636267234, "grad_norm": 1.2919152975082397, "learning_rate": 4.633195387422516e-05, "loss": 0.3714, "num_input_tokens_seen": 30088320, "step": 31525 }, { "epoch": 2.571987927237132, "grad_norm": 1.6954554319381714, "learning_rate": 4.633009764926517e-05, "loss": 0.4187, "num_input_tokens_seen": 30092976, "step": 31530 }, { "epoch": 2.5723957908475406, "grad_norm": 0.3931121230125427, "learning_rate": 4.632824099195317e-05, "loss": 0.365, "num_input_tokens_seen": 30097216, "step": 31535 }, { "epoch": 2.572803654457949, "grad_norm": 1.0493600368499756, "learning_rate": 4.6326383902326775e-05, "loss": 0.3663, "num_input_tokens_seen": 30102000, "step": 31540 }, { "epoch": 2.5732115180683577, "grad_norm": 0.43701305985450745, "learning_rate": 4.632452638042365e-05, "loss": 0.3951, "num_input_tokens_seen": 30106960, "step": 31545 }, { "epoch": 2.5736193816787667, "grad_norm": 0.5234586596488953, "learning_rate": 4.6322668426281435e-05, "loss": 0.3561, "num_input_tokens_seen": 30111216, "step": 31550 }, { "epoch": 2.5740272452891753, "grad_norm": 2.361391305923462, "learning_rate": 4.632081003993779e-05, "loss": 0.3665, "num_input_tokens_seen": 30116640, "step": 31555 }, { "epoch": 2.574435108899584, "grad_norm": 1.218581199645996, "learning_rate": 4.6318951221430385e-05, "loss": 0.4097, "num_input_tokens_seen": 30122336, "step": 31560 }, { "epoch": 2.574842972509993, "grad_norm": 1.185030221939087, "learning_rate": 4.63170919707969e-05, "loss": 0.3693, "num_input_tokens_seen": 30128048, "step": 31565 }, { "epoch": 2.5752508361204014, "grad_norm": 0.932896614074707, "learning_rate": 4.631523228807502e-05, "loss": 0.3492, "num_input_tokens_seen": 30133376, "step": 31570 }, { "epoch": 2.57565869973081, "grad_norm": 0.5028437972068787, "learning_rate": 4.6313372173302426e-05, "loss": 0.3478, "num_input_tokens_seen": 30138608, "step": 31575 }, { "epoch": 2.576066563341219, "grad_norm": 0.4143017530441284, "learning_rate": 4.6311511626516846e-05, "loss": 0.4425, "num_input_tokens_seen": 30143760, "step": 31580 }, { "epoch": 2.5764744269516275, "grad_norm": 0.17431236803531647, "learning_rate": 4.630965064775599e-05, "loss": 0.389, "num_input_tokens_seen": 30148128, "step": 31585 }, { "epoch": 2.576882290562036, "grad_norm": 1.0912619829177856, "learning_rate": 4.630778923705755e-05, "loss": 0.2892, "num_input_tokens_seen": 30153264, "step": 31590 }, { "epoch": 2.5772901541724447, "grad_norm": 1.4703706502914429, "learning_rate": 4.6305927394459293e-05, "loss": 0.3131, "num_input_tokens_seen": 30158544, "step": 31595 }, { "epoch": 2.5776980177828532, "grad_norm": 0.6104198098182678, "learning_rate": 4.630406511999893e-05, "loss": 0.3965, "num_input_tokens_seen": 30163280, "step": 31600 }, { "epoch": 2.5781058813932622, "grad_norm": 1.166229009628296, "learning_rate": 4.6302202413714226e-05, "loss": 0.3296, "num_input_tokens_seen": 30167488, "step": 31605 }, { "epoch": 2.578513745003671, "grad_norm": 0.8391190767288208, "learning_rate": 4.630033927564293e-05, "loss": 0.464, "num_input_tokens_seen": 30172336, "step": 31610 }, { "epoch": 2.5789216086140794, "grad_norm": 0.4243624806404114, "learning_rate": 4.6298475705822806e-05, "loss": 0.423, "num_input_tokens_seen": 30177456, "step": 31615 }, { "epoch": 2.5793294722244884, "grad_norm": 1.2189106941223145, "learning_rate": 4.6296611704291636e-05, "loss": 0.3745, "num_input_tokens_seen": 30182128, "step": 31620 }, { "epoch": 2.579737335834897, "grad_norm": 1.3648861646652222, "learning_rate": 4.6294747271087194e-05, "loss": 0.3729, "num_input_tokens_seen": 30186752, "step": 31625 }, { "epoch": 2.5801451994453055, "grad_norm": 0.7270709872245789, "learning_rate": 4.629288240624727e-05, "loss": 0.3999, "num_input_tokens_seen": 30191344, "step": 31630 }, { "epoch": 2.580553063055714, "grad_norm": 1.2641422748565674, "learning_rate": 4.6291017109809664e-05, "loss": 0.3343, "num_input_tokens_seen": 30196064, "step": 31635 }, { "epoch": 2.5809609266661226, "grad_norm": 0.9379634857177734, "learning_rate": 4.628915138181219e-05, "loss": 0.3357, "num_input_tokens_seen": 30200656, "step": 31640 }, { "epoch": 2.5813687902765317, "grad_norm": 0.9306723475456238, "learning_rate": 4.6287285222292676e-05, "loss": 0.6151, "num_input_tokens_seen": 30204864, "step": 31645 }, { "epoch": 2.58177665388694, "grad_norm": 0.7447792291641235, "learning_rate": 4.6285418631288924e-05, "loss": 0.2709, "num_input_tokens_seen": 30210512, "step": 31650 }, { "epoch": 2.582184517497349, "grad_norm": 1.413760781288147, "learning_rate": 4.628355160883878e-05, "loss": 0.4607, "num_input_tokens_seen": 30214864, "step": 31655 }, { "epoch": 2.582592381107758, "grad_norm": 1.1044714450836182, "learning_rate": 4.62816841549801e-05, "loss": 0.3806, "num_input_tokens_seen": 30219504, "step": 31660 }, { "epoch": 2.5830002447181664, "grad_norm": 1.2394121885299683, "learning_rate": 4.6279816269750714e-05, "loss": 0.3476, "num_input_tokens_seen": 30224032, "step": 31665 }, { "epoch": 2.583408108328575, "grad_norm": 1.0895520448684692, "learning_rate": 4.627794795318849e-05, "loss": 0.3449, "num_input_tokens_seen": 30229232, "step": 31670 }, { "epoch": 2.5838159719389835, "grad_norm": 0.3199597895145416, "learning_rate": 4.627607920533132e-05, "loss": 0.3579, "num_input_tokens_seen": 30233504, "step": 31675 }, { "epoch": 2.584223835549392, "grad_norm": 0.6759722828865051, "learning_rate": 4.627421002621706e-05, "loss": 0.3544, "num_input_tokens_seen": 30238272, "step": 31680 }, { "epoch": 2.584631699159801, "grad_norm": 1.539398193359375, "learning_rate": 4.62723404158836e-05, "loss": 0.3809, "num_input_tokens_seen": 30243440, "step": 31685 }, { "epoch": 2.5850395627702096, "grad_norm": 0.3550167679786682, "learning_rate": 4.627047037436883e-05, "loss": 0.3659, "num_input_tokens_seen": 30248736, "step": 31690 }, { "epoch": 2.585447426380618, "grad_norm": 0.9820678234100342, "learning_rate": 4.6268599901710676e-05, "loss": 0.3113, "num_input_tokens_seen": 30253312, "step": 31695 }, { "epoch": 2.585855289991027, "grad_norm": 0.5726163983345032, "learning_rate": 4.626672899794704e-05, "loss": 0.3604, "num_input_tokens_seen": 30258384, "step": 31700 }, { "epoch": 2.5862631536014358, "grad_norm": 1.483860969543457, "learning_rate": 4.626485766311584e-05, "loss": 0.4595, "num_input_tokens_seen": 30262384, "step": 31705 }, { "epoch": 2.5866710172118443, "grad_norm": 1.067789077758789, "learning_rate": 4.626298589725501e-05, "loss": 0.3331, "num_input_tokens_seen": 30267680, "step": 31710 }, { "epoch": 2.587078880822253, "grad_norm": 0.17316977679729462, "learning_rate": 4.626111370040248e-05, "loss": 0.3734, "num_input_tokens_seen": 30271776, "step": 31715 }, { "epoch": 2.5874867444326615, "grad_norm": 1.2231004238128662, "learning_rate": 4.6259241072596224e-05, "loss": 0.3871, "num_input_tokens_seen": 30276176, "step": 31720 }, { "epoch": 2.5878946080430705, "grad_norm": 0.9683337211608887, "learning_rate": 4.6257368013874184e-05, "loss": 0.3709, "num_input_tokens_seen": 30281232, "step": 31725 }, { "epoch": 2.588302471653479, "grad_norm": 0.5375257730484009, "learning_rate": 4.6255494524274324e-05, "loss": 0.3441, "num_input_tokens_seen": 30286240, "step": 31730 }, { "epoch": 2.5887103352638876, "grad_norm": 0.8083509802818298, "learning_rate": 4.6253620603834625e-05, "loss": 0.2761, "num_input_tokens_seen": 30290608, "step": 31735 }, { "epoch": 2.5891181988742966, "grad_norm": 1.2184959650039673, "learning_rate": 4.625174625259307e-05, "loss": 0.4201, "num_input_tokens_seen": 30295344, "step": 31740 }, { "epoch": 2.589526062484705, "grad_norm": 0.8732961416244507, "learning_rate": 4.624987147058764e-05, "loss": 0.2575, "num_input_tokens_seen": 30300624, "step": 31745 }, { "epoch": 2.5899339260951137, "grad_norm": 0.852469265460968, "learning_rate": 4.624799625785635e-05, "loss": 0.2918, "num_input_tokens_seen": 30305200, "step": 31750 }, { "epoch": 2.5903417897055223, "grad_norm": 1.7638088464736938, "learning_rate": 4.6246120614437204e-05, "loss": 0.2762, "num_input_tokens_seen": 30310448, "step": 31755 }, { "epoch": 2.590749653315931, "grad_norm": 3.0272741317749023, "learning_rate": 4.624424454036823e-05, "loss": 0.4392, "num_input_tokens_seen": 30315088, "step": 31760 }, { "epoch": 2.59115751692634, "grad_norm": 0.7537505030632019, "learning_rate": 4.624236803568743e-05, "loss": 0.1896, "num_input_tokens_seen": 30319744, "step": 31765 }, { "epoch": 2.5915653805367485, "grad_norm": 1.6924102306365967, "learning_rate": 4.624049110043287e-05, "loss": 0.4253, "num_input_tokens_seen": 30323216, "step": 31770 }, { "epoch": 2.591973244147157, "grad_norm": 1.9095345735549927, "learning_rate": 4.6238613734642575e-05, "loss": 0.3526, "num_input_tokens_seen": 30327808, "step": 31775 }, { "epoch": 2.592381107757566, "grad_norm": 2.374908685684204, "learning_rate": 4.6236735938354605e-05, "loss": 0.6141, "num_input_tokens_seen": 30332512, "step": 31780 }, { "epoch": 2.5927889713679746, "grad_norm": 0.9021396040916443, "learning_rate": 4.623485771160702e-05, "loss": 0.332, "num_input_tokens_seen": 30337344, "step": 31785 }, { "epoch": 2.593196834978383, "grad_norm": 0.2814795970916748, "learning_rate": 4.623297905443789e-05, "loss": 0.3578, "num_input_tokens_seen": 30341696, "step": 31790 }, { "epoch": 2.593604698588792, "grad_norm": 1.3892440795898438, "learning_rate": 4.6231099966885306e-05, "loss": 0.3527, "num_input_tokens_seen": 30346640, "step": 31795 }, { "epoch": 2.5940125621992007, "grad_norm": 1.5441218614578247, "learning_rate": 4.622922044898734e-05, "loss": 0.3277, "num_input_tokens_seen": 30351168, "step": 31800 }, { "epoch": 2.5944204258096093, "grad_norm": 1.6061371564865112, "learning_rate": 4.62273405007821e-05, "loss": 0.3704, "num_input_tokens_seen": 30356208, "step": 31805 }, { "epoch": 2.594828289420018, "grad_norm": 0.23523524403572083, "learning_rate": 4.6225460122307685e-05, "loss": 0.3955, "num_input_tokens_seen": 30360720, "step": 31810 }, { "epoch": 2.5952361530304264, "grad_norm": 0.24242816865444183, "learning_rate": 4.622357931360222e-05, "loss": 0.3843, "num_input_tokens_seen": 30365120, "step": 31815 }, { "epoch": 2.5956440166408354, "grad_norm": 1.2280194759368896, "learning_rate": 4.622169807470381e-05, "loss": 0.321, "num_input_tokens_seen": 30370240, "step": 31820 }, { "epoch": 2.596051880251244, "grad_norm": 1.0222795009613037, "learning_rate": 4.62198164056506e-05, "loss": 0.3156, "num_input_tokens_seen": 30375216, "step": 31825 }, { "epoch": 2.5964597438616526, "grad_norm": 0.258409708738327, "learning_rate": 4.6217934306480735e-05, "loss": 0.3648, "num_input_tokens_seen": 30380272, "step": 31830 }, { "epoch": 2.5968676074720616, "grad_norm": 0.2638256251811981, "learning_rate": 4.621605177723235e-05, "loss": 0.337, "num_input_tokens_seen": 30385024, "step": 31835 }, { "epoch": 2.59727547108247, "grad_norm": 0.6868801712989807, "learning_rate": 4.6214168817943615e-05, "loss": 0.3614, "num_input_tokens_seen": 30390144, "step": 31840 }, { "epoch": 2.5976833346928787, "grad_norm": 1.6462877988815308, "learning_rate": 4.621228542865269e-05, "loss": 0.3541, "num_input_tokens_seen": 30395600, "step": 31845 }, { "epoch": 2.5980911983032873, "grad_norm": 1.1176060438156128, "learning_rate": 4.6210401609397756e-05, "loss": 0.3424, "num_input_tokens_seen": 30400176, "step": 31850 }, { "epoch": 2.598499061913696, "grad_norm": 0.7176974415779114, "learning_rate": 4.620851736021699e-05, "loss": 0.3119, "num_input_tokens_seen": 30404400, "step": 31855 }, { "epoch": 2.598906925524105, "grad_norm": 0.7407941222190857, "learning_rate": 4.62066326811486e-05, "loss": 0.3264, "num_input_tokens_seen": 30409296, "step": 31860 }, { "epoch": 2.5993147891345134, "grad_norm": 0.8200281858444214, "learning_rate": 4.620474757223076e-05, "loss": 0.38, "num_input_tokens_seen": 30414432, "step": 31865 }, { "epoch": 2.599722652744922, "grad_norm": 0.7342855930328369, "learning_rate": 4.620286203350171e-05, "loss": 0.1994, "num_input_tokens_seen": 30420304, "step": 31870 }, { "epoch": 2.600130516355331, "grad_norm": 2.236299514770508, "learning_rate": 4.6200976064999656e-05, "loss": 0.5001, "num_input_tokens_seen": 30425568, "step": 31875 }, { "epoch": 2.6005383799657396, "grad_norm": 1.8622390031814575, "learning_rate": 4.619908966676282e-05, "loss": 0.3494, "num_input_tokens_seen": 30430272, "step": 31880 }, { "epoch": 2.600946243576148, "grad_norm": 1.4328999519348145, "learning_rate": 4.6197202838829444e-05, "loss": 0.3682, "num_input_tokens_seen": 30435040, "step": 31885 }, { "epoch": 2.6013541071865567, "grad_norm": 0.2950676679611206, "learning_rate": 4.619531558123778e-05, "loss": 0.3426, "num_input_tokens_seen": 30440224, "step": 31890 }, { "epoch": 2.6017619707969653, "grad_norm": 2.2199127674102783, "learning_rate": 4.6193427894026075e-05, "loss": 0.3726, "num_input_tokens_seen": 30445344, "step": 31895 }, { "epoch": 2.6021698344073743, "grad_norm": 1.585302472114563, "learning_rate": 4.619153977723259e-05, "loss": 0.3712, "num_input_tokens_seen": 30449808, "step": 31900 }, { "epoch": 2.602577698017783, "grad_norm": 0.33633336424827576, "learning_rate": 4.618965123089559e-05, "loss": 0.3781, "num_input_tokens_seen": 30454960, "step": 31905 }, { "epoch": 2.6029855616281914, "grad_norm": 1.0244454145431519, "learning_rate": 4.618776225505338e-05, "loss": 0.3271, "num_input_tokens_seen": 30459920, "step": 31910 }, { "epoch": 2.6033934252386004, "grad_norm": 0.8139963150024414, "learning_rate": 4.618587284974422e-05, "loss": 0.3261, "num_input_tokens_seen": 30465264, "step": 31915 }, { "epoch": 2.603801288849009, "grad_norm": 0.756013810634613, "learning_rate": 4.6183983015006426e-05, "loss": 0.3686, "num_input_tokens_seen": 30470896, "step": 31920 }, { "epoch": 2.6042091524594175, "grad_norm": 0.5010897517204285, "learning_rate": 4.618209275087829e-05, "loss": 0.4718, "num_input_tokens_seen": 30475680, "step": 31925 }, { "epoch": 2.604617016069826, "grad_norm": 1.0009514093399048, "learning_rate": 4.6180202057398145e-05, "loss": 0.3493, "num_input_tokens_seen": 30480800, "step": 31930 }, { "epoch": 2.6050248796802347, "grad_norm": 1.3107243776321411, "learning_rate": 4.6178310934604294e-05, "loss": 0.4289, "num_input_tokens_seen": 30486208, "step": 31935 }, { "epoch": 2.6054327432906437, "grad_norm": 1.1325079202651978, "learning_rate": 4.617641938253509e-05, "loss": 0.3636, "num_input_tokens_seen": 30491920, "step": 31940 }, { "epoch": 2.6058406069010522, "grad_norm": 0.40687912702560425, "learning_rate": 4.617452740122886e-05, "loss": 0.3224, "num_input_tokens_seen": 30496352, "step": 31945 }, { "epoch": 2.606248470511461, "grad_norm": 0.898068904876709, "learning_rate": 4.6172634990723954e-05, "loss": 0.3406, "num_input_tokens_seen": 30501056, "step": 31950 }, { "epoch": 2.60665633412187, "grad_norm": 0.7462062835693359, "learning_rate": 4.617074215105874e-05, "loss": 0.3284, "num_input_tokens_seen": 30506160, "step": 31955 }, { "epoch": 2.6070641977322784, "grad_norm": 0.7125483751296997, "learning_rate": 4.616884888227157e-05, "loss": 0.3418, "num_input_tokens_seen": 30511344, "step": 31960 }, { "epoch": 2.607472061342687, "grad_norm": 0.8605495095252991, "learning_rate": 4.616695518440083e-05, "loss": 0.364, "num_input_tokens_seen": 30516160, "step": 31965 }, { "epoch": 2.607879924953096, "grad_norm": 0.3026452660560608, "learning_rate": 4.6165061057484904e-05, "loss": 0.4429, "num_input_tokens_seen": 30520880, "step": 31970 }, { "epoch": 2.6082877885635045, "grad_norm": 0.9229789972305298, "learning_rate": 4.616316650156219e-05, "loss": 0.3475, "num_input_tokens_seen": 30525296, "step": 31975 }, { "epoch": 2.608695652173913, "grad_norm": 1.1405259370803833, "learning_rate": 4.616127151667108e-05, "loss": 0.3501, "num_input_tokens_seen": 30529904, "step": 31980 }, { "epoch": 2.6091035157843216, "grad_norm": 0.996536910533905, "learning_rate": 4.6159376102849985e-05, "loss": 0.3467, "num_input_tokens_seen": 30535216, "step": 31985 }, { "epoch": 2.60951137939473, "grad_norm": 0.7829341888427734, "learning_rate": 4.615748026013733e-05, "loss": 0.3145, "num_input_tokens_seen": 30540528, "step": 31990 }, { "epoch": 2.6099192430051392, "grad_norm": 1.408318042755127, "learning_rate": 4.615558398857154e-05, "loss": 0.4141, "num_input_tokens_seen": 30544192, "step": 31995 }, { "epoch": 2.610327106615548, "grad_norm": 0.567754864692688, "learning_rate": 4.615368728819105e-05, "loss": 0.3036, "num_input_tokens_seen": 30548784, "step": 32000 }, { "epoch": 2.6107349702259564, "grad_norm": 0.884316086769104, "learning_rate": 4.615179015903431e-05, "loss": 0.3665, "num_input_tokens_seen": 30553200, "step": 32005 }, { "epoch": 2.6111428338363654, "grad_norm": 1.2918421030044556, "learning_rate": 4.614989260113978e-05, "loss": 0.347, "num_input_tokens_seen": 30558272, "step": 32010 }, { "epoch": 2.611550697446774, "grad_norm": 1.0131969451904297, "learning_rate": 4.61479946145459e-05, "loss": 0.3566, "num_input_tokens_seen": 30563520, "step": 32015 }, { "epoch": 2.6119585610571825, "grad_norm": 0.961408257484436, "learning_rate": 4.614609619929116e-05, "loss": 0.3093, "num_input_tokens_seen": 30567968, "step": 32020 }, { "epoch": 2.612366424667591, "grad_norm": 0.7501046061515808, "learning_rate": 4.614419735541403e-05, "loss": 0.4023, "num_input_tokens_seen": 30572560, "step": 32025 }, { "epoch": 2.6127742882779996, "grad_norm": 1.4151678085327148, "learning_rate": 4.6142298082953015e-05, "loss": 0.3787, "num_input_tokens_seen": 30577232, "step": 32030 }, { "epoch": 2.6131821518884086, "grad_norm": 0.7828437089920044, "learning_rate": 4.61403983819466e-05, "loss": 0.3564, "num_input_tokens_seen": 30583264, "step": 32035 }, { "epoch": 2.613590015498817, "grad_norm": 1.1109356880187988, "learning_rate": 4.613849825243329e-05, "loss": 0.3885, "num_input_tokens_seen": 30589040, "step": 32040 }, { "epoch": 2.6139978791092258, "grad_norm": 0.8811289668083191, "learning_rate": 4.6136597694451596e-05, "loss": 0.3458, "num_input_tokens_seen": 30593920, "step": 32045 }, { "epoch": 2.6144057427196348, "grad_norm": 0.6401829719543457, "learning_rate": 4.613469670804006e-05, "loss": 0.3186, "num_input_tokens_seen": 30599264, "step": 32050 }, { "epoch": 2.6148136063300433, "grad_norm": 0.6670191884040833, "learning_rate": 4.6132795293237195e-05, "loss": 0.479, "num_input_tokens_seen": 30604144, "step": 32055 }, { "epoch": 2.615221469940452, "grad_norm": 1.512986183166504, "learning_rate": 4.613089345008156e-05, "loss": 0.3522, "num_input_tokens_seen": 30608240, "step": 32060 }, { "epoch": 2.6156293335508605, "grad_norm": 1.784974455833435, "learning_rate": 4.612899117861168e-05, "loss": 0.428, "num_input_tokens_seen": 30612976, "step": 32065 }, { "epoch": 2.616037197161269, "grad_norm": 0.6035772562026978, "learning_rate": 4.6127088478866134e-05, "loss": 0.3783, "num_input_tokens_seen": 30618128, "step": 32070 }, { "epoch": 2.616445060771678, "grad_norm": 0.5046669244766235, "learning_rate": 4.612518535088348e-05, "loss": 0.353, "num_input_tokens_seen": 30623952, "step": 32075 }, { "epoch": 2.6168529243820866, "grad_norm": 0.3238251805305481, "learning_rate": 4.61232817947023e-05, "loss": 0.4104, "num_input_tokens_seen": 30628528, "step": 32080 }, { "epoch": 2.617260787992495, "grad_norm": 0.9984463453292847, "learning_rate": 4.612137781036116e-05, "loss": 0.3467, "num_input_tokens_seen": 30633104, "step": 32085 }, { "epoch": 2.617668651602904, "grad_norm": 1.0969635248184204, "learning_rate": 4.6119473397898684e-05, "loss": 0.3689, "num_input_tokens_seen": 30636608, "step": 32090 }, { "epoch": 2.6180765152133127, "grad_norm": 1.0525730848312378, "learning_rate": 4.611756855735345e-05, "loss": 0.3405, "num_input_tokens_seen": 30641168, "step": 32095 }, { "epoch": 2.6184843788237213, "grad_norm": 1.467312216758728, "learning_rate": 4.611566328876408e-05, "loss": 0.3846, "num_input_tokens_seen": 30646032, "step": 32100 }, { "epoch": 2.61889224243413, "grad_norm": 0.8944982290267944, "learning_rate": 4.611375759216918e-05, "loss": 0.3435, "num_input_tokens_seen": 30650080, "step": 32105 }, { "epoch": 2.6193001060445384, "grad_norm": 0.5045035481452942, "learning_rate": 4.611185146760738e-05, "loss": 0.3993, "num_input_tokens_seen": 30654960, "step": 32110 }, { "epoch": 2.6197079696549475, "grad_norm": 0.7799606323242188, "learning_rate": 4.610994491511733e-05, "loss": 0.3468, "num_input_tokens_seen": 30658912, "step": 32115 }, { "epoch": 2.620115833265356, "grad_norm": 0.42279136180877686, "learning_rate": 4.610803793473767e-05, "loss": 0.3287, "num_input_tokens_seen": 30663888, "step": 32120 }, { "epoch": 2.6205236968757646, "grad_norm": 1.519595742225647, "learning_rate": 4.610613052650704e-05, "loss": 0.3377, "num_input_tokens_seen": 30667776, "step": 32125 }, { "epoch": 2.6209315604861736, "grad_norm": 0.7892705202102661, "learning_rate": 4.610422269046413e-05, "loss": 0.2315, "num_input_tokens_seen": 30672880, "step": 32130 }, { "epoch": 2.621339424096582, "grad_norm": 1.0449153184890747, "learning_rate": 4.6102314426647574e-05, "loss": 0.4402, "num_input_tokens_seen": 30677920, "step": 32135 }, { "epoch": 2.6217472877069907, "grad_norm": 0.5893991589546204, "learning_rate": 4.610040573509608e-05, "loss": 0.3099, "num_input_tokens_seen": 30682336, "step": 32140 }, { "epoch": 2.6221551513173997, "grad_norm": 0.6002178192138672, "learning_rate": 4.609849661584833e-05, "loss": 0.2187, "num_input_tokens_seen": 30687248, "step": 32145 }, { "epoch": 2.6225630149278083, "grad_norm": 0.6273693442344666, "learning_rate": 4.609658706894302e-05, "loss": 0.4688, "num_input_tokens_seen": 30692032, "step": 32150 }, { "epoch": 2.622970878538217, "grad_norm": 0.5335624814033508, "learning_rate": 4.6094677094418845e-05, "loss": 0.3601, "num_input_tokens_seen": 30696688, "step": 32155 }, { "epoch": 2.6233787421486254, "grad_norm": 0.3975595533847809, "learning_rate": 4.609276669231454e-05, "loss": 0.4171, "num_input_tokens_seen": 30701616, "step": 32160 }, { "epoch": 2.623786605759034, "grad_norm": 0.36155781149864197, "learning_rate": 4.6090855862668816e-05, "loss": 0.3542, "num_input_tokens_seen": 30706624, "step": 32165 }, { "epoch": 2.624194469369443, "grad_norm": 0.4645383358001709, "learning_rate": 4.60889446055204e-05, "loss": 0.3278, "num_input_tokens_seen": 30711856, "step": 32170 }, { "epoch": 2.6246023329798516, "grad_norm": 0.29676827788352966, "learning_rate": 4.608703292090803e-05, "loss": 0.3774, "num_input_tokens_seen": 30716672, "step": 32175 }, { "epoch": 2.62501019659026, "grad_norm": 0.29316967725753784, "learning_rate": 4.608512080887048e-05, "loss": 0.3578, "num_input_tokens_seen": 30721152, "step": 32180 }, { "epoch": 2.625418060200669, "grad_norm": 1.2825847864151, "learning_rate": 4.608320826944649e-05, "loss": 0.3617, "num_input_tokens_seen": 30725472, "step": 32185 }, { "epoch": 2.6258259238110777, "grad_norm": 0.28815406560897827, "learning_rate": 4.608129530267482e-05, "loss": 0.3392, "num_input_tokens_seen": 30730704, "step": 32190 }, { "epoch": 2.6262337874214863, "grad_norm": 1.467253565788269, "learning_rate": 4.607938190859426e-05, "loss": 0.3648, "num_input_tokens_seen": 30735920, "step": 32195 }, { "epoch": 2.626641651031895, "grad_norm": 0.5492637753486633, "learning_rate": 4.6077468087243567e-05, "loss": 0.3274, "num_input_tokens_seen": 30740432, "step": 32200 }, { "epoch": 2.6270495146423034, "grad_norm": 0.6857938170433044, "learning_rate": 4.6075553838661566e-05, "loss": 0.4068, "num_input_tokens_seen": 30745024, "step": 32205 }, { "epoch": 2.6274573782527124, "grad_norm": 0.40478333830833435, "learning_rate": 4.6073639162887043e-05, "loss": 0.3089, "num_input_tokens_seen": 30749504, "step": 32210 }, { "epoch": 2.627865241863121, "grad_norm": 0.35918739438056946, "learning_rate": 4.6071724059958806e-05, "loss": 0.4124, "num_input_tokens_seen": 30753856, "step": 32215 }, { "epoch": 2.6282731054735295, "grad_norm": 0.5993732810020447, "learning_rate": 4.606980852991568e-05, "loss": 0.3831, "num_input_tokens_seen": 30757520, "step": 32220 }, { "epoch": 2.6286809690839386, "grad_norm": 0.45944729447364807, "learning_rate": 4.606789257279649e-05, "loss": 0.3104, "num_input_tokens_seen": 30762688, "step": 32225 }, { "epoch": 2.629088832694347, "grad_norm": 0.6093338131904602, "learning_rate": 4.606597618864007e-05, "loss": 0.2957, "num_input_tokens_seen": 30766976, "step": 32230 }, { "epoch": 2.6294966963047557, "grad_norm": 0.7520883679389954, "learning_rate": 4.606405937748526e-05, "loss": 0.3697, "num_input_tokens_seen": 30772320, "step": 32235 }, { "epoch": 2.6299045599151643, "grad_norm": 3.684638738632202, "learning_rate": 4.606214213937092e-05, "loss": 0.5769, "num_input_tokens_seen": 30776288, "step": 32240 }, { "epoch": 2.630312423525573, "grad_norm": 1.6331310272216797, "learning_rate": 4.60602244743359e-05, "loss": 0.3972, "num_input_tokens_seen": 30782080, "step": 32245 }, { "epoch": 2.630720287135982, "grad_norm": 0.4205121397972107, "learning_rate": 4.6058306382419095e-05, "loss": 0.3307, "num_input_tokens_seen": 30786896, "step": 32250 }, { "epoch": 2.6311281507463904, "grad_norm": 1.1674748659133911, "learning_rate": 4.6056387863659356e-05, "loss": 0.3626, "num_input_tokens_seen": 30792208, "step": 32255 }, { "epoch": 2.631536014356799, "grad_norm": 0.3792734146118164, "learning_rate": 4.6054468918095586e-05, "loss": 0.3503, "num_input_tokens_seen": 30797328, "step": 32260 }, { "epoch": 2.631943877967208, "grad_norm": 0.25923094153404236, "learning_rate": 4.605254954576668e-05, "loss": 0.3605, "num_input_tokens_seen": 30801968, "step": 32265 }, { "epoch": 2.6323517415776165, "grad_norm": 1.1037479639053345, "learning_rate": 4.605062974671153e-05, "loss": 0.3499, "num_input_tokens_seen": 30807120, "step": 32270 }, { "epoch": 2.632759605188025, "grad_norm": 1.3727885484695435, "learning_rate": 4.604870952096907e-05, "loss": 0.3848, "num_input_tokens_seen": 30811056, "step": 32275 }, { "epoch": 2.6331674687984337, "grad_norm": 0.26739808917045593, "learning_rate": 4.604678886857821e-05, "loss": 0.3362, "num_input_tokens_seen": 30815872, "step": 32280 }, { "epoch": 2.6335753324088422, "grad_norm": 0.34911444783210754, "learning_rate": 4.604486778957788e-05, "loss": 0.3278, "num_input_tokens_seen": 30821936, "step": 32285 }, { "epoch": 2.6339831960192512, "grad_norm": 0.7292138338088989, "learning_rate": 4.604294628400703e-05, "loss": 0.32, "num_input_tokens_seen": 30827472, "step": 32290 }, { "epoch": 2.63439105962966, "grad_norm": 1.462841510772705, "learning_rate": 4.6041024351904596e-05, "loss": 0.4565, "num_input_tokens_seen": 30831776, "step": 32295 }, { "epoch": 2.6347989232400684, "grad_norm": 1.172743320465088, "learning_rate": 4.6039101993309534e-05, "loss": 0.3776, "num_input_tokens_seen": 30837328, "step": 32300 }, { "epoch": 2.6352067868504774, "grad_norm": 0.3771102726459503, "learning_rate": 4.6037179208260814e-05, "loss": 0.3064, "num_input_tokens_seen": 30841808, "step": 32305 }, { "epoch": 2.635614650460886, "grad_norm": 0.7944018840789795, "learning_rate": 4.603525599679742e-05, "loss": 0.4061, "num_input_tokens_seen": 30846992, "step": 32310 }, { "epoch": 2.6360225140712945, "grad_norm": 1.199871301651001, "learning_rate": 4.6033332358958326e-05, "loss": 0.3981, "num_input_tokens_seen": 30851776, "step": 32315 }, { "epoch": 2.636430377681703, "grad_norm": 1.116514801979065, "learning_rate": 4.603140829478251e-05, "loss": 0.3776, "num_input_tokens_seen": 30856672, "step": 32320 }, { "epoch": 2.636838241292112, "grad_norm": 0.4635498821735382, "learning_rate": 4.6029483804308996e-05, "loss": 0.3077, "num_input_tokens_seen": 30861008, "step": 32325 }, { "epoch": 2.6372461049025206, "grad_norm": 0.5749638676643372, "learning_rate": 4.602755888757678e-05, "loss": 0.2538, "num_input_tokens_seen": 30865792, "step": 32330 }, { "epoch": 2.637653968512929, "grad_norm": 0.5629671812057495, "learning_rate": 4.602563354462488e-05, "loss": 0.421, "num_input_tokens_seen": 30870112, "step": 32335 }, { "epoch": 2.638061832123338, "grad_norm": 0.6386051774024963, "learning_rate": 4.602370777549232e-05, "loss": 0.3172, "num_input_tokens_seen": 30876096, "step": 32340 }, { "epoch": 2.638469695733747, "grad_norm": 1.10566246509552, "learning_rate": 4.602178158021814e-05, "loss": 0.2593, "num_input_tokens_seen": 30880912, "step": 32345 }, { "epoch": 2.6388775593441554, "grad_norm": 0.7466370463371277, "learning_rate": 4.6019854958841375e-05, "loss": 0.3131, "num_input_tokens_seen": 30885824, "step": 32350 }, { "epoch": 2.639285422954564, "grad_norm": 0.6035435795783997, "learning_rate": 4.601792791140108e-05, "loss": 0.5048, "num_input_tokens_seen": 30890432, "step": 32355 }, { "epoch": 2.639693286564973, "grad_norm": 0.7388581037521362, "learning_rate": 4.601600043793632e-05, "loss": 0.3179, "num_input_tokens_seen": 30895552, "step": 32360 }, { "epoch": 2.6401011501753815, "grad_norm": 1.0673495531082153, "learning_rate": 4.601407253848617e-05, "loss": 0.3905, "num_input_tokens_seen": 30900928, "step": 32365 }, { "epoch": 2.64050901378579, "grad_norm": 1.062774658203125, "learning_rate": 4.601214421308969e-05, "loss": 0.37, "num_input_tokens_seen": 30905168, "step": 32370 }, { "epoch": 2.6409168773961986, "grad_norm": 1.0290604829788208, "learning_rate": 4.6010215461785974e-05, "loss": 0.3635, "num_input_tokens_seen": 30910240, "step": 32375 }, { "epoch": 2.641324741006607, "grad_norm": 1.1194751262664795, "learning_rate": 4.600828628461413e-05, "loss": 0.3573, "num_input_tokens_seen": 30915584, "step": 32380 }, { "epoch": 2.641732604617016, "grad_norm": 1.1717592477798462, "learning_rate": 4.600635668161324e-05, "loss": 0.3341, "num_input_tokens_seen": 30920992, "step": 32385 }, { "epoch": 2.6421404682274248, "grad_norm": 0.38277116417884827, "learning_rate": 4.600442665282242e-05, "loss": 0.3277, "num_input_tokens_seen": 30925664, "step": 32390 }, { "epoch": 2.6425483318378333, "grad_norm": 0.40840864181518555, "learning_rate": 4.600249619828081e-05, "loss": 0.3755, "num_input_tokens_seen": 30930880, "step": 32395 }, { "epoch": 2.6429561954482423, "grad_norm": 0.4971052408218384, "learning_rate": 4.6000565318027525e-05, "loss": 0.3695, "num_input_tokens_seen": 30935968, "step": 32400 }, { "epoch": 2.643364059058651, "grad_norm": 0.23860527575016022, "learning_rate": 4.5998634012101705e-05, "loss": 0.3527, "num_input_tokens_seen": 30941232, "step": 32405 }, { "epoch": 2.6437719226690595, "grad_norm": 1.3747889995574951, "learning_rate": 4.5996702280542494e-05, "loss": 0.3819, "num_input_tokens_seen": 30945664, "step": 32410 }, { "epoch": 2.644179786279468, "grad_norm": 0.9864550232887268, "learning_rate": 4.5994770123389045e-05, "loss": 0.3601, "num_input_tokens_seen": 30950160, "step": 32415 }, { "epoch": 2.6445876498898766, "grad_norm": 0.2716001272201538, "learning_rate": 4.599283754068053e-05, "loss": 0.3453, "num_input_tokens_seen": 30955696, "step": 32420 }, { "epoch": 2.6449955135002856, "grad_norm": 1.096826434135437, "learning_rate": 4.5990904532456117e-05, "loss": 0.3556, "num_input_tokens_seen": 30960608, "step": 32425 }, { "epoch": 2.645403377110694, "grad_norm": 1.1812288761138916, "learning_rate": 4.5988971098755e-05, "loss": 0.3465, "num_input_tokens_seen": 30965296, "step": 32430 }, { "epoch": 2.6458112407211027, "grad_norm": 0.5364109873771667, "learning_rate": 4.598703723961634e-05, "loss": 0.3315, "num_input_tokens_seen": 30969760, "step": 32435 }, { "epoch": 2.6462191043315118, "grad_norm": 1.5715845823287964, "learning_rate": 4.5985102955079366e-05, "loss": 0.3456, "num_input_tokens_seen": 30975040, "step": 32440 }, { "epoch": 2.6466269679419203, "grad_norm": 0.5071752667427063, "learning_rate": 4.5983168245183264e-05, "loss": 0.3611, "num_input_tokens_seen": 30979504, "step": 32445 }, { "epoch": 2.647034831552329, "grad_norm": 0.8018231987953186, "learning_rate": 4.598123310996726e-05, "loss": 0.3312, "num_input_tokens_seen": 30984976, "step": 32450 }, { "epoch": 2.6474426951627374, "grad_norm": 1.0358368158340454, "learning_rate": 4.597929754947058e-05, "loss": 0.31, "num_input_tokens_seen": 30989232, "step": 32455 }, { "epoch": 2.647850558773146, "grad_norm": 1.494948148727417, "learning_rate": 4.597736156373245e-05, "loss": 0.3773, "num_input_tokens_seen": 30993456, "step": 32460 }, { "epoch": 2.648258422383555, "grad_norm": 1.2587406635284424, "learning_rate": 4.5975425152792115e-05, "loss": 0.3873, "num_input_tokens_seen": 30998432, "step": 32465 }, { "epoch": 2.6486662859939636, "grad_norm": 1.3839523792266846, "learning_rate": 4.597348831668883e-05, "loss": 0.3071, "num_input_tokens_seen": 31002976, "step": 32470 }, { "epoch": 2.649074149604372, "grad_norm": 0.6628363728523254, "learning_rate": 4.5971551055461845e-05, "loss": 0.4175, "num_input_tokens_seen": 31007664, "step": 32475 }, { "epoch": 2.649482013214781, "grad_norm": 1.9386088848114014, "learning_rate": 4.596961336915043e-05, "loss": 0.3606, "num_input_tokens_seen": 31011440, "step": 32480 }, { "epoch": 2.6498898768251897, "grad_norm": 1.6243951320648193, "learning_rate": 4.596767525779386e-05, "loss": 0.3424, "num_input_tokens_seen": 31016496, "step": 32485 }, { "epoch": 2.6502977404355983, "grad_norm": 0.7505528330802917, "learning_rate": 4.596573672143143e-05, "loss": 0.3631, "num_input_tokens_seen": 31021824, "step": 32490 }, { "epoch": 2.650705604046007, "grad_norm": 1.2673282623291016, "learning_rate": 4.5963797760102424e-05, "loss": 0.3417, "num_input_tokens_seen": 31026016, "step": 32495 }, { "epoch": 2.6511134676564154, "grad_norm": 1.0942597389221191, "learning_rate": 4.5961858373846146e-05, "loss": 0.3546, "num_input_tokens_seen": 31031104, "step": 32500 }, { "epoch": 2.6515213312668244, "grad_norm": 1.0623670816421509, "learning_rate": 4.59599185627019e-05, "loss": 0.3529, "num_input_tokens_seen": 31036640, "step": 32505 }, { "epoch": 2.651929194877233, "grad_norm": 0.944590151309967, "learning_rate": 4.595797832670902e-05, "loss": 0.3233, "num_input_tokens_seen": 31041504, "step": 32510 }, { "epoch": 2.6523370584876416, "grad_norm": 0.923453688621521, "learning_rate": 4.5956037665906816e-05, "loss": 0.339, "num_input_tokens_seen": 31046064, "step": 32515 }, { "epoch": 2.6527449220980506, "grad_norm": 0.7228160500526428, "learning_rate": 4.595409658033464e-05, "loss": 0.2215, "num_input_tokens_seen": 31050384, "step": 32520 }, { "epoch": 2.653152785708459, "grad_norm": 1.8185063600540161, "learning_rate": 4.5952155070031824e-05, "loss": 0.491, "num_input_tokens_seen": 31054800, "step": 32525 }, { "epoch": 2.6535606493188677, "grad_norm": 0.9482154846191406, "learning_rate": 4.595021313503773e-05, "loss": 0.5319, "num_input_tokens_seen": 31060016, "step": 32530 }, { "epoch": 2.6539685129292767, "grad_norm": 0.6765255331993103, "learning_rate": 4.594827077539172e-05, "loss": 0.3604, "num_input_tokens_seen": 31065312, "step": 32535 }, { "epoch": 2.6543763765396853, "grad_norm": 0.6775289177894592, "learning_rate": 4.594632799113316e-05, "loss": 0.2683, "num_input_tokens_seen": 31070720, "step": 32540 }, { "epoch": 2.654784240150094, "grad_norm": 0.558657705783844, "learning_rate": 4.594438478230143e-05, "loss": 0.3834, "num_input_tokens_seen": 31075616, "step": 32545 }, { "epoch": 2.6551921037605024, "grad_norm": 1.1509628295898438, "learning_rate": 4.5942441148935925e-05, "loss": 0.3265, "num_input_tokens_seen": 31080288, "step": 32550 }, { "epoch": 2.655599967370911, "grad_norm": 1.1795300245285034, "learning_rate": 4.594049709107604e-05, "loss": 0.3316, "num_input_tokens_seen": 31085392, "step": 32555 }, { "epoch": 2.65600783098132, "grad_norm": 0.9261160492897034, "learning_rate": 4.593855260876118e-05, "loss": 0.3733, "num_input_tokens_seen": 31089488, "step": 32560 }, { "epoch": 2.6564156945917285, "grad_norm": 0.8084679841995239, "learning_rate": 4.593660770203074e-05, "loss": 0.2465, "num_input_tokens_seen": 31095088, "step": 32565 }, { "epoch": 2.656823558202137, "grad_norm": 3.1491732597351074, "learning_rate": 4.593466237092417e-05, "loss": 0.3185, "num_input_tokens_seen": 31099232, "step": 32570 }, { "epoch": 2.657231421812546, "grad_norm": 1.605055809020996, "learning_rate": 4.5932716615480884e-05, "loss": 0.4661, "num_input_tokens_seen": 31104160, "step": 32575 }, { "epoch": 2.6576392854229547, "grad_norm": 0.8035802245140076, "learning_rate": 4.5930770435740333e-05, "loss": 0.3255, "num_input_tokens_seen": 31109360, "step": 32580 }, { "epoch": 2.6580471490333633, "grad_norm": 4.361819744110107, "learning_rate": 4.592882383174196e-05, "loss": 0.2979, "num_input_tokens_seen": 31113888, "step": 32585 }, { "epoch": 2.658455012643772, "grad_norm": 1.3304717540740967, "learning_rate": 4.592687680352521e-05, "loss": 0.3273, "num_input_tokens_seen": 31119120, "step": 32590 }, { "epoch": 2.6588628762541804, "grad_norm": 2.512793779373169, "learning_rate": 4.592492935112957e-05, "loss": 0.4191, "num_input_tokens_seen": 31123808, "step": 32595 }, { "epoch": 2.6592707398645894, "grad_norm": 0.4913718104362488, "learning_rate": 4.592298147459451e-05, "loss": 0.2884, "num_input_tokens_seen": 31128384, "step": 32600 }, { "epoch": 2.659678603474998, "grad_norm": 1.2310099601745605, "learning_rate": 4.5921033173959486e-05, "loss": 0.296, "num_input_tokens_seen": 31132080, "step": 32605 }, { "epoch": 2.6600864670854065, "grad_norm": 1.2791664600372314, "learning_rate": 4.591908444926403e-05, "loss": 0.2588, "num_input_tokens_seen": 31136912, "step": 32610 }, { "epoch": 2.6604943306958155, "grad_norm": 0.48809197545051575, "learning_rate": 4.591713530054761e-05, "loss": 0.2113, "num_input_tokens_seen": 31141488, "step": 32615 }, { "epoch": 2.660902194306224, "grad_norm": 5.609803676605225, "learning_rate": 4.591518572784974e-05, "loss": 0.2786, "num_input_tokens_seen": 31145728, "step": 32620 }, { "epoch": 2.6613100579166327, "grad_norm": 1.1765464544296265, "learning_rate": 4.5913235731209955e-05, "loss": 0.3342, "num_input_tokens_seen": 31150096, "step": 32625 }, { "epoch": 2.6617179215270412, "grad_norm": 1.5980753898620605, "learning_rate": 4.591128531066776e-05, "loss": 0.2163, "num_input_tokens_seen": 31155232, "step": 32630 }, { "epoch": 2.66212578513745, "grad_norm": 2.1313107013702393, "learning_rate": 4.59093344662627e-05, "loss": 0.4472, "num_input_tokens_seen": 31160352, "step": 32635 }, { "epoch": 2.662533648747859, "grad_norm": 1.3616600036621094, "learning_rate": 4.590738319803432e-05, "loss": 0.369, "num_input_tokens_seen": 31165696, "step": 32640 }, { "epoch": 2.6629415123582674, "grad_norm": 1.0436521768569946, "learning_rate": 4.590543150602216e-05, "loss": 0.2824, "num_input_tokens_seen": 31169776, "step": 32645 }, { "epoch": 2.663349375968676, "grad_norm": 1.5991528034210205, "learning_rate": 4.590347939026579e-05, "loss": 0.3223, "num_input_tokens_seen": 31174112, "step": 32650 }, { "epoch": 2.663757239579085, "grad_norm": 6.999592304229736, "learning_rate": 4.590152685080478e-05, "loss": 0.4168, "num_input_tokens_seen": 31179456, "step": 32655 }, { "epoch": 2.6641651031894935, "grad_norm": 0.5937395095825195, "learning_rate": 4.5899573887678694e-05, "loss": 0.3922, "num_input_tokens_seen": 31184336, "step": 32660 }, { "epoch": 2.664572966799902, "grad_norm": 1.357299566268921, "learning_rate": 4.589762050092713e-05, "loss": 0.387, "num_input_tokens_seen": 31187920, "step": 32665 }, { "epoch": 2.6649808304103106, "grad_norm": 0.40338000655174255, "learning_rate": 4.589566669058968e-05, "loss": 0.3968, "num_input_tokens_seen": 31192896, "step": 32670 }, { "epoch": 2.665388694020719, "grad_norm": 1.1476515531539917, "learning_rate": 4.589371245670594e-05, "loss": 0.3388, "num_input_tokens_seen": 31197376, "step": 32675 }, { "epoch": 2.665796557631128, "grad_norm": 0.7796134352684021, "learning_rate": 4.5891757799315525e-05, "loss": 0.2765, "num_input_tokens_seen": 31203024, "step": 32680 }, { "epoch": 2.666204421241537, "grad_norm": 1.5180050134658813, "learning_rate": 4.588980271845806e-05, "loss": 0.3747, "num_input_tokens_seen": 31207888, "step": 32685 }, { "epoch": 2.6666122848519453, "grad_norm": 1.8991174697875977, "learning_rate": 4.5887847214173175e-05, "loss": 0.3637, "num_input_tokens_seen": 31212896, "step": 32690 }, { "epoch": 2.6670201484623544, "grad_norm": 1.0057876110076904, "learning_rate": 4.58858912865005e-05, "loss": 0.4312, "num_input_tokens_seen": 31218496, "step": 32695 }, { "epoch": 2.667428012072763, "grad_norm": 1.953548789024353, "learning_rate": 4.5883934935479676e-05, "loss": 0.5613, "num_input_tokens_seen": 31223200, "step": 32700 }, { "epoch": 2.6678358756831715, "grad_norm": 0.38621988892555237, "learning_rate": 4.588197816115038e-05, "loss": 0.3678, "num_input_tokens_seen": 31227296, "step": 32705 }, { "epoch": 2.6682437392935805, "grad_norm": 1.720987319946289, "learning_rate": 4.588002096355225e-05, "loss": 0.4051, "num_input_tokens_seen": 31232112, "step": 32710 }, { "epoch": 2.668651602903989, "grad_norm": 0.8208847045898438, "learning_rate": 4.5878063342724976e-05, "loss": 0.3286, "num_input_tokens_seen": 31236832, "step": 32715 }, { "epoch": 2.6690594665143976, "grad_norm": 9.085817337036133, "learning_rate": 4.5876105298708215e-05, "loss": 0.4289, "num_input_tokens_seen": 31241584, "step": 32720 }, { "epoch": 2.669467330124806, "grad_norm": 0.8430898785591125, "learning_rate": 4.587414683154168e-05, "loss": 0.3656, "num_input_tokens_seen": 31247168, "step": 32725 }, { "epoch": 2.6698751937352148, "grad_norm": 0.6545255184173584, "learning_rate": 4.587218794126505e-05, "loss": 0.3682, "num_input_tokens_seen": 31252672, "step": 32730 }, { "epoch": 2.6702830573456238, "grad_norm": 0.4852977395057678, "learning_rate": 4.5870228627918056e-05, "loss": 0.3531, "num_input_tokens_seen": 31256544, "step": 32735 }, { "epoch": 2.6706909209560323, "grad_norm": 0.5668959617614746, "learning_rate": 4.586826889154039e-05, "loss": 0.3202, "num_input_tokens_seen": 31261504, "step": 32740 }, { "epoch": 2.671098784566441, "grad_norm": 1.0594831705093384, "learning_rate": 4.5866308732171776e-05, "loss": 0.2701, "num_input_tokens_seen": 31265808, "step": 32745 }, { "epoch": 2.67150664817685, "grad_norm": 3.36501407623291, "learning_rate": 4.586434814985195e-05, "loss": 0.3022, "num_input_tokens_seen": 31270864, "step": 32750 }, { "epoch": 2.6719145117872585, "grad_norm": 3.40690279006958, "learning_rate": 4.586238714462066e-05, "loss": 0.3979, "num_input_tokens_seen": 31275216, "step": 32755 }, { "epoch": 2.672322375397667, "grad_norm": 22.048961639404297, "learning_rate": 4.586042571651764e-05, "loss": 0.3254, "num_input_tokens_seen": 31280192, "step": 32760 }, { "epoch": 2.6727302390080756, "grad_norm": 2.009488344192505, "learning_rate": 4.585846386558266e-05, "loss": 0.3789, "num_input_tokens_seen": 31285184, "step": 32765 }, { "epoch": 2.673138102618484, "grad_norm": 5.666032791137695, "learning_rate": 4.585650159185547e-05, "loss": 0.2838, "num_input_tokens_seen": 31290224, "step": 32770 }, { "epoch": 2.673545966228893, "grad_norm": 1.4762136936187744, "learning_rate": 4.585453889537587e-05, "loss": 0.3323, "num_input_tokens_seen": 31295600, "step": 32775 }, { "epoch": 2.6739538298393017, "grad_norm": 1.1867001056671143, "learning_rate": 4.585257577618363e-05, "loss": 0.2313, "num_input_tokens_seen": 31300176, "step": 32780 }, { "epoch": 2.6743616934497103, "grad_norm": 3.0366451740264893, "learning_rate": 4.585061223431852e-05, "loss": 0.3333, "num_input_tokens_seen": 31304560, "step": 32785 }, { "epoch": 2.6747695570601193, "grad_norm": 2.389695167541504, "learning_rate": 4.584864826982037e-05, "loss": 0.361, "num_input_tokens_seen": 31309984, "step": 32790 }, { "epoch": 2.675177420670528, "grad_norm": 0.3709377944469452, "learning_rate": 4.5846683882728985e-05, "loss": 0.1856, "num_input_tokens_seen": 31313072, "step": 32795 }, { "epoch": 2.6755852842809364, "grad_norm": 4.478715896606445, "learning_rate": 4.5844719073084175e-05, "loss": 0.2751, "num_input_tokens_seen": 31318880, "step": 32800 }, { "epoch": 2.675993147891345, "grad_norm": 0.8181533813476562, "learning_rate": 4.5842753840925764e-05, "loss": 0.4223, "num_input_tokens_seen": 31323424, "step": 32805 }, { "epoch": 2.6764010115017536, "grad_norm": 8.075021743774414, "learning_rate": 4.5840788186293584e-05, "loss": 0.2043, "num_input_tokens_seen": 31329040, "step": 32810 }, { "epoch": 2.6768088751121626, "grad_norm": 2.594733238220215, "learning_rate": 4.5838822109227486e-05, "loss": 0.4932, "num_input_tokens_seen": 31334112, "step": 32815 }, { "epoch": 2.677216738722571, "grad_norm": 1.8783091306686401, "learning_rate": 4.583685560976733e-05, "loss": 0.3235, "num_input_tokens_seen": 31338544, "step": 32820 }, { "epoch": 2.6776246023329797, "grad_norm": 1.3235400915145874, "learning_rate": 4.583488868795295e-05, "loss": 0.2076, "num_input_tokens_seen": 31342752, "step": 32825 }, { "epoch": 2.6780324659433887, "grad_norm": 1.8725850582122803, "learning_rate": 4.583292134382423e-05, "loss": 0.3893, "num_input_tokens_seen": 31348224, "step": 32830 }, { "epoch": 2.6784403295537973, "grad_norm": 2.176744222640991, "learning_rate": 4.583095357742105e-05, "loss": 0.3123, "num_input_tokens_seen": 31352768, "step": 32835 }, { "epoch": 2.678848193164206, "grad_norm": 2.0189900398254395, "learning_rate": 4.5828985388783295e-05, "loss": 0.367, "num_input_tokens_seen": 31358224, "step": 32840 }, { "epoch": 2.6792560567746144, "grad_norm": 1.6054329872131348, "learning_rate": 4.582701677795086e-05, "loss": 0.2645, "num_input_tokens_seen": 31363008, "step": 32845 }, { "epoch": 2.679663920385023, "grad_norm": 4.410376071929932, "learning_rate": 4.5825047744963645e-05, "loss": 0.318, "num_input_tokens_seen": 31367712, "step": 32850 }, { "epoch": 2.680071783995432, "grad_norm": 4.882238864898682, "learning_rate": 4.582307828986155e-05, "loss": 0.5762, "num_input_tokens_seen": 31372432, "step": 32855 }, { "epoch": 2.6804796476058406, "grad_norm": 7.112086772918701, "learning_rate": 4.5821108412684524e-05, "loss": 0.3935, "num_input_tokens_seen": 31378128, "step": 32860 }, { "epoch": 2.680887511216249, "grad_norm": 0.8200653195381165, "learning_rate": 4.5819138113472464e-05, "loss": 0.353, "num_input_tokens_seen": 31383232, "step": 32865 }, { "epoch": 2.681295374826658, "grad_norm": 3.529209613800049, "learning_rate": 4.5817167392265324e-05, "loss": 0.4653, "num_input_tokens_seen": 31388000, "step": 32870 }, { "epoch": 2.6817032384370667, "grad_norm": 96.51617431640625, "learning_rate": 4.581519624910305e-05, "loss": 0.359, "num_input_tokens_seen": 31392640, "step": 32875 }, { "epoch": 2.6821111020474753, "grad_norm": 1.3705404996871948, "learning_rate": 4.581322468402559e-05, "loss": 0.2487, "num_input_tokens_seen": 31397248, "step": 32880 }, { "epoch": 2.6825189656578843, "grad_norm": 0.9238986968994141, "learning_rate": 4.581125269707291e-05, "loss": 0.3893, "num_input_tokens_seen": 31402112, "step": 32885 }, { "epoch": 2.682926829268293, "grad_norm": 7.468413352966309, "learning_rate": 4.580928028828498e-05, "loss": 0.2445, "num_input_tokens_seen": 31407296, "step": 32890 }, { "epoch": 2.6833346928787014, "grad_norm": 2.5541133880615234, "learning_rate": 4.580730745770178e-05, "loss": 0.1654, "num_input_tokens_seen": 31411776, "step": 32895 }, { "epoch": 2.68374255648911, "grad_norm": 0.6042171716690063, "learning_rate": 4.580533420536331e-05, "loss": 0.4499, "num_input_tokens_seen": 31416352, "step": 32900 }, { "epoch": 2.6841504200995185, "grad_norm": 2.1475467681884766, "learning_rate": 4.580336053130954e-05, "loss": 0.3357, "num_input_tokens_seen": 31421808, "step": 32905 }, { "epoch": 2.6845582837099276, "grad_norm": 15.137677192687988, "learning_rate": 4.58013864355805e-05, "loss": 0.2344, "num_input_tokens_seen": 31426960, "step": 32910 }, { "epoch": 2.684966147320336, "grad_norm": 34.65060043334961, "learning_rate": 4.57994119182162e-05, "loss": 0.6742, "num_input_tokens_seen": 31432176, "step": 32915 }, { "epoch": 2.6853740109307447, "grad_norm": 8.413434028625488, "learning_rate": 4.5797436979256655e-05, "loss": 0.5197, "num_input_tokens_seen": 31437408, "step": 32920 }, { "epoch": 2.6857818745411537, "grad_norm": 3.0526835918426514, "learning_rate": 4.57954616187419e-05, "loss": 0.3915, "num_input_tokens_seen": 31442656, "step": 32925 }, { "epoch": 2.6861897381515623, "grad_norm": 2.6921675205230713, "learning_rate": 4.5793485836711966e-05, "loss": 0.3012, "num_input_tokens_seen": 31447680, "step": 32930 }, { "epoch": 2.686597601761971, "grad_norm": 2.9740512371063232, "learning_rate": 4.579150963320692e-05, "loss": 0.5331, "num_input_tokens_seen": 31452320, "step": 32935 }, { "epoch": 2.6870054653723794, "grad_norm": 8.33051872253418, "learning_rate": 4.578953300826681e-05, "loss": 0.3509, "num_input_tokens_seen": 31457792, "step": 32940 }, { "epoch": 2.687413328982788, "grad_norm": 2.8125154972076416, "learning_rate": 4.578755596193169e-05, "loss": 0.3393, "num_input_tokens_seen": 31463168, "step": 32945 }, { "epoch": 2.687821192593197, "grad_norm": 1.3642147779464722, "learning_rate": 4.5785578494241655e-05, "loss": 0.2194, "num_input_tokens_seen": 31467424, "step": 32950 }, { "epoch": 2.6882290562036055, "grad_norm": 60.439876556396484, "learning_rate": 4.5783600605236774e-05, "loss": 0.5654, "num_input_tokens_seen": 31471312, "step": 32955 }, { "epoch": 2.688636919814014, "grad_norm": 2.410526990890503, "learning_rate": 4.5781622294957136e-05, "loss": 0.4081, "num_input_tokens_seen": 31476160, "step": 32960 }, { "epoch": 2.689044783424423, "grad_norm": 11.614025115966797, "learning_rate": 4.577964356344284e-05, "loss": 0.3361, "num_input_tokens_seen": 31480352, "step": 32965 }, { "epoch": 2.6894526470348317, "grad_norm": 2.243173599243164, "learning_rate": 4.577766441073401e-05, "loss": 0.3657, "num_input_tokens_seen": 31485504, "step": 32970 }, { "epoch": 2.6898605106452402, "grad_norm": 20.807802200317383, "learning_rate": 4.5775684836870746e-05, "loss": 0.3665, "num_input_tokens_seen": 31490592, "step": 32975 }, { "epoch": 2.690268374255649, "grad_norm": 4.199598789215088, "learning_rate": 4.577370484189318e-05, "loss": 0.3569, "num_input_tokens_seen": 31495008, "step": 32980 }, { "epoch": 2.6906762378660574, "grad_norm": 3.9723193645477295, "learning_rate": 4.5771724425841443e-05, "loss": 0.2888, "num_input_tokens_seen": 31499664, "step": 32985 }, { "epoch": 2.6910841014764664, "grad_norm": 1.4767229557037354, "learning_rate": 4.5769743588755675e-05, "loss": 0.3942, "num_input_tokens_seen": 31504944, "step": 32990 }, { "epoch": 2.691491965086875, "grad_norm": 1.770972728729248, "learning_rate": 4.5767762330676033e-05, "loss": 0.3322, "num_input_tokens_seen": 31509744, "step": 32995 }, { "epoch": 2.6918998286972835, "grad_norm": 18.72885513305664, "learning_rate": 4.5765780651642674e-05, "loss": 0.3953, "num_input_tokens_seen": 31514800, "step": 33000 }, { "epoch": 2.6923076923076925, "grad_norm": 2.24310040473938, "learning_rate": 4.576379855169577e-05, "loss": 0.4285, "num_input_tokens_seen": 31520256, "step": 33005 }, { "epoch": 2.692715555918101, "grad_norm": 6.12582540512085, "learning_rate": 4.576181603087548e-05, "loss": 0.3582, "num_input_tokens_seen": 31525200, "step": 33010 }, { "epoch": 2.6931234195285096, "grad_norm": 2.757840156555176, "learning_rate": 4.575983308922202e-05, "loss": 0.2534, "num_input_tokens_seen": 31530832, "step": 33015 }, { "epoch": 2.693531283138918, "grad_norm": 0.765516459941864, "learning_rate": 4.575784972677555e-05, "loss": 0.3633, "num_input_tokens_seen": 31535712, "step": 33020 }, { "epoch": 2.6939391467493268, "grad_norm": 6.035315990447998, "learning_rate": 4.575586594357628e-05, "loss": 0.4568, "num_input_tokens_seen": 31540256, "step": 33025 }, { "epoch": 2.694347010359736, "grad_norm": 5.615767002105713, "learning_rate": 4.575388173966444e-05, "loss": 0.7887, "num_input_tokens_seen": 31544256, "step": 33030 }, { "epoch": 2.6947548739701443, "grad_norm": 7.04794454574585, "learning_rate": 4.575189711508023e-05, "loss": 0.4439, "num_input_tokens_seen": 31548928, "step": 33035 }, { "epoch": 2.695162737580553, "grad_norm": 2.577148914337158, "learning_rate": 4.574991206986389e-05, "loss": 0.5287, "num_input_tokens_seen": 31553776, "step": 33040 }, { "epoch": 2.695570601190962, "grad_norm": 1.7350026369094849, "learning_rate": 4.5747926604055645e-05, "loss": 0.3695, "num_input_tokens_seen": 31558848, "step": 33045 }, { "epoch": 2.6959784648013705, "grad_norm": 2.3868532180786133, "learning_rate": 4.574594071769575e-05, "loss": 0.4033, "num_input_tokens_seen": 31564112, "step": 33050 }, { "epoch": 2.696386328411779, "grad_norm": 1.3790934085845947, "learning_rate": 4.574395441082444e-05, "loss": 0.5212, "num_input_tokens_seen": 31569408, "step": 33055 }, { "epoch": 2.6967941920221876, "grad_norm": 1.5537382364273071, "learning_rate": 4.5741967683482e-05, "loss": 0.4622, "num_input_tokens_seen": 31573760, "step": 33060 }, { "epoch": 2.697202055632596, "grad_norm": 1.990342378616333, "learning_rate": 4.573998053570868e-05, "loss": 0.3052, "num_input_tokens_seen": 31578544, "step": 33065 }, { "epoch": 2.697609919243005, "grad_norm": 1.8910542726516724, "learning_rate": 4.5737992967544776e-05, "loss": 0.2759, "num_input_tokens_seen": 31583552, "step": 33070 }, { "epoch": 2.6980177828534138, "grad_norm": 0.6429563164710999, "learning_rate": 4.573600497903056e-05, "loss": 0.282, "num_input_tokens_seen": 31588272, "step": 33075 }, { "epoch": 2.6984256464638223, "grad_norm": 7.626622676849365, "learning_rate": 4.573401657020634e-05, "loss": 0.7345, "num_input_tokens_seen": 31592960, "step": 33080 }, { "epoch": 2.6988335100742313, "grad_norm": 4.105254650115967, "learning_rate": 4.573202774111241e-05, "loss": 0.3851, "num_input_tokens_seen": 31596608, "step": 33085 }, { "epoch": 2.69924137368464, "grad_norm": 8.289080619812012, "learning_rate": 4.573003849178909e-05, "loss": 0.5089, "num_input_tokens_seen": 31601632, "step": 33090 }, { "epoch": 2.6996492372950485, "grad_norm": 3.914741277694702, "learning_rate": 4.572804882227669e-05, "loss": 0.3392, "num_input_tokens_seen": 31606704, "step": 33095 }, { "epoch": 2.7000571009054575, "grad_norm": 1.6529382467269897, "learning_rate": 4.572605873261556e-05, "loss": 0.3666, "num_input_tokens_seen": 31611568, "step": 33100 }, { "epoch": 2.700464964515866, "grad_norm": 2.0847506523132324, "learning_rate": 4.5724068222846024e-05, "loss": 0.3695, "num_input_tokens_seen": 31616608, "step": 33105 }, { "epoch": 2.7008728281262746, "grad_norm": 2.840426206588745, "learning_rate": 4.572207729300843e-05, "loss": 0.3833, "num_input_tokens_seen": 31622368, "step": 33110 }, { "epoch": 2.701280691736683, "grad_norm": 1.7540369033813477, "learning_rate": 4.5720085943143135e-05, "loss": 0.3423, "num_input_tokens_seen": 31627536, "step": 33115 }, { "epoch": 2.7016885553470917, "grad_norm": 3.3820791244506836, "learning_rate": 4.57180941732905e-05, "loss": 0.3337, "num_input_tokens_seen": 31632624, "step": 33120 }, { "epoch": 2.7020964189575007, "grad_norm": 5.95564079284668, "learning_rate": 4.57161019834909e-05, "loss": 0.3591, "num_input_tokens_seen": 31637296, "step": 33125 }, { "epoch": 2.7025042825679093, "grad_norm": 1.9041162729263306, "learning_rate": 4.571410937378472e-05, "loss": 0.2897, "num_input_tokens_seen": 31642128, "step": 33130 }, { "epoch": 2.702912146178318, "grad_norm": 2.302772283554077, "learning_rate": 4.571211634421234e-05, "loss": 0.5321, "num_input_tokens_seen": 31647104, "step": 33135 }, { "epoch": 2.703320009788727, "grad_norm": 6.069492340087891, "learning_rate": 4.5710122894814165e-05, "loss": 0.3928, "num_input_tokens_seen": 31652816, "step": 33140 }, { "epoch": 2.7037278733991355, "grad_norm": 1.4144740104675293, "learning_rate": 4.57081290256306e-05, "loss": 0.3452, "num_input_tokens_seen": 31657248, "step": 33145 }, { "epoch": 2.704135737009544, "grad_norm": 2.5086565017700195, "learning_rate": 4.570613473670205e-05, "loss": 0.6317, "num_input_tokens_seen": 31662256, "step": 33150 }, { "epoch": 2.7045436006199526, "grad_norm": 76.96359252929688, "learning_rate": 4.570414002806896e-05, "loss": 0.6709, "num_input_tokens_seen": 31667600, "step": 33155 }, { "epoch": 2.704951464230361, "grad_norm": 2.5444748401641846, "learning_rate": 4.570214489977175e-05, "loss": 0.3709, "num_input_tokens_seen": 31672736, "step": 33160 }, { "epoch": 2.70535932784077, "grad_norm": 1.5688670873641968, "learning_rate": 4.570014935185085e-05, "loss": 0.3183, "num_input_tokens_seen": 31677648, "step": 33165 }, { "epoch": 2.7057671914511787, "grad_norm": 1.715634822845459, "learning_rate": 4.569815338434672e-05, "loss": 0.4223, "num_input_tokens_seen": 31682256, "step": 33170 }, { "epoch": 2.7061750550615873, "grad_norm": 0.8006486892700195, "learning_rate": 4.569615699729982e-05, "loss": 0.4788, "num_input_tokens_seen": 31686592, "step": 33175 }, { "epoch": 2.7065829186719963, "grad_norm": 1.9002019166946411, "learning_rate": 4.569416019075061e-05, "loss": 0.3449, "num_input_tokens_seen": 31690704, "step": 33180 }, { "epoch": 2.706990782282405, "grad_norm": 0.46172991394996643, "learning_rate": 4.569216296473957e-05, "loss": 0.3439, "num_input_tokens_seen": 31694768, "step": 33185 }, { "epoch": 2.7073986458928134, "grad_norm": 28.629526138305664, "learning_rate": 4.569016531930718e-05, "loss": 0.3773, "num_input_tokens_seen": 31699232, "step": 33190 }, { "epoch": 2.707806509503222, "grad_norm": 0.9368143677711487, "learning_rate": 4.5688167254493926e-05, "loss": 0.3476, "num_input_tokens_seen": 31703376, "step": 33195 }, { "epoch": 2.7082143731136306, "grad_norm": 0.9407109022140503, "learning_rate": 4.568616877034031e-05, "loss": 0.3608, "num_input_tokens_seen": 31709088, "step": 33200 }, { "epoch": 2.7086222367240396, "grad_norm": 1.5253514051437378, "learning_rate": 4.568416986688685e-05, "loss": 0.3223, "num_input_tokens_seen": 31713360, "step": 33205 }, { "epoch": 2.709030100334448, "grad_norm": 0.7335667610168457, "learning_rate": 4.568217054417405e-05, "loss": 0.3633, "num_input_tokens_seen": 31718448, "step": 33210 }, { "epoch": 2.7094379639448567, "grad_norm": 3.380143165588379, "learning_rate": 4.568017080224245e-05, "loss": 0.3997, "num_input_tokens_seen": 31723664, "step": 33215 }, { "epoch": 2.7098458275552657, "grad_norm": 2.57804536819458, "learning_rate": 4.567817064113257e-05, "loss": 0.3444, "num_input_tokens_seen": 31728560, "step": 33220 }, { "epoch": 2.7102536911656743, "grad_norm": 0.7786710858345032, "learning_rate": 4.567617006088496e-05, "loss": 0.3713, "num_input_tokens_seen": 31733776, "step": 33225 }, { "epoch": 2.710661554776083, "grad_norm": 1.1029654741287231, "learning_rate": 4.567416906154016e-05, "loss": 0.374, "num_input_tokens_seen": 31739184, "step": 33230 }, { "epoch": 2.7110694183864914, "grad_norm": 3.531503915786743, "learning_rate": 4.567216764313875e-05, "loss": 0.3554, "num_input_tokens_seen": 31743984, "step": 33235 }, { "epoch": 2.7114772819969, "grad_norm": 1.5633193254470825, "learning_rate": 4.567016580572128e-05, "loss": 0.367, "num_input_tokens_seen": 31749296, "step": 33240 }, { "epoch": 2.711885145607309, "grad_norm": 1.4344333410263062, "learning_rate": 4.5668163549328334e-05, "loss": 0.3324, "num_input_tokens_seen": 31754944, "step": 33245 }, { "epoch": 2.7122930092177175, "grad_norm": 1.4352608919143677, "learning_rate": 4.5666160874000495e-05, "loss": 0.3818, "num_input_tokens_seen": 31759440, "step": 33250 }, { "epoch": 2.712700872828126, "grad_norm": 0.7346424460411072, "learning_rate": 4.566415777977835e-05, "loss": 0.2876, "num_input_tokens_seen": 31763664, "step": 33255 }, { "epoch": 2.713108736438535, "grad_norm": 2.297327995300293, "learning_rate": 4.566215426670252e-05, "loss": 0.3736, "num_input_tokens_seen": 31768256, "step": 33260 }, { "epoch": 2.7135166000489437, "grad_norm": 1.4125471115112305, "learning_rate": 4.5660150334813595e-05, "loss": 0.3631, "num_input_tokens_seen": 31773360, "step": 33265 }, { "epoch": 2.7139244636593522, "grad_norm": 0.5963151454925537, "learning_rate": 4.56581459841522e-05, "loss": 0.4236, "num_input_tokens_seen": 31777648, "step": 33270 }, { "epoch": 2.7143323272697613, "grad_norm": 39.259788513183594, "learning_rate": 4.565614121475897e-05, "loss": 0.3534, "num_input_tokens_seen": 31782656, "step": 33275 }, { "epoch": 2.71474019088017, "grad_norm": 0.4074089825153351, "learning_rate": 4.565413602667453e-05, "loss": 0.3218, "num_input_tokens_seen": 31787648, "step": 33280 }, { "epoch": 2.7151480544905784, "grad_norm": 0.7991005778312683, "learning_rate": 4.565213041993953e-05, "loss": 0.3824, "num_input_tokens_seen": 31792336, "step": 33285 }, { "epoch": 2.715555918100987, "grad_norm": 1.5794665813446045, "learning_rate": 4.5650124394594626e-05, "loss": 0.3951, "num_input_tokens_seen": 31796272, "step": 33290 }, { "epoch": 2.7159637817113955, "grad_norm": 3.040189504623413, "learning_rate": 4.564811795068048e-05, "loss": 0.3387, "num_input_tokens_seen": 31801104, "step": 33295 }, { "epoch": 2.7163716453218045, "grad_norm": 2.3556900024414062, "learning_rate": 4.564611108823774e-05, "loss": 0.5474, "num_input_tokens_seen": 31806624, "step": 33300 }, { "epoch": 2.716779508932213, "grad_norm": 0.379311740398407, "learning_rate": 4.5644103807307116e-05, "loss": 0.3743, "num_input_tokens_seen": 31810784, "step": 33305 }, { "epoch": 2.7171873725426217, "grad_norm": 0.8469133377075195, "learning_rate": 4.564209610792928e-05, "loss": 0.3627, "num_input_tokens_seen": 31814960, "step": 33310 }, { "epoch": 2.7175952361530307, "grad_norm": 1.874826431274414, "learning_rate": 4.564008799014492e-05, "loss": 0.5644, "num_input_tokens_seen": 31819808, "step": 33315 }, { "epoch": 2.7180030997634392, "grad_norm": 1.5794172286987305, "learning_rate": 4.563807945399475e-05, "loss": 0.3739, "num_input_tokens_seen": 31824976, "step": 33320 }, { "epoch": 2.718410963373848, "grad_norm": 0.8697324395179749, "learning_rate": 4.563607049951948e-05, "loss": 0.3065, "num_input_tokens_seen": 31829296, "step": 33325 }, { "epoch": 2.7188188269842564, "grad_norm": 1.4702744483947754, "learning_rate": 4.5634061126759826e-05, "loss": 0.3971, "num_input_tokens_seen": 31834160, "step": 33330 }, { "epoch": 2.719226690594665, "grad_norm": 0.42880308628082275, "learning_rate": 4.563205133575652e-05, "loss": 0.3097, "num_input_tokens_seen": 31838144, "step": 33335 }, { "epoch": 2.719634554205074, "grad_norm": 1.077126145362854, "learning_rate": 4.563004112655031e-05, "loss": 0.4089, "num_input_tokens_seen": 31843504, "step": 33340 }, { "epoch": 2.7200424178154825, "grad_norm": 1.5158250331878662, "learning_rate": 4.562803049918192e-05, "loss": 0.426, "num_input_tokens_seen": 31848048, "step": 33345 }, { "epoch": 2.720450281425891, "grad_norm": 1.6367957592010498, "learning_rate": 4.562601945369213e-05, "loss": 0.3732, "num_input_tokens_seen": 31851504, "step": 33350 }, { "epoch": 2.7208581450363, "grad_norm": 0.5531629920005798, "learning_rate": 4.562400799012168e-05, "loss": 0.3584, "num_input_tokens_seen": 31856528, "step": 33355 }, { "epoch": 2.7212660086467086, "grad_norm": 0.36978429555892944, "learning_rate": 4.562199610851136e-05, "loss": 0.3632, "num_input_tokens_seen": 31861440, "step": 33360 }, { "epoch": 2.721673872257117, "grad_norm": 1.4141786098480225, "learning_rate": 4.561998380890194e-05, "loss": 0.3332, "num_input_tokens_seen": 31865424, "step": 33365 }, { "epoch": 2.7220817358675258, "grad_norm": 1.2157390117645264, "learning_rate": 4.56179710913342e-05, "loss": 0.2842, "num_input_tokens_seen": 31870928, "step": 33370 }, { "epoch": 2.7224895994779343, "grad_norm": 3.1609957218170166, "learning_rate": 4.561595795584895e-05, "loss": 0.5782, "num_input_tokens_seen": 31875776, "step": 33375 }, { "epoch": 2.7228974630883434, "grad_norm": 0.46259862184524536, "learning_rate": 4.5613944402487005e-05, "loss": 0.4935, "num_input_tokens_seen": 31879664, "step": 33380 }, { "epoch": 2.723305326698752, "grad_norm": 1.0107507705688477, "learning_rate": 4.561193043128915e-05, "loss": 0.3394, "num_input_tokens_seen": 31883680, "step": 33385 }, { "epoch": 2.7237131903091605, "grad_norm": 0.8426645994186401, "learning_rate": 4.560991604229623e-05, "loss": 0.5338, "num_input_tokens_seen": 31888128, "step": 33390 }, { "epoch": 2.7241210539195695, "grad_norm": 1.2578521966934204, "learning_rate": 4.560790123554908e-05, "loss": 0.3386, "num_input_tokens_seen": 31893296, "step": 33395 }, { "epoch": 2.724528917529978, "grad_norm": 0.4510194659233093, "learning_rate": 4.560588601108851e-05, "loss": 0.3816, "num_input_tokens_seen": 31898224, "step": 33400 }, { "epoch": 2.7249367811403866, "grad_norm": 1.405479907989502, "learning_rate": 4.5603870368955406e-05, "loss": 0.3716, "num_input_tokens_seen": 31903616, "step": 33405 }, { "epoch": 2.725344644750795, "grad_norm": 0.47817519307136536, "learning_rate": 4.5601854309190596e-05, "loss": 0.3702, "num_input_tokens_seen": 31908304, "step": 33410 }, { "epoch": 2.7257525083612038, "grad_norm": 1.2196643352508545, "learning_rate": 4.559983783183495e-05, "loss": 0.3459, "num_input_tokens_seen": 31912416, "step": 33415 }, { "epoch": 2.7261603719716128, "grad_norm": 2.2537550926208496, "learning_rate": 4.559782093692935e-05, "loss": 0.3658, "num_input_tokens_seen": 31916976, "step": 33420 }, { "epoch": 2.7265682355820213, "grad_norm": 4.191248893737793, "learning_rate": 4.5595803624514664e-05, "loss": 0.3887, "num_input_tokens_seen": 31921392, "step": 33425 }, { "epoch": 2.72697609919243, "grad_norm": 0.39933353662490845, "learning_rate": 4.5593785894631804e-05, "loss": 0.3573, "num_input_tokens_seen": 31926032, "step": 33430 }, { "epoch": 2.727383962802839, "grad_norm": 0.3888268768787384, "learning_rate": 4.559176774732165e-05, "loss": 0.3429, "num_input_tokens_seen": 31930720, "step": 33435 }, { "epoch": 2.7277918264132475, "grad_norm": 4.2298102378845215, "learning_rate": 4.558974918262511e-05, "loss": 0.3555, "num_input_tokens_seen": 31936240, "step": 33440 }, { "epoch": 2.728199690023656, "grad_norm": 1.4459885358810425, "learning_rate": 4.55877302005831e-05, "loss": 0.3812, "num_input_tokens_seen": 31940432, "step": 33445 }, { "epoch": 2.728607553634065, "grad_norm": 0.5134943127632141, "learning_rate": 4.5585710801236546e-05, "loss": 0.3619, "num_input_tokens_seen": 31944928, "step": 33450 }, { "epoch": 2.7290154172444736, "grad_norm": 1.663153052330017, "learning_rate": 4.558369098462639e-05, "loss": 0.3804, "num_input_tokens_seen": 31950384, "step": 33455 }, { "epoch": 2.729423280854882, "grad_norm": 2.0637755393981934, "learning_rate": 4.558167075079356e-05, "loss": 0.393, "num_input_tokens_seen": 31954960, "step": 33460 }, { "epoch": 2.7298311444652907, "grad_norm": 1.4142357110977173, "learning_rate": 4.557965009977901e-05, "loss": 0.3527, "num_input_tokens_seen": 31959536, "step": 33465 }, { "epoch": 2.7302390080756993, "grad_norm": 0.4810298681259155, "learning_rate": 4.55776290316237e-05, "loss": 0.4417, "num_input_tokens_seen": 31964336, "step": 33470 }, { "epoch": 2.7306468716861083, "grad_norm": 2.75130558013916, "learning_rate": 4.5575607546368584e-05, "loss": 0.336, "num_input_tokens_seen": 31969744, "step": 33475 }, { "epoch": 2.731054735296517, "grad_norm": 1.3308745622634888, "learning_rate": 4.557358564405465e-05, "loss": 0.333, "num_input_tokens_seen": 31975120, "step": 33480 }, { "epoch": 2.7314625989069254, "grad_norm": 1.1053094863891602, "learning_rate": 4.557156332472289e-05, "loss": 0.4084, "num_input_tokens_seen": 31979808, "step": 33485 }, { "epoch": 2.7318704625173345, "grad_norm": 1.203610897064209, "learning_rate": 4.556954058841427e-05, "loss": 0.2921, "num_input_tokens_seen": 31984064, "step": 33490 }, { "epoch": 2.732278326127743, "grad_norm": 1.1299272775650024, "learning_rate": 4.5567517435169804e-05, "loss": 0.347, "num_input_tokens_seen": 31988432, "step": 33495 }, { "epoch": 2.7326861897381516, "grad_norm": 2.939408302307129, "learning_rate": 4.5565493865030504e-05, "loss": 0.3547, "num_input_tokens_seen": 31993456, "step": 33500 }, { "epoch": 2.73309405334856, "grad_norm": 1.5833250284194946, "learning_rate": 4.556346987803738e-05, "loss": 0.358, "num_input_tokens_seen": 31997936, "step": 33505 }, { "epoch": 2.7335019169589687, "grad_norm": 2.012261152267456, "learning_rate": 4.556144547423146e-05, "loss": 0.4003, "num_input_tokens_seen": 32002688, "step": 33510 }, { "epoch": 2.7339097805693777, "grad_norm": 0.5296935439109802, "learning_rate": 4.555942065365377e-05, "loss": 0.2969, "num_input_tokens_seen": 32007312, "step": 33515 }, { "epoch": 2.7343176441797863, "grad_norm": 1.8635700941085815, "learning_rate": 4.555739541634537e-05, "loss": 0.3368, "num_input_tokens_seen": 32012016, "step": 33520 }, { "epoch": 2.734725507790195, "grad_norm": 1.22213613986969, "learning_rate": 4.555536976234729e-05, "loss": 0.3846, "num_input_tokens_seen": 32018304, "step": 33525 }, { "epoch": 2.735133371400604, "grad_norm": 1.2242625951766968, "learning_rate": 4.555334369170061e-05, "loss": 0.4063, "num_input_tokens_seen": 32024128, "step": 33530 }, { "epoch": 2.7355412350110124, "grad_norm": 0.655509352684021, "learning_rate": 4.555131720444638e-05, "loss": 0.3696, "num_input_tokens_seen": 32029280, "step": 33535 }, { "epoch": 2.735949098621421, "grad_norm": 1.3023462295532227, "learning_rate": 4.554929030062569e-05, "loss": 0.3427, "num_input_tokens_seen": 32033648, "step": 33540 }, { "epoch": 2.7363569622318296, "grad_norm": 3.483489513397217, "learning_rate": 4.5547262980279605e-05, "loss": 0.3914, "num_input_tokens_seen": 32039408, "step": 33545 }, { "epoch": 2.736764825842238, "grad_norm": 2.3356494903564453, "learning_rate": 4.5545235243449244e-05, "loss": 0.3565, "num_input_tokens_seen": 32044016, "step": 33550 }, { "epoch": 2.737172689452647, "grad_norm": 1.686608910560608, "learning_rate": 4.554320709017569e-05, "loss": 0.4441, "num_input_tokens_seen": 32049584, "step": 33555 }, { "epoch": 2.7375805530630557, "grad_norm": 0.8402540683746338, "learning_rate": 4.554117852050005e-05, "loss": 0.4019, "num_input_tokens_seen": 32054128, "step": 33560 }, { "epoch": 2.7379884166734643, "grad_norm": 3.1202492713928223, "learning_rate": 4.553914953446346e-05, "loss": 0.4191, "num_input_tokens_seen": 32058048, "step": 33565 }, { "epoch": 2.7383962802838733, "grad_norm": 1.551747441291809, "learning_rate": 4.5537120132107036e-05, "loss": 0.3406, "num_input_tokens_seen": 32062512, "step": 33570 }, { "epoch": 2.738804143894282, "grad_norm": 0.4645284414291382, "learning_rate": 4.553509031347191e-05, "loss": 0.3433, "num_input_tokens_seen": 32067520, "step": 33575 }, { "epoch": 2.7392120075046904, "grad_norm": 0.4555371403694153, "learning_rate": 4.5533060078599226e-05, "loss": 0.3473, "num_input_tokens_seen": 32073024, "step": 33580 }, { "epoch": 2.739619871115099, "grad_norm": 0.5007641911506653, "learning_rate": 4.553102942753015e-05, "loss": 0.384, "num_input_tokens_seen": 32077488, "step": 33585 }, { "epoch": 2.7400277347255075, "grad_norm": 1.533215880393982, "learning_rate": 4.552899836030582e-05, "loss": 0.3889, "num_input_tokens_seen": 32080864, "step": 33590 }, { "epoch": 2.7404355983359165, "grad_norm": 2.2467284202575684, "learning_rate": 4.552696687696742e-05, "loss": 0.392, "num_input_tokens_seen": 32085008, "step": 33595 }, { "epoch": 2.740843461946325, "grad_norm": 2.160858392715454, "learning_rate": 4.5524934977556125e-05, "loss": 0.3471, "num_input_tokens_seen": 32089328, "step": 33600 }, { "epoch": 2.7412513255567337, "grad_norm": 2.091045379638672, "learning_rate": 4.552290266211312e-05, "loss": 0.4178, "num_input_tokens_seen": 32094320, "step": 33605 }, { "epoch": 2.7416591891671427, "grad_norm": 0.6301222443580627, "learning_rate": 4.55208699306796e-05, "loss": 0.3627, "num_input_tokens_seen": 32099168, "step": 33610 }, { "epoch": 2.7420670527775513, "grad_norm": 1.3803836107254028, "learning_rate": 4.551883678329677e-05, "loss": 0.3807, "num_input_tokens_seen": 32103984, "step": 33615 }, { "epoch": 2.74247491638796, "grad_norm": 0.3240641951560974, "learning_rate": 4.551680322000583e-05, "loss": 0.3452, "num_input_tokens_seen": 32108688, "step": 33620 }, { "epoch": 2.7428827799983684, "grad_norm": 1.6869100332260132, "learning_rate": 4.5514769240848006e-05, "loss": 0.4071, "num_input_tokens_seen": 32113072, "step": 33625 }, { "epoch": 2.7432906436087774, "grad_norm": 0.8857921957969666, "learning_rate": 4.551273484586453e-05, "loss": 0.3628, "num_input_tokens_seen": 32117824, "step": 33630 }, { "epoch": 2.743698507219186, "grad_norm": 1.6817386150360107, "learning_rate": 4.551070003509663e-05, "loss": 0.3084, "num_input_tokens_seen": 32122352, "step": 33635 }, { "epoch": 2.7441063708295945, "grad_norm": 0.7692916989326477, "learning_rate": 4.550866480858557e-05, "loss": 0.3669, "num_input_tokens_seen": 32126672, "step": 33640 }, { "epoch": 2.744514234440003, "grad_norm": 0.7866344451904297, "learning_rate": 4.550662916637257e-05, "loss": 0.5217, "num_input_tokens_seen": 32131152, "step": 33645 }, { "epoch": 2.744922098050412, "grad_norm": 1.7414926290512085, "learning_rate": 4.550459310849893e-05, "loss": 0.4178, "num_input_tokens_seen": 32136400, "step": 33650 }, { "epoch": 2.7453299616608207, "grad_norm": 0.7208691239356995, "learning_rate": 4.550255663500589e-05, "loss": 0.4424, "num_input_tokens_seen": 32141248, "step": 33655 }, { "epoch": 2.7457378252712292, "grad_norm": 1.3885027170181274, "learning_rate": 4.5500519745934735e-05, "loss": 0.3399, "num_input_tokens_seen": 32146048, "step": 33660 }, { "epoch": 2.7461456888816382, "grad_norm": 0.7711026072502136, "learning_rate": 4.549848244132676e-05, "loss": 0.3327, "num_input_tokens_seen": 32151088, "step": 33665 }, { "epoch": 2.746553552492047, "grad_norm": 1.908531665802002, "learning_rate": 4.5496444721223266e-05, "loss": 0.3563, "num_input_tokens_seen": 32155120, "step": 33670 }, { "epoch": 2.7469614161024554, "grad_norm": 0.3708544075489044, "learning_rate": 4.5494406585665535e-05, "loss": 0.3594, "num_input_tokens_seen": 32160272, "step": 33675 }, { "epoch": 2.747369279712864, "grad_norm": 0.5539435744285583, "learning_rate": 4.5492368034694894e-05, "loss": 0.3443, "num_input_tokens_seen": 32165040, "step": 33680 }, { "epoch": 2.7477771433232725, "grad_norm": 2.3044323921203613, "learning_rate": 4.549032906835266e-05, "loss": 0.3648, "num_input_tokens_seen": 32169360, "step": 33685 }, { "epoch": 2.7481850069336815, "grad_norm": 0.570445716381073, "learning_rate": 4.5488289686680164e-05, "loss": 0.3564, "num_input_tokens_seen": 32175104, "step": 33690 }, { "epoch": 2.74859287054409, "grad_norm": 1.789771318435669, "learning_rate": 4.548624988971874e-05, "loss": 0.3445, "num_input_tokens_seen": 32180064, "step": 33695 }, { "epoch": 2.7490007341544986, "grad_norm": 2.4318323135375977, "learning_rate": 4.548420967750974e-05, "loss": 0.3494, "num_input_tokens_seen": 32184576, "step": 33700 }, { "epoch": 2.7494085977649076, "grad_norm": 1.5643763542175293, "learning_rate": 4.548216905009451e-05, "loss": 0.3611, "num_input_tokens_seen": 32189920, "step": 33705 }, { "epoch": 2.749816461375316, "grad_norm": 1.0109792947769165, "learning_rate": 4.548012800751442e-05, "loss": 0.3359, "num_input_tokens_seen": 32194400, "step": 33710 }, { "epoch": 2.7502243249857248, "grad_norm": 0.6933561563491821, "learning_rate": 4.547808654981084e-05, "loss": 0.4592, "num_input_tokens_seen": 32199120, "step": 33715 }, { "epoch": 2.7506321885961333, "grad_norm": 2.232008695602417, "learning_rate": 4.5476044677025144e-05, "loss": 0.3672, "num_input_tokens_seen": 32203808, "step": 33720 }, { "epoch": 2.751040052206542, "grad_norm": 1.0710678100585938, "learning_rate": 4.5474002389198714e-05, "loss": 0.3934, "num_input_tokens_seen": 32208016, "step": 33725 }, { "epoch": 2.751447915816951, "grad_norm": 1.0779613256454468, "learning_rate": 4.5471959686372965e-05, "loss": 0.3995, "num_input_tokens_seen": 32212272, "step": 33730 }, { "epoch": 2.7518557794273595, "grad_norm": 4.275397300720215, "learning_rate": 4.546991656858929e-05, "loss": 0.3967, "num_input_tokens_seen": 32217360, "step": 33735 }, { "epoch": 2.752263643037768, "grad_norm": 1.4285072088241577, "learning_rate": 4.546787303588911e-05, "loss": 0.3582, "num_input_tokens_seen": 32223008, "step": 33740 }, { "epoch": 2.752671506648177, "grad_norm": 0.7828292846679688, "learning_rate": 4.546582908831384e-05, "loss": 0.3576, "num_input_tokens_seen": 32227744, "step": 33745 }, { "epoch": 2.7530793702585856, "grad_norm": 1.0812053680419922, "learning_rate": 4.5463784725904904e-05, "loss": 0.367, "num_input_tokens_seen": 32231664, "step": 33750 }, { "epoch": 2.753487233868994, "grad_norm": 1.588985800743103, "learning_rate": 4.5461739948703754e-05, "loss": 0.3663, "num_input_tokens_seen": 32236368, "step": 33755 }, { "epoch": 2.7538950974794028, "grad_norm": 1.9979549646377563, "learning_rate": 4.5459694756751823e-05, "loss": 0.3505, "num_input_tokens_seen": 32240912, "step": 33760 }, { "epoch": 2.7543029610898113, "grad_norm": 0.26428309082984924, "learning_rate": 4.5457649150090576e-05, "loss": 0.3566, "num_input_tokens_seen": 32245440, "step": 33765 }, { "epoch": 2.7547108247002203, "grad_norm": 1.1106451749801636, "learning_rate": 4.545560312876148e-05, "loss": 0.3052, "num_input_tokens_seen": 32249536, "step": 33770 }, { "epoch": 2.755118688310629, "grad_norm": 0.9848666191101074, "learning_rate": 4.5453556692805996e-05, "loss": 0.28, "num_input_tokens_seen": 32253600, "step": 33775 }, { "epoch": 2.7555265519210375, "grad_norm": 2.299257755279541, "learning_rate": 4.54515098422656e-05, "loss": 0.3284, "num_input_tokens_seen": 32258576, "step": 33780 }, { "epoch": 2.7559344155314465, "grad_norm": 0.8491400480270386, "learning_rate": 4.54494625771818e-05, "loss": 0.2909, "num_input_tokens_seen": 32263648, "step": 33785 }, { "epoch": 2.756342279141855, "grad_norm": 1.0243576765060425, "learning_rate": 4.5447414897596084e-05, "loss": 0.4635, "num_input_tokens_seen": 32268368, "step": 33790 }, { "epoch": 2.7567501427522636, "grad_norm": 1.06209397315979, "learning_rate": 4.544536680354995e-05, "loss": 0.2038, "num_input_tokens_seen": 32274160, "step": 33795 }, { "epoch": 2.757158006362672, "grad_norm": 1.4689898490905762, "learning_rate": 4.544331829508492e-05, "loss": 0.3475, "num_input_tokens_seen": 32278736, "step": 33800 }, { "epoch": 2.7575658699730807, "grad_norm": 0.7113247513771057, "learning_rate": 4.544126937224252e-05, "loss": 0.3108, "num_input_tokens_seen": 32283632, "step": 33805 }, { "epoch": 2.7579737335834897, "grad_norm": 0.8237814903259277, "learning_rate": 4.5439220035064276e-05, "loss": 0.5361, "num_input_tokens_seen": 32288080, "step": 33810 }, { "epoch": 2.7583815971938983, "grad_norm": 0.7843502759933472, "learning_rate": 4.5437170283591724e-05, "loss": 0.3563, "num_input_tokens_seen": 32292656, "step": 33815 }, { "epoch": 2.758789460804307, "grad_norm": 2.2851459980010986, "learning_rate": 4.543512011786641e-05, "loss": 0.3381, "num_input_tokens_seen": 32297760, "step": 33820 }, { "epoch": 2.759197324414716, "grad_norm": 1.1421974897384644, "learning_rate": 4.543306953792991e-05, "loss": 0.3587, "num_input_tokens_seen": 32302784, "step": 33825 }, { "epoch": 2.7596051880251244, "grad_norm": 0.7190225124359131, "learning_rate": 4.5431018543823757e-05, "loss": 0.3597, "num_input_tokens_seen": 32308368, "step": 33830 }, { "epoch": 2.760013051635533, "grad_norm": 0.4742400050163269, "learning_rate": 4.542896713558955e-05, "loss": 0.3281, "num_input_tokens_seen": 32313328, "step": 33835 }, { "epoch": 2.760420915245942, "grad_norm": 12.282395362854004, "learning_rate": 4.542691531326885e-05, "loss": 0.4234, "num_input_tokens_seen": 32318704, "step": 33840 }, { "epoch": 2.7608287788563506, "grad_norm": 1.2236448526382446, "learning_rate": 4.5424863076903265e-05, "loss": 0.3605, "num_input_tokens_seen": 32323520, "step": 33845 }, { "epoch": 2.761236642466759, "grad_norm": 1.3066521883010864, "learning_rate": 4.542281042653438e-05, "loss": 0.3861, "num_input_tokens_seen": 32328064, "step": 33850 }, { "epoch": 2.7616445060771677, "grad_norm": 2.6381032466888428, "learning_rate": 4.542075736220381e-05, "loss": 0.3507, "num_input_tokens_seen": 32333440, "step": 33855 }, { "epoch": 2.7620523696875763, "grad_norm": 19.884592056274414, "learning_rate": 4.541870388395316e-05, "loss": 0.3485, "num_input_tokens_seen": 32338176, "step": 33860 }, { "epoch": 2.7624602332979853, "grad_norm": 1.908839225769043, "learning_rate": 4.541664999182407e-05, "loss": 0.3758, "num_input_tokens_seen": 32342752, "step": 33865 }, { "epoch": 2.762868096908394, "grad_norm": 1.3318078517913818, "learning_rate": 4.541459568585814e-05, "loss": 0.3531, "num_input_tokens_seen": 32346736, "step": 33870 }, { "epoch": 2.7632759605188024, "grad_norm": 1.780387043952942, "learning_rate": 4.541254096609705e-05, "loss": 0.3441, "num_input_tokens_seen": 32351680, "step": 33875 }, { "epoch": 2.7636838241292114, "grad_norm": 1.3081353902816772, "learning_rate": 4.541048583258242e-05, "loss": 0.3716, "num_input_tokens_seen": 32356864, "step": 33880 }, { "epoch": 2.76409168773962, "grad_norm": 4.842220306396484, "learning_rate": 4.540843028535591e-05, "loss": 0.3324, "num_input_tokens_seen": 32361568, "step": 33885 }, { "epoch": 2.7644995513500286, "grad_norm": 2.455956220626831, "learning_rate": 4.54063743244592e-05, "loss": 0.4087, "num_input_tokens_seen": 32366192, "step": 33890 }, { "epoch": 2.764907414960437, "grad_norm": 0.662810206413269, "learning_rate": 4.540431794993395e-05, "loss": 0.3327, "num_input_tokens_seen": 32370896, "step": 33895 }, { "epoch": 2.7653152785708457, "grad_norm": 1.5117735862731934, "learning_rate": 4.540226116182185e-05, "loss": 0.3318, "num_input_tokens_seen": 32375568, "step": 33900 }, { "epoch": 2.7657231421812547, "grad_norm": 2.4895641803741455, "learning_rate": 4.540020396016458e-05, "loss": 0.3447, "num_input_tokens_seen": 32380384, "step": 33905 }, { "epoch": 2.7661310057916633, "grad_norm": 0.9788047671318054, "learning_rate": 4.539814634500384e-05, "loss": 0.3663, "num_input_tokens_seen": 32385344, "step": 33910 }, { "epoch": 2.766538869402072, "grad_norm": 1.5709693431854248, "learning_rate": 4.5396088316381345e-05, "loss": 0.3513, "num_input_tokens_seen": 32390144, "step": 33915 }, { "epoch": 2.766946733012481, "grad_norm": 1.8377577066421509, "learning_rate": 4.539402987433881e-05, "loss": 0.389, "num_input_tokens_seen": 32395392, "step": 33920 }, { "epoch": 2.7673545966228894, "grad_norm": 1.0003219842910767, "learning_rate": 4.539197101891795e-05, "loss": 0.3651, "num_input_tokens_seen": 32399088, "step": 33925 }, { "epoch": 2.767762460233298, "grad_norm": 4.935713768005371, "learning_rate": 4.538991175016051e-05, "loss": 0.3551, "num_input_tokens_seen": 32404192, "step": 33930 }, { "epoch": 2.7681703238437065, "grad_norm": 0.9965159296989441, "learning_rate": 4.538785206810821e-05, "loss": 0.3199, "num_input_tokens_seen": 32408528, "step": 33935 }, { "epoch": 2.768578187454115, "grad_norm": 2.976511001586914, "learning_rate": 4.538579197280283e-05, "loss": 0.3344, "num_input_tokens_seen": 32413056, "step": 33940 }, { "epoch": 2.768986051064524, "grad_norm": 1.3181779384613037, "learning_rate": 4.5383731464286096e-05, "loss": 0.3903, "num_input_tokens_seen": 32418624, "step": 33945 }, { "epoch": 2.7693939146749327, "grad_norm": 0.788367748260498, "learning_rate": 4.538167054259978e-05, "loss": 0.3072, "num_input_tokens_seen": 32423680, "step": 33950 }, { "epoch": 2.7698017782853412, "grad_norm": 0.9941954612731934, "learning_rate": 4.5379609207785686e-05, "loss": 0.4366, "num_input_tokens_seen": 32428400, "step": 33955 }, { "epoch": 2.7702096418957503, "grad_norm": 1.9372307062149048, "learning_rate": 4.537754745988556e-05, "loss": 0.2631, "num_input_tokens_seen": 32433536, "step": 33960 }, { "epoch": 2.770617505506159, "grad_norm": 0.7622211575508118, "learning_rate": 4.53754852989412e-05, "loss": 0.4139, "num_input_tokens_seen": 32437616, "step": 33965 }, { "epoch": 2.7710253691165674, "grad_norm": 2.0711169242858887, "learning_rate": 4.537342272499442e-05, "loss": 0.4201, "num_input_tokens_seen": 32442288, "step": 33970 }, { "epoch": 2.771433232726976, "grad_norm": 4.360266208648682, "learning_rate": 4.537135973808702e-05, "loss": 0.3533, "num_input_tokens_seen": 32447472, "step": 33975 }, { "epoch": 2.7718410963373845, "grad_norm": 1.8037676811218262, "learning_rate": 4.536929633826081e-05, "loss": 0.4144, "num_input_tokens_seen": 32452080, "step": 33980 }, { "epoch": 2.7722489599477935, "grad_norm": 0.8732520937919617, "learning_rate": 4.536723252555762e-05, "loss": 0.2938, "num_input_tokens_seen": 32457056, "step": 33985 }, { "epoch": 2.772656823558202, "grad_norm": 2.3369140625, "learning_rate": 4.5365168300019276e-05, "loss": 0.3464, "num_input_tokens_seen": 32462544, "step": 33990 }, { "epoch": 2.7730646871686107, "grad_norm": 0.6302650570869446, "learning_rate": 4.536310366168763e-05, "loss": 0.3617, "num_input_tokens_seen": 32468000, "step": 33995 }, { "epoch": 2.7734725507790197, "grad_norm": 1.1810864210128784, "learning_rate": 4.536103861060452e-05, "loss": 0.3838, "num_input_tokens_seen": 32472592, "step": 34000 }, { "epoch": 2.7738804143894282, "grad_norm": 2.2305431365966797, "learning_rate": 4.535897314681182e-05, "loss": 0.3629, "num_input_tokens_seen": 32477600, "step": 34005 }, { "epoch": 2.774288277999837, "grad_norm": 3.666177988052368, "learning_rate": 4.535690727035138e-05, "loss": 0.3679, "num_input_tokens_seen": 32483200, "step": 34010 }, { "epoch": 2.774696141610246, "grad_norm": 1.3630999326705933, "learning_rate": 4.535484098126508e-05, "loss": 0.3906, "num_input_tokens_seen": 32487808, "step": 34015 }, { "epoch": 2.7751040052206544, "grad_norm": 0.5251386761665344, "learning_rate": 4.53527742795948e-05, "loss": 0.3717, "num_input_tokens_seen": 32491792, "step": 34020 }, { "epoch": 2.775511868831063, "grad_norm": 0.8062968254089355, "learning_rate": 4.5350707165382436e-05, "loss": 0.3137, "num_input_tokens_seen": 32496400, "step": 34025 }, { "epoch": 2.7759197324414715, "grad_norm": 2.1770498752593994, "learning_rate": 4.5348639638669884e-05, "loss": 0.3813, "num_input_tokens_seen": 32500080, "step": 34030 }, { "epoch": 2.77632759605188, "grad_norm": 2.485365629196167, "learning_rate": 4.534657169949905e-05, "loss": 0.4922, "num_input_tokens_seen": 32504832, "step": 34035 }, { "epoch": 2.776735459662289, "grad_norm": 0.6234850883483887, "learning_rate": 4.534450334791186e-05, "loss": 0.4394, "num_input_tokens_seen": 32509344, "step": 34040 }, { "epoch": 2.7771433232726976, "grad_norm": 0.6720994710922241, "learning_rate": 4.534243458395023e-05, "loss": 0.2652, "num_input_tokens_seen": 32513504, "step": 34045 }, { "epoch": 2.777551186883106, "grad_norm": 0.871216356754303, "learning_rate": 4.5340365407656095e-05, "loss": 0.3606, "num_input_tokens_seen": 32518768, "step": 34050 }, { "epoch": 2.777959050493515, "grad_norm": 1.53126859664917, "learning_rate": 4.533829581907139e-05, "loss": 0.4245, "num_input_tokens_seen": 32523856, "step": 34055 }, { "epoch": 2.778366914103924, "grad_norm": 1.2157191038131714, "learning_rate": 4.533622581823808e-05, "loss": 0.3941, "num_input_tokens_seen": 32529232, "step": 34060 }, { "epoch": 2.7787747777143323, "grad_norm": 0.800754725933075, "learning_rate": 4.5334155405198106e-05, "loss": 0.3576, "num_input_tokens_seen": 32534432, "step": 34065 }, { "epoch": 2.779182641324741, "grad_norm": 7.652052402496338, "learning_rate": 4.533208457999345e-05, "loss": 0.3534, "num_input_tokens_seen": 32539488, "step": 34070 }, { "epoch": 2.7795905049351495, "grad_norm": 0.34103086590766907, "learning_rate": 4.533001334266608e-05, "loss": 0.3293, "num_input_tokens_seen": 32543744, "step": 34075 }, { "epoch": 2.7799983685455585, "grad_norm": 0.20625825226306915, "learning_rate": 4.532794169325797e-05, "loss": 0.4465, "num_input_tokens_seen": 32548256, "step": 34080 }, { "epoch": 2.780406232155967, "grad_norm": 0.4753129482269287, "learning_rate": 4.5325869631811126e-05, "loss": 0.3702, "num_input_tokens_seen": 32553440, "step": 34085 }, { "epoch": 2.7808140957663756, "grad_norm": 1.54166841506958, "learning_rate": 4.5323797158367534e-05, "loss": 0.3702, "num_input_tokens_seen": 32558576, "step": 34090 }, { "epoch": 2.7812219593767846, "grad_norm": 0.21195809543132782, "learning_rate": 4.532172427296922e-05, "loss": 0.3571, "num_input_tokens_seen": 32563760, "step": 34095 }, { "epoch": 2.781629822987193, "grad_norm": 1.3064059019088745, "learning_rate": 4.531965097565818e-05, "loss": 0.3384, "num_input_tokens_seen": 32569008, "step": 34100 }, { "epoch": 2.7820376865976018, "grad_norm": 1.0975353717803955, "learning_rate": 4.5317577266476455e-05, "loss": 0.3015, "num_input_tokens_seen": 32574464, "step": 34105 }, { "epoch": 2.7824455502080103, "grad_norm": 3.701423168182373, "learning_rate": 4.5315503145466076e-05, "loss": 0.4252, "num_input_tokens_seen": 32579408, "step": 34110 }, { "epoch": 2.782853413818419, "grad_norm": 0.6574869155883789, "learning_rate": 4.531342861266907e-05, "loss": 0.4137, "num_input_tokens_seen": 32584704, "step": 34115 }, { "epoch": 2.783261277428828, "grad_norm": 0.9925846457481384, "learning_rate": 4.531135366812751e-05, "loss": 0.3137, "num_input_tokens_seen": 32589216, "step": 34120 }, { "epoch": 2.7836691410392365, "grad_norm": 1.038442850112915, "learning_rate": 4.530927831188344e-05, "loss": 0.3902, "num_input_tokens_seen": 32593392, "step": 34125 }, { "epoch": 2.784077004649645, "grad_norm": 0.33560940623283386, "learning_rate": 4.530720254397892e-05, "loss": 0.3412, "num_input_tokens_seen": 32597872, "step": 34130 }, { "epoch": 2.784484868260054, "grad_norm": 0.40666845440864563, "learning_rate": 4.530512636445604e-05, "loss": 0.3866, "num_input_tokens_seen": 32601888, "step": 34135 }, { "epoch": 2.7848927318704626, "grad_norm": 1.2683011293411255, "learning_rate": 4.5303049773356873e-05, "loss": 0.3721, "num_input_tokens_seen": 32606544, "step": 34140 }, { "epoch": 2.785300595480871, "grad_norm": 1.1757632493972778, "learning_rate": 4.5300972770723514e-05, "loss": 0.3553, "num_input_tokens_seen": 32610832, "step": 34145 }, { "epoch": 2.7857084590912797, "grad_norm": 1.1506788730621338, "learning_rate": 4.529889535659807e-05, "loss": 0.3523, "num_input_tokens_seen": 32615248, "step": 34150 }, { "epoch": 2.7861163227016883, "grad_norm": 0.2872999310493469, "learning_rate": 4.529681753102264e-05, "loss": 0.333, "num_input_tokens_seen": 32619648, "step": 34155 }, { "epoch": 2.7865241863120973, "grad_norm": 0.3161177635192871, "learning_rate": 4.529473929403934e-05, "loss": 0.4318, "num_input_tokens_seen": 32624928, "step": 34160 }, { "epoch": 2.786932049922506, "grad_norm": 0.3544537127017975, "learning_rate": 4.52926606456903e-05, "loss": 0.3454, "num_input_tokens_seen": 32628816, "step": 34165 }, { "epoch": 2.7873399135329144, "grad_norm": 0.6470952033996582, "learning_rate": 4.5290581586017655e-05, "loss": 0.3447, "num_input_tokens_seen": 32633600, "step": 34170 }, { "epoch": 2.7877477771433234, "grad_norm": 1.12351393699646, "learning_rate": 4.5288502115063535e-05, "loss": 0.3494, "num_input_tokens_seen": 32638704, "step": 34175 }, { "epoch": 2.788155640753732, "grad_norm": 0.7931867241859436, "learning_rate": 4.528642223287011e-05, "loss": 0.3306, "num_input_tokens_seen": 32642912, "step": 34180 }, { "epoch": 2.7885635043641406, "grad_norm": 0.5632451176643372, "learning_rate": 4.5284341939479514e-05, "loss": 0.3792, "num_input_tokens_seen": 32647792, "step": 34185 }, { "epoch": 2.7889713679745496, "grad_norm": 0.3552156090736389, "learning_rate": 4.528226123493393e-05, "loss": 0.3922, "num_input_tokens_seen": 32653008, "step": 34190 }, { "epoch": 2.789379231584958, "grad_norm": 0.34542030096054077, "learning_rate": 4.528018011927554e-05, "loss": 0.3607, "num_input_tokens_seen": 32656464, "step": 34195 }, { "epoch": 2.7897870951953667, "grad_norm": 0.9395172595977783, "learning_rate": 4.52780985925465e-05, "loss": 0.2873, "num_input_tokens_seen": 32661344, "step": 34200 }, { "epoch": 2.7901949588057753, "grad_norm": 0.7872116565704346, "learning_rate": 4.527601665478903e-05, "loss": 0.3161, "num_input_tokens_seen": 32665840, "step": 34205 }, { "epoch": 2.790602822416184, "grad_norm": 0.6672973036766052, "learning_rate": 4.527393430604531e-05, "loss": 0.3112, "num_input_tokens_seen": 32669952, "step": 34210 }, { "epoch": 2.791010686026593, "grad_norm": 0.7216242551803589, "learning_rate": 4.5271851546357564e-05, "loss": 0.3751, "num_input_tokens_seen": 32674256, "step": 34215 }, { "epoch": 2.7914185496370014, "grad_norm": 0.3672170042991638, "learning_rate": 4.5269768375768e-05, "loss": 0.3983, "num_input_tokens_seen": 32678992, "step": 34220 }, { "epoch": 2.79182641324741, "grad_norm": 1.227338194847107, "learning_rate": 4.5267684794318844e-05, "loss": 0.4029, "num_input_tokens_seen": 32683552, "step": 34225 }, { "epoch": 2.792234276857819, "grad_norm": 0.8420300483703613, "learning_rate": 4.526560080205232e-05, "loss": 0.3208, "num_input_tokens_seen": 32688768, "step": 34230 }, { "epoch": 2.7926421404682276, "grad_norm": 1.5699011087417603, "learning_rate": 4.5263516399010694e-05, "loss": 0.462, "num_input_tokens_seen": 32693536, "step": 34235 }, { "epoch": 2.793050004078636, "grad_norm": 0.577675998210907, "learning_rate": 4.526143158523619e-05, "loss": 0.4235, "num_input_tokens_seen": 32698640, "step": 34240 }, { "epoch": 2.7934578676890447, "grad_norm": 0.2939698100090027, "learning_rate": 4.525934636077108e-05, "loss": 0.4437, "num_input_tokens_seen": 32703952, "step": 34245 }, { "epoch": 2.7938657312994533, "grad_norm": 1.1604409217834473, "learning_rate": 4.5257260725657634e-05, "loss": 0.3479, "num_input_tokens_seen": 32708256, "step": 34250 }, { "epoch": 2.7942735949098623, "grad_norm": 1.3379648923873901, "learning_rate": 4.525517467993811e-05, "loss": 0.3644, "num_input_tokens_seen": 32712784, "step": 34255 }, { "epoch": 2.794681458520271, "grad_norm": 1.8908947706222534, "learning_rate": 4.525308822365481e-05, "loss": 0.4074, "num_input_tokens_seen": 32716640, "step": 34260 }, { "epoch": 2.7950893221306794, "grad_norm": 0.939155638217926, "learning_rate": 4.525100135685002e-05, "loss": 0.4039, "num_input_tokens_seen": 32720688, "step": 34265 }, { "epoch": 2.7954971857410884, "grad_norm": 0.8343786001205444, "learning_rate": 4.524891407956603e-05, "loss": 0.3688, "num_input_tokens_seen": 32725216, "step": 34270 }, { "epoch": 2.795905049351497, "grad_norm": 1.1300181150436401, "learning_rate": 4.524682639184516e-05, "loss": 0.3052, "num_input_tokens_seen": 32730544, "step": 34275 }, { "epoch": 2.7963129129619055, "grad_norm": 1.2473421096801758, "learning_rate": 4.524473829372972e-05, "loss": 0.3855, "num_input_tokens_seen": 32736000, "step": 34280 }, { "epoch": 2.796720776572314, "grad_norm": 0.3403699994087219, "learning_rate": 4.524264978526204e-05, "loss": 0.3536, "num_input_tokens_seen": 32740208, "step": 34285 }, { "epoch": 2.7971286401827227, "grad_norm": 1.452826738357544, "learning_rate": 4.524056086648445e-05, "loss": 0.3665, "num_input_tokens_seen": 32745680, "step": 34290 }, { "epoch": 2.7975365037931317, "grad_norm": 0.4504246115684509, "learning_rate": 4.523847153743929e-05, "loss": 0.3511, "num_input_tokens_seen": 32749840, "step": 34295 }, { "epoch": 2.7979443674035402, "grad_norm": 0.3667547106742859, "learning_rate": 4.523638179816891e-05, "loss": 0.3533, "num_input_tokens_seen": 32754848, "step": 34300 }, { "epoch": 2.798352231013949, "grad_norm": 1.0502452850341797, "learning_rate": 4.523429164871568e-05, "loss": 0.3605, "num_input_tokens_seen": 32759648, "step": 34305 }, { "epoch": 2.798760094624358, "grad_norm": 0.44202667474746704, "learning_rate": 4.5232201089121954e-05, "loss": 0.3173, "num_input_tokens_seen": 32764640, "step": 34310 }, { "epoch": 2.7991679582347664, "grad_norm": 1.106314778327942, "learning_rate": 4.52301101194301e-05, "loss": 0.3099, "num_input_tokens_seen": 32769552, "step": 34315 }, { "epoch": 2.799575821845175, "grad_norm": 0.6167091131210327, "learning_rate": 4.5228018739682516e-05, "loss": 0.3877, "num_input_tokens_seen": 32774656, "step": 34320 }, { "epoch": 2.7999836854555835, "grad_norm": 2.7297356128692627, "learning_rate": 4.5225926949921587e-05, "loss": 0.4002, "num_input_tokens_seen": 32779136, "step": 34325 }, { "epoch": 2.800391549065992, "grad_norm": 0.40193742513656616, "learning_rate": 4.522383475018971e-05, "loss": 0.4095, "num_input_tokens_seen": 32783552, "step": 34330 }, { "epoch": 2.800799412676401, "grad_norm": 0.592849850654602, "learning_rate": 4.52217421405293e-05, "loss": 0.3513, "num_input_tokens_seen": 32787728, "step": 34335 }, { "epoch": 2.8012072762868097, "grad_norm": 2.000295400619507, "learning_rate": 4.521964912098276e-05, "loss": 0.3537, "num_input_tokens_seen": 32791616, "step": 34340 }, { "epoch": 2.801615139897218, "grad_norm": 1.86080002784729, "learning_rate": 4.521755569159254e-05, "loss": 0.4358, "num_input_tokens_seen": 32795936, "step": 34345 }, { "epoch": 2.8020230035076272, "grad_norm": 0.7743188738822937, "learning_rate": 4.521546185240105e-05, "loss": 0.4093, "num_input_tokens_seen": 32800944, "step": 34350 }, { "epoch": 2.802430867118036, "grad_norm": 0.9130154848098755, "learning_rate": 4.521336760345074e-05, "loss": 0.3021, "num_input_tokens_seen": 32806176, "step": 34355 }, { "epoch": 2.8028387307284444, "grad_norm": 0.8126935958862305, "learning_rate": 4.5211272944784055e-05, "loss": 0.3674, "num_input_tokens_seen": 32810480, "step": 34360 }, { "epoch": 2.803246594338853, "grad_norm": 4.405815124511719, "learning_rate": 4.520917787644346e-05, "loss": 0.3832, "num_input_tokens_seen": 32816192, "step": 34365 }, { "epoch": 2.8036544579492615, "grad_norm": 1.1107239723205566, "learning_rate": 4.520708239847141e-05, "loss": 0.3515, "num_input_tokens_seen": 32821376, "step": 34370 }, { "epoch": 2.8040623215596705, "grad_norm": 1.700858235359192, "learning_rate": 4.52049865109104e-05, "loss": 0.3353, "num_input_tokens_seen": 32826112, "step": 34375 }, { "epoch": 2.804470185170079, "grad_norm": 0.8676849007606506, "learning_rate": 4.5202890213802886e-05, "loss": 0.2599, "num_input_tokens_seen": 32831600, "step": 34380 }, { "epoch": 2.8048780487804876, "grad_norm": 0.707111656665802, "learning_rate": 4.520079350719137e-05, "loss": 0.3803, "num_input_tokens_seen": 32835616, "step": 34385 }, { "epoch": 2.8052859123908966, "grad_norm": 0.7160735130310059, "learning_rate": 4.519869639111836e-05, "loss": 0.4068, "num_input_tokens_seen": 32840896, "step": 34390 }, { "epoch": 2.805693776001305, "grad_norm": 0.7349116206169128, "learning_rate": 4.519659886562635e-05, "loss": 0.4059, "num_input_tokens_seen": 32845440, "step": 34395 }, { "epoch": 2.8061016396117138, "grad_norm": 0.4913007915019989, "learning_rate": 4.5194500930757874e-05, "loss": 0.3628, "num_input_tokens_seen": 32850816, "step": 34400 }, { "epoch": 2.806509503222123, "grad_norm": 0.45573729276657104, "learning_rate": 4.519240258655544e-05, "loss": 0.3446, "num_input_tokens_seen": 32854816, "step": 34405 }, { "epoch": 2.8069173668325313, "grad_norm": 0.5946027040481567, "learning_rate": 4.519030383306159e-05, "loss": 0.3725, "num_input_tokens_seen": 32860384, "step": 34410 }, { "epoch": 2.80732523044294, "grad_norm": 7.092647552490234, "learning_rate": 4.518820467031885e-05, "loss": 0.3443, "num_input_tokens_seen": 32865584, "step": 34415 }, { "epoch": 2.8077330940533485, "grad_norm": 0.9778223633766174, "learning_rate": 4.518610509836979e-05, "loss": 0.3681, "num_input_tokens_seen": 32870848, "step": 34420 }, { "epoch": 2.808140957663757, "grad_norm": 1.1967895030975342, "learning_rate": 4.518400511725695e-05, "loss": 0.2964, "num_input_tokens_seen": 32875168, "step": 34425 }, { "epoch": 2.808548821274166, "grad_norm": 0.802496075630188, "learning_rate": 4.5181904727022906e-05, "loss": 0.3179, "num_input_tokens_seen": 32879632, "step": 34430 }, { "epoch": 2.8089566848845746, "grad_norm": 0.6140921115875244, "learning_rate": 4.517980392771023e-05, "loss": 0.3121, "num_input_tokens_seen": 32883408, "step": 34435 }, { "epoch": 2.809364548494983, "grad_norm": 0.7195757627487183, "learning_rate": 4.5177702719361506e-05, "loss": 0.3714, "num_input_tokens_seen": 32888160, "step": 34440 }, { "epoch": 2.809772412105392, "grad_norm": 1.3195359706878662, "learning_rate": 4.5175601102019316e-05, "loss": 0.3133, "num_input_tokens_seen": 32892992, "step": 34445 }, { "epoch": 2.8101802757158008, "grad_norm": 0.931793212890625, "learning_rate": 4.5173499075726264e-05, "loss": 0.2215, "num_input_tokens_seen": 32897760, "step": 34450 }, { "epoch": 2.8105881393262093, "grad_norm": 2.3859994411468506, "learning_rate": 4.517139664052497e-05, "loss": 0.4314, "num_input_tokens_seen": 32901264, "step": 34455 }, { "epoch": 2.810996002936618, "grad_norm": 1.0416454076766968, "learning_rate": 4.516929379645802e-05, "loss": 0.4006, "num_input_tokens_seen": 32905776, "step": 34460 }, { "epoch": 2.8114038665470265, "grad_norm": 1.3130037784576416, "learning_rate": 4.516719054356807e-05, "loss": 0.3245, "num_input_tokens_seen": 32910320, "step": 34465 }, { "epoch": 2.8118117301574355, "grad_norm": 2.0010244846343994, "learning_rate": 4.516508688189773e-05, "loss": 0.415, "num_input_tokens_seen": 32914752, "step": 34470 }, { "epoch": 2.812219593767844, "grad_norm": 1.489691972732544, "learning_rate": 4.5162982811489654e-05, "loss": 0.357, "num_input_tokens_seen": 32919424, "step": 34475 }, { "epoch": 2.8126274573782526, "grad_norm": 0.9858267903327942, "learning_rate": 4.516087833238647e-05, "loss": 0.3189, "num_input_tokens_seen": 32924640, "step": 34480 }, { "epoch": 2.8130353209886616, "grad_norm": 1.174626350402832, "learning_rate": 4.515877344463086e-05, "loss": 0.341, "num_input_tokens_seen": 32929472, "step": 34485 }, { "epoch": 2.81344318459907, "grad_norm": 1.1766605377197266, "learning_rate": 4.515666814826548e-05, "loss": 0.3403, "num_input_tokens_seen": 32935424, "step": 34490 }, { "epoch": 2.8138510482094787, "grad_norm": 0.40838587284088135, "learning_rate": 4.5154562443333e-05, "loss": 0.3456, "num_input_tokens_seen": 32941024, "step": 34495 }, { "epoch": 2.8142589118198873, "grad_norm": 0.5090933442115784, "learning_rate": 4.51524563298761e-05, "loss": 0.3837, "num_input_tokens_seen": 32946256, "step": 34500 }, { "epoch": 2.814666775430296, "grad_norm": 1.529555082321167, "learning_rate": 4.515034980793748e-05, "loss": 0.398, "num_input_tokens_seen": 32950784, "step": 34505 }, { "epoch": 2.815074639040705, "grad_norm": 0.5207375884056091, "learning_rate": 4.514824287755982e-05, "loss": 0.3625, "num_input_tokens_seen": 32955792, "step": 34510 }, { "epoch": 2.8154825026511134, "grad_norm": 0.7466709017753601, "learning_rate": 4.514613553878585e-05, "loss": 0.3625, "num_input_tokens_seen": 32960448, "step": 34515 }, { "epoch": 2.815890366261522, "grad_norm": 1.306535005569458, "learning_rate": 4.5144027791658264e-05, "loss": 0.3377, "num_input_tokens_seen": 32965456, "step": 34520 }, { "epoch": 2.816298229871931, "grad_norm": 1.1775639057159424, "learning_rate": 4.51419196362198e-05, "loss": 0.3739, "num_input_tokens_seen": 32970416, "step": 34525 }, { "epoch": 2.8167060934823396, "grad_norm": 0.39893633127212524, "learning_rate": 4.5139811072513184e-05, "loss": 0.3622, "num_input_tokens_seen": 32975520, "step": 34530 }, { "epoch": 2.817113957092748, "grad_norm": 0.8973478674888611, "learning_rate": 4.513770210058115e-05, "loss": 0.3643, "num_input_tokens_seen": 32980672, "step": 34535 }, { "epoch": 2.8175218207031567, "grad_norm": 0.8827658295631409, "learning_rate": 4.513559272046646e-05, "loss": 0.2935, "num_input_tokens_seen": 32985472, "step": 34540 }, { "epoch": 2.8179296843135653, "grad_norm": 0.9549006223678589, "learning_rate": 4.5133482932211856e-05, "loss": 0.3482, "num_input_tokens_seen": 32990624, "step": 34545 }, { "epoch": 2.8183375479239743, "grad_norm": 0.566764771938324, "learning_rate": 4.513137273586011e-05, "loss": 0.4712, "num_input_tokens_seen": 32995200, "step": 34550 }, { "epoch": 2.818745411534383, "grad_norm": 0.8070182800292969, "learning_rate": 4.512926213145399e-05, "loss": 0.3173, "num_input_tokens_seen": 33000736, "step": 34555 }, { "epoch": 2.8191532751447914, "grad_norm": 1.0839428901672363, "learning_rate": 4.512715111903628e-05, "loss": 0.4042, "num_input_tokens_seen": 33005808, "step": 34560 }, { "epoch": 2.8195611387552004, "grad_norm": 0.8949145674705505, "learning_rate": 4.5125039698649776e-05, "loss": 0.3531, "num_input_tokens_seen": 33010864, "step": 34565 }, { "epoch": 2.819969002365609, "grad_norm": 0.47019484639167786, "learning_rate": 4.5122927870337265e-05, "loss": 0.346, "num_input_tokens_seen": 33015728, "step": 34570 }, { "epoch": 2.8203768659760176, "grad_norm": 0.5491302013397217, "learning_rate": 4.5120815634141554e-05, "loss": 0.3289, "num_input_tokens_seen": 33020784, "step": 34575 }, { "epoch": 2.8207847295864266, "grad_norm": 0.3830502927303314, "learning_rate": 4.511870299010546e-05, "loss": 0.3937, "num_input_tokens_seen": 33025984, "step": 34580 }, { "epoch": 2.821192593196835, "grad_norm": 0.5543428063392639, "learning_rate": 4.511658993827182e-05, "loss": 0.3379, "num_input_tokens_seen": 33030960, "step": 34585 }, { "epoch": 2.8216004568072437, "grad_norm": 0.3013485372066498, "learning_rate": 4.511447647868343e-05, "loss": 0.3466, "num_input_tokens_seen": 33036064, "step": 34590 }, { "epoch": 2.8220083204176523, "grad_norm": 0.9196423888206482, "learning_rate": 4.511236261138316e-05, "loss": 0.333, "num_input_tokens_seen": 33041168, "step": 34595 }, { "epoch": 2.822416184028061, "grad_norm": 0.2867053151130676, "learning_rate": 4.511024833641384e-05, "loss": 0.3309, "num_input_tokens_seen": 33045808, "step": 34600 }, { "epoch": 2.82282404763847, "grad_norm": 0.39258015155792236, "learning_rate": 4.5108133653818335e-05, "loss": 0.3449, "num_input_tokens_seen": 33051008, "step": 34605 }, { "epoch": 2.8232319112488784, "grad_norm": 0.4051036536693573, "learning_rate": 4.510601856363951e-05, "loss": 0.3496, "num_input_tokens_seen": 33055712, "step": 34610 }, { "epoch": 2.823639774859287, "grad_norm": 1.2937361001968384, "learning_rate": 4.510390306592022e-05, "loss": 0.3532, "num_input_tokens_seen": 33060048, "step": 34615 }, { "epoch": 2.824047638469696, "grad_norm": 1.172669768333435, "learning_rate": 4.510178716070337e-05, "loss": 0.3543, "num_input_tokens_seen": 33065328, "step": 34620 }, { "epoch": 2.8244555020801045, "grad_norm": 1.310756802558899, "learning_rate": 4.5099670848031825e-05, "loss": 0.3503, "num_input_tokens_seen": 33069824, "step": 34625 }, { "epoch": 2.824863365690513, "grad_norm": 0.45315006375312805, "learning_rate": 4.50975541279485e-05, "loss": 0.421, "num_input_tokens_seen": 33074048, "step": 34630 }, { "epoch": 2.8252712293009217, "grad_norm": 0.2827436327934265, "learning_rate": 4.50954370004963e-05, "loss": 0.412, "num_input_tokens_seen": 33079552, "step": 34635 }, { "epoch": 2.8256790929113302, "grad_norm": 1.2123298645019531, "learning_rate": 4.509331946571812e-05, "loss": 0.3609, "num_input_tokens_seen": 33084160, "step": 34640 }, { "epoch": 2.8260869565217392, "grad_norm": 0.3821793496608734, "learning_rate": 4.509120152365689e-05, "loss": 0.3469, "num_input_tokens_seen": 33089344, "step": 34645 }, { "epoch": 2.826494820132148, "grad_norm": 0.5446648001670837, "learning_rate": 4.5089083174355554e-05, "loss": 0.3298, "num_input_tokens_seen": 33094112, "step": 34650 }, { "epoch": 2.8269026837425564, "grad_norm": 0.6811082363128662, "learning_rate": 4.5086964417857025e-05, "loss": 0.3198, "num_input_tokens_seen": 33099456, "step": 34655 }, { "epoch": 2.8273105473529654, "grad_norm": 0.6423919796943665, "learning_rate": 4.5084845254204266e-05, "loss": 0.4159, "num_input_tokens_seen": 33104800, "step": 34660 }, { "epoch": 2.827718410963374, "grad_norm": 0.6700151562690735, "learning_rate": 4.508272568344023e-05, "loss": 0.3394, "num_input_tokens_seen": 33110304, "step": 34665 }, { "epoch": 2.8281262745737825, "grad_norm": 0.6301929950714111, "learning_rate": 4.5080605705607885e-05, "loss": 0.3789, "num_input_tokens_seen": 33115248, "step": 34670 }, { "epoch": 2.828534138184191, "grad_norm": 1.407396912574768, "learning_rate": 4.507848532075019e-05, "loss": 0.285, "num_input_tokens_seen": 33119472, "step": 34675 }, { "epoch": 2.8289420017945996, "grad_norm": 1.6928094625473022, "learning_rate": 4.507636452891013e-05, "loss": 0.376, "num_input_tokens_seen": 33124000, "step": 34680 }, { "epoch": 2.8293498654050087, "grad_norm": 0.7140453457832336, "learning_rate": 4.507424333013069e-05, "loss": 0.3076, "num_input_tokens_seen": 33128960, "step": 34685 }, { "epoch": 2.8297577290154172, "grad_norm": 0.8094459772109985, "learning_rate": 4.507212172445487e-05, "loss": 0.3142, "num_input_tokens_seen": 33133456, "step": 34690 }, { "epoch": 2.830165592625826, "grad_norm": 1.0364874601364136, "learning_rate": 4.506999971192567e-05, "loss": 0.3756, "num_input_tokens_seen": 33138272, "step": 34695 }, { "epoch": 2.830573456236235, "grad_norm": 0.7682612538337708, "learning_rate": 4.5067877292586105e-05, "loss": 0.3492, "num_input_tokens_seen": 33143344, "step": 34700 }, { "epoch": 2.8309813198466434, "grad_norm": 0.7360198497772217, "learning_rate": 4.506575446647919e-05, "loss": 0.2987, "num_input_tokens_seen": 33147088, "step": 34705 }, { "epoch": 2.831389183457052, "grad_norm": 0.98378986120224, "learning_rate": 4.5063631233647965e-05, "loss": 0.3534, "num_input_tokens_seen": 33152128, "step": 34710 }, { "epoch": 2.8317970470674605, "grad_norm": 0.9105497598648071, "learning_rate": 4.506150759413546e-05, "loss": 0.2873, "num_input_tokens_seen": 33156560, "step": 34715 }, { "epoch": 2.832204910677869, "grad_norm": 0.9053201675415039, "learning_rate": 4.5059383547984724e-05, "loss": 0.315, "num_input_tokens_seen": 33160960, "step": 34720 }, { "epoch": 2.832612774288278, "grad_norm": 1.8968027830123901, "learning_rate": 4.50572590952388e-05, "loss": 0.5295, "num_input_tokens_seen": 33165120, "step": 34725 }, { "epoch": 2.8330206378986866, "grad_norm": 0.8926326632499695, "learning_rate": 4.505513423594076e-05, "loss": 0.3166, "num_input_tokens_seen": 33170064, "step": 34730 }, { "epoch": 2.833428501509095, "grad_norm": 0.2952360212802887, "learning_rate": 4.505300897013367e-05, "loss": 0.3422, "num_input_tokens_seen": 33174992, "step": 34735 }, { "epoch": 2.833836365119504, "grad_norm": 0.7437503337860107, "learning_rate": 4.505088329786061e-05, "loss": 0.2837, "num_input_tokens_seen": 33179616, "step": 34740 }, { "epoch": 2.8342442287299128, "grad_norm": 0.97618168592453, "learning_rate": 4.504875721916466e-05, "loss": 0.3525, "num_input_tokens_seen": 33184064, "step": 34745 }, { "epoch": 2.8346520923403213, "grad_norm": 0.9805252552032471, "learning_rate": 4.504663073408893e-05, "loss": 0.311, "num_input_tokens_seen": 33188000, "step": 34750 }, { "epoch": 2.8350599559507303, "grad_norm": 0.8901007175445557, "learning_rate": 4.504450384267651e-05, "loss": 0.4334, "num_input_tokens_seen": 33193136, "step": 34755 }, { "epoch": 2.835467819561139, "grad_norm": 0.5434440970420837, "learning_rate": 4.504237654497051e-05, "loss": 0.3787, "num_input_tokens_seen": 33197888, "step": 34760 }, { "epoch": 2.8358756831715475, "grad_norm": 0.3006003201007843, "learning_rate": 4.504024884101406e-05, "loss": 0.3612, "num_input_tokens_seen": 33202368, "step": 34765 }, { "epoch": 2.836283546781956, "grad_norm": 0.20776984095573425, "learning_rate": 4.503812073085028e-05, "loss": 0.3699, "num_input_tokens_seen": 33206896, "step": 34770 }, { "epoch": 2.8366914103923646, "grad_norm": 0.7560240030288696, "learning_rate": 4.503599221452231e-05, "loss": 0.3468, "num_input_tokens_seen": 33212176, "step": 34775 }, { "epoch": 2.8370992740027736, "grad_norm": 0.6734963059425354, "learning_rate": 4.5033863292073286e-05, "loss": 0.3738, "num_input_tokens_seen": 33217536, "step": 34780 }, { "epoch": 2.837507137613182, "grad_norm": 0.7174279689788818, "learning_rate": 4.503173396354637e-05, "loss": 0.4081, "num_input_tokens_seen": 33222256, "step": 34785 }, { "epoch": 2.8379150012235907, "grad_norm": 0.6015508770942688, "learning_rate": 4.502960422898471e-05, "loss": 0.3586, "num_input_tokens_seen": 33227600, "step": 34790 }, { "epoch": 2.8383228648339998, "grad_norm": 4.14130163192749, "learning_rate": 4.502747408843149e-05, "loss": 0.3537, "num_input_tokens_seen": 33232688, "step": 34795 }, { "epoch": 2.8387307284444083, "grad_norm": 1.290551781654358, "learning_rate": 4.5025343541929886e-05, "loss": 0.365, "num_input_tokens_seen": 33237104, "step": 34800 }, { "epoch": 2.839138592054817, "grad_norm": 0.9543291330337524, "learning_rate": 4.502321258952307e-05, "loss": 0.3574, "num_input_tokens_seen": 33242704, "step": 34805 }, { "epoch": 2.8395464556652255, "grad_norm": 0.9914106726646423, "learning_rate": 4.502108123125425e-05, "loss": 0.3143, "num_input_tokens_seen": 33247056, "step": 34810 }, { "epoch": 2.839954319275634, "grad_norm": 0.8997746706008911, "learning_rate": 4.501894946716661e-05, "loss": 0.2811, "num_input_tokens_seen": 33252272, "step": 34815 }, { "epoch": 2.840362182886043, "grad_norm": 0.6872542500495911, "learning_rate": 4.501681729730338e-05, "loss": 0.3886, "num_input_tokens_seen": 33256624, "step": 34820 }, { "epoch": 2.8407700464964516, "grad_norm": 0.44413474202156067, "learning_rate": 4.5014684721707766e-05, "loss": 0.3917, "num_input_tokens_seen": 33261904, "step": 34825 }, { "epoch": 2.84117791010686, "grad_norm": 0.9028361439704895, "learning_rate": 4.5012551740423e-05, "loss": 0.4075, "num_input_tokens_seen": 33266352, "step": 34830 }, { "epoch": 2.841585773717269, "grad_norm": 0.4320235848426819, "learning_rate": 4.501041835349231e-05, "loss": 0.3383, "num_input_tokens_seen": 33270400, "step": 34835 }, { "epoch": 2.8419936373276777, "grad_norm": 0.30184057354927063, "learning_rate": 4.5008284560958944e-05, "loss": 0.3625, "num_input_tokens_seen": 33275216, "step": 34840 }, { "epoch": 2.8424015009380863, "grad_norm": 0.8263880610466003, "learning_rate": 4.500615036286615e-05, "loss": 0.3334, "num_input_tokens_seen": 33279568, "step": 34845 }, { "epoch": 2.842809364548495, "grad_norm": 0.9950956106185913, "learning_rate": 4.500401575925719e-05, "loss": 0.3659, "num_input_tokens_seen": 33284672, "step": 34850 }, { "epoch": 2.8432172281589034, "grad_norm": 1.2495442628860474, "learning_rate": 4.500188075017534e-05, "loss": 0.38, "num_input_tokens_seen": 33290032, "step": 34855 }, { "epoch": 2.8436250917693124, "grad_norm": 1.0110886096954346, "learning_rate": 4.4999745335663855e-05, "loss": 0.3442, "num_input_tokens_seen": 33294816, "step": 34860 }, { "epoch": 2.844032955379721, "grad_norm": 1.1031385660171509, "learning_rate": 4.4997609515766035e-05, "loss": 0.3314, "num_input_tokens_seen": 33300320, "step": 34865 }, { "epoch": 2.8444408189901296, "grad_norm": 0.4262484014034271, "learning_rate": 4.499547329052517e-05, "loss": 0.3784, "num_input_tokens_seen": 33305264, "step": 34870 }, { "epoch": 2.8448486826005386, "grad_norm": 1.334958791732788, "learning_rate": 4.499333665998455e-05, "loss": 0.3599, "num_input_tokens_seen": 33309696, "step": 34875 }, { "epoch": 2.845256546210947, "grad_norm": 0.1752273589372635, "learning_rate": 4.4991199624187504e-05, "loss": 0.3785, "num_input_tokens_seen": 33314592, "step": 34880 }, { "epoch": 2.8456644098213557, "grad_norm": 1.4290237426757812, "learning_rate": 4.498906218317733e-05, "loss": 0.3843, "num_input_tokens_seen": 33319296, "step": 34885 }, { "epoch": 2.8460722734317643, "grad_norm": 0.8264508247375488, "learning_rate": 4.498692433699736e-05, "loss": 0.264, "num_input_tokens_seen": 33324288, "step": 34890 }, { "epoch": 2.846480137042173, "grad_norm": 1.97886323928833, "learning_rate": 4.498478608569092e-05, "loss": 0.4274, "num_input_tokens_seen": 33328768, "step": 34895 }, { "epoch": 2.846888000652582, "grad_norm": 1.807268500328064, "learning_rate": 4.4982647429301364e-05, "loss": 0.483, "num_input_tokens_seen": 33333536, "step": 34900 }, { "epoch": 2.8472958642629904, "grad_norm": 0.698943018913269, "learning_rate": 4.4980508367872036e-05, "loss": 0.388, "num_input_tokens_seen": 33338336, "step": 34905 }, { "epoch": 2.847703727873399, "grad_norm": 0.6045858263969421, "learning_rate": 4.497836890144629e-05, "loss": 0.3852, "num_input_tokens_seen": 33343856, "step": 34910 }, { "epoch": 2.848111591483808, "grad_norm": 0.2914294898509979, "learning_rate": 4.49762290300675e-05, "loss": 0.3468, "num_input_tokens_seen": 33348992, "step": 34915 }, { "epoch": 2.8485194550942166, "grad_norm": 0.8288007378578186, "learning_rate": 4.497408875377904e-05, "loss": 0.323, "num_input_tokens_seen": 33353824, "step": 34920 }, { "epoch": 2.848927318704625, "grad_norm": 0.6423128247261047, "learning_rate": 4.497194807262428e-05, "loss": 0.3283, "num_input_tokens_seen": 33358704, "step": 34925 }, { "epoch": 2.8493351823150337, "grad_norm": 1.8301239013671875, "learning_rate": 4.4969806986646624e-05, "loss": 0.472, "num_input_tokens_seen": 33363152, "step": 34930 }, { "epoch": 2.8497430459254427, "grad_norm": 0.6944483518600464, "learning_rate": 4.496766549588946e-05, "loss": 0.3381, "num_input_tokens_seen": 33368064, "step": 34935 }, { "epoch": 2.8501509095358513, "grad_norm": 0.9037209153175354, "learning_rate": 4.496552360039621e-05, "loss": 0.3843, "num_input_tokens_seen": 33372000, "step": 34940 }, { "epoch": 2.85055877314626, "grad_norm": 1.0633593797683716, "learning_rate": 4.4963381300210274e-05, "loss": 0.3534, "num_input_tokens_seen": 33376976, "step": 34945 }, { "epoch": 2.8509666367566684, "grad_norm": 0.19863823056221008, "learning_rate": 4.4961238595375084e-05, "loss": 0.3757, "num_input_tokens_seen": 33381296, "step": 34950 }, { "epoch": 2.8513745003670774, "grad_norm": 0.5552160739898682, "learning_rate": 4.495909548593407e-05, "loss": 0.3359, "num_input_tokens_seen": 33385888, "step": 34955 }, { "epoch": 2.851782363977486, "grad_norm": 0.526351273059845, "learning_rate": 4.4956951971930675e-05, "loss": 0.3376, "num_input_tokens_seen": 33390896, "step": 34960 }, { "epoch": 2.8521902275878945, "grad_norm": 1.3888256549835205, "learning_rate": 4.4954808053408335e-05, "loss": 0.3695, "num_input_tokens_seen": 33396304, "step": 34965 }, { "epoch": 2.8525980911983035, "grad_norm": 1.229526400566101, "learning_rate": 4.4952663730410516e-05, "loss": 0.3594, "num_input_tokens_seen": 33400912, "step": 34970 }, { "epoch": 2.853005954808712, "grad_norm": 0.36909663677215576, "learning_rate": 4.495051900298069e-05, "loss": 0.2953, "num_input_tokens_seen": 33405920, "step": 34975 }, { "epoch": 2.8534138184191207, "grad_norm": 0.41236451268196106, "learning_rate": 4.494837387116232e-05, "loss": 0.4062, "num_input_tokens_seen": 33410848, "step": 34980 }, { "epoch": 2.8538216820295292, "grad_norm": 1.1959350109100342, "learning_rate": 4.4946228334998884e-05, "loss": 0.367, "num_input_tokens_seen": 33415440, "step": 34985 }, { "epoch": 2.854229545639938, "grad_norm": 1.109352946281433, "learning_rate": 4.494408239453387e-05, "loss": 0.3892, "num_input_tokens_seen": 33419408, "step": 34990 }, { "epoch": 2.854637409250347, "grad_norm": 0.5289310812950134, "learning_rate": 4.494193604981079e-05, "loss": 0.3236, "num_input_tokens_seen": 33424608, "step": 34995 }, { "epoch": 2.8550452728607554, "grad_norm": 0.46008366346359253, "learning_rate": 4.493978930087314e-05, "loss": 0.3458, "num_input_tokens_seen": 33429760, "step": 35000 }, { "epoch": 2.855453136471164, "grad_norm": 0.6818809509277344, "learning_rate": 4.493764214776443e-05, "loss": 0.3339, "num_input_tokens_seen": 33434240, "step": 35005 }, { "epoch": 2.855861000081573, "grad_norm": 0.7163242101669312, "learning_rate": 4.493549459052818e-05, "loss": 0.3093, "num_input_tokens_seen": 33438272, "step": 35010 }, { "epoch": 2.8562688636919815, "grad_norm": 0.7555489540100098, "learning_rate": 4.493334662920794e-05, "loss": 0.362, "num_input_tokens_seen": 33443184, "step": 35015 }, { "epoch": 2.85667672730239, "grad_norm": 0.3634403347969055, "learning_rate": 4.493119826384722e-05, "loss": 0.3945, "num_input_tokens_seen": 33447376, "step": 35020 }, { "epoch": 2.8570845909127986, "grad_norm": 0.26931026577949524, "learning_rate": 4.492904949448959e-05, "loss": 0.357, "num_input_tokens_seen": 33452080, "step": 35025 }, { "epoch": 2.857492454523207, "grad_norm": 0.4807584881782532, "learning_rate": 4.4926900321178595e-05, "loss": 0.3635, "num_input_tokens_seen": 33457200, "step": 35030 }, { "epoch": 2.8579003181336162, "grad_norm": 1.4845926761627197, "learning_rate": 4.4924750743957796e-05, "loss": 0.3382, "num_input_tokens_seen": 33462592, "step": 35035 }, { "epoch": 2.858308181744025, "grad_norm": 1.256473422050476, "learning_rate": 4.492260076287076e-05, "loss": 0.3574, "num_input_tokens_seen": 33467488, "step": 35040 }, { "epoch": 2.8587160453544334, "grad_norm": 1.1956357955932617, "learning_rate": 4.492045037796108e-05, "loss": 0.4233, "num_input_tokens_seen": 33472512, "step": 35045 }, { "epoch": 2.8591239089648424, "grad_norm": 0.9588830471038818, "learning_rate": 4.4918299589272335e-05, "loss": 0.341, "num_input_tokens_seen": 33477840, "step": 35050 }, { "epoch": 2.859531772575251, "grad_norm": 0.560096263885498, "learning_rate": 4.4916148396848115e-05, "loss": 0.3411, "num_input_tokens_seen": 33483088, "step": 35055 }, { "epoch": 2.8599396361856595, "grad_norm": 0.8175202012062073, "learning_rate": 4.491399680073203e-05, "loss": 0.305, "num_input_tokens_seen": 33488112, "step": 35060 }, { "epoch": 2.860347499796068, "grad_norm": 0.6444391012191772, "learning_rate": 4.49118448009677e-05, "loss": 0.2405, "num_input_tokens_seen": 33493008, "step": 35065 }, { "epoch": 2.8607553634064766, "grad_norm": 0.7099287509918213, "learning_rate": 4.4909692397598735e-05, "loss": 0.3537, "num_input_tokens_seen": 33497392, "step": 35070 }, { "epoch": 2.8611632270168856, "grad_norm": 0.9328774809837341, "learning_rate": 4.490753959066876e-05, "loss": 0.3911, "num_input_tokens_seen": 33502240, "step": 35075 }, { "epoch": 2.861571090627294, "grad_norm": 0.5261471271514893, "learning_rate": 4.490538638022142e-05, "loss": 0.5662, "num_input_tokens_seen": 33507296, "step": 35080 }, { "epoch": 2.8619789542377028, "grad_norm": 0.7194706797599792, "learning_rate": 4.4903232766300355e-05, "loss": 0.2041, "num_input_tokens_seen": 33511952, "step": 35085 }, { "epoch": 2.8623868178481118, "grad_norm": 0.7702708840370178, "learning_rate": 4.490107874894922e-05, "loss": 0.3444, "num_input_tokens_seen": 33516288, "step": 35090 }, { "epoch": 2.8627946814585203, "grad_norm": 0.6239781975746155, "learning_rate": 4.489892432821168e-05, "loss": 0.3358, "num_input_tokens_seen": 33521344, "step": 35095 }, { "epoch": 2.863202545068929, "grad_norm": 0.3187815546989441, "learning_rate": 4.4896769504131386e-05, "loss": 0.3922, "num_input_tokens_seen": 33525680, "step": 35100 }, { "epoch": 2.8636104086793375, "grad_norm": 0.5701496005058289, "learning_rate": 4.489461427675204e-05, "loss": 0.3388, "num_input_tokens_seen": 33530848, "step": 35105 }, { "epoch": 2.864018272289746, "grad_norm": 0.915850818157196, "learning_rate": 4.489245864611732e-05, "loss": 0.3192, "num_input_tokens_seen": 33536080, "step": 35110 }, { "epoch": 2.864426135900155, "grad_norm": 0.4962683320045471, "learning_rate": 4.4890302612270907e-05, "loss": 0.3559, "num_input_tokens_seen": 33540560, "step": 35115 }, { "epoch": 2.8648339995105636, "grad_norm": 0.8518710732460022, "learning_rate": 4.488814617525651e-05, "loss": 0.3616, "num_input_tokens_seen": 33545248, "step": 35120 }, { "epoch": 2.865241863120972, "grad_norm": 0.3497065305709839, "learning_rate": 4.488598933511785e-05, "loss": 0.3202, "num_input_tokens_seen": 33550160, "step": 35125 }, { "epoch": 2.865649726731381, "grad_norm": 0.3651590943336487, "learning_rate": 4.488383209189863e-05, "loss": 0.355, "num_input_tokens_seen": 33554848, "step": 35130 }, { "epoch": 2.8660575903417898, "grad_norm": 0.8111446499824524, "learning_rate": 4.488167444564258e-05, "loss": 0.2815, "num_input_tokens_seen": 33560256, "step": 35135 }, { "epoch": 2.8664654539521983, "grad_norm": 0.7495055198669434, "learning_rate": 4.487951639639345e-05, "loss": 0.4454, "num_input_tokens_seen": 33564928, "step": 35140 }, { "epoch": 2.8668733175626073, "grad_norm": 0.7831985354423523, "learning_rate": 4.487735794419496e-05, "loss": 0.325, "num_input_tokens_seen": 33569760, "step": 35145 }, { "epoch": 2.867281181173016, "grad_norm": 0.7535200119018555, "learning_rate": 4.487519908909087e-05, "loss": 0.421, "num_input_tokens_seen": 33574800, "step": 35150 }, { "epoch": 2.8676890447834245, "grad_norm": 0.2866382598876953, "learning_rate": 4.487303983112494e-05, "loss": 0.3432, "num_input_tokens_seen": 33579920, "step": 35155 }, { "epoch": 2.868096908393833, "grad_norm": 1.0574692487716675, "learning_rate": 4.4870880170340944e-05, "loss": 0.3542, "num_input_tokens_seen": 33584992, "step": 35160 }, { "epoch": 2.8685047720042416, "grad_norm": 0.9700818061828613, "learning_rate": 4.4868720106782646e-05, "loss": 0.3434, "num_input_tokens_seen": 33590208, "step": 35165 }, { "epoch": 2.8689126356146506, "grad_norm": 1.1820213794708252, "learning_rate": 4.486655964049383e-05, "loss": 0.4217, "num_input_tokens_seen": 33594416, "step": 35170 }, { "epoch": 2.869320499225059, "grad_norm": 1.0284816026687622, "learning_rate": 4.48643987715183e-05, "loss": 0.3968, "num_input_tokens_seen": 33599408, "step": 35175 }, { "epoch": 2.8697283628354677, "grad_norm": 0.786132276058197, "learning_rate": 4.486223749989985e-05, "loss": 0.3254, "num_input_tokens_seen": 33604496, "step": 35180 }, { "epoch": 2.8701362264458767, "grad_norm": 0.43018046021461487, "learning_rate": 4.486007582568228e-05, "loss": 0.4734, "num_input_tokens_seen": 33609104, "step": 35185 }, { "epoch": 2.8705440900562853, "grad_norm": 0.6059141159057617, "learning_rate": 4.485791374890941e-05, "loss": 0.4033, "num_input_tokens_seen": 33614352, "step": 35190 }, { "epoch": 2.870951953666694, "grad_norm": 1.11985445022583, "learning_rate": 4.4855751269625066e-05, "loss": 0.3627, "num_input_tokens_seen": 33619136, "step": 35195 }, { "epoch": 2.8713598172771024, "grad_norm": 0.26515868306159973, "learning_rate": 4.4853588387873084e-05, "loss": 0.3509, "num_input_tokens_seen": 33625248, "step": 35200 }, { "epoch": 2.871767680887511, "grad_norm": 1.057561993598938, "learning_rate": 4.4851425103697294e-05, "loss": 0.3402, "num_input_tokens_seen": 33630496, "step": 35205 }, { "epoch": 2.87217554449792, "grad_norm": 0.9548348784446716, "learning_rate": 4.484926141714157e-05, "loss": 0.3494, "num_input_tokens_seen": 33634768, "step": 35210 }, { "epoch": 2.8725834081083286, "grad_norm": 0.880047619342804, "learning_rate": 4.484709732824974e-05, "loss": 0.361, "num_input_tokens_seen": 33640256, "step": 35215 }, { "epoch": 2.872991271718737, "grad_norm": 0.4630119502544403, "learning_rate": 4.4844932837065676e-05, "loss": 0.3026, "num_input_tokens_seen": 33644896, "step": 35220 }, { "epoch": 2.873399135329146, "grad_norm": 0.681430995464325, "learning_rate": 4.484276794363327e-05, "loss": 0.3129, "num_input_tokens_seen": 33649872, "step": 35225 }, { "epoch": 2.8738069989395547, "grad_norm": 0.670143187046051, "learning_rate": 4.484060264799638e-05, "loss": 0.4188, "num_input_tokens_seen": 33654848, "step": 35230 }, { "epoch": 2.8742148625499633, "grad_norm": 1.4695078134536743, "learning_rate": 4.48384369501989e-05, "loss": 0.4936, "num_input_tokens_seen": 33659952, "step": 35235 }, { "epoch": 2.874622726160372, "grad_norm": 0.3001847267150879, "learning_rate": 4.483627085028475e-05, "loss": 0.3138, "num_input_tokens_seen": 33665264, "step": 35240 }, { "epoch": 2.8750305897707804, "grad_norm": 1.1736780405044556, "learning_rate": 4.48341043482978e-05, "loss": 0.3883, "num_input_tokens_seen": 33670256, "step": 35245 }, { "epoch": 2.8754384533811894, "grad_norm": 0.20925647020339966, "learning_rate": 4.4831937444281996e-05, "loss": 0.3431, "num_input_tokens_seen": 33674736, "step": 35250 }, { "epoch": 2.875846316991598, "grad_norm": 0.8770922422409058, "learning_rate": 4.4829770138281246e-05, "loss": 0.3507, "num_input_tokens_seen": 33679040, "step": 35255 }, { "epoch": 2.8762541806020065, "grad_norm": 1.22269606590271, "learning_rate": 4.4827602430339476e-05, "loss": 0.366, "num_input_tokens_seen": 33684384, "step": 35260 }, { "epoch": 2.8766620442124156, "grad_norm": 0.7934519648551941, "learning_rate": 4.4825434320500624e-05, "loss": 0.2994, "num_input_tokens_seen": 33689104, "step": 35265 }, { "epoch": 2.877069907822824, "grad_norm": 1.52285635471344, "learning_rate": 4.4823265808808653e-05, "loss": 0.4253, "num_input_tokens_seen": 33693696, "step": 35270 }, { "epoch": 2.8774777714332327, "grad_norm": 0.4296721816062927, "learning_rate": 4.4821096895307516e-05, "loss": 0.3137, "num_input_tokens_seen": 33698528, "step": 35275 }, { "epoch": 2.8778856350436413, "grad_norm": 0.35303348302841187, "learning_rate": 4.481892758004115e-05, "loss": 0.3455, "num_input_tokens_seen": 33702704, "step": 35280 }, { "epoch": 2.87829349865405, "grad_norm": 1.2797067165374756, "learning_rate": 4.481675786305355e-05, "loss": 0.4212, "num_input_tokens_seen": 33706768, "step": 35285 }, { "epoch": 2.878701362264459, "grad_norm": 0.9403887391090393, "learning_rate": 4.481458774438869e-05, "loss": 0.3329, "num_input_tokens_seen": 33711424, "step": 35290 }, { "epoch": 2.8791092258748674, "grad_norm": 1.261725664138794, "learning_rate": 4.481241722409055e-05, "loss": 0.3452, "num_input_tokens_seen": 33715056, "step": 35295 }, { "epoch": 2.879517089485276, "grad_norm": 0.35032710433006287, "learning_rate": 4.481024630220314e-05, "loss": 0.3426, "num_input_tokens_seen": 33719184, "step": 35300 }, { "epoch": 2.879924953095685, "grad_norm": 0.25341537594795227, "learning_rate": 4.4808074978770455e-05, "loss": 0.4129, "num_input_tokens_seen": 33724336, "step": 35305 }, { "epoch": 2.8803328167060935, "grad_norm": 1.1509419679641724, "learning_rate": 4.48059032538365e-05, "loss": 0.3283, "num_input_tokens_seen": 33728944, "step": 35310 }, { "epoch": 2.880740680316502, "grad_norm": 0.9911386370658875, "learning_rate": 4.48037311274453e-05, "loss": 0.3596, "num_input_tokens_seen": 33733664, "step": 35315 }, { "epoch": 2.881148543926911, "grad_norm": 1.057684063911438, "learning_rate": 4.4801558599640895e-05, "loss": 0.35, "num_input_tokens_seen": 33738384, "step": 35320 }, { "epoch": 2.8815564075373197, "grad_norm": 1.3202934265136719, "learning_rate": 4.4799385670467305e-05, "loss": 0.3634, "num_input_tokens_seen": 33742672, "step": 35325 }, { "epoch": 2.8819642711477282, "grad_norm": 0.37317514419555664, "learning_rate": 4.4797212339968584e-05, "loss": 0.4235, "num_input_tokens_seen": 33747584, "step": 35330 }, { "epoch": 2.882372134758137, "grad_norm": 0.21426498889923096, "learning_rate": 4.479503860818879e-05, "loss": 0.3699, "num_input_tokens_seen": 33752560, "step": 35335 }, { "epoch": 2.8827799983685454, "grad_norm": 1.1327602863311768, "learning_rate": 4.4792864475171955e-05, "loss": 0.3587, "num_input_tokens_seen": 33757232, "step": 35340 }, { "epoch": 2.8831878619789544, "grad_norm": 1.0540653467178345, "learning_rate": 4.479068994096218e-05, "loss": 0.3284, "num_input_tokens_seen": 33762608, "step": 35345 }, { "epoch": 2.883595725589363, "grad_norm": 0.49736735224723816, "learning_rate": 4.478851500560353e-05, "loss": 0.3208, "num_input_tokens_seen": 33767744, "step": 35350 }, { "epoch": 2.8840035891997715, "grad_norm": 0.4840952455997467, "learning_rate": 4.4786339669140086e-05, "loss": 0.3876, "num_input_tokens_seen": 33772448, "step": 35355 }, { "epoch": 2.8844114528101805, "grad_norm": 0.4835358262062073, "learning_rate": 4.478416393161595e-05, "loss": 0.3047, "num_input_tokens_seen": 33777632, "step": 35360 }, { "epoch": 2.884819316420589, "grad_norm": 0.7185850143432617, "learning_rate": 4.478198779307521e-05, "loss": 0.2721, "num_input_tokens_seen": 33781840, "step": 35365 }, { "epoch": 2.8852271800309977, "grad_norm": 0.7739556431770325, "learning_rate": 4.477981125356199e-05, "loss": 0.3419, "num_input_tokens_seen": 33786512, "step": 35370 }, { "epoch": 2.885635043641406, "grad_norm": 0.6990134119987488, "learning_rate": 4.4777634313120396e-05, "loss": 0.3279, "num_input_tokens_seen": 33791264, "step": 35375 }, { "epoch": 2.886042907251815, "grad_norm": 0.564548909664154, "learning_rate": 4.477545697179456e-05, "loss": 0.3463, "num_input_tokens_seen": 33796032, "step": 35380 }, { "epoch": 2.886450770862224, "grad_norm": 0.6572219133377075, "learning_rate": 4.4773279229628616e-05, "loss": 0.3712, "num_input_tokens_seen": 33801248, "step": 35385 }, { "epoch": 2.8868586344726324, "grad_norm": 0.6650383472442627, "learning_rate": 4.47711010866667e-05, "loss": 0.3502, "num_input_tokens_seen": 33806224, "step": 35390 }, { "epoch": 2.887266498083041, "grad_norm": 0.7473714351654053, "learning_rate": 4.4768922542952975e-05, "loss": 0.392, "num_input_tokens_seen": 33810880, "step": 35395 }, { "epoch": 2.88767436169345, "grad_norm": 0.8477562665939331, "learning_rate": 4.476674359853158e-05, "loss": 0.3674, "num_input_tokens_seen": 33815632, "step": 35400 }, { "epoch": 2.8880822253038585, "grad_norm": 0.5891767144203186, "learning_rate": 4.4764564253446694e-05, "loss": 0.3665, "num_input_tokens_seen": 33821120, "step": 35405 }, { "epoch": 2.888490088914267, "grad_norm": 1.116187572479248, "learning_rate": 4.4762384507742496e-05, "loss": 0.3409, "num_input_tokens_seen": 33825296, "step": 35410 }, { "epoch": 2.8888979525246756, "grad_norm": 1.1436882019042969, "learning_rate": 4.4760204361463155e-05, "loss": 0.4169, "num_input_tokens_seen": 33830416, "step": 35415 }, { "epoch": 2.889305816135084, "grad_norm": 0.8369410037994385, "learning_rate": 4.4758023814652874e-05, "loss": 0.3431, "num_input_tokens_seen": 33835024, "step": 35420 }, { "epoch": 2.889713679745493, "grad_norm": 1.4930814504623413, "learning_rate": 4.475584286735583e-05, "loss": 0.3763, "num_input_tokens_seen": 33839856, "step": 35425 }, { "epoch": 2.8901215433559018, "grad_norm": 1.1001412868499756, "learning_rate": 4.475366151961626e-05, "loss": 0.3621, "num_input_tokens_seen": 33843984, "step": 35430 }, { "epoch": 2.8905294069663103, "grad_norm": 0.9273344278335571, "learning_rate": 4.475147977147836e-05, "loss": 0.3817, "num_input_tokens_seen": 33848960, "step": 35435 }, { "epoch": 2.8909372705767193, "grad_norm": 1.107460379600525, "learning_rate": 4.4749297622986363e-05, "loss": 0.3768, "num_input_tokens_seen": 33853984, "step": 35440 }, { "epoch": 2.891345134187128, "grad_norm": 0.7787270545959473, "learning_rate": 4.47471150741845e-05, "loss": 0.3129, "num_input_tokens_seen": 33859616, "step": 35445 }, { "epoch": 2.8917529977975365, "grad_norm": 0.6520934104919434, "learning_rate": 4.474493212511699e-05, "loss": 0.3637, "num_input_tokens_seen": 33864832, "step": 35450 }, { "epoch": 2.892160861407945, "grad_norm": 0.7505086064338684, "learning_rate": 4.474274877582811e-05, "loss": 0.3637, "num_input_tokens_seen": 33870544, "step": 35455 }, { "epoch": 2.8925687250183536, "grad_norm": 0.13342463970184326, "learning_rate": 4.474056502636209e-05, "loss": 0.3582, "num_input_tokens_seen": 33874768, "step": 35460 }, { "epoch": 2.8929765886287626, "grad_norm": 0.23878812789916992, "learning_rate": 4.473838087676321e-05, "loss": 0.3162, "num_input_tokens_seen": 33879312, "step": 35465 }, { "epoch": 2.893384452239171, "grad_norm": 0.15071243047714233, "learning_rate": 4.4736196327075745e-05, "loss": 0.2929, "num_input_tokens_seen": 33884336, "step": 35470 }, { "epoch": 2.8937923158495797, "grad_norm": 0.31470900774002075, "learning_rate": 4.4734011377343955e-05, "loss": 0.3342, "num_input_tokens_seen": 33889216, "step": 35475 }, { "epoch": 2.8942001794599888, "grad_norm": 0.721883237361908, "learning_rate": 4.4731826027612144e-05, "loss": 0.1919, "num_input_tokens_seen": 33893888, "step": 35480 }, { "epoch": 2.8946080430703973, "grad_norm": 1.1591784954071045, "learning_rate": 4.4729640277924604e-05, "loss": 0.352, "num_input_tokens_seen": 33898928, "step": 35485 }, { "epoch": 2.895015906680806, "grad_norm": 2.3678200244903564, "learning_rate": 4.472745412832564e-05, "loss": 0.6053, "num_input_tokens_seen": 33903392, "step": 35490 }, { "epoch": 2.895423770291215, "grad_norm": 0.4857066869735718, "learning_rate": 4.4725267578859565e-05, "loss": 0.4942, "num_input_tokens_seen": 33907856, "step": 35495 }, { "epoch": 2.8958316339016235, "grad_norm": 1.0541852712631226, "learning_rate": 4.472308062957069e-05, "loss": 0.3618, "num_input_tokens_seen": 33913280, "step": 35500 }, { "epoch": 2.896239497512032, "grad_norm": 1.0626654624938965, "learning_rate": 4.4720893280503365e-05, "loss": 0.3781, "num_input_tokens_seen": 33917936, "step": 35505 }, { "epoch": 2.8966473611224406, "grad_norm": 1.033247470855713, "learning_rate": 4.4718705531701896e-05, "loss": 0.3958, "num_input_tokens_seen": 33922880, "step": 35510 }, { "epoch": 2.897055224732849, "grad_norm": 0.9290786385536194, "learning_rate": 4.471651738321066e-05, "loss": 0.3891, "num_input_tokens_seen": 33928048, "step": 35515 }, { "epoch": 2.897463088343258, "grad_norm": 0.7197305560112, "learning_rate": 4.471432883507399e-05, "loss": 0.265, "num_input_tokens_seen": 33933264, "step": 35520 }, { "epoch": 2.8978709519536667, "grad_norm": 0.8602463603019714, "learning_rate": 4.471213988733626e-05, "loss": 0.2603, "num_input_tokens_seen": 33936896, "step": 35525 }, { "epoch": 2.8982788155640753, "grad_norm": 0.7545309066772461, "learning_rate": 4.4709950540041814e-05, "loss": 0.5569, "num_input_tokens_seen": 33941168, "step": 35530 }, { "epoch": 2.8986866791744843, "grad_norm": 0.5725820064544678, "learning_rate": 4.4707760793235055e-05, "loss": 0.4516, "num_input_tokens_seen": 33946096, "step": 35535 }, { "epoch": 2.899094542784893, "grad_norm": 0.5615876317024231, "learning_rate": 4.4705570646960356e-05, "loss": 0.3729, "num_input_tokens_seen": 33950976, "step": 35540 }, { "epoch": 2.8995024063953014, "grad_norm": 0.31872016191482544, "learning_rate": 4.470338010126211e-05, "loss": 0.4011, "num_input_tokens_seen": 33956144, "step": 35545 }, { "epoch": 2.89991027000571, "grad_norm": 0.24972975254058838, "learning_rate": 4.470118915618473e-05, "loss": 0.3893, "num_input_tokens_seen": 33961184, "step": 35550 }, { "epoch": 2.9003181336161186, "grad_norm": 1.5920220613479614, "learning_rate": 4.469899781177261e-05, "loss": 0.3707, "num_input_tokens_seen": 33966192, "step": 35555 }, { "epoch": 2.9007259972265276, "grad_norm": 0.8731734156608582, "learning_rate": 4.469680606807018e-05, "loss": 0.3843, "num_input_tokens_seen": 33970784, "step": 35560 }, { "epoch": 2.901133860836936, "grad_norm": 0.4427167475223541, "learning_rate": 4.469461392512186e-05, "loss": 0.2929, "num_input_tokens_seen": 33976384, "step": 35565 }, { "epoch": 2.9015417244473447, "grad_norm": 0.4713517725467682, "learning_rate": 4.469242138297208e-05, "loss": 0.3489, "num_input_tokens_seen": 33981616, "step": 35570 }, { "epoch": 2.9019495880577537, "grad_norm": 1.2709338665008545, "learning_rate": 4.469022844166529e-05, "loss": 0.4727, "num_input_tokens_seen": 33986416, "step": 35575 }, { "epoch": 2.9023574516681623, "grad_norm": 1.1803193092346191, "learning_rate": 4.468803510124595e-05, "loss": 0.3599, "num_input_tokens_seen": 33991680, "step": 35580 }, { "epoch": 2.902765315278571, "grad_norm": 0.8709461688995361, "learning_rate": 4.4685841361758485e-05, "loss": 0.3673, "num_input_tokens_seen": 33996816, "step": 35585 }, { "epoch": 2.9031731788889794, "grad_norm": 0.8152548670768738, "learning_rate": 4.468364722324739e-05, "loss": 0.3457, "num_input_tokens_seen": 34001376, "step": 35590 }, { "epoch": 2.903581042499388, "grad_norm": 0.48765695095062256, "learning_rate": 4.4681452685757126e-05, "loss": 0.3842, "num_input_tokens_seen": 34006208, "step": 35595 }, { "epoch": 2.903988906109797, "grad_norm": 0.17146776616573334, "learning_rate": 4.467925774933218e-05, "loss": 0.3291, "num_input_tokens_seen": 34010288, "step": 35600 }, { "epoch": 2.9043967697202056, "grad_norm": 0.7493979334831238, "learning_rate": 4.467706241401704e-05, "loss": 0.3054, "num_input_tokens_seen": 34014768, "step": 35605 }, { "epoch": 2.904804633330614, "grad_norm": 0.4568049907684326, "learning_rate": 4.467486667985621e-05, "loss": 0.3256, "num_input_tokens_seen": 34019872, "step": 35610 }, { "epoch": 2.905212496941023, "grad_norm": 1.248075246810913, "learning_rate": 4.467267054689419e-05, "loss": 0.3891, "num_input_tokens_seen": 34024208, "step": 35615 }, { "epoch": 2.9056203605514317, "grad_norm": 0.18987886607646942, "learning_rate": 4.46704740151755e-05, "loss": 0.3509, "num_input_tokens_seen": 34028912, "step": 35620 }, { "epoch": 2.9060282241618403, "grad_norm": 0.37143129110336304, "learning_rate": 4.4668277084744655e-05, "loss": 0.3595, "num_input_tokens_seen": 34034384, "step": 35625 }, { "epoch": 2.906436087772249, "grad_norm": 0.1931600719690323, "learning_rate": 4.4666079755646195e-05, "loss": 0.3464, "num_input_tokens_seen": 34039056, "step": 35630 }, { "epoch": 2.9068439513826574, "grad_norm": 0.3113010823726654, "learning_rate": 4.466388202792465e-05, "loss": 0.3571, "num_input_tokens_seen": 34043488, "step": 35635 }, { "epoch": 2.9072518149930664, "grad_norm": 1.1824313402175903, "learning_rate": 4.466168390162458e-05, "loss": 0.3454, "num_input_tokens_seen": 34048800, "step": 35640 }, { "epoch": 2.907659678603475, "grad_norm": 1.1034694910049438, "learning_rate": 4.4659485376790525e-05, "loss": 0.3824, "num_input_tokens_seen": 34053824, "step": 35645 }, { "epoch": 2.9080675422138835, "grad_norm": 0.9584998488426208, "learning_rate": 4.465728645346706e-05, "loss": 0.3709, "num_input_tokens_seen": 34057888, "step": 35650 }, { "epoch": 2.9084754058242925, "grad_norm": 1.130475401878357, "learning_rate": 4.4655087131698746e-05, "loss": 0.3485, "num_input_tokens_seen": 34062912, "step": 35655 }, { "epoch": 2.908883269434701, "grad_norm": 0.16875462234020233, "learning_rate": 4.4652887411530177e-05, "loss": 0.3531, "num_input_tokens_seen": 34067808, "step": 35660 }, { "epoch": 2.9092911330451097, "grad_norm": 0.8519350290298462, "learning_rate": 4.465068729300592e-05, "loss": 0.3174, "num_input_tokens_seen": 34073792, "step": 35665 }, { "epoch": 2.9096989966555182, "grad_norm": 0.7562743425369263, "learning_rate": 4.46484867761706e-05, "loss": 0.2646, "num_input_tokens_seen": 34078368, "step": 35670 }, { "epoch": 2.910106860265927, "grad_norm": 0.6107056140899658, "learning_rate": 4.464628586106878e-05, "loss": 0.375, "num_input_tokens_seen": 34083056, "step": 35675 }, { "epoch": 2.910514723876336, "grad_norm": 0.30239248275756836, "learning_rate": 4.464408454774511e-05, "loss": 0.3311, "num_input_tokens_seen": 34087152, "step": 35680 }, { "epoch": 2.9109225874867444, "grad_norm": 1.3213732242584229, "learning_rate": 4.464188283624419e-05, "loss": 0.3832, "num_input_tokens_seen": 34091968, "step": 35685 }, { "epoch": 2.911330451097153, "grad_norm": 0.11280937492847443, "learning_rate": 4.463968072661064e-05, "loss": 0.4034, "num_input_tokens_seen": 34095920, "step": 35690 }, { "epoch": 2.911738314707562, "grad_norm": 0.3116047978401184, "learning_rate": 4.4637478218889116e-05, "loss": 0.293, "num_input_tokens_seen": 34101072, "step": 35695 }, { "epoch": 2.9121461783179705, "grad_norm": 0.7425588369369507, "learning_rate": 4.4635275313124256e-05, "loss": 0.3199, "num_input_tokens_seen": 34105232, "step": 35700 }, { "epoch": 2.912554041928379, "grad_norm": 0.3347518742084503, "learning_rate": 4.46330720093607e-05, "loss": 0.3091, "num_input_tokens_seen": 34110544, "step": 35705 }, { "epoch": 2.912961905538788, "grad_norm": 0.3991498053073883, "learning_rate": 4.4630868307643124e-05, "loss": 0.4522, "num_input_tokens_seen": 34115248, "step": 35710 }, { "epoch": 2.9133697691491967, "grad_norm": 0.7776288390159607, "learning_rate": 4.462866420801619e-05, "loss": 0.3373, "num_input_tokens_seen": 34120944, "step": 35715 }, { "epoch": 2.913777632759605, "grad_norm": 0.392752468585968, "learning_rate": 4.4626459710524574e-05, "loss": 0.297, "num_input_tokens_seen": 34125968, "step": 35720 }, { "epoch": 2.914185496370014, "grad_norm": 0.8067547082901001, "learning_rate": 4.462425481521295e-05, "loss": 0.3753, "num_input_tokens_seen": 34130192, "step": 35725 }, { "epoch": 2.9145933599804223, "grad_norm": 0.26558011770248413, "learning_rate": 4.462204952212603e-05, "loss": 0.3448, "num_input_tokens_seen": 34134576, "step": 35730 }, { "epoch": 2.9150012235908314, "grad_norm": 0.39131397008895874, "learning_rate": 4.461984383130849e-05, "loss": 0.4076, "num_input_tokens_seen": 34138992, "step": 35735 }, { "epoch": 2.91540908720124, "grad_norm": 0.9753053188323975, "learning_rate": 4.4617637742805075e-05, "loss": 0.3384, "num_input_tokens_seen": 34143264, "step": 35740 }, { "epoch": 2.9158169508116485, "grad_norm": 0.47761163115501404, "learning_rate": 4.461543125666046e-05, "loss": 0.3414, "num_input_tokens_seen": 34147632, "step": 35745 }, { "epoch": 2.9162248144220575, "grad_norm": 0.850382924079895, "learning_rate": 4.4613224372919395e-05, "loss": 0.3557, "num_input_tokens_seen": 34152240, "step": 35750 }, { "epoch": 2.916632678032466, "grad_norm": 0.6324601173400879, "learning_rate": 4.461101709162661e-05, "loss": 0.3081, "num_input_tokens_seen": 34156880, "step": 35755 }, { "epoch": 2.9170405416428746, "grad_norm": 0.7388135194778442, "learning_rate": 4.460880941282683e-05, "loss": 0.3138, "num_input_tokens_seen": 34160832, "step": 35760 }, { "epoch": 2.917448405253283, "grad_norm": 0.5980014801025391, "learning_rate": 4.460660133656483e-05, "loss": 0.4203, "num_input_tokens_seen": 34165616, "step": 35765 }, { "epoch": 2.9178562688636918, "grad_norm": 0.6964578032493591, "learning_rate": 4.4604392862885346e-05, "loss": 0.2033, "num_input_tokens_seen": 34170112, "step": 35770 }, { "epoch": 2.9182641324741008, "grad_norm": 0.6942575573921204, "learning_rate": 4.4602183991833146e-05, "loss": 0.4645, "num_input_tokens_seen": 34175344, "step": 35775 }, { "epoch": 2.9186719960845093, "grad_norm": 0.553982138633728, "learning_rate": 4.459997472345301e-05, "loss": 0.4168, "num_input_tokens_seen": 34180480, "step": 35780 }, { "epoch": 2.919079859694918, "grad_norm": 0.5840718746185303, "learning_rate": 4.459776505778971e-05, "loss": 0.3388, "num_input_tokens_seen": 34184992, "step": 35785 }, { "epoch": 2.919487723305327, "grad_norm": 0.4760180711746216, "learning_rate": 4.459555499488804e-05, "loss": 0.3537, "num_input_tokens_seen": 34189248, "step": 35790 }, { "epoch": 2.9198955869157355, "grad_norm": 0.2812891900539398, "learning_rate": 4.4593344534792804e-05, "loss": 0.3265, "num_input_tokens_seen": 34193616, "step": 35795 }, { "epoch": 2.920303450526144, "grad_norm": 0.5517024397850037, "learning_rate": 4.45911336775488e-05, "loss": 0.3983, "num_input_tokens_seen": 34199008, "step": 35800 }, { "epoch": 2.9207113141365526, "grad_norm": 0.3228004276752472, "learning_rate": 4.458892242320084e-05, "loss": 0.3554, "num_input_tokens_seen": 34203920, "step": 35805 }, { "epoch": 2.921119177746961, "grad_norm": 0.5102766752243042, "learning_rate": 4.458671077179374e-05, "loss": 0.414, "num_input_tokens_seen": 34208128, "step": 35810 }, { "epoch": 2.92152704135737, "grad_norm": 0.2724549472332001, "learning_rate": 4.458449872337234e-05, "loss": 0.3821, "num_input_tokens_seen": 34213520, "step": 35815 }, { "epoch": 2.9219349049677787, "grad_norm": 0.9045986533164978, "learning_rate": 4.4582286277981465e-05, "loss": 0.3505, "num_input_tokens_seen": 34218368, "step": 35820 }, { "epoch": 2.9223427685781873, "grad_norm": 0.13037002086639404, "learning_rate": 4.458007343566598e-05, "loss": 0.3555, "num_input_tokens_seen": 34222480, "step": 35825 }, { "epoch": 2.9227506321885963, "grad_norm": 0.9451642632484436, "learning_rate": 4.4577860196470725e-05, "loss": 0.3525, "num_input_tokens_seen": 34227792, "step": 35830 }, { "epoch": 2.923158495799005, "grad_norm": 0.2118653953075409, "learning_rate": 4.457564656044056e-05, "loss": 0.3433, "num_input_tokens_seen": 34232560, "step": 35835 }, { "epoch": 2.9235663594094135, "grad_norm": 0.7396962642669678, "learning_rate": 4.457343252762036e-05, "loss": 0.3411, "num_input_tokens_seen": 34237760, "step": 35840 }, { "epoch": 2.923974223019822, "grad_norm": 0.19099746644496918, "learning_rate": 4.457121809805499e-05, "loss": 0.3615, "num_input_tokens_seen": 34242288, "step": 35845 }, { "epoch": 2.9243820866302306, "grad_norm": 0.896777331829071, "learning_rate": 4.4569003271789354e-05, "loss": 0.3496, "num_input_tokens_seen": 34247216, "step": 35850 }, { "epoch": 2.9247899502406396, "grad_norm": 0.8855394124984741, "learning_rate": 4.4566788048868335e-05, "loss": 0.3331, "num_input_tokens_seen": 34252320, "step": 35855 }, { "epoch": 2.925197813851048, "grad_norm": 0.4020702540874481, "learning_rate": 4.456457242933684e-05, "loss": 0.34, "num_input_tokens_seen": 34256816, "step": 35860 }, { "epoch": 2.9256056774614567, "grad_norm": 0.39091694355010986, "learning_rate": 4.456235641323977e-05, "loss": 0.3661, "num_input_tokens_seen": 34262192, "step": 35865 }, { "epoch": 2.9260135410718657, "grad_norm": 0.7560375928878784, "learning_rate": 4.456014000062205e-05, "loss": 0.3696, "num_input_tokens_seen": 34267280, "step": 35870 }, { "epoch": 2.9264214046822743, "grad_norm": 1.0521512031555176, "learning_rate": 4.455792319152861e-05, "loss": 0.3652, "num_input_tokens_seen": 34272464, "step": 35875 }, { "epoch": 2.926829268292683, "grad_norm": 0.9567497968673706, "learning_rate": 4.455570598600437e-05, "loss": 0.3398, "num_input_tokens_seen": 34277360, "step": 35880 }, { "epoch": 2.927237131903092, "grad_norm": 0.2638896703720093, "learning_rate": 4.455348838409428e-05, "loss": 0.3412, "num_input_tokens_seen": 34282656, "step": 35885 }, { "epoch": 2.9276449955135004, "grad_norm": 0.9237986207008362, "learning_rate": 4.4551270385843294e-05, "loss": 0.3521, "num_input_tokens_seen": 34286800, "step": 35890 }, { "epoch": 2.928052859123909, "grad_norm": 0.4770899713039398, "learning_rate": 4.4549051991296356e-05, "loss": 0.349, "num_input_tokens_seen": 34291024, "step": 35895 }, { "epoch": 2.9284607227343176, "grad_norm": 0.9132611155509949, "learning_rate": 4.454683320049845e-05, "loss": 0.3325, "num_input_tokens_seen": 34296336, "step": 35900 }, { "epoch": 2.928868586344726, "grad_norm": 0.24973690509796143, "learning_rate": 4.4544614013494535e-05, "loss": 0.3363, "num_input_tokens_seen": 34301264, "step": 35905 }, { "epoch": 2.929276449955135, "grad_norm": 0.49579307436943054, "learning_rate": 4.4542394430329606e-05, "loss": 0.3876, "num_input_tokens_seen": 34306576, "step": 35910 }, { "epoch": 2.9296843135655437, "grad_norm": 1.0575847625732422, "learning_rate": 4.454017445104864e-05, "loss": 0.3454, "num_input_tokens_seen": 34311536, "step": 35915 }, { "epoch": 2.9300921771759523, "grad_norm": 0.3768783211708069, "learning_rate": 4.4537954075696645e-05, "loss": 0.3595, "num_input_tokens_seen": 34316304, "step": 35920 }, { "epoch": 2.9305000407863613, "grad_norm": 1.0054190158843994, "learning_rate": 4.4535733304318614e-05, "loss": 0.3763, "num_input_tokens_seen": 34321696, "step": 35925 }, { "epoch": 2.93090790439677, "grad_norm": 1.0382486581802368, "learning_rate": 4.453351213695958e-05, "loss": 0.3665, "num_input_tokens_seen": 34326720, "step": 35930 }, { "epoch": 2.9313157680071784, "grad_norm": 0.2351590245962143, "learning_rate": 4.4531290573664554e-05, "loss": 0.3963, "num_input_tokens_seen": 34331424, "step": 35935 }, { "epoch": 2.931723631617587, "grad_norm": 1.2639926671981812, "learning_rate": 4.452906861447857e-05, "loss": 0.3376, "num_input_tokens_seen": 34335888, "step": 35940 }, { "epoch": 2.9321314952279955, "grad_norm": 0.8320841789245605, "learning_rate": 4.4526846259446654e-05, "loss": 0.3125, "num_input_tokens_seen": 34340320, "step": 35945 }, { "epoch": 2.9325393588384046, "grad_norm": 0.8246349096298218, "learning_rate": 4.452462350861387e-05, "loss": 0.3572, "num_input_tokens_seen": 34345584, "step": 35950 }, { "epoch": 2.932947222448813, "grad_norm": 0.7328673005104065, "learning_rate": 4.452240036202526e-05, "loss": 0.3274, "num_input_tokens_seen": 34350336, "step": 35955 }, { "epoch": 2.9333550860592217, "grad_norm": 0.2260785549879074, "learning_rate": 4.452017681972589e-05, "loss": 0.4249, "num_input_tokens_seen": 34355808, "step": 35960 }, { "epoch": 2.9337629496696307, "grad_norm": 0.43854665756225586, "learning_rate": 4.4517952881760836e-05, "loss": 0.3434, "num_input_tokens_seen": 34360240, "step": 35965 }, { "epoch": 2.9341708132800393, "grad_norm": 0.1781558245420456, "learning_rate": 4.451572854817516e-05, "loss": 0.351, "num_input_tokens_seen": 34364528, "step": 35970 }, { "epoch": 2.934578676890448, "grad_norm": 0.9185636639595032, "learning_rate": 4.4513503819013966e-05, "loss": 0.3539, "num_input_tokens_seen": 34370016, "step": 35975 }, { "epoch": 2.9349865405008564, "grad_norm": 0.34831246733665466, "learning_rate": 4.451127869432235e-05, "loss": 0.3491, "num_input_tokens_seen": 34375136, "step": 35980 }, { "epoch": 2.935394404111265, "grad_norm": 0.6132847666740417, "learning_rate": 4.450905317414539e-05, "loss": 0.3321, "num_input_tokens_seen": 34379536, "step": 35985 }, { "epoch": 2.935802267721674, "grad_norm": 0.6973806619644165, "learning_rate": 4.450682725852822e-05, "loss": 0.267, "num_input_tokens_seen": 34385504, "step": 35990 }, { "epoch": 2.9362101313320825, "grad_norm": 0.5425690412521362, "learning_rate": 4.450460094751595e-05, "loss": 0.3453, "num_input_tokens_seen": 34391360, "step": 35995 }, { "epoch": 2.936617994942491, "grad_norm": 1.3679622411727905, "learning_rate": 4.45023742411537e-05, "loss": 0.5316, "num_input_tokens_seen": 34395968, "step": 36000 }, { "epoch": 2.9370258585529, "grad_norm": 1.4084162712097168, "learning_rate": 4.450014713948663e-05, "loss": 0.3768, "num_input_tokens_seen": 34401216, "step": 36005 }, { "epoch": 2.9374337221633087, "grad_norm": 0.7728251218795776, "learning_rate": 4.449791964255984e-05, "loss": 0.2802, "num_input_tokens_seen": 34406448, "step": 36010 }, { "epoch": 2.9378415857737172, "grad_norm": 0.3634450435638428, "learning_rate": 4.4495691750418516e-05, "loss": 0.344, "num_input_tokens_seen": 34411360, "step": 36015 }, { "epoch": 2.938249449384126, "grad_norm": 0.8756048679351807, "learning_rate": 4.4493463463107806e-05, "loss": 0.3354, "num_input_tokens_seen": 34415472, "step": 36020 }, { "epoch": 2.9386573129945344, "grad_norm": 0.6280957460403442, "learning_rate": 4.449123478067287e-05, "loss": 0.2731, "num_input_tokens_seen": 34420272, "step": 36025 }, { "epoch": 2.9390651766049434, "grad_norm": 0.48325762152671814, "learning_rate": 4.448900570315889e-05, "loss": 0.3706, "num_input_tokens_seen": 34425696, "step": 36030 }, { "epoch": 2.939473040215352, "grad_norm": 0.7173362970352173, "learning_rate": 4.4486776230611046e-05, "loss": 0.3978, "num_input_tokens_seen": 34430608, "step": 36035 }, { "epoch": 2.9398809038257605, "grad_norm": 0.9204900860786438, "learning_rate": 4.448454636307453e-05, "loss": 0.3504, "num_input_tokens_seen": 34435136, "step": 36040 }, { "epoch": 2.9402887674361695, "grad_norm": 0.23605147004127502, "learning_rate": 4.448231610059454e-05, "loss": 0.3229, "num_input_tokens_seen": 34440064, "step": 36045 }, { "epoch": 2.940696631046578, "grad_norm": 0.7409541010856628, "learning_rate": 4.448008544321627e-05, "loss": 0.3547, "num_input_tokens_seen": 34444960, "step": 36050 }, { "epoch": 2.9411044946569866, "grad_norm": 1.2575769424438477, "learning_rate": 4.447785439098495e-05, "loss": 0.3497, "num_input_tokens_seen": 34449648, "step": 36055 }, { "epoch": 2.9415123582673957, "grad_norm": 0.39636868238449097, "learning_rate": 4.447562294394581e-05, "loss": 0.2838, "num_input_tokens_seen": 34454640, "step": 36060 }, { "epoch": 2.941920221877804, "grad_norm": 0.342688649892807, "learning_rate": 4.447339110214405e-05, "loss": 0.3862, "num_input_tokens_seen": 34458512, "step": 36065 }, { "epoch": 2.942328085488213, "grad_norm": 0.3369607627391815, "learning_rate": 4.447115886562494e-05, "loss": 0.3145, "num_input_tokens_seen": 34462912, "step": 36070 }, { "epoch": 2.9427359490986214, "grad_norm": 0.8391932249069214, "learning_rate": 4.446892623443371e-05, "loss": 0.3703, "num_input_tokens_seen": 34467520, "step": 36075 }, { "epoch": 2.94314381270903, "grad_norm": 0.7796626091003418, "learning_rate": 4.4466693208615616e-05, "loss": 0.3657, "num_input_tokens_seen": 34472656, "step": 36080 }, { "epoch": 2.943551676319439, "grad_norm": 0.8414037823677063, "learning_rate": 4.446445978821593e-05, "loss": 0.3488, "num_input_tokens_seen": 34478400, "step": 36085 }, { "epoch": 2.9439595399298475, "grad_norm": 0.2277744859457016, "learning_rate": 4.4462225973279906e-05, "loss": 0.3154, "num_input_tokens_seen": 34482592, "step": 36090 }, { "epoch": 2.944367403540256, "grad_norm": 0.2214338481426239, "learning_rate": 4.4459991763852835e-05, "loss": 0.3945, "num_input_tokens_seen": 34487696, "step": 36095 }, { "epoch": 2.944775267150665, "grad_norm": 0.168659046292305, "learning_rate": 4.445775715998001e-05, "loss": 0.3195, "num_input_tokens_seen": 34492496, "step": 36100 }, { "epoch": 2.9451831307610736, "grad_norm": 0.2824113667011261, "learning_rate": 4.4455522161706696e-05, "loss": 0.3029, "num_input_tokens_seen": 34497744, "step": 36105 }, { "epoch": 2.945590994371482, "grad_norm": 0.18684445321559906, "learning_rate": 4.445328676907823e-05, "loss": 0.3714, "num_input_tokens_seen": 34502624, "step": 36110 }, { "epoch": 2.9459988579818908, "grad_norm": 0.5591810345649719, "learning_rate": 4.4451050982139895e-05, "loss": 0.3596, "num_input_tokens_seen": 34508544, "step": 36115 }, { "epoch": 2.9464067215922993, "grad_norm": 1.0384703874588013, "learning_rate": 4.4448814800937033e-05, "loss": 0.3369, "num_input_tokens_seen": 34512928, "step": 36120 }, { "epoch": 2.9468145852027083, "grad_norm": 0.36174696683883667, "learning_rate": 4.4446578225514955e-05, "loss": 0.344, "num_input_tokens_seen": 34518416, "step": 36125 }, { "epoch": 2.947222448813117, "grad_norm": 0.6456682085990906, "learning_rate": 4.4444341255919e-05, "loss": 0.3671, "num_input_tokens_seen": 34523632, "step": 36130 }, { "epoch": 2.9476303124235255, "grad_norm": 0.6558370590209961, "learning_rate": 4.4442103892194515e-05, "loss": 0.369, "num_input_tokens_seen": 34527776, "step": 36135 }, { "epoch": 2.9480381760339345, "grad_norm": 0.754906177520752, "learning_rate": 4.443986613438684e-05, "loss": 0.3937, "num_input_tokens_seen": 34532592, "step": 36140 }, { "epoch": 2.948446039644343, "grad_norm": 0.3278273046016693, "learning_rate": 4.443762798254134e-05, "loss": 0.3819, "num_input_tokens_seen": 34537712, "step": 36145 }, { "epoch": 2.9488539032547516, "grad_norm": 0.23705430328845978, "learning_rate": 4.4435389436703375e-05, "loss": 0.3591, "num_input_tokens_seen": 34542832, "step": 36150 }, { "epoch": 2.94926176686516, "grad_norm": 0.1406974047422409, "learning_rate": 4.443315049691833e-05, "loss": 0.347, "num_input_tokens_seen": 34547584, "step": 36155 }, { "epoch": 2.9496696304755687, "grad_norm": 0.11115255951881409, "learning_rate": 4.443091116323159e-05, "loss": 0.3623, "num_input_tokens_seen": 34552112, "step": 36160 }, { "epoch": 2.9500774940859777, "grad_norm": 0.8046143651008606, "learning_rate": 4.442867143568853e-05, "loss": 0.3527, "num_input_tokens_seen": 34556112, "step": 36165 }, { "epoch": 2.9504853576963863, "grad_norm": 0.1940731257200241, "learning_rate": 4.442643131433455e-05, "loss": 0.399, "num_input_tokens_seen": 34560192, "step": 36170 }, { "epoch": 2.950893221306795, "grad_norm": 0.949006974697113, "learning_rate": 4.442419079921507e-05, "loss": 0.3459, "num_input_tokens_seen": 34565072, "step": 36175 }, { "epoch": 2.951301084917204, "grad_norm": 0.9900814294815063, "learning_rate": 4.442194989037549e-05, "loss": 0.3557, "num_input_tokens_seen": 34569152, "step": 36180 }, { "epoch": 2.9517089485276125, "grad_norm": 0.9313716292381287, "learning_rate": 4.441970858786124e-05, "loss": 0.3335, "num_input_tokens_seen": 34573712, "step": 36185 }, { "epoch": 2.952116812138021, "grad_norm": 0.8931686878204346, "learning_rate": 4.441746689171775e-05, "loss": 0.3461, "num_input_tokens_seen": 34577808, "step": 36190 }, { "epoch": 2.9525246757484296, "grad_norm": 0.8799084424972534, "learning_rate": 4.441522480199046e-05, "loss": 0.3494, "num_input_tokens_seen": 34582544, "step": 36195 }, { "epoch": 2.952932539358838, "grad_norm": 1.3060075044631958, "learning_rate": 4.441298231872482e-05, "loss": 0.3971, "num_input_tokens_seen": 34586480, "step": 36200 }, { "epoch": 2.953340402969247, "grad_norm": 0.8771615028381348, "learning_rate": 4.4410739441966266e-05, "loss": 0.3527, "num_input_tokens_seen": 34591392, "step": 36205 }, { "epoch": 2.9537482665796557, "grad_norm": 0.3315463662147522, "learning_rate": 4.440849617176027e-05, "loss": 0.3523, "num_input_tokens_seen": 34595920, "step": 36210 }, { "epoch": 2.9541561301900643, "grad_norm": 0.28002843260765076, "learning_rate": 4.4406252508152315e-05, "loss": 0.3303, "num_input_tokens_seen": 34601248, "step": 36215 }, { "epoch": 2.9545639938004733, "grad_norm": 1.1390706300735474, "learning_rate": 4.440400845118785e-05, "loss": 0.3932, "num_input_tokens_seen": 34605552, "step": 36220 }, { "epoch": 2.954971857410882, "grad_norm": 0.7909500002861023, "learning_rate": 4.440176400091239e-05, "loss": 0.2808, "num_input_tokens_seen": 34610384, "step": 36225 }, { "epoch": 2.9553797210212904, "grad_norm": 1.1683427095413208, "learning_rate": 4.439951915737143e-05, "loss": 0.4472, "num_input_tokens_seen": 34614416, "step": 36230 }, { "epoch": 2.955787584631699, "grad_norm": 0.25523504614830017, "learning_rate": 4.4397273920610446e-05, "loss": 0.3724, "num_input_tokens_seen": 34619488, "step": 36235 }, { "epoch": 2.956195448242108, "grad_norm": 0.2195502668619156, "learning_rate": 4.439502829067496e-05, "loss": 0.3427, "num_input_tokens_seen": 34624560, "step": 36240 }, { "epoch": 2.9566033118525166, "grad_norm": 0.2861330509185791, "learning_rate": 4.43927822676105e-05, "loss": 0.3615, "num_input_tokens_seen": 34629424, "step": 36245 }, { "epoch": 2.957011175462925, "grad_norm": 1.1400015354156494, "learning_rate": 4.439053585146258e-05, "loss": 0.3483, "num_input_tokens_seen": 34634512, "step": 36250 }, { "epoch": 2.9574190390733337, "grad_norm": 0.38477644324302673, "learning_rate": 4.4388289042276736e-05, "loss": 0.3257, "num_input_tokens_seen": 34639280, "step": 36255 }, { "epoch": 2.9578269026837427, "grad_norm": 0.499483585357666, "learning_rate": 4.4386041840098516e-05, "loss": 0.4347, "num_input_tokens_seen": 34643424, "step": 36260 }, { "epoch": 2.9582347662941513, "grad_norm": 0.6766588091850281, "learning_rate": 4.438379424497346e-05, "loss": 0.3831, "num_input_tokens_seen": 34647952, "step": 36265 }, { "epoch": 2.95864262990456, "grad_norm": 0.5778322219848633, "learning_rate": 4.438154625694714e-05, "loss": 0.2896, "num_input_tokens_seen": 34653520, "step": 36270 }, { "epoch": 2.959050493514969, "grad_norm": 0.5338958501815796, "learning_rate": 4.4379297876065105e-05, "loss": 0.4038, "num_input_tokens_seen": 34656944, "step": 36275 }, { "epoch": 2.9594583571253774, "grad_norm": 0.7380768656730652, "learning_rate": 4.437704910237294e-05, "loss": 0.26, "num_input_tokens_seen": 34662112, "step": 36280 }, { "epoch": 2.959866220735786, "grad_norm": 0.6856008172035217, "learning_rate": 4.4374799935916226e-05, "loss": 0.2743, "num_input_tokens_seen": 34667232, "step": 36285 }, { "epoch": 2.9602740843461945, "grad_norm": 1.5922144651412964, "learning_rate": 4.437255037674054e-05, "loss": 0.455, "num_input_tokens_seen": 34672960, "step": 36290 }, { "epoch": 2.960681947956603, "grad_norm": 1.4322913885116577, "learning_rate": 4.43703004248915e-05, "loss": 0.3892, "num_input_tokens_seen": 34678640, "step": 36295 }, { "epoch": 2.961089811567012, "grad_norm": 2.6030478477478027, "learning_rate": 4.4368050080414695e-05, "loss": 0.341, "num_input_tokens_seen": 34683344, "step": 36300 }, { "epoch": 2.9614976751774207, "grad_norm": 0.180578351020813, "learning_rate": 4.4365799343355754e-05, "loss": 0.349, "num_input_tokens_seen": 34688080, "step": 36305 }, { "epoch": 2.9619055387878293, "grad_norm": 0.22390970587730408, "learning_rate": 4.436354821376028e-05, "loss": 0.3557, "num_input_tokens_seen": 34692704, "step": 36310 }, { "epoch": 2.9623134023982383, "grad_norm": 0.8633840680122375, "learning_rate": 4.436129669167391e-05, "loss": 0.3261, "num_input_tokens_seen": 34697440, "step": 36315 }, { "epoch": 2.962721266008647, "grad_norm": 1.1188700199127197, "learning_rate": 4.435904477714229e-05, "loss": 0.4182, "num_input_tokens_seen": 34701952, "step": 36320 }, { "epoch": 2.9631291296190554, "grad_norm": 0.8259648680686951, "learning_rate": 4.435679247021106e-05, "loss": 0.3645, "num_input_tokens_seen": 34707008, "step": 36325 }, { "epoch": 2.963536993229464, "grad_norm": 0.1798725575208664, "learning_rate": 4.435453977092586e-05, "loss": 0.3307, "num_input_tokens_seen": 34711616, "step": 36330 }, { "epoch": 2.9639448568398725, "grad_norm": 0.9259164333343506, "learning_rate": 4.435228667933238e-05, "loss": 0.3696, "num_input_tokens_seen": 34717168, "step": 36335 }, { "epoch": 2.9643527204502815, "grad_norm": 0.8266151547431946, "learning_rate": 4.435003319547626e-05, "loss": 0.3375, "num_input_tokens_seen": 34722096, "step": 36340 }, { "epoch": 2.96476058406069, "grad_norm": 0.28572845458984375, "learning_rate": 4.4347779319403194e-05, "loss": 0.3852, "num_input_tokens_seen": 34726768, "step": 36345 }, { "epoch": 2.9651684476710987, "grad_norm": 0.37829771637916565, "learning_rate": 4.4345525051158865e-05, "loss": 0.3165, "num_input_tokens_seen": 34731568, "step": 36350 }, { "epoch": 2.9655763112815077, "grad_norm": 0.5224516987800598, "learning_rate": 4.434327039078896e-05, "loss": 0.3085, "num_input_tokens_seen": 34736240, "step": 36355 }, { "epoch": 2.9659841748919162, "grad_norm": 0.6585941314697266, "learning_rate": 4.434101533833919e-05, "loss": 0.3849, "num_input_tokens_seen": 34740944, "step": 36360 }, { "epoch": 2.966392038502325, "grad_norm": 0.7445654273033142, "learning_rate": 4.4338759893855246e-05, "loss": 0.382, "num_input_tokens_seen": 34745808, "step": 36365 }, { "epoch": 2.9667999021127334, "grad_norm": 0.15967395901679993, "learning_rate": 4.4336504057382866e-05, "loss": 0.3289, "num_input_tokens_seen": 34749664, "step": 36370 }, { "epoch": 2.967207765723142, "grad_norm": 0.402413010597229, "learning_rate": 4.4334247828967756e-05, "loss": 0.417, "num_input_tokens_seen": 34754496, "step": 36375 }, { "epoch": 2.967615629333551, "grad_norm": 0.1782739907503128, "learning_rate": 4.433199120865567e-05, "loss": 0.3534, "num_input_tokens_seen": 34758320, "step": 36380 }, { "epoch": 2.9680234929439595, "grad_norm": 0.6501496434211731, "learning_rate": 4.432973419649233e-05, "loss": 0.3342, "num_input_tokens_seen": 34762080, "step": 36385 }, { "epoch": 2.968431356554368, "grad_norm": 0.34640076756477356, "learning_rate": 4.432747679252349e-05, "loss": 0.3368, "num_input_tokens_seen": 34765904, "step": 36390 }, { "epoch": 2.968839220164777, "grad_norm": 0.7010729312896729, "learning_rate": 4.432521899679491e-05, "loss": 0.4059, "num_input_tokens_seen": 34770800, "step": 36395 }, { "epoch": 2.9692470837751856, "grad_norm": 0.16630001366138458, "learning_rate": 4.4322960809352354e-05, "loss": 0.3426, "num_input_tokens_seen": 34775056, "step": 36400 }, { "epoch": 2.969654947385594, "grad_norm": 0.2298184037208557, "learning_rate": 4.4320702230241596e-05, "loss": 0.3359, "num_input_tokens_seen": 34779648, "step": 36405 }, { "epoch": 2.9700628109960028, "grad_norm": 0.14699694514274597, "learning_rate": 4.43184432595084e-05, "loss": 0.3624, "num_input_tokens_seen": 34783552, "step": 36410 }, { "epoch": 2.9704706746064113, "grad_norm": 0.2743867337703705, "learning_rate": 4.431618389719858e-05, "loss": 0.3666, "num_input_tokens_seen": 34788496, "step": 36415 }, { "epoch": 2.9708785382168204, "grad_norm": 0.8343608379364014, "learning_rate": 4.431392414335792e-05, "loss": 0.3337, "num_input_tokens_seen": 34793168, "step": 36420 }, { "epoch": 2.971286401827229, "grad_norm": 1.099767804145813, "learning_rate": 4.431166399803223e-05, "loss": 0.3649, "num_input_tokens_seen": 34797584, "step": 36425 }, { "epoch": 2.9716942654376375, "grad_norm": 0.6689791083335876, "learning_rate": 4.430940346126731e-05, "loss": 0.2533, "num_input_tokens_seen": 34803408, "step": 36430 }, { "epoch": 2.9721021290480465, "grad_norm": 1.2873979806900024, "learning_rate": 4.430714253310898e-05, "loss": 0.4605, "num_input_tokens_seen": 34807408, "step": 36435 }, { "epoch": 2.972509992658455, "grad_norm": 0.7264421582221985, "learning_rate": 4.4304881213603086e-05, "loss": 0.25, "num_input_tokens_seen": 34811648, "step": 36440 }, { "epoch": 2.9729178562688636, "grad_norm": 0.5979545712471008, "learning_rate": 4.430261950279545e-05, "loss": 0.2321, "num_input_tokens_seen": 34816560, "step": 36445 }, { "epoch": 2.9733257198792726, "grad_norm": 0.5088936686515808, "learning_rate": 4.430035740073192e-05, "loss": 0.2519, "num_input_tokens_seen": 34821248, "step": 36450 }, { "epoch": 2.973733583489681, "grad_norm": 0.46428871154785156, "learning_rate": 4.429809490745834e-05, "loss": 0.432, "num_input_tokens_seen": 34825328, "step": 36455 }, { "epoch": 2.9741414471000898, "grad_norm": 0.46112731099128723, "learning_rate": 4.429583202302058e-05, "loss": 0.4803, "num_input_tokens_seen": 34829568, "step": 36460 }, { "epoch": 2.9745493107104983, "grad_norm": 0.7816520929336548, "learning_rate": 4.42935687474645e-05, "loss": 0.4366, "num_input_tokens_seen": 34834144, "step": 36465 }, { "epoch": 2.974957174320907, "grad_norm": 0.7788923382759094, "learning_rate": 4.429130508083599e-05, "loss": 0.3537, "num_input_tokens_seen": 34837776, "step": 36470 }, { "epoch": 2.975365037931316, "grad_norm": 1.1190423965454102, "learning_rate": 4.428904102318091e-05, "loss": 0.3969, "num_input_tokens_seen": 34841792, "step": 36475 }, { "epoch": 2.9757729015417245, "grad_norm": 0.27277398109436035, "learning_rate": 4.428677657454518e-05, "loss": 0.3588, "num_input_tokens_seen": 34847152, "step": 36480 }, { "epoch": 2.976180765152133, "grad_norm": 1.042629599571228, "learning_rate": 4.428451173497468e-05, "loss": 0.421, "num_input_tokens_seen": 34851360, "step": 36485 }, { "epoch": 2.976588628762542, "grad_norm": 0.3221130967140198, "learning_rate": 4.4282246504515314e-05, "loss": 0.3266, "num_input_tokens_seen": 34856608, "step": 36490 }, { "epoch": 2.9769964923729506, "grad_norm": 1.1904051303863525, "learning_rate": 4.427998088321301e-05, "loss": 0.4228, "num_input_tokens_seen": 34861184, "step": 36495 }, { "epoch": 2.977404355983359, "grad_norm": 0.5297917127609253, "learning_rate": 4.4277714871113684e-05, "loss": 0.3142, "num_input_tokens_seen": 34865760, "step": 36500 }, { "epoch": 2.9778122195937677, "grad_norm": 0.3969518840312958, "learning_rate": 4.427544846826327e-05, "loss": 0.3622, "num_input_tokens_seen": 34869904, "step": 36505 }, { "epoch": 2.9782200832041763, "grad_norm": 1.1569221019744873, "learning_rate": 4.427318167470771e-05, "loss": 0.451, "num_input_tokens_seen": 34874272, "step": 36510 }, { "epoch": 2.9786279468145853, "grad_norm": 0.31706473231315613, "learning_rate": 4.427091449049294e-05, "loss": 0.3539, "num_input_tokens_seen": 34878832, "step": 36515 }, { "epoch": 2.979035810424994, "grad_norm": 0.3717314302921295, "learning_rate": 4.426864691566493e-05, "loss": 0.3448, "num_input_tokens_seen": 34883776, "step": 36520 }, { "epoch": 2.9794436740354024, "grad_norm": 0.23327936232089996, "learning_rate": 4.426637895026963e-05, "loss": 0.333, "num_input_tokens_seen": 34888320, "step": 36525 }, { "epoch": 2.9798515376458115, "grad_norm": 0.6889618635177612, "learning_rate": 4.426411059435301e-05, "loss": 0.2993, "num_input_tokens_seen": 34893328, "step": 36530 }, { "epoch": 2.98025940125622, "grad_norm": 1.1746711730957031, "learning_rate": 4.4261841847961074e-05, "loss": 0.4551, "num_input_tokens_seen": 34897696, "step": 36535 }, { "epoch": 2.9806672648666286, "grad_norm": 0.25051000714302063, "learning_rate": 4.425957271113977e-05, "loss": 0.4139, "num_input_tokens_seen": 34902512, "step": 36540 }, { "epoch": 2.981075128477037, "grad_norm": 0.2509021759033203, "learning_rate": 4.425730318393512e-05, "loss": 0.3199, "num_input_tokens_seen": 34906928, "step": 36545 }, { "epoch": 2.9814829920874457, "grad_norm": 0.6822273135185242, "learning_rate": 4.425503326639311e-05, "loss": 0.2919, "num_input_tokens_seen": 34911056, "step": 36550 }, { "epoch": 2.9818908556978547, "grad_norm": 0.28328073024749756, "learning_rate": 4.4252762958559756e-05, "loss": 0.4424, "num_input_tokens_seen": 34915904, "step": 36555 }, { "epoch": 2.9822987193082633, "grad_norm": 0.58766108751297, "learning_rate": 4.425049226048109e-05, "loss": 0.3233, "num_input_tokens_seen": 34920496, "step": 36560 }, { "epoch": 2.982706582918672, "grad_norm": 0.7591635584831238, "learning_rate": 4.424822117220311e-05, "loss": 0.3798, "num_input_tokens_seen": 34924032, "step": 36565 }, { "epoch": 2.983114446529081, "grad_norm": 1.0855778455734253, "learning_rate": 4.424594969377188e-05, "loss": 0.3701, "num_input_tokens_seen": 34929216, "step": 36570 }, { "epoch": 2.9835223101394894, "grad_norm": 0.05759056657552719, "learning_rate": 4.424367782523342e-05, "loss": 0.3412, "num_input_tokens_seen": 34934400, "step": 36575 }, { "epoch": 2.983930173749898, "grad_norm": 0.574624240398407, "learning_rate": 4.424140556663379e-05, "loss": 0.2971, "num_input_tokens_seen": 34939536, "step": 36580 }, { "epoch": 2.9843380373603066, "grad_norm": 0.5891504287719727, "learning_rate": 4.423913291801905e-05, "loss": 0.3336, "num_input_tokens_seen": 34945744, "step": 36585 }, { "epoch": 2.984745900970715, "grad_norm": 0.41506823897361755, "learning_rate": 4.4236859879435255e-05, "loss": 0.4581, "num_input_tokens_seen": 34949808, "step": 36590 }, { "epoch": 2.985153764581124, "grad_norm": 0.8516028523445129, "learning_rate": 4.4234586450928486e-05, "loss": 0.4129, "num_input_tokens_seen": 34954240, "step": 36595 }, { "epoch": 2.9855616281915327, "grad_norm": 1.148991584777832, "learning_rate": 4.4232312632544824e-05, "loss": 0.3656, "num_input_tokens_seen": 34959536, "step": 36600 }, { "epoch": 2.9859694918019413, "grad_norm": 0.8477882742881775, "learning_rate": 4.4230038424330356e-05, "loss": 0.3868, "num_input_tokens_seen": 34964080, "step": 36605 }, { "epoch": 2.9863773554123503, "grad_norm": 0.3610319495201111, "learning_rate": 4.422776382633118e-05, "loss": 0.3448, "num_input_tokens_seen": 34967904, "step": 36610 }, { "epoch": 2.986785219022759, "grad_norm": 0.144682839512825, "learning_rate": 4.42254888385934e-05, "loss": 0.3492, "num_input_tokens_seen": 34973488, "step": 36615 }, { "epoch": 2.9871930826331674, "grad_norm": 0.21212337911128998, "learning_rate": 4.422321346116314e-05, "loss": 0.3578, "num_input_tokens_seen": 34978752, "step": 36620 }, { "epoch": 2.9876009462435764, "grad_norm": 0.24322068691253662, "learning_rate": 4.4220937694086495e-05, "loss": 0.3287, "num_input_tokens_seen": 34983024, "step": 36625 }, { "epoch": 2.988008809853985, "grad_norm": 0.255077600479126, "learning_rate": 4.4218661537409626e-05, "loss": 0.4434, "num_input_tokens_seen": 34987536, "step": 36630 }, { "epoch": 2.9884166734643935, "grad_norm": 0.17921878397464752, "learning_rate": 4.421638499117865e-05, "loss": 0.3316, "num_input_tokens_seen": 34991168, "step": 36635 }, { "epoch": 2.988824537074802, "grad_norm": 0.8446106910705566, "learning_rate": 4.4214108055439705e-05, "loss": 0.3769, "num_input_tokens_seen": 34995728, "step": 36640 }, { "epoch": 2.9892324006852107, "grad_norm": 0.1700354963541031, "learning_rate": 4.421183073023897e-05, "loss": 0.363, "num_input_tokens_seen": 35001696, "step": 36645 }, { "epoch": 2.9896402642956197, "grad_norm": 0.9623395204544067, "learning_rate": 4.4209553015622577e-05, "loss": 0.3694, "num_input_tokens_seen": 35006160, "step": 36650 }, { "epoch": 2.9900481279060283, "grad_norm": 0.8666142821311951, "learning_rate": 4.4207274911636714e-05, "loss": 0.3462, "num_input_tokens_seen": 35012032, "step": 36655 }, { "epoch": 2.990455991516437, "grad_norm": 0.3172004818916321, "learning_rate": 4.4204996418327547e-05, "loss": 0.3706, "num_input_tokens_seen": 35017072, "step": 36660 }, { "epoch": 2.990863855126846, "grad_norm": 1.4126189947128296, "learning_rate": 4.420271753574127e-05, "loss": 0.3718, "num_input_tokens_seen": 35021632, "step": 36665 }, { "epoch": 2.9912717187372544, "grad_norm": 0.22877034544944763, "learning_rate": 4.420043826392406e-05, "loss": 0.3526, "num_input_tokens_seen": 35026400, "step": 36670 }, { "epoch": 2.991679582347663, "grad_norm": 1.0981675386428833, "learning_rate": 4.419815860292213e-05, "loss": 0.3726, "num_input_tokens_seen": 35031440, "step": 36675 }, { "epoch": 2.9920874459580715, "grad_norm": 0.24944782257080078, "learning_rate": 4.419587855278168e-05, "loss": 0.352, "num_input_tokens_seen": 35036416, "step": 36680 }, { "epoch": 2.99249530956848, "grad_norm": 0.19343852996826172, "learning_rate": 4.419359811354893e-05, "loss": 0.3549, "num_input_tokens_seen": 35040512, "step": 36685 }, { "epoch": 2.992903173178889, "grad_norm": 0.35521289706230164, "learning_rate": 4.41913172852701e-05, "loss": 0.3517, "num_input_tokens_seen": 35045904, "step": 36690 }, { "epoch": 2.9933110367892977, "grad_norm": 0.4980853796005249, "learning_rate": 4.418903606799143e-05, "loss": 0.3594, "num_input_tokens_seen": 35051408, "step": 36695 }, { "epoch": 2.9937189003997062, "grad_norm": 1.027357816696167, "learning_rate": 4.418675446175915e-05, "loss": 0.3571, "num_input_tokens_seen": 35055360, "step": 36700 }, { "epoch": 2.9941267640101152, "grad_norm": 0.2161770612001419, "learning_rate": 4.418447246661951e-05, "loss": 0.3527, "num_input_tokens_seen": 35060032, "step": 36705 }, { "epoch": 2.994534627620524, "grad_norm": 0.10110737383365631, "learning_rate": 4.418219008261876e-05, "loss": 0.3533, "num_input_tokens_seen": 35065376, "step": 36710 }, { "epoch": 2.9949424912309324, "grad_norm": 0.9896323084831238, "learning_rate": 4.417990730980317e-05, "loss": 0.3801, "num_input_tokens_seen": 35069696, "step": 36715 }, { "epoch": 2.995350354841341, "grad_norm": 0.8064867854118347, "learning_rate": 4.417762414821901e-05, "loss": 0.3524, "num_input_tokens_seen": 35075104, "step": 36720 }, { "epoch": 2.9957582184517495, "grad_norm": 1.04532790184021, "learning_rate": 4.417534059791256e-05, "loss": 0.3955, "num_input_tokens_seen": 35079488, "step": 36725 }, { "epoch": 2.9961660820621585, "grad_norm": 0.26141417026519775, "learning_rate": 4.41730566589301e-05, "loss": 0.3891, "num_input_tokens_seen": 35084560, "step": 36730 }, { "epoch": 2.996573945672567, "grad_norm": 0.89240562915802, "learning_rate": 4.417077233131793e-05, "loss": 0.3593, "num_input_tokens_seen": 35089232, "step": 36735 }, { "epoch": 2.9969818092829756, "grad_norm": 0.7154017090797424, "learning_rate": 4.4168487615122353e-05, "loss": 0.3267, "num_input_tokens_seen": 35094048, "step": 36740 }, { "epoch": 2.9973896728933846, "grad_norm": 1.2349046468734741, "learning_rate": 4.416620251038967e-05, "loss": 0.4424, "num_input_tokens_seen": 35098144, "step": 36745 }, { "epoch": 2.997797536503793, "grad_norm": 0.19702915847301483, "learning_rate": 4.4163917017166226e-05, "loss": 0.3947, "num_input_tokens_seen": 35103056, "step": 36750 }, { "epoch": 2.998205400114202, "grad_norm": 0.8593635559082031, "learning_rate": 4.416163113549831e-05, "loss": 0.3495, "num_input_tokens_seen": 35107504, "step": 36755 }, { "epoch": 2.9986132637246103, "grad_norm": 0.8599703311920166, "learning_rate": 4.415934486543228e-05, "loss": 0.3731, "num_input_tokens_seen": 35110576, "step": 36760 }, { "epoch": 2.999021127335019, "grad_norm": 0.965849757194519, "learning_rate": 4.415705820701447e-05, "loss": 0.3257, "num_input_tokens_seen": 35114880, "step": 36765 }, { "epoch": 2.999428990945428, "grad_norm": 0.7364261746406555, "learning_rate": 4.415477116029122e-05, "loss": 0.3673, "num_input_tokens_seen": 35119040, "step": 36770 }, { "epoch": 2.9998368545558365, "grad_norm": 0.16590571403503418, "learning_rate": 4.415248372530891e-05, "loss": 0.3707, "num_input_tokens_seen": 35123696, "step": 36775 }, { "epoch": 3.000244718166245, "grad_norm": 0.25430065393447876, "learning_rate": 4.415019590211389e-05, "loss": 0.3393, "num_input_tokens_seen": 35127456, "step": 36780 }, { "epoch": 3.000244718166245, "eval_loss": 0.3402019441127777, "eval_runtime": 570.8211, "eval_samples_per_second": 4.774, "eval_steps_per_second": 2.388, "num_input_tokens_seen": 35127456, "step": 36780 }, { "epoch": 3.000652581776654, "grad_norm": 0.8579269051551819, "learning_rate": 4.414790769075253e-05, "loss": 0.3531, "num_input_tokens_seen": 35131856, "step": 36785 }, { "epoch": 3.0010604453870626, "grad_norm": 0.21986043453216553, "learning_rate": 4.414561909127123e-05, "loss": 0.3564, "num_input_tokens_seen": 35137104, "step": 36790 }, { "epoch": 3.001468308997471, "grad_norm": 0.8559483885765076, "learning_rate": 4.414333010371636e-05, "loss": 0.3443, "num_input_tokens_seen": 35142608, "step": 36795 }, { "epoch": 3.0018761726078798, "grad_norm": 0.9784379005432129, "learning_rate": 4.414104072813432e-05, "loss": 0.3493, "num_input_tokens_seen": 35147680, "step": 36800 }, { "epoch": 3.0022840362182888, "grad_norm": 0.17580753564834595, "learning_rate": 4.413875096457152e-05, "loss": 0.3458, "num_input_tokens_seen": 35153184, "step": 36805 }, { "epoch": 3.0026918998286973, "grad_norm": 0.926415205001831, "learning_rate": 4.413646081307438e-05, "loss": 0.3581, "num_input_tokens_seen": 35157360, "step": 36810 }, { "epoch": 3.003099763439106, "grad_norm": 0.1308974027633667, "learning_rate": 4.4134170273689294e-05, "loss": 0.3532, "num_input_tokens_seen": 35161600, "step": 36815 }, { "epoch": 3.0035076270495145, "grad_norm": 0.33034268021583557, "learning_rate": 4.413187934646272e-05, "loss": 0.3307, "num_input_tokens_seen": 35166320, "step": 36820 }, { "epoch": 3.0039154906599235, "grad_norm": 0.909136950969696, "learning_rate": 4.4129588031441076e-05, "loss": 0.3457, "num_input_tokens_seen": 35171536, "step": 36825 }, { "epoch": 3.004323354270332, "grad_norm": 0.12732478976249695, "learning_rate": 4.412729632867081e-05, "loss": 0.3424, "num_input_tokens_seen": 35176176, "step": 36830 }, { "epoch": 3.0047312178807406, "grad_norm": 0.20527254045009613, "learning_rate": 4.4125004238198377e-05, "loss": 0.3646, "num_input_tokens_seen": 35181248, "step": 36835 }, { "epoch": 3.005139081491149, "grad_norm": 0.36596378684043884, "learning_rate": 4.4122711760070237e-05, "loss": 0.359, "num_input_tokens_seen": 35185920, "step": 36840 }, { "epoch": 3.005546945101558, "grad_norm": 0.8481746912002563, "learning_rate": 4.4120418894332854e-05, "loss": 0.3332, "num_input_tokens_seen": 35191136, "step": 36845 }, { "epoch": 3.0059548087119667, "grad_norm": 1.2969616651535034, "learning_rate": 4.411812564103271e-05, "loss": 0.3139, "num_input_tokens_seen": 35196336, "step": 36850 }, { "epoch": 3.0063626723223753, "grad_norm": 0.4789494276046753, "learning_rate": 4.4115832000216274e-05, "loss": 0.4926, "num_input_tokens_seen": 35200528, "step": 36855 }, { "epoch": 3.006770535932784, "grad_norm": 0.4132167100906372, "learning_rate": 4.411353797193005e-05, "loss": 0.4622, "num_input_tokens_seen": 35205712, "step": 36860 }, { "epoch": 3.007178399543193, "grad_norm": 0.24410462379455566, "learning_rate": 4.4111243556220525e-05, "loss": 0.3127, "num_input_tokens_seen": 35210512, "step": 36865 }, { "epoch": 3.0075862631536014, "grad_norm": 0.17669931054115295, "learning_rate": 4.410894875313423e-05, "loss": 0.4137, "num_input_tokens_seen": 35215136, "step": 36870 }, { "epoch": 3.00799412676401, "grad_norm": 0.8057147860527039, "learning_rate": 4.4106653562717645e-05, "loss": 0.3158, "num_input_tokens_seen": 35220080, "step": 36875 }, { "epoch": 3.0084019903744186, "grad_norm": 0.78517746925354, "learning_rate": 4.410435798501733e-05, "loss": 0.347, "num_input_tokens_seen": 35225680, "step": 36880 }, { "epoch": 3.0088098539848276, "grad_norm": 0.26476407051086426, "learning_rate": 4.410206202007978e-05, "loss": 0.3522, "num_input_tokens_seen": 35230816, "step": 36885 }, { "epoch": 3.009217717595236, "grad_norm": 1.0785022974014282, "learning_rate": 4.4099765667951566e-05, "loss": 0.3593, "num_input_tokens_seen": 35235264, "step": 36890 }, { "epoch": 3.0096255812056447, "grad_norm": 0.3301120102405548, "learning_rate": 4.409746892867921e-05, "loss": 0.3304, "num_input_tokens_seen": 35240288, "step": 36895 }, { "epoch": 3.0100334448160537, "grad_norm": 0.3012673854827881, "learning_rate": 4.4095171802309274e-05, "loss": 0.3589, "num_input_tokens_seen": 35246448, "step": 36900 }, { "epoch": 3.0104413084264623, "grad_norm": 0.7246184945106506, "learning_rate": 4.409287428888832e-05, "loss": 0.3054, "num_input_tokens_seen": 35252048, "step": 36905 }, { "epoch": 3.010849172036871, "grad_norm": 0.27872157096862793, "learning_rate": 4.409057638846291e-05, "loss": 0.3819, "num_input_tokens_seen": 35256656, "step": 36910 }, { "epoch": 3.0112570356472794, "grad_norm": 0.8458014130592346, "learning_rate": 4.4088278101079636e-05, "loss": 0.3422, "num_input_tokens_seen": 35262096, "step": 36915 }, { "epoch": 3.0116648992576884, "grad_norm": 0.7902382612228394, "learning_rate": 4.408597942678507e-05, "loss": 0.3443, "num_input_tokens_seen": 35266992, "step": 36920 }, { "epoch": 3.012072762868097, "grad_norm": 0.3142741620540619, "learning_rate": 4.408368036562582e-05, "loss": 0.3511, "num_input_tokens_seen": 35272192, "step": 36925 }, { "epoch": 3.0124806264785056, "grad_norm": 0.8484954833984375, "learning_rate": 4.408138091764848e-05, "loss": 0.3363, "num_input_tokens_seen": 35277264, "step": 36930 }, { "epoch": 3.012888490088914, "grad_norm": 0.450804740190506, "learning_rate": 4.407908108289965e-05, "loss": 0.3298, "num_input_tokens_seen": 35281760, "step": 36935 }, { "epoch": 3.013296353699323, "grad_norm": 0.6596871018409729, "learning_rate": 4.407678086142596e-05, "loss": 0.3313, "num_input_tokens_seen": 35286352, "step": 36940 }, { "epoch": 3.0137042173097317, "grad_norm": 0.38866114616394043, "learning_rate": 4.407448025327402e-05, "loss": 0.4059, "num_input_tokens_seen": 35290912, "step": 36945 }, { "epoch": 3.0141120809201403, "grad_norm": 1.165386438369751, "learning_rate": 4.407217925849048e-05, "loss": 0.3996, "num_input_tokens_seen": 35296016, "step": 36950 }, { "epoch": 3.014519944530549, "grad_norm": 0.9313088655471802, "learning_rate": 4.4069877877121956e-05, "loss": 0.3186, "num_input_tokens_seen": 35300912, "step": 36955 }, { "epoch": 3.014927808140958, "grad_norm": 0.7661000490188599, "learning_rate": 4.406757610921512e-05, "loss": 0.327, "num_input_tokens_seen": 35305232, "step": 36960 }, { "epoch": 3.0153356717513664, "grad_norm": 0.2969258427619934, "learning_rate": 4.4065273954816615e-05, "loss": 0.3664, "num_input_tokens_seen": 35309088, "step": 36965 }, { "epoch": 3.015743535361775, "grad_norm": 0.545890748500824, "learning_rate": 4.406297141397311e-05, "loss": 0.3053, "num_input_tokens_seen": 35314384, "step": 36970 }, { "epoch": 3.0161513989721835, "grad_norm": 0.3101350665092468, "learning_rate": 4.406066848673128e-05, "loss": 0.3704, "num_input_tokens_seen": 35318592, "step": 36975 }, { "epoch": 3.0165592625825925, "grad_norm": 0.7201870679855347, "learning_rate": 4.405836517313779e-05, "loss": 0.3113, "num_input_tokens_seen": 35323776, "step": 36980 }, { "epoch": 3.016967126193001, "grad_norm": 0.30282163619995117, "learning_rate": 4.405606147323934e-05, "loss": 0.3203, "num_input_tokens_seen": 35329008, "step": 36985 }, { "epoch": 3.0173749898034097, "grad_norm": 0.486605703830719, "learning_rate": 4.405375738708263e-05, "loss": 0.366, "num_input_tokens_seen": 35333312, "step": 36990 }, { "epoch": 3.0177828534138182, "grad_norm": 0.3528309464454651, "learning_rate": 4.405145291471434e-05, "loss": 0.3639, "num_input_tokens_seen": 35338528, "step": 36995 }, { "epoch": 3.0181907170242273, "grad_norm": 0.840458333492279, "learning_rate": 4.4049148056181205e-05, "loss": 0.3394, "num_input_tokens_seen": 35343600, "step": 37000 }, { "epoch": 3.018598580634636, "grad_norm": 0.18362313508987427, "learning_rate": 4.404684281152993e-05, "loss": 0.3283, "num_input_tokens_seen": 35348736, "step": 37005 }, { "epoch": 3.0190064442450444, "grad_norm": 0.4039474427700043, "learning_rate": 4.404453718080724e-05, "loss": 0.3425, "num_input_tokens_seen": 35354000, "step": 37010 }, { "epoch": 3.019414307855453, "grad_norm": 0.3378358483314514, "learning_rate": 4.404223116405988e-05, "loss": 0.3172, "num_input_tokens_seen": 35359344, "step": 37015 }, { "epoch": 3.019822171465862, "grad_norm": 0.389608234167099, "learning_rate": 4.403992476133458e-05, "loss": 0.3572, "num_input_tokens_seen": 35365152, "step": 37020 }, { "epoch": 3.0202300350762705, "grad_norm": 0.8684053421020508, "learning_rate": 4.40376179726781e-05, "loss": 0.3641, "num_input_tokens_seen": 35369712, "step": 37025 }, { "epoch": 3.020637898686679, "grad_norm": 1.0880186557769775, "learning_rate": 4.40353107981372e-05, "loss": 0.3443, "num_input_tokens_seen": 35374736, "step": 37030 }, { "epoch": 3.0210457622970877, "grad_norm": 0.42221033573150635, "learning_rate": 4.403300323775863e-05, "loss": 0.3083, "num_input_tokens_seen": 35379136, "step": 37035 }, { "epoch": 3.0214536259074967, "grad_norm": 0.35612085461616516, "learning_rate": 4.403069529158917e-05, "loss": 0.3761, "num_input_tokens_seen": 35384416, "step": 37040 }, { "epoch": 3.0218614895179052, "grad_norm": 1.0752167701721191, "learning_rate": 4.402838695967562e-05, "loss": 0.3487, "num_input_tokens_seen": 35389488, "step": 37045 }, { "epoch": 3.022269353128314, "grad_norm": 0.18072421848773956, "learning_rate": 4.402607824206473e-05, "loss": 0.3344, "num_input_tokens_seen": 35394432, "step": 37050 }, { "epoch": 3.0226772167387224, "grad_norm": 0.2680372893810272, "learning_rate": 4.402376913880333e-05, "loss": 0.3317, "num_input_tokens_seen": 35400240, "step": 37055 }, { "epoch": 3.0230850803491314, "grad_norm": 0.25364547967910767, "learning_rate": 4.402145964993821e-05, "loss": 0.3302, "num_input_tokens_seen": 35405280, "step": 37060 }, { "epoch": 3.02349294395954, "grad_norm": 1.1514325141906738, "learning_rate": 4.401914977551619e-05, "loss": 0.3689, "num_input_tokens_seen": 35409632, "step": 37065 }, { "epoch": 3.0239008075699485, "grad_norm": 0.5750337839126587, "learning_rate": 4.401683951558408e-05, "loss": 0.3214, "num_input_tokens_seen": 35414768, "step": 37070 }, { "epoch": 3.024308671180357, "grad_norm": 0.4485838711261749, "learning_rate": 4.4014528870188714e-05, "loss": 0.4277, "num_input_tokens_seen": 35419808, "step": 37075 }, { "epoch": 3.024716534790766, "grad_norm": 1.3238778114318848, "learning_rate": 4.401221783937693e-05, "loss": 0.3608, "num_input_tokens_seen": 35424656, "step": 37080 }, { "epoch": 3.0251243984011746, "grad_norm": 0.4388733208179474, "learning_rate": 4.400990642319557e-05, "loss": 0.3344, "num_input_tokens_seen": 35430704, "step": 37085 }, { "epoch": 3.025532262011583, "grad_norm": 0.27958279848098755, "learning_rate": 4.4007594621691486e-05, "loss": 0.3709, "num_input_tokens_seen": 35435824, "step": 37090 }, { "epoch": 3.025940125621992, "grad_norm": 0.7265315055847168, "learning_rate": 4.4005282434911534e-05, "loss": 0.3823, "num_input_tokens_seen": 35439488, "step": 37095 }, { "epoch": 3.026347989232401, "grad_norm": 0.9462381601333618, "learning_rate": 4.400296986290258e-05, "loss": 0.3655, "num_input_tokens_seen": 35444528, "step": 37100 }, { "epoch": 3.0267558528428093, "grad_norm": 0.9843860268592834, "learning_rate": 4.40006569057115e-05, "loss": 0.3556, "num_input_tokens_seen": 35448944, "step": 37105 }, { "epoch": 3.027163716453218, "grad_norm": 0.3051297664642334, "learning_rate": 4.399834356338519e-05, "loss": 0.3061, "num_input_tokens_seen": 35453264, "step": 37110 }, { "epoch": 3.027571580063627, "grad_norm": 1.1494457721710205, "learning_rate": 4.399602983597052e-05, "loss": 0.2709, "num_input_tokens_seen": 35457760, "step": 37115 }, { "epoch": 3.0279794436740355, "grad_norm": 2.2258331775665283, "learning_rate": 4.3993715723514394e-05, "loss": 0.6085, "num_input_tokens_seen": 35462048, "step": 37120 }, { "epoch": 3.028387307284444, "grad_norm": 0.43956124782562256, "learning_rate": 4.3991401226063725e-05, "loss": 0.4515, "num_input_tokens_seen": 35466352, "step": 37125 }, { "epoch": 3.0287951708948526, "grad_norm": 1.2091127634048462, "learning_rate": 4.398908634366542e-05, "loss": 0.3347, "num_input_tokens_seen": 35470944, "step": 37130 }, { "epoch": 3.0292030345052616, "grad_norm": 0.3387048542499542, "learning_rate": 4.3986771076366415e-05, "loss": 0.3512, "num_input_tokens_seen": 35476656, "step": 37135 }, { "epoch": 3.02961089811567, "grad_norm": 0.8765100240707397, "learning_rate": 4.3984455424213625e-05, "loss": 0.3806, "num_input_tokens_seen": 35482144, "step": 37140 }, { "epoch": 3.0300187617260788, "grad_norm": 0.884364128112793, "learning_rate": 4.398213938725398e-05, "loss": 0.315, "num_input_tokens_seen": 35487104, "step": 37145 }, { "epoch": 3.0304266253364873, "grad_norm": 0.7686148881912231, "learning_rate": 4.3979822965534444e-05, "loss": 0.2771, "num_input_tokens_seen": 35491824, "step": 37150 }, { "epoch": 3.0308344889468963, "grad_norm": 0.5523636341094971, "learning_rate": 4.397750615910195e-05, "loss": 0.3831, "num_input_tokens_seen": 35497136, "step": 37155 }, { "epoch": 3.031242352557305, "grad_norm": 0.7428667545318604, "learning_rate": 4.397518896800349e-05, "loss": 0.3013, "num_input_tokens_seen": 35502384, "step": 37160 }, { "epoch": 3.0316502161677135, "grad_norm": 0.8021681308746338, "learning_rate": 4.3972871392286e-05, "loss": 0.5067, "num_input_tokens_seen": 35507632, "step": 37165 }, { "epoch": 3.032058079778122, "grad_norm": 0.22531166672706604, "learning_rate": 4.3970553431996466e-05, "loss": 0.4032, "num_input_tokens_seen": 35512720, "step": 37170 }, { "epoch": 3.032465943388531, "grad_norm": 0.737099289894104, "learning_rate": 4.396823508718189e-05, "loss": 0.3338, "num_input_tokens_seen": 35517856, "step": 37175 }, { "epoch": 3.0328738069989396, "grad_norm": 0.9007844924926758, "learning_rate": 4.396591635788923e-05, "loss": 0.3443, "num_input_tokens_seen": 35522976, "step": 37180 }, { "epoch": 3.033281670609348, "grad_norm": 0.32969629764556885, "learning_rate": 4.396359724416551e-05, "loss": 0.3345, "num_input_tokens_seen": 35527248, "step": 37185 }, { "epoch": 3.0336895342197567, "grad_norm": 0.9800190925598145, "learning_rate": 4.3961277746057735e-05, "loss": 0.3402, "num_input_tokens_seen": 35532208, "step": 37190 }, { "epoch": 3.0340973978301657, "grad_norm": 0.3486305773258209, "learning_rate": 4.3958957863612925e-05, "loss": 0.317, "num_input_tokens_seen": 35537904, "step": 37195 }, { "epoch": 3.0345052614405743, "grad_norm": 1.376060962677002, "learning_rate": 4.395663759687808e-05, "loss": 0.3502, "num_input_tokens_seen": 35542704, "step": 37200 }, { "epoch": 3.034913125050983, "grad_norm": 0.4450071454048157, "learning_rate": 4.3954316945900257e-05, "loss": 0.317, "num_input_tokens_seen": 35547312, "step": 37205 }, { "epoch": 3.0353209886613914, "grad_norm": 0.543765664100647, "learning_rate": 4.3951995910726486e-05, "loss": 0.3599, "num_input_tokens_seen": 35552480, "step": 37210 }, { "epoch": 3.0357288522718004, "grad_norm": 1.5543440580368042, "learning_rate": 4.3949674491403805e-05, "loss": 0.4359, "num_input_tokens_seen": 35557824, "step": 37215 }, { "epoch": 3.036136715882209, "grad_norm": 1.239380955696106, "learning_rate": 4.394735268797927e-05, "loss": 0.3881, "num_input_tokens_seen": 35562832, "step": 37220 }, { "epoch": 3.0365445794926176, "grad_norm": 0.11811277270317078, "learning_rate": 4.394503050049995e-05, "loss": 0.372, "num_input_tokens_seen": 35567088, "step": 37225 }, { "epoch": 3.036952443103026, "grad_norm": 0.2314377725124359, "learning_rate": 4.394270792901292e-05, "loss": 0.3489, "num_input_tokens_seen": 35572896, "step": 37230 }, { "epoch": 3.037360306713435, "grad_norm": 1.119722843170166, "learning_rate": 4.394038497356524e-05, "loss": 0.3668, "num_input_tokens_seen": 35577904, "step": 37235 }, { "epoch": 3.0377681703238437, "grad_norm": 0.54348224401474, "learning_rate": 4.3938061634204005e-05, "loss": 0.3525, "num_input_tokens_seen": 35582496, "step": 37240 }, { "epoch": 3.0381760339342523, "grad_norm": 0.2568415105342865, "learning_rate": 4.393573791097631e-05, "loss": 0.451, "num_input_tokens_seen": 35586832, "step": 37245 }, { "epoch": 3.038583897544661, "grad_norm": 0.778991162776947, "learning_rate": 4.393341380392926e-05, "loss": 0.3295, "num_input_tokens_seen": 35592032, "step": 37250 }, { "epoch": 3.03899176115507, "grad_norm": 1.2013051509857178, "learning_rate": 4.393108931310995e-05, "loss": 0.4104, "num_input_tokens_seen": 35596480, "step": 37255 }, { "epoch": 3.0393996247654784, "grad_norm": 0.32136452198028564, "learning_rate": 4.392876443856551e-05, "loss": 0.3474, "num_input_tokens_seen": 35601808, "step": 37260 }, { "epoch": 3.039807488375887, "grad_norm": 0.36380916833877563, "learning_rate": 4.392643918034306e-05, "loss": 0.3728, "num_input_tokens_seen": 35606960, "step": 37265 }, { "epoch": 3.040215351986296, "grad_norm": 0.8077266812324524, "learning_rate": 4.392411353848972e-05, "loss": 0.3578, "num_input_tokens_seen": 35610960, "step": 37270 }, { "epoch": 3.0406232155967046, "grad_norm": 0.2518707811832428, "learning_rate": 4.392178751305265e-05, "loss": 0.3353, "num_input_tokens_seen": 35615824, "step": 37275 }, { "epoch": 3.041031079207113, "grad_norm": 0.9810946583747864, "learning_rate": 4.391946110407898e-05, "loss": 0.3696, "num_input_tokens_seen": 35620016, "step": 37280 }, { "epoch": 3.0414389428175217, "grad_norm": 0.81849205493927, "learning_rate": 4.391713431161587e-05, "loss": 0.3279, "num_input_tokens_seen": 35624880, "step": 37285 }, { "epoch": 3.0418468064279307, "grad_norm": 0.7540446519851685, "learning_rate": 4.39148071357105e-05, "loss": 0.3234, "num_input_tokens_seen": 35630192, "step": 37290 }, { "epoch": 3.0422546700383393, "grad_norm": 0.3571825325489044, "learning_rate": 4.391247957641001e-05, "loss": 0.3528, "num_input_tokens_seen": 35635184, "step": 37295 }, { "epoch": 3.042662533648748, "grad_norm": 0.25626707077026367, "learning_rate": 4.391015163376161e-05, "loss": 0.3424, "num_input_tokens_seen": 35640480, "step": 37300 }, { "epoch": 3.0430703972591564, "grad_norm": 0.6909327507019043, "learning_rate": 4.390782330781246e-05, "loss": 0.3202, "num_input_tokens_seen": 35645152, "step": 37305 }, { "epoch": 3.0434782608695654, "grad_norm": 0.8801909685134888, "learning_rate": 4.3905494598609777e-05, "loss": 0.3549, "num_input_tokens_seen": 35650704, "step": 37310 }, { "epoch": 3.043886124479974, "grad_norm": 0.5317019820213318, "learning_rate": 4.390316550620074e-05, "loss": 0.3661, "num_input_tokens_seen": 35655168, "step": 37315 }, { "epoch": 3.0442939880903825, "grad_norm": 1.0498628616333008, "learning_rate": 4.390083603063258e-05, "loss": 0.3698, "num_input_tokens_seen": 35658752, "step": 37320 }, { "epoch": 3.044701851700791, "grad_norm": 0.8320038914680481, "learning_rate": 4.38985061719525e-05, "loss": 0.3195, "num_input_tokens_seen": 35663664, "step": 37325 }, { "epoch": 3.0451097153112, "grad_norm": 0.7436873912811279, "learning_rate": 4.389617593020773e-05, "loss": 0.3119, "num_input_tokens_seen": 35668144, "step": 37330 }, { "epoch": 3.0455175789216087, "grad_norm": 0.28243115544319153, "learning_rate": 4.38938453054455e-05, "loss": 0.3211, "num_input_tokens_seen": 35672896, "step": 37335 }, { "epoch": 3.0459254425320172, "grad_norm": 0.8411362171173096, "learning_rate": 4.389151429771306e-05, "loss": 0.3871, "num_input_tokens_seen": 35678144, "step": 37340 }, { "epoch": 3.046333306142426, "grad_norm": 0.9085074067115784, "learning_rate": 4.3889182907057656e-05, "loss": 0.3243, "num_input_tokens_seen": 35683584, "step": 37345 }, { "epoch": 3.046741169752835, "grad_norm": 0.3967115879058838, "learning_rate": 4.388685113352654e-05, "loss": 0.3155, "num_input_tokens_seen": 35689056, "step": 37350 }, { "epoch": 3.0471490333632434, "grad_norm": 1.0005385875701904, "learning_rate": 4.3884518977166975e-05, "loss": 0.3657, "num_input_tokens_seen": 35693776, "step": 37355 }, { "epoch": 3.047556896973652, "grad_norm": 1.1363110542297363, "learning_rate": 4.3882186438026236e-05, "loss": 0.3713, "num_input_tokens_seen": 35699296, "step": 37360 }, { "epoch": 3.0479647605840605, "grad_norm": 0.7063040137290955, "learning_rate": 4.38798535161516e-05, "loss": 0.3002, "num_input_tokens_seen": 35705088, "step": 37365 }, { "epoch": 3.0483726241944695, "grad_norm": 0.970696747303009, "learning_rate": 4.387752021159035e-05, "loss": 0.3623, "num_input_tokens_seen": 35709728, "step": 37370 }, { "epoch": 3.048780487804878, "grad_norm": 0.5617082715034485, "learning_rate": 4.3875186524389795e-05, "loss": 0.3496, "num_input_tokens_seen": 35715344, "step": 37375 }, { "epoch": 3.0491883514152867, "grad_norm": 0.781144380569458, "learning_rate": 4.387285245459722e-05, "loss": 0.3107, "num_input_tokens_seen": 35720528, "step": 37380 }, { "epoch": 3.0495962150256952, "grad_norm": 0.7001383304595947, "learning_rate": 4.387051800225996e-05, "loss": 0.2905, "num_input_tokens_seen": 35724688, "step": 37385 }, { "epoch": 3.0500040786361042, "grad_norm": 0.6125872135162354, "learning_rate": 4.3868183167425306e-05, "loss": 0.4856, "num_input_tokens_seen": 35729840, "step": 37390 }, { "epoch": 3.050411942246513, "grad_norm": 0.5690719485282898, "learning_rate": 4.3865847950140606e-05, "loss": 0.4026, "num_input_tokens_seen": 35734080, "step": 37395 }, { "epoch": 3.0508198058569214, "grad_norm": 0.25584787130355835, "learning_rate": 4.386351235045318e-05, "loss": 0.3629, "num_input_tokens_seen": 35738736, "step": 37400 }, { "epoch": 3.05122766946733, "grad_norm": 0.716751217842102, "learning_rate": 4.3861176368410374e-05, "loss": 0.3363, "num_input_tokens_seen": 35744208, "step": 37405 }, { "epoch": 3.051635533077739, "grad_norm": 0.5805932283401489, "learning_rate": 4.3858840004059535e-05, "loss": 0.3594, "num_input_tokens_seen": 35748832, "step": 37410 }, { "epoch": 3.0520433966881475, "grad_norm": 0.34055283665657043, "learning_rate": 4.385650325744803e-05, "loss": 0.4211, "num_input_tokens_seen": 35754048, "step": 37415 }, { "epoch": 3.052451260298556, "grad_norm": 0.26135897636413574, "learning_rate": 4.385416612862321e-05, "loss": 0.3799, "num_input_tokens_seen": 35758080, "step": 37420 }, { "epoch": 3.0528591239089646, "grad_norm": 1.0280462503433228, "learning_rate": 4.385182861763246e-05, "loss": 0.399, "num_input_tokens_seen": 35763104, "step": 37425 }, { "epoch": 3.0532669875193736, "grad_norm": 0.30620816349983215, "learning_rate": 4.3849490724523143e-05, "loss": 0.3199, "num_input_tokens_seen": 35768672, "step": 37430 }, { "epoch": 3.053674851129782, "grad_norm": 0.38663503527641296, "learning_rate": 4.384715244934267e-05, "loss": 0.3253, "num_input_tokens_seen": 35774256, "step": 37435 }, { "epoch": 3.0540827147401908, "grad_norm": 1.0675941705703735, "learning_rate": 4.384481379213843e-05, "loss": 0.4118, "num_input_tokens_seen": 35778448, "step": 37440 }, { "epoch": 3.0544905783506, "grad_norm": 0.7683162689208984, "learning_rate": 4.384247475295781e-05, "loss": 0.3396, "num_input_tokens_seen": 35782400, "step": 37445 }, { "epoch": 3.0548984419610083, "grad_norm": 0.790883481502533, "learning_rate": 4.384013533184823e-05, "loss": 0.3648, "num_input_tokens_seen": 35787408, "step": 37450 }, { "epoch": 3.055306305571417, "grad_norm": 0.26021409034729004, "learning_rate": 4.383779552885713e-05, "loss": 0.3338, "num_input_tokens_seen": 35792208, "step": 37455 }, { "epoch": 3.0557141691818255, "grad_norm": 0.9101487398147583, "learning_rate": 4.3835455344031906e-05, "loss": 0.349, "num_input_tokens_seen": 35796464, "step": 37460 }, { "epoch": 3.0561220327922345, "grad_norm": 0.8024227619171143, "learning_rate": 4.383311477742001e-05, "loss": 0.3192, "num_input_tokens_seen": 35801136, "step": 37465 }, { "epoch": 3.056529896402643, "grad_norm": 0.3634832501411438, "learning_rate": 4.3830773829068876e-05, "loss": 0.3971, "num_input_tokens_seen": 35805744, "step": 37470 }, { "epoch": 3.0569377600130516, "grad_norm": 0.5142381191253662, "learning_rate": 4.382843249902596e-05, "loss": 0.4342, "num_input_tokens_seen": 35810976, "step": 37475 }, { "epoch": 3.05734562362346, "grad_norm": 0.3341263234615326, "learning_rate": 4.3826090787338724e-05, "loss": 0.2785, "num_input_tokens_seen": 35816560, "step": 37480 }, { "epoch": 3.057753487233869, "grad_norm": 0.5021449327468872, "learning_rate": 4.3823748694054626e-05, "loss": 0.3919, "num_input_tokens_seen": 35820880, "step": 37485 }, { "epoch": 3.0581613508442778, "grad_norm": 0.7913618683815002, "learning_rate": 4.3821406219221136e-05, "loss": 0.3791, "num_input_tokens_seen": 35825744, "step": 37490 }, { "epoch": 3.0585692144546863, "grad_norm": 0.4373278021812439, "learning_rate": 4.3819063362885734e-05, "loss": 0.3572, "num_input_tokens_seen": 35830336, "step": 37495 }, { "epoch": 3.058977078065095, "grad_norm": 1.0211323499679565, "learning_rate": 4.381672012509593e-05, "loss": 0.399, "num_input_tokens_seen": 35835376, "step": 37500 }, { "epoch": 3.059384941675504, "grad_norm": 0.9119577407836914, "learning_rate": 4.381437650589919e-05, "loss": 0.3583, "num_input_tokens_seen": 35840400, "step": 37505 }, { "epoch": 3.0597928052859125, "grad_norm": 0.3670867681503296, "learning_rate": 4.381203250534304e-05, "loss": 0.3699, "num_input_tokens_seen": 35845696, "step": 37510 }, { "epoch": 3.060200668896321, "grad_norm": 0.6438929438591003, "learning_rate": 4.380968812347499e-05, "loss": 0.2842, "num_input_tokens_seen": 35850848, "step": 37515 }, { "epoch": 3.0606085325067296, "grad_norm": 0.686275064945221, "learning_rate": 4.380734336034255e-05, "loss": 0.2962, "num_input_tokens_seen": 35855584, "step": 37520 }, { "epoch": 3.0610163961171386, "grad_norm": 0.6277687549591064, "learning_rate": 4.3804998215993245e-05, "loss": 0.4383, "num_input_tokens_seen": 35859728, "step": 37525 }, { "epoch": 3.061424259727547, "grad_norm": 1.2474528551101685, "learning_rate": 4.380265269047462e-05, "loss": 0.4008, "num_input_tokens_seen": 35864528, "step": 37530 }, { "epoch": 3.0618321233379557, "grad_norm": 0.724254310131073, "learning_rate": 4.3800306783834224e-05, "loss": 0.3748, "num_input_tokens_seen": 35868656, "step": 37535 }, { "epoch": 3.0622399869483643, "grad_norm": 0.2737792432308197, "learning_rate": 4.3797960496119585e-05, "loss": 0.3403, "num_input_tokens_seen": 35873120, "step": 37540 }, { "epoch": 3.0626478505587733, "grad_norm": 0.2242756187915802, "learning_rate": 4.3795613827378286e-05, "loss": 0.3144, "num_input_tokens_seen": 35878176, "step": 37545 }, { "epoch": 3.063055714169182, "grad_norm": 0.36232802271842957, "learning_rate": 4.379326677765788e-05, "loss": 0.326, "num_input_tokens_seen": 35883296, "step": 37550 }, { "epoch": 3.0634635777795904, "grad_norm": 0.3576432764530182, "learning_rate": 4.379091934700593e-05, "loss": 0.3393, "num_input_tokens_seen": 35888048, "step": 37555 }, { "epoch": 3.063871441389999, "grad_norm": 0.4385671615600586, "learning_rate": 4.378857153547004e-05, "loss": 0.4326, "num_input_tokens_seen": 35892960, "step": 37560 }, { "epoch": 3.064279305000408, "grad_norm": 0.3300941288471222, "learning_rate": 4.378622334309779e-05, "loss": 0.3665, "num_input_tokens_seen": 35897664, "step": 37565 }, { "epoch": 3.0646871686108166, "grad_norm": 0.4317229986190796, "learning_rate": 4.3783874769936776e-05, "loss": 0.3627, "num_input_tokens_seen": 35902176, "step": 37570 }, { "epoch": 3.065095032221225, "grad_norm": 0.14993247389793396, "learning_rate": 4.378152581603459e-05, "loss": 0.3738, "num_input_tokens_seen": 35907024, "step": 37575 }, { "epoch": 3.0655028958316337, "grad_norm": 0.2311803549528122, "learning_rate": 4.377917648143887e-05, "loss": 0.373, "num_input_tokens_seen": 35911680, "step": 37580 }, { "epoch": 3.0659107594420427, "grad_norm": 0.26996049284935, "learning_rate": 4.377682676619721e-05, "loss": 0.3624, "num_input_tokens_seen": 35917216, "step": 37585 }, { "epoch": 3.0663186230524513, "grad_norm": 0.923563539981842, "learning_rate": 4.3774476670357263e-05, "loss": 0.3869, "num_input_tokens_seen": 35922496, "step": 37590 }, { "epoch": 3.06672648666286, "grad_norm": 0.6661641001701355, "learning_rate": 4.377212619396665e-05, "loss": 0.3222, "num_input_tokens_seen": 35927024, "step": 37595 }, { "epoch": 3.0671343502732684, "grad_norm": 1.0077894926071167, "learning_rate": 4.376977533707301e-05, "loss": 0.3887, "num_input_tokens_seen": 35932304, "step": 37600 }, { "epoch": 3.0675422138836774, "grad_norm": 0.6978349089622498, "learning_rate": 4.3767424099724e-05, "loss": 0.2857, "num_input_tokens_seen": 35937280, "step": 37605 }, { "epoch": 3.067950077494086, "grad_norm": 1.1044219732284546, "learning_rate": 4.376507248196728e-05, "loss": 0.3725, "num_input_tokens_seen": 35941600, "step": 37610 }, { "epoch": 3.0683579411044946, "grad_norm": 0.1875421404838562, "learning_rate": 4.376272048385052e-05, "loss": 0.4085, "num_input_tokens_seen": 35945200, "step": 37615 }, { "epoch": 3.068765804714903, "grad_norm": 0.17868591845035553, "learning_rate": 4.3760368105421376e-05, "loss": 0.3444, "num_input_tokens_seen": 35949760, "step": 37620 }, { "epoch": 3.069173668325312, "grad_norm": 0.23256827890872955, "learning_rate": 4.375801534672755e-05, "loss": 0.3576, "num_input_tokens_seen": 35954496, "step": 37625 }, { "epoch": 3.0695815319357207, "grad_norm": 0.247033029794693, "learning_rate": 4.375566220781672e-05, "loss": 0.3758, "num_input_tokens_seen": 35959648, "step": 37630 }, { "epoch": 3.0699893955461293, "grad_norm": 0.21433648467063904, "learning_rate": 4.375330868873659e-05, "loss": 0.3369, "num_input_tokens_seen": 35964784, "step": 37635 }, { "epoch": 3.070397259156538, "grad_norm": 0.7886608242988586, "learning_rate": 4.375095478953486e-05, "loss": 0.3717, "num_input_tokens_seen": 35968448, "step": 37640 }, { "epoch": 3.070805122766947, "grad_norm": 0.32227587699890137, "learning_rate": 4.374860051025925e-05, "loss": 0.3725, "num_input_tokens_seen": 35973568, "step": 37645 }, { "epoch": 3.0712129863773554, "grad_norm": 0.8034085035324097, "learning_rate": 4.374624585095747e-05, "loss": 0.3465, "num_input_tokens_seen": 35978288, "step": 37650 }, { "epoch": 3.071620849987764, "grad_norm": 0.1654025912284851, "learning_rate": 4.374389081167725e-05, "loss": 0.3394, "num_input_tokens_seen": 35982320, "step": 37655 }, { "epoch": 3.072028713598173, "grad_norm": 0.8465822339057922, "learning_rate": 4.374153539246633e-05, "loss": 0.3588, "num_input_tokens_seen": 35986336, "step": 37660 }, { "epoch": 3.0724365772085815, "grad_norm": 0.9822971820831299, "learning_rate": 4.373917959337246e-05, "loss": 0.3378, "num_input_tokens_seen": 35991040, "step": 37665 }, { "epoch": 3.07284444081899, "grad_norm": 0.2720835506916046, "learning_rate": 4.373682341444337e-05, "loss": 0.3456, "num_input_tokens_seen": 35995856, "step": 37670 }, { "epoch": 3.0732523044293987, "grad_norm": 0.9471209049224854, "learning_rate": 4.373446685572683e-05, "loss": 0.3712, "num_input_tokens_seen": 35999856, "step": 37675 }, { "epoch": 3.0736601680398077, "grad_norm": 0.8004488348960876, "learning_rate": 4.373210991727061e-05, "loss": 0.3522, "num_input_tokens_seen": 36004528, "step": 37680 }, { "epoch": 3.0740680316502162, "grad_norm": 0.3456135094165802, "learning_rate": 4.3729752599122495e-05, "loss": 0.3786, "num_input_tokens_seen": 36010160, "step": 37685 }, { "epoch": 3.074475895260625, "grad_norm": 0.4262235164642334, "learning_rate": 4.372739490133024e-05, "loss": 0.3452, "num_input_tokens_seen": 36015392, "step": 37690 }, { "epoch": 3.0748837588710334, "grad_norm": 0.3339865505695343, "learning_rate": 4.372503682394165e-05, "loss": 0.4038, "num_input_tokens_seen": 36020400, "step": 37695 }, { "epoch": 3.0752916224814424, "grad_norm": 0.3181331753730774, "learning_rate": 4.372267836700452e-05, "loss": 0.3404, "num_input_tokens_seen": 36024624, "step": 37700 }, { "epoch": 3.075699486091851, "grad_norm": 0.21230585873126984, "learning_rate": 4.372031953056665e-05, "loss": 0.3429, "num_input_tokens_seen": 36029456, "step": 37705 }, { "epoch": 3.0761073497022595, "grad_norm": 0.21680708229541779, "learning_rate": 4.371796031467588e-05, "loss": 0.3398, "num_input_tokens_seen": 36033856, "step": 37710 }, { "epoch": 3.076515213312668, "grad_norm": 0.767387330532074, "learning_rate": 4.371560071937999e-05, "loss": 0.3282, "num_input_tokens_seen": 36038064, "step": 37715 }, { "epoch": 3.076923076923077, "grad_norm": 0.24301986396312714, "learning_rate": 4.371324074472683e-05, "loss": 0.3688, "num_input_tokens_seen": 36043024, "step": 37720 }, { "epoch": 3.0773309405334857, "grad_norm": 0.31099840998649597, "learning_rate": 4.371088039076423e-05, "loss": 0.3287, "num_input_tokens_seen": 36046688, "step": 37725 }, { "epoch": 3.0777388041438942, "grad_norm": 0.5943452715873718, "learning_rate": 4.370851965754003e-05, "loss": 0.2539, "num_input_tokens_seen": 36051552, "step": 37730 }, { "epoch": 3.078146667754303, "grad_norm": 1.493316411972046, "learning_rate": 4.3706158545102094e-05, "loss": 0.3617, "num_input_tokens_seen": 36056832, "step": 37735 }, { "epoch": 3.078554531364712, "grad_norm": 1.4327195882797241, "learning_rate": 4.3703797053498274e-05, "loss": 0.5756, "num_input_tokens_seen": 36062144, "step": 37740 }, { "epoch": 3.0789623949751204, "grad_norm": 0.5750217437744141, "learning_rate": 4.3701435182776426e-05, "loss": 0.2394, "num_input_tokens_seen": 36066848, "step": 37745 }, { "epoch": 3.079370258585529, "grad_norm": 0.5510174036026001, "learning_rate": 4.369907293298444e-05, "loss": 0.2438, "num_input_tokens_seen": 36071488, "step": 37750 }, { "epoch": 3.0797781221959375, "grad_norm": 1.4002387523651123, "learning_rate": 4.3696710304170194e-05, "loss": 0.3135, "num_input_tokens_seen": 36075232, "step": 37755 }, { "epoch": 3.0801859858063465, "grad_norm": 0.4404261112213135, "learning_rate": 4.369434729638158e-05, "loss": 0.1853, "num_input_tokens_seen": 36080592, "step": 37760 }, { "epoch": 3.080593849416755, "grad_norm": 0.4197846055030823, "learning_rate": 4.369198390966648e-05, "loss": 0.248, "num_input_tokens_seen": 36084384, "step": 37765 }, { "epoch": 3.0810017130271636, "grad_norm": 0.40484070777893066, "learning_rate": 4.368962014407281e-05, "loss": 0.4824, "num_input_tokens_seen": 36089136, "step": 37770 }, { "epoch": 3.081409576637572, "grad_norm": 1.3509306907653809, "learning_rate": 4.368725599964848e-05, "loss": 0.4154, "num_input_tokens_seen": 36094176, "step": 37775 }, { "epoch": 3.081817440247981, "grad_norm": 0.3851287066936493, "learning_rate": 4.368489147644141e-05, "loss": 0.4323, "num_input_tokens_seen": 36098048, "step": 37780 }, { "epoch": 3.0822253038583898, "grad_norm": 0.9215186834335327, "learning_rate": 4.3682526574499546e-05, "loss": 0.3111, "num_input_tokens_seen": 36102016, "step": 37785 }, { "epoch": 3.0826331674687983, "grad_norm": 0.7440876960754395, "learning_rate": 4.368016129387079e-05, "loss": 0.3536, "num_input_tokens_seen": 36107456, "step": 37790 }, { "epoch": 3.083041031079207, "grad_norm": 0.37640324234962463, "learning_rate": 4.367779563460311e-05, "loss": 0.3407, "num_input_tokens_seen": 36112512, "step": 37795 }, { "epoch": 3.083448894689616, "grad_norm": 0.12266603112220764, "learning_rate": 4.367542959674445e-05, "loss": 0.3588, "num_input_tokens_seen": 36117488, "step": 37800 }, { "epoch": 3.0838567583000245, "grad_norm": 1.231828212738037, "learning_rate": 4.367306318034276e-05, "loss": 0.3652, "num_input_tokens_seen": 36121920, "step": 37805 }, { "epoch": 3.084264621910433, "grad_norm": 1.031939148902893, "learning_rate": 4.367069638544602e-05, "loss": 0.3503, "num_input_tokens_seen": 36127696, "step": 37810 }, { "epoch": 3.0846724855208416, "grad_norm": 0.7943031191825867, "learning_rate": 4.3668329212102196e-05, "loss": 0.3628, "num_input_tokens_seen": 36131872, "step": 37815 }, { "epoch": 3.0850803491312506, "grad_norm": 0.8849295377731323, "learning_rate": 4.3665961660359264e-05, "loss": 0.3603, "num_input_tokens_seen": 36137152, "step": 37820 }, { "epoch": 3.085488212741659, "grad_norm": 0.22987182438373566, "learning_rate": 4.3663593730265235e-05, "loss": 0.3732, "num_input_tokens_seen": 36141248, "step": 37825 }, { "epoch": 3.0858960763520678, "grad_norm": 0.2693611681461334, "learning_rate": 4.3661225421868076e-05, "loss": 0.3399, "num_input_tokens_seen": 36147008, "step": 37830 }, { "epoch": 3.0863039399624768, "grad_norm": 0.27967390418052673, "learning_rate": 4.3658856735215826e-05, "loss": 0.3936, "num_input_tokens_seen": 36152304, "step": 37835 }, { "epoch": 3.0867118035728853, "grad_norm": 0.24099820852279663, "learning_rate": 4.3656487670356466e-05, "loss": 0.3817, "num_input_tokens_seen": 36155920, "step": 37840 }, { "epoch": 3.087119667183294, "grad_norm": 0.8115445971488953, "learning_rate": 4.3654118227338034e-05, "loss": 0.3435, "num_input_tokens_seen": 36160224, "step": 37845 }, { "epoch": 3.0875275307937025, "grad_norm": 0.6371379494667053, "learning_rate": 4.365174840620855e-05, "loss": 0.3071, "num_input_tokens_seen": 36164528, "step": 37850 }, { "epoch": 3.0879353944041115, "grad_norm": 0.4742496609687805, "learning_rate": 4.364937820701605e-05, "loss": 0.3343, "num_input_tokens_seen": 36169312, "step": 37855 }, { "epoch": 3.08834325801452, "grad_norm": 0.4006587266921997, "learning_rate": 4.364700762980858e-05, "loss": 0.4566, "num_input_tokens_seen": 36173984, "step": 37860 }, { "epoch": 3.0887511216249286, "grad_norm": 0.310091108083725, "learning_rate": 4.364463667463419e-05, "loss": 0.3684, "num_input_tokens_seen": 36178272, "step": 37865 }, { "epoch": 3.089158985235337, "grad_norm": 0.3036665618419647, "learning_rate": 4.3642265341540935e-05, "loss": 0.4211, "num_input_tokens_seen": 36182880, "step": 37870 }, { "epoch": 3.089566848845746, "grad_norm": 0.4806022346019745, "learning_rate": 4.363989363057688e-05, "loss": 0.3738, "num_input_tokens_seen": 36187776, "step": 37875 }, { "epoch": 3.0899747124561547, "grad_norm": 0.850380539894104, "learning_rate": 4.3637521541790104e-05, "loss": 0.3875, "num_input_tokens_seen": 36192688, "step": 37880 }, { "epoch": 3.0903825760665633, "grad_norm": 0.12317255139350891, "learning_rate": 4.363514907522869e-05, "loss": 0.3347, "num_input_tokens_seen": 36197072, "step": 37885 }, { "epoch": 3.090790439676972, "grad_norm": 0.9488558769226074, "learning_rate": 4.363277623094072e-05, "loss": 0.3554, "num_input_tokens_seen": 36201024, "step": 37890 }, { "epoch": 3.091198303287381, "grad_norm": 0.7916227579116821, "learning_rate": 4.36304030089743e-05, "loss": 0.3621, "num_input_tokens_seen": 36206272, "step": 37895 }, { "epoch": 3.0916061668977894, "grad_norm": 0.9251986742019653, "learning_rate": 4.362802940937751e-05, "loss": 0.3694, "num_input_tokens_seen": 36211872, "step": 37900 }, { "epoch": 3.092014030508198, "grad_norm": 0.20028340816497803, "learning_rate": 4.3625655432198495e-05, "loss": 0.3635, "num_input_tokens_seen": 36216912, "step": 37905 }, { "epoch": 3.0924218941186066, "grad_norm": 0.8661041855812073, "learning_rate": 4.3623281077485355e-05, "loss": 0.3691, "num_input_tokens_seen": 36221696, "step": 37910 }, { "epoch": 3.0928297577290156, "grad_norm": 0.18472100794315338, "learning_rate": 4.3620906345286216e-05, "loss": 0.3475, "num_input_tokens_seen": 36226080, "step": 37915 }, { "epoch": 3.093237621339424, "grad_norm": 0.5395455956459045, "learning_rate": 4.361853123564923e-05, "loss": 0.3173, "num_input_tokens_seen": 36229936, "step": 37920 }, { "epoch": 3.0936454849498327, "grad_norm": 0.5017979145050049, "learning_rate": 4.361615574862252e-05, "loss": 0.2103, "num_input_tokens_seen": 36235248, "step": 37925 }, { "epoch": 3.0940533485602413, "grad_norm": 0.5545333623886108, "learning_rate": 4.361377988425423e-05, "loss": 0.4769, "num_input_tokens_seen": 36240464, "step": 37930 }, { "epoch": 3.0944612121706503, "grad_norm": 0.4537814259529114, "learning_rate": 4.361140364259255e-05, "loss": 0.2551, "num_input_tokens_seen": 36244960, "step": 37935 }, { "epoch": 3.094869075781059, "grad_norm": 0.46452581882476807, "learning_rate": 4.3609027023685626e-05, "loss": 0.3246, "num_input_tokens_seen": 36249136, "step": 37940 }, { "epoch": 3.0952769393914674, "grad_norm": 1.268829584121704, "learning_rate": 4.360665002758162e-05, "loss": 0.4698, "num_input_tokens_seen": 36253872, "step": 37945 }, { "epoch": 3.095684803001876, "grad_norm": 1.0405722856521606, "learning_rate": 4.360427265432874e-05, "loss": 0.431, "num_input_tokens_seen": 36259120, "step": 37950 }, { "epoch": 3.096092666612285, "grad_norm": 0.8541163206100464, "learning_rate": 4.360189490397515e-05, "loss": 0.326, "num_input_tokens_seen": 36264864, "step": 37955 }, { "epoch": 3.0965005302226936, "grad_norm": 0.833182156085968, "learning_rate": 4.359951677656905e-05, "loss": 0.3608, "num_input_tokens_seen": 36269712, "step": 37960 }, { "epoch": 3.096908393833102, "grad_norm": 0.30073282122612, "learning_rate": 4.359713827215865e-05, "loss": 0.377, "num_input_tokens_seen": 36274480, "step": 37965 }, { "epoch": 3.0973162574435107, "grad_norm": 0.7985087037086487, "learning_rate": 4.359475939079217e-05, "loss": 0.3531, "num_input_tokens_seen": 36279440, "step": 37970 }, { "epoch": 3.0977241210539197, "grad_norm": 0.9216703176498413, "learning_rate": 4.359238013251781e-05, "loss": 0.3597, "num_input_tokens_seen": 36284000, "step": 37975 }, { "epoch": 3.0981319846643283, "grad_norm": 0.8865138292312622, "learning_rate": 4.35900004973838e-05, "loss": 0.3588, "num_input_tokens_seen": 36288976, "step": 37980 }, { "epoch": 3.098539848274737, "grad_norm": 0.6465641856193542, "learning_rate": 4.358762048543839e-05, "loss": 0.3204, "num_input_tokens_seen": 36294432, "step": 37985 }, { "epoch": 3.0989477118851454, "grad_norm": 1.1495368480682373, "learning_rate": 4.358524009672981e-05, "loss": 0.4434, "num_input_tokens_seen": 36298832, "step": 37990 }, { "epoch": 3.0993555754955544, "grad_norm": 0.552023708820343, "learning_rate": 4.3582859331306306e-05, "loss": 0.338, "num_input_tokens_seen": 36303536, "step": 37995 }, { "epoch": 3.099763439105963, "grad_norm": 0.27154654264450073, "learning_rate": 4.3580478189216136e-05, "loss": 0.3049, "num_input_tokens_seen": 36308896, "step": 38000 }, { "epoch": 3.1001713027163715, "grad_norm": 0.255912184715271, "learning_rate": 4.357809667050757e-05, "loss": 0.4056, "num_input_tokens_seen": 36313216, "step": 38005 }, { "epoch": 3.1005791663267805, "grad_norm": 0.13533496856689453, "learning_rate": 4.357571477522888e-05, "loss": 0.3605, "num_input_tokens_seen": 36318032, "step": 38010 }, { "epoch": 3.100987029937189, "grad_norm": 0.8690183162689209, "learning_rate": 4.357333250342835e-05, "loss": 0.3497, "num_input_tokens_seen": 36323056, "step": 38015 }, { "epoch": 3.1013948935475977, "grad_norm": 0.17708252370357513, "learning_rate": 4.3570949855154266e-05, "loss": 0.3447, "num_input_tokens_seen": 36327264, "step": 38020 }, { "epoch": 3.1018027571580062, "grad_norm": 1.0079200267791748, "learning_rate": 4.35685668304549e-05, "loss": 0.321, "num_input_tokens_seen": 36332176, "step": 38025 }, { "epoch": 3.1022106207684153, "grad_norm": 0.22089728713035583, "learning_rate": 4.3566183429378595e-05, "loss": 0.3765, "num_input_tokens_seen": 36336800, "step": 38030 }, { "epoch": 3.102618484378824, "grad_norm": 0.2553706765174866, "learning_rate": 4.3563799651973635e-05, "loss": 0.3274, "num_input_tokens_seen": 36341088, "step": 38035 }, { "epoch": 3.1030263479892324, "grad_norm": 0.9903526306152344, "learning_rate": 4.356141549828834e-05, "loss": 0.3626, "num_input_tokens_seen": 36346048, "step": 38040 }, { "epoch": 3.103434211599641, "grad_norm": 0.21540939807891846, "learning_rate": 4.355903096837104e-05, "loss": 0.3312, "num_input_tokens_seen": 36351264, "step": 38045 }, { "epoch": 3.10384207521005, "grad_norm": 1.0126066207885742, "learning_rate": 4.3556646062270076e-05, "loss": 0.3032, "num_input_tokens_seen": 36355840, "step": 38050 }, { "epoch": 3.1042499388204585, "grad_norm": 0.7018945813179016, "learning_rate": 4.355426078003378e-05, "loss": 0.334, "num_input_tokens_seen": 36360992, "step": 38055 }, { "epoch": 3.104657802430867, "grad_norm": 1.1704096794128418, "learning_rate": 4.355187512171049e-05, "loss": 0.39, "num_input_tokens_seen": 36365456, "step": 38060 }, { "epoch": 3.1050656660412757, "grad_norm": 0.7445017099380493, "learning_rate": 4.3549489087348596e-05, "loss": 0.3776, "num_input_tokens_seen": 36370640, "step": 38065 }, { "epoch": 3.1054735296516847, "grad_norm": 0.9526610374450684, "learning_rate": 4.3547102676996423e-05, "loss": 0.3561, "num_input_tokens_seen": 36375632, "step": 38070 }, { "epoch": 3.1058813932620932, "grad_norm": 0.8191004991531372, "learning_rate": 4.3544715890702365e-05, "loss": 0.3429, "num_input_tokens_seen": 36381408, "step": 38075 }, { "epoch": 3.106289256872502, "grad_norm": 0.35170111060142517, "learning_rate": 4.354232872851479e-05, "loss": 0.3279, "num_input_tokens_seen": 36386592, "step": 38080 }, { "epoch": 3.1066971204829104, "grad_norm": 1.1170159578323364, "learning_rate": 4.35399411904821e-05, "loss": 0.3723, "num_input_tokens_seen": 36391088, "step": 38085 }, { "epoch": 3.1071049840933194, "grad_norm": 0.6682947874069214, "learning_rate": 4.3537553276652676e-05, "loss": 0.3453, "num_input_tokens_seen": 36396240, "step": 38090 }, { "epoch": 3.107512847703728, "grad_norm": 0.6778727173805237, "learning_rate": 4.353516498707493e-05, "loss": 0.3213, "num_input_tokens_seen": 36401328, "step": 38095 }, { "epoch": 3.1079207113141365, "grad_norm": 0.3508234918117523, "learning_rate": 4.353277632179726e-05, "loss": 0.3608, "num_input_tokens_seen": 36405504, "step": 38100 }, { "epoch": 3.108328574924545, "grad_norm": 1.076683521270752, "learning_rate": 4.353038728086809e-05, "loss": 0.3843, "num_input_tokens_seen": 36409328, "step": 38105 }, { "epoch": 3.108736438534954, "grad_norm": 0.9194771647453308, "learning_rate": 4.3527997864335844e-05, "loss": 0.3233, "num_input_tokens_seen": 36413568, "step": 38110 }, { "epoch": 3.1091443021453626, "grad_norm": 1.143434762954712, "learning_rate": 4.352560807224895e-05, "loss": 0.3454, "num_input_tokens_seen": 36418880, "step": 38115 }, { "epoch": 3.109552165755771, "grad_norm": 0.38475775718688965, "learning_rate": 4.352321790465586e-05, "loss": 0.3876, "num_input_tokens_seen": 36423296, "step": 38120 }, { "epoch": 3.1099600293661798, "grad_norm": 0.7362242341041565, "learning_rate": 4.352082736160501e-05, "loss": 0.3403, "num_input_tokens_seen": 36428704, "step": 38125 }, { "epoch": 3.1103678929765888, "grad_norm": 0.36023804545402527, "learning_rate": 4.351843644314486e-05, "loss": 0.318, "num_input_tokens_seen": 36431872, "step": 38130 }, { "epoch": 3.1107757565869973, "grad_norm": 1.477124810218811, "learning_rate": 4.351604514932387e-05, "loss": 0.362, "num_input_tokens_seen": 36436640, "step": 38135 }, { "epoch": 3.111183620197406, "grad_norm": 0.45613768696784973, "learning_rate": 4.351365348019052e-05, "loss": 0.3464, "num_input_tokens_seen": 36441856, "step": 38140 }, { "epoch": 3.1115914838078145, "grad_norm": 0.5395607948303223, "learning_rate": 4.351126143579327e-05, "loss": 0.3443, "num_input_tokens_seen": 36447488, "step": 38145 }, { "epoch": 3.1119993474182235, "grad_norm": 0.6235482096672058, "learning_rate": 4.350886901618063e-05, "loss": 0.3036, "num_input_tokens_seen": 36451856, "step": 38150 }, { "epoch": 3.112407211028632, "grad_norm": 0.6382150053977966, "learning_rate": 4.3506476221401065e-05, "loss": 0.3481, "num_input_tokens_seen": 36456512, "step": 38155 }, { "epoch": 3.1128150746390406, "grad_norm": 0.7269455194473267, "learning_rate": 4.3504083051503096e-05, "loss": 0.3737, "num_input_tokens_seen": 36460304, "step": 38160 }, { "epoch": 3.113222938249449, "grad_norm": 1.4960330724716187, "learning_rate": 4.350168950653524e-05, "loss": 0.3836, "num_input_tokens_seen": 36464896, "step": 38165 }, { "epoch": 3.113630801859858, "grad_norm": 0.36781755089759827, "learning_rate": 4.349929558654599e-05, "loss": 0.3222, "num_input_tokens_seen": 36469456, "step": 38170 }, { "epoch": 3.1140386654702668, "grad_norm": 0.30164334177970886, "learning_rate": 4.349690129158387e-05, "loss": 0.3335, "num_input_tokens_seen": 36474096, "step": 38175 }, { "epoch": 3.1144465290806753, "grad_norm": 0.21906134486198425, "learning_rate": 4.349450662169743e-05, "loss": 0.3118, "num_input_tokens_seen": 36478640, "step": 38180 }, { "epoch": 3.1148543926910843, "grad_norm": 0.3555414378643036, "learning_rate": 4.349211157693519e-05, "loss": 0.3707, "num_input_tokens_seen": 36484560, "step": 38185 }, { "epoch": 3.115262256301493, "grad_norm": 0.23537079989910126, "learning_rate": 4.3489716157345716e-05, "loss": 0.3246, "num_input_tokens_seen": 36489088, "step": 38190 }, { "epoch": 3.1156701199119015, "grad_norm": 0.8332579731941223, "learning_rate": 4.348732036297755e-05, "loss": 0.3422, "num_input_tokens_seen": 36493040, "step": 38195 }, { "epoch": 3.11607798352231, "grad_norm": 0.78733891248703, "learning_rate": 4.3484924193879255e-05, "loss": 0.3504, "num_input_tokens_seen": 36498576, "step": 38200 }, { "epoch": 3.116485847132719, "grad_norm": 1.1519501209259033, "learning_rate": 4.34825276500994e-05, "loss": 0.3766, "num_input_tokens_seen": 36503136, "step": 38205 }, { "epoch": 3.1168937107431276, "grad_norm": 0.4129520058631897, "learning_rate": 4.348013073168657e-05, "loss": 0.3645, "num_input_tokens_seen": 36507488, "step": 38210 }, { "epoch": 3.117301574353536, "grad_norm": 0.671769917011261, "learning_rate": 4.3477733438689337e-05, "loss": 0.3475, "num_input_tokens_seen": 36511904, "step": 38215 }, { "epoch": 3.1177094379639447, "grad_norm": 0.9517691731452942, "learning_rate": 4.34753357711563e-05, "loss": 0.3463, "num_input_tokens_seen": 36516288, "step": 38220 }, { "epoch": 3.1181173015743537, "grad_norm": 0.22804342210292816, "learning_rate": 4.3472937729136054e-05, "loss": 0.3611, "num_input_tokens_seen": 36521376, "step": 38225 }, { "epoch": 3.1185251651847623, "grad_norm": 0.9322283267974854, "learning_rate": 4.3470539312677215e-05, "loss": 0.3614, "num_input_tokens_seen": 36525920, "step": 38230 }, { "epoch": 3.118933028795171, "grad_norm": 0.8086922764778137, "learning_rate": 4.346814052182839e-05, "loss": 0.3335, "num_input_tokens_seen": 36530720, "step": 38235 }, { "epoch": 3.1193408924055794, "grad_norm": 0.36136552691459656, "learning_rate": 4.346574135663821e-05, "loss": 0.3564, "num_input_tokens_seen": 36536064, "step": 38240 }, { "epoch": 3.1197487560159884, "grad_norm": 0.9372590184211731, "learning_rate": 4.346334181715529e-05, "loss": 0.3427, "num_input_tokens_seen": 36540736, "step": 38245 }, { "epoch": 3.120156619626397, "grad_norm": 0.6953800916671753, "learning_rate": 4.346094190342829e-05, "loss": 0.3078, "num_input_tokens_seen": 36546176, "step": 38250 }, { "epoch": 3.1205644832368056, "grad_norm": 0.5799208879470825, "learning_rate": 4.345854161550583e-05, "loss": 0.308, "num_input_tokens_seen": 36550640, "step": 38255 }, { "epoch": 3.120972346847214, "grad_norm": 0.644144594669342, "learning_rate": 4.345614095343658e-05, "loss": 0.3019, "num_input_tokens_seen": 36555648, "step": 38260 }, { "epoch": 3.121380210457623, "grad_norm": 0.5359276533126831, "learning_rate": 4.345373991726919e-05, "loss": 0.3307, "num_input_tokens_seen": 36560240, "step": 38265 }, { "epoch": 3.1217880740680317, "grad_norm": 0.47222018241882324, "learning_rate": 4.345133850705234e-05, "loss": 0.4366, "num_input_tokens_seen": 36564880, "step": 38270 }, { "epoch": 3.1221959376784403, "grad_norm": 0.7258188724517822, "learning_rate": 4.34489367228347e-05, "loss": 0.3176, "num_input_tokens_seen": 36569648, "step": 38275 }, { "epoch": 3.122603801288849, "grad_norm": 0.5754956007003784, "learning_rate": 4.3446534564664944e-05, "loss": 0.3698, "num_input_tokens_seen": 36574816, "step": 38280 }, { "epoch": 3.123011664899258, "grad_norm": 0.5586532354354858, "learning_rate": 4.3444132032591775e-05, "loss": 0.3593, "num_input_tokens_seen": 36579712, "step": 38285 }, { "epoch": 3.1234195285096664, "grad_norm": 0.4533784091472626, "learning_rate": 4.344172912666389e-05, "loss": 0.3626, "num_input_tokens_seen": 36584192, "step": 38290 }, { "epoch": 3.123827392120075, "grad_norm": 0.259806364774704, "learning_rate": 4.343932584692999e-05, "loss": 0.3336, "num_input_tokens_seen": 36588704, "step": 38295 }, { "epoch": 3.1242352557304836, "grad_norm": 0.7637419700622559, "learning_rate": 4.343692219343879e-05, "loss": 0.3919, "num_input_tokens_seen": 36593248, "step": 38300 }, { "epoch": 3.1246431193408926, "grad_norm": 0.9091692566871643, "learning_rate": 4.343451816623901e-05, "loss": 0.3032, "num_input_tokens_seen": 36597648, "step": 38305 }, { "epoch": 3.125050982951301, "grad_norm": 1.2761555910110474, "learning_rate": 4.343211376537938e-05, "loss": 0.3383, "num_input_tokens_seen": 36602112, "step": 38310 }, { "epoch": 3.1254588465617097, "grad_norm": 0.6386421322822571, "learning_rate": 4.342970899090863e-05, "loss": 0.2246, "num_input_tokens_seen": 36607024, "step": 38315 }, { "epoch": 3.1258667101721183, "grad_norm": 0.7652361392974854, "learning_rate": 4.3427303842875525e-05, "loss": 0.4683, "num_input_tokens_seen": 36611488, "step": 38320 }, { "epoch": 3.1262745737825273, "grad_norm": 0.49892348051071167, "learning_rate": 4.342489832132879e-05, "loss": 0.2849, "num_input_tokens_seen": 36616016, "step": 38325 }, { "epoch": 3.126682437392936, "grad_norm": 0.6304554343223572, "learning_rate": 4.34224924263172e-05, "loss": 0.3018, "num_input_tokens_seen": 36620720, "step": 38330 }, { "epoch": 3.1270903010033444, "grad_norm": 0.6713236570358276, "learning_rate": 4.342008615788952e-05, "loss": 0.3544, "num_input_tokens_seen": 36625360, "step": 38335 }, { "epoch": 3.127498164613753, "grad_norm": 3.1134939193725586, "learning_rate": 4.341767951609451e-05, "loss": 0.3761, "num_input_tokens_seen": 36630400, "step": 38340 }, { "epoch": 3.127906028224162, "grad_norm": 0.6696015000343323, "learning_rate": 4.341527250098097e-05, "loss": 0.3662, "num_input_tokens_seen": 36635648, "step": 38345 }, { "epoch": 3.1283138918345705, "grad_norm": 0.6055328249931335, "learning_rate": 4.3412865112597685e-05, "loss": 0.343, "num_input_tokens_seen": 36640048, "step": 38350 }, { "epoch": 3.128721755444979, "grad_norm": 1.1899266242980957, "learning_rate": 4.341045735099344e-05, "loss": 0.3463, "num_input_tokens_seen": 36644208, "step": 38355 }, { "epoch": 3.129129619055388, "grad_norm": 1.190333604812622, "learning_rate": 4.340804921621705e-05, "loss": 0.3346, "num_input_tokens_seen": 36648976, "step": 38360 }, { "epoch": 3.1295374826657967, "grad_norm": 0.6694270372390747, "learning_rate": 4.340564070831733e-05, "loss": 0.3571, "num_input_tokens_seen": 36654608, "step": 38365 }, { "epoch": 3.1299453462762052, "grad_norm": 0.48039186000823975, "learning_rate": 4.3403231827343087e-05, "loss": 0.3314, "num_input_tokens_seen": 36659456, "step": 38370 }, { "epoch": 3.130353209886614, "grad_norm": 0.5461257696151733, "learning_rate": 4.340082257334316e-05, "loss": 0.3382, "num_input_tokens_seen": 36663504, "step": 38375 }, { "epoch": 3.1307610734970224, "grad_norm": 0.6179638504981995, "learning_rate": 4.339841294636637e-05, "loss": 0.3866, "num_input_tokens_seen": 36668832, "step": 38380 }, { "epoch": 3.1311689371074314, "grad_norm": 1.0917223691940308, "learning_rate": 4.339600294646158e-05, "loss": 0.3217, "num_input_tokens_seen": 36674160, "step": 38385 }, { "epoch": 3.13157680071784, "grad_norm": 0.7241861820220947, "learning_rate": 4.339359257367762e-05, "loss": 0.3501, "num_input_tokens_seen": 36678400, "step": 38390 }, { "epoch": 3.1319846643282485, "grad_norm": 0.9935885667800903, "learning_rate": 4.339118182806335e-05, "loss": 0.342, "num_input_tokens_seen": 36682368, "step": 38395 }, { "epoch": 3.1323925279386575, "grad_norm": 0.7716063857078552, "learning_rate": 4.338877070966765e-05, "loss": 0.3576, "num_input_tokens_seen": 36686448, "step": 38400 }, { "epoch": 3.132800391549066, "grad_norm": 0.5396578907966614, "learning_rate": 4.3386359218539376e-05, "loss": 0.3317, "num_input_tokens_seen": 36691424, "step": 38405 }, { "epoch": 3.1332082551594747, "grad_norm": 0.9402621984481812, "learning_rate": 4.3383947354727415e-05, "loss": 0.3776, "num_input_tokens_seen": 36696160, "step": 38410 }, { "epoch": 3.133616118769883, "grad_norm": 0.8125070333480835, "learning_rate": 4.338153511828065e-05, "loss": 0.2994, "num_input_tokens_seen": 36700464, "step": 38415 }, { "epoch": 3.1340239823802922, "grad_norm": 0.4136066138744354, "learning_rate": 4.337912250924798e-05, "loss": 0.3648, "num_input_tokens_seen": 36704944, "step": 38420 }, { "epoch": 3.134431845990701, "grad_norm": 0.3612215518951416, "learning_rate": 4.33767095276783e-05, "loss": 0.2869, "num_input_tokens_seen": 36709024, "step": 38425 }, { "epoch": 3.1348397096011094, "grad_norm": 0.6072381734848022, "learning_rate": 4.3374296173620536e-05, "loss": 0.379, "num_input_tokens_seen": 36713552, "step": 38430 }, { "epoch": 3.135247573211518, "grad_norm": 0.9144548177719116, "learning_rate": 4.337188244712359e-05, "loss": 0.3026, "num_input_tokens_seen": 36717584, "step": 38435 }, { "epoch": 3.135655436821927, "grad_norm": 0.34103602170944214, "learning_rate": 4.3369468348236405e-05, "loss": 0.3215, "num_input_tokens_seen": 36722912, "step": 38440 }, { "epoch": 3.1360633004323355, "grad_norm": 0.5989651679992676, "learning_rate": 4.336705387700789e-05, "loss": 0.348, "num_input_tokens_seen": 36727296, "step": 38445 }, { "epoch": 3.136471164042744, "grad_norm": 1.3975930213928223, "learning_rate": 4.336463903348701e-05, "loss": 0.4127, "num_input_tokens_seen": 36731904, "step": 38450 }, { "epoch": 3.1368790276531526, "grad_norm": 0.25686296820640564, "learning_rate": 4.3362223817722686e-05, "loss": 0.367, "num_input_tokens_seen": 36736816, "step": 38455 }, { "epoch": 3.1372868912635616, "grad_norm": 0.44606882333755493, "learning_rate": 4.3359808229763894e-05, "loss": 0.3416, "num_input_tokens_seen": 36741984, "step": 38460 }, { "epoch": 3.13769475487397, "grad_norm": 0.8971692323684692, "learning_rate": 4.335739226965959e-05, "loss": 0.3522, "num_input_tokens_seen": 36746400, "step": 38465 }, { "epoch": 3.1381026184843788, "grad_norm": 0.9014601707458496, "learning_rate": 4.335497593745875e-05, "loss": 0.3459, "num_input_tokens_seen": 36751584, "step": 38470 }, { "epoch": 3.1385104820947873, "grad_norm": 0.4653792083263397, "learning_rate": 4.3352559233210346e-05, "loss": 0.3495, "num_input_tokens_seen": 36756384, "step": 38475 }, { "epoch": 3.1389183457051963, "grad_norm": 0.6955975890159607, "learning_rate": 4.3350142156963355e-05, "loss": 0.3364, "num_input_tokens_seen": 36760896, "step": 38480 }, { "epoch": 3.139326209315605, "grad_norm": 1.330439567565918, "learning_rate": 4.334772470876679e-05, "loss": 0.4574, "num_input_tokens_seen": 36766752, "step": 38485 }, { "epoch": 3.1397340729260135, "grad_norm": 1.0105459690093994, "learning_rate": 4.3345306888669634e-05, "loss": 0.3639, "num_input_tokens_seen": 36772384, "step": 38490 }, { "epoch": 3.140141936536422, "grad_norm": 1.1735790967941284, "learning_rate": 4.334288869672091e-05, "loss": 0.4309, "num_input_tokens_seen": 36777520, "step": 38495 }, { "epoch": 3.140549800146831, "grad_norm": 0.39741697907447815, "learning_rate": 4.3340470132969625e-05, "loss": 0.3695, "num_input_tokens_seen": 36782432, "step": 38500 }, { "epoch": 3.1409576637572396, "grad_norm": 0.9033865332603455, "learning_rate": 4.3338051197464803e-05, "loss": 0.349, "num_input_tokens_seen": 36787216, "step": 38505 }, { "epoch": 3.141365527367648, "grad_norm": 0.3931337594985962, "learning_rate": 4.333563189025548e-05, "loss": 0.3221, "num_input_tokens_seen": 36792176, "step": 38510 }, { "epoch": 3.1417733909780567, "grad_norm": 0.5997521877288818, "learning_rate": 4.333321221139068e-05, "loss": 0.3465, "num_input_tokens_seen": 36795984, "step": 38515 }, { "epoch": 3.1421812545884658, "grad_norm": 0.5714671015739441, "learning_rate": 4.333079216091946e-05, "loss": 0.2913, "num_input_tokens_seen": 36800656, "step": 38520 }, { "epoch": 3.1425891181988743, "grad_norm": 1.3527787923812866, "learning_rate": 4.332837173889088e-05, "loss": 0.4021, "num_input_tokens_seen": 36805856, "step": 38525 }, { "epoch": 3.142996981809283, "grad_norm": 0.6661422252655029, "learning_rate": 4.3325950945353985e-05, "loss": 0.2685, "num_input_tokens_seen": 36810688, "step": 38530 }, { "epoch": 3.1434048454196915, "grad_norm": 0.647925078868866, "learning_rate": 4.332352978035785e-05, "loss": 0.4589, "num_input_tokens_seen": 36815008, "step": 38535 }, { "epoch": 3.1438127090301005, "grad_norm": 0.35837188363075256, "learning_rate": 4.332110824395157e-05, "loss": 0.3632, "num_input_tokens_seen": 36819760, "step": 38540 }, { "epoch": 3.144220572640509, "grad_norm": 0.1927933394908905, "learning_rate": 4.3318686336184196e-05, "loss": 0.3385, "num_input_tokens_seen": 36824688, "step": 38545 }, { "epoch": 3.1446284362509176, "grad_norm": 0.4599800407886505, "learning_rate": 4.331626405710484e-05, "loss": 0.3388, "num_input_tokens_seen": 36828880, "step": 38550 }, { "epoch": 3.145036299861326, "grad_norm": 0.8332132697105408, "learning_rate": 4.331384140676259e-05, "loss": 0.3695, "num_input_tokens_seen": 36833296, "step": 38555 }, { "epoch": 3.145444163471735, "grad_norm": 0.4643748104572296, "learning_rate": 4.331141838520655e-05, "loss": 0.3765, "num_input_tokens_seen": 36838384, "step": 38560 }, { "epoch": 3.1458520270821437, "grad_norm": 0.24527044594287872, "learning_rate": 4.330899499248585e-05, "loss": 0.3415, "num_input_tokens_seen": 36843312, "step": 38565 }, { "epoch": 3.1462598906925523, "grad_norm": 0.26003018021583557, "learning_rate": 4.330657122864961e-05, "loss": 0.2928, "num_input_tokens_seen": 36847840, "step": 38570 }, { "epoch": 3.1466677543029613, "grad_norm": 0.6522624492645264, "learning_rate": 4.330414709374693e-05, "loss": 0.2889, "num_input_tokens_seen": 36852400, "step": 38575 }, { "epoch": 3.14707561791337, "grad_norm": 0.6002374887466431, "learning_rate": 4.330172258782697e-05, "loss": 0.4093, "num_input_tokens_seen": 36857584, "step": 38580 }, { "epoch": 3.1474834815237784, "grad_norm": 0.6869510412216187, "learning_rate": 4.329929771093888e-05, "loss": 0.3708, "num_input_tokens_seen": 36862704, "step": 38585 }, { "epoch": 3.147891345134187, "grad_norm": 0.557864248752594, "learning_rate": 4.32968724631318e-05, "loss": 0.3381, "num_input_tokens_seen": 36867568, "step": 38590 }, { "epoch": 3.148299208744596, "grad_norm": 0.6483858227729797, "learning_rate": 4.329444684445489e-05, "loss": 0.3313, "num_input_tokens_seen": 36872576, "step": 38595 }, { "epoch": 3.1487070723550046, "grad_norm": 0.718809962272644, "learning_rate": 4.329202085495731e-05, "loss": 0.3201, "num_input_tokens_seen": 36878064, "step": 38600 }, { "epoch": 3.149114935965413, "grad_norm": 0.3574664890766144, "learning_rate": 4.3289594494688244e-05, "loss": 0.3628, "num_input_tokens_seen": 36882848, "step": 38605 }, { "epoch": 3.1495227995758217, "grad_norm": 0.718244731426239, "learning_rate": 4.3287167763696876e-05, "loss": 0.3321, "num_input_tokens_seen": 36888464, "step": 38610 }, { "epoch": 3.1499306631862307, "grad_norm": 0.8290541768074036, "learning_rate": 4.328474066203238e-05, "loss": 0.3571, "num_input_tokens_seen": 36893744, "step": 38615 }, { "epoch": 3.1503385267966393, "grad_norm": 0.6591382026672363, "learning_rate": 4.328231318974396e-05, "loss": 0.3058, "num_input_tokens_seen": 36899360, "step": 38620 }, { "epoch": 3.150746390407048, "grad_norm": 0.5970455408096313, "learning_rate": 4.327988534688082e-05, "loss": 0.3456, "num_input_tokens_seen": 36903936, "step": 38625 }, { "epoch": 3.1511542540174564, "grad_norm": 1.1940834522247314, "learning_rate": 4.327745713349218e-05, "loss": 0.3982, "num_input_tokens_seen": 36909120, "step": 38630 }, { "epoch": 3.1515621176278654, "grad_norm": 0.9246308207511902, "learning_rate": 4.3275028549627236e-05, "loss": 0.3652, "num_input_tokens_seen": 36913552, "step": 38635 }, { "epoch": 3.151969981238274, "grad_norm": 0.24663031101226807, "learning_rate": 4.327259959533524e-05, "loss": 0.3541, "num_input_tokens_seen": 36918256, "step": 38640 }, { "epoch": 3.1523778448486826, "grad_norm": 0.8658726215362549, "learning_rate": 4.327017027066541e-05, "loss": 0.3517, "num_input_tokens_seen": 36922064, "step": 38645 }, { "epoch": 3.152785708459091, "grad_norm": 0.7181668877601624, "learning_rate": 4.3267740575667e-05, "loss": 0.3407, "num_input_tokens_seen": 36926592, "step": 38650 }, { "epoch": 3.1531935720695, "grad_norm": 0.33695918321609497, "learning_rate": 4.3265310510389245e-05, "loss": 0.346, "num_input_tokens_seen": 36930832, "step": 38655 }, { "epoch": 3.1536014356799087, "grad_norm": 0.6914902329444885, "learning_rate": 4.3262880074881406e-05, "loss": 0.3599, "num_input_tokens_seen": 36935792, "step": 38660 }, { "epoch": 3.1540092992903173, "grad_norm": 0.6854420304298401, "learning_rate": 4.3260449269192747e-05, "loss": 0.3382, "num_input_tokens_seen": 36939840, "step": 38665 }, { "epoch": 3.154417162900726, "grad_norm": 0.28114181756973267, "learning_rate": 4.325801809337254e-05, "loss": 0.3398, "num_input_tokens_seen": 36945168, "step": 38670 }, { "epoch": 3.154825026511135, "grad_norm": 0.3284986913204193, "learning_rate": 4.325558654747007e-05, "loss": 0.3528, "num_input_tokens_seen": 36950160, "step": 38675 }, { "epoch": 3.1552328901215434, "grad_norm": 0.24696838855743408, "learning_rate": 4.325315463153462e-05, "loss": 0.3809, "num_input_tokens_seen": 36955216, "step": 38680 }, { "epoch": 3.155640753731952, "grad_norm": 0.8783107399940491, "learning_rate": 4.325072234561547e-05, "loss": 0.3471, "num_input_tokens_seen": 36959328, "step": 38685 }, { "epoch": 3.1560486173423605, "grad_norm": 0.9056353569030762, "learning_rate": 4.324828968976194e-05, "loss": 0.3551, "num_input_tokens_seen": 36964000, "step": 38690 }, { "epoch": 3.1564564809527695, "grad_norm": 0.6276715993881226, "learning_rate": 4.324585666402334e-05, "loss": 0.3081, "num_input_tokens_seen": 36968576, "step": 38695 }, { "epoch": 3.156864344563178, "grad_norm": 0.5230270624160767, "learning_rate": 4.324342326844897e-05, "loss": 0.3865, "num_input_tokens_seen": 36972576, "step": 38700 }, { "epoch": 3.1572722081735867, "grad_norm": 0.515811562538147, "learning_rate": 4.324098950308816e-05, "loss": 0.2921, "num_input_tokens_seen": 36977616, "step": 38705 }, { "epoch": 3.1576800717839952, "grad_norm": 0.34340232610702515, "learning_rate": 4.323855536799025e-05, "loss": 0.3669, "num_input_tokens_seen": 36983040, "step": 38710 }, { "epoch": 3.1580879353944042, "grad_norm": 0.5166919231414795, "learning_rate": 4.3236120863204574e-05, "loss": 0.33, "num_input_tokens_seen": 36988112, "step": 38715 }, { "epoch": 3.158495799004813, "grad_norm": 0.5888382792472839, "learning_rate": 4.3233685988780473e-05, "loss": 0.2715, "num_input_tokens_seen": 36992608, "step": 38720 }, { "epoch": 3.1589036626152214, "grad_norm": 0.546756386756897, "learning_rate": 4.32312507447673e-05, "loss": 0.2372, "num_input_tokens_seen": 36997680, "step": 38725 }, { "epoch": 3.15931152622563, "grad_norm": 1.4310981035232544, "learning_rate": 4.3228815131214426e-05, "loss": 0.4453, "num_input_tokens_seen": 37003184, "step": 38730 }, { "epoch": 3.159719389836039, "grad_norm": 1.2196402549743652, "learning_rate": 4.322637914817122e-05, "loss": 0.4339, "num_input_tokens_seen": 37007184, "step": 38735 }, { "epoch": 3.1601272534464475, "grad_norm": 0.5608165264129639, "learning_rate": 4.322394279568704e-05, "loss": 0.2628, "num_input_tokens_seen": 37011952, "step": 38740 }, { "epoch": 3.160535117056856, "grad_norm": 0.6093389987945557, "learning_rate": 4.32215060738113e-05, "loss": 0.3774, "num_input_tokens_seen": 37016528, "step": 38745 }, { "epoch": 3.160942980667265, "grad_norm": 1.0648186206817627, "learning_rate": 4.321906898259337e-05, "loss": 0.391, "num_input_tokens_seen": 37020688, "step": 38750 }, { "epoch": 3.1613508442776737, "grad_norm": 0.8460085391998291, "learning_rate": 4.321663152208265e-05, "loss": 0.3655, "num_input_tokens_seen": 37025040, "step": 38755 }, { "epoch": 3.161758707888082, "grad_norm": 0.775270402431488, "learning_rate": 4.321419369232855e-05, "loss": 0.3553, "num_input_tokens_seen": 37029504, "step": 38760 }, { "epoch": 3.162166571498491, "grad_norm": 0.4122271239757538, "learning_rate": 4.3211755493380485e-05, "loss": 0.3724, "num_input_tokens_seen": 37034608, "step": 38765 }, { "epoch": 3.1625744351088994, "grad_norm": 0.20850886404514313, "learning_rate": 4.320931692528787e-05, "loss": 0.3584, "num_input_tokens_seen": 37038560, "step": 38770 }, { "epoch": 3.1629822987193084, "grad_norm": 0.846916913986206, "learning_rate": 4.320687798810015e-05, "loss": 0.3496, "num_input_tokens_seen": 37043728, "step": 38775 }, { "epoch": 3.163390162329717, "grad_norm": 0.741871178150177, "learning_rate": 4.320443868186674e-05, "loss": 0.3367, "num_input_tokens_seen": 37048768, "step": 38780 }, { "epoch": 3.1637980259401255, "grad_norm": 0.22078989446163177, "learning_rate": 4.3201999006637094e-05, "loss": 0.3738, "num_input_tokens_seen": 37053120, "step": 38785 }, { "epoch": 3.1642058895505345, "grad_norm": 0.33937039971351624, "learning_rate": 4.3199558962460665e-05, "loss": 0.3823, "num_input_tokens_seen": 37057856, "step": 38790 }, { "epoch": 3.164613753160943, "grad_norm": 0.2648318111896515, "learning_rate": 4.319711854938691e-05, "loss": 0.3693, "num_input_tokens_seen": 37062208, "step": 38795 }, { "epoch": 3.1650216167713516, "grad_norm": 0.2539639472961426, "learning_rate": 4.319467776746529e-05, "loss": 0.3613, "num_input_tokens_seen": 37066352, "step": 38800 }, { "epoch": 3.16542948038176, "grad_norm": 0.342436283826828, "learning_rate": 4.319223661674528e-05, "loss": 0.3141, "num_input_tokens_seen": 37071552, "step": 38805 }, { "epoch": 3.165837343992169, "grad_norm": 0.25560328364372253, "learning_rate": 4.318979509727638e-05, "loss": 0.326, "num_input_tokens_seen": 37075824, "step": 38810 }, { "epoch": 3.1662452076025778, "grad_norm": 0.6964346170425415, "learning_rate": 4.318735320910805e-05, "loss": 0.3733, "num_input_tokens_seen": 37081456, "step": 38815 }, { "epoch": 3.1666530712129863, "grad_norm": 0.6821196675300598, "learning_rate": 4.31849109522898e-05, "loss": 0.2891, "num_input_tokens_seen": 37086848, "step": 38820 }, { "epoch": 3.167060934823395, "grad_norm": 0.524493396282196, "learning_rate": 4.318246832687114e-05, "loss": 0.3999, "num_input_tokens_seen": 37092096, "step": 38825 }, { "epoch": 3.167468798433804, "grad_norm": 0.4548640251159668, "learning_rate": 4.3180025332901565e-05, "loss": 0.3382, "num_input_tokens_seen": 37097232, "step": 38830 }, { "epoch": 3.1678766620442125, "grad_norm": 0.31479284167289734, "learning_rate": 4.3177581970430604e-05, "loss": 0.4153, "num_input_tokens_seen": 37102288, "step": 38835 }, { "epoch": 3.168284525654621, "grad_norm": 0.2608211636543274, "learning_rate": 4.317513823950778e-05, "loss": 0.363, "num_input_tokens_seen": 37107408, "step": 38840 }, { "epoch": 3.1686923892650296, "grad_norm": 0.306631475687027, "learning_rate": 4.317269414018262e-05, "loss": 0.3425, "num_input_tokens_seen": 37112288, "step": 38845 }, { "epoch": 3.1691002528754386, "grad_norm": 0.2522590458393097, "learning_rate": 4.3170249672504684e-05, "loss": 0.3444, "num_input_tokens_seen": 37116752, "step": 38850 }, { "epoch": 3.169508116485847, "grad_norm": 0.3803669512271881, "learning_rate": 4.3167804836523506e-05, "loss": 0.3642, "num_input_tokens_seen": 37121120, "step": 38855 }, { "epoch": 3.1699159800962557, "grad_norm": 0.7893136143684387, "learning_rate": 4.316535963228864e-05, "loss": 0.346, "num_input_tokens_seen": 37124848, "step": 38860 }, { "epoch": 3.1703238437066643, "grad_norm": 0.7769095301628113, "learning_rate": 4.316291405984966e-05, "loss": 0.2981, "num_input_tokens_seen": 37129984, "step": 38865 }, { "epoch": 3.1707317073170733, "grad_norm": 1.2151620388031006, "learning_rate": 4.316046811925612e-05, "loss": 0.327, "num_input_tokens_seen": 37135040, "step": 38870 }, { "epoch": 3.171139570927482, "grad_norm": 0.41765928268432617, "learning_rate": 4.3158021810557615e-05, "loss": 0.3538, "num_input_tokens_seen": 37140448, "step": 38875 }, { "epoch": 3.1715474345378905, "grad_norm": 1.3038384914398193, "learning_rate": 4.315557513380372e-05, "loss": 0.4771, "num_input_tokens_seen": 37144624, "step": 38880 }, { "epoch": 3.171955298148299, "grad_norm": 1.061133861541748, "learning_rate": 4.315312808904403e-05, "loss": 0.2622, "num_input_tokens_seen": 37149648, "step": 38885 }, { "epoch": 3.172363161758708, "grad_norm": 0.4409525394439697, "learning_rate": 4.3150680676328146e-05, "loss": 0.3505, "num_input_tokens_seen": 37154448, "step": 38890 }, { "epoch": 3.1727710253691166, "grad_norm": 0.9051975607872009, "learning_rate": 4.3148232895705685e-05, "loss": 0.3654, "num_input_tokens_seen": 37159312, "step": 38895 }, { "epoch": 3.173178888979525, "grad_norm": 0.8912102580070496, "learning_rate": 4.3145784747226255e-05, "loss": 0.3459, "num_input_tokens_seen": 37164464, "step": 38900 }, { "epoch": 3.1735867525899337, "grad_norm": 0.24372684955596924, "learning_rate": 4.314333623093948e-05, "loss": 0.3581, "num_input_tokens_seen": 37169872, "step": 38905 }, { "epoch": 3.1739946162003427, "grad_norm": 0.2306535840034485, "learning_rate": 4.3140887346894974e-05, "loss": 0.3489, "num_input_tokens_seen": 37174464, "step": 38910 }, { "epoch": 3.1744024798107513, "grad_norm": 0.9528875946998596, "learning_rate": 4.313843809514241e-05, "loss": 0.3751, "num_input_tokens_seen": 37178544, "step": 38915 }, { "epoch": 3.17481034342116, "grad_norm": 0.23405644297599792, "learning_rate": 4.3135988475731395e-05, "loss": 0.3371, "num_input_tokens_seen": 37183040, "step": 38920 }, { "epoch": 3.175218207031569, "grad_norm": 0.8095076084136963, "learning_rate": 4.313353848871161e-05, "loss": 0.3433, "num_input_tokens_seen": 37188064, "step": 38925 }, { "epoch": 3.1756260706419774, "grad_norm": 0.31822165846824646, "learning_rate": 4.313108813413271e-05, "loss": 0.3547, "num_input_tokens_seen": 37193440, "step": 38930 }, { "epoch": 3.176033934252386, "grad_norm": 0.42699071764945984, "learning_rate": 4.312863741204435e-05, "loss": 0.3422, "num_input_tokens_seen": 37198240, "step": 38935 }, { "epoch": 3.1764417978627946, "grad_norm": 0.8803888559341431, "learning_rate": 4.312618632249621e-05, "loss": 0.3459, "num_input_tokens_seen": 37202832, "step": 38940 }, { "epoch": 3.176849661473203, "grad_norm": 1.0071860551834106, "learning_rate": 4.3123734865537976e-05, "loss": 0.3218, "num_input_tokens_seen": 37207808, "step": 38945 }, { "epoch": 3.177257525083612, "grad_norm": 0.6978252530097961, "learning_rate": 4.312128304121935e-05, "loss": 0.3255, "num_input_tokens_seen": 37212336, "step": 38950 }, { "epoch": 3.1776653886940207, "grad_norm": 0.6155045032501221, "learning_rate": 4.311883084959001e-05, "loss": 0.3235, "num_input_tokens_seen": 37217824, "step": 38955 }, { "epoch": 3.1780732523044293, "grad_norm": 0.1708209067583084, "learning_rate": 4.311637829069967e-05, "loss": 0.4255, "num_input_tokens_seen": 37222480, "step": 38960 }, { "epoch": 3.1784811159148383, "grad_norm": 0.3986735939979553, "learning_rate": 4.311392536459803e-05, "loss": 0.3337, "num_input_tokens_seen": 37227600, "step": 38965 }, { "epoch": 3.178888979525247, "grad_norm": 0.3263281583786011, "learning_rate": 4.311147207133483e-05, "loss": 0.3175, "num_input_tokens_seen": 37232752, "step": 38970 }, { "epoch": 3.1792968431356554, "grad_norm": 0.7647449970245361, "learning_rate": 4.3109018410959785e-05, "loss": 0.3555, "num_input_tokens_seen": 37237808, "step": 38975 }, { "epoch": 3.179704706746064, "grad_norm": 0.22046826779842377, "learning_rate": 4.310656438352263e-05, "loss": 0.3193, "num_input_tokens_seen": 37242896, "step": 38980 }, { "epoch": 3.180112570356473, "grad_norm": 0.8149587512016296, "learning_rate": 4.3104109989073106e-05, "loss": 0.3538, "num_input_tokens_seen": 37247600, "step": 38985 }, { "epoch": 3.1805204339668816, "grad_norm": 0.7240839600563049, "learning_rate": 4.310165522766097e-05, "loss": 0.3244, "num_input_tokens_seen": 37251216, "step": 38990 }, { "epoch": 3.18092829757729, "grad_norm": 0.3104144334793091, "learning_rate": 4.309920009933598e-05, "loss": 0.3468, "num_input_tokens_seen": 37256176, "step": 38995 }, { "epoch": 3.1813361611876987, "grad_norm": 0.7571372389793396, "learning_rate": 4.309674460414788e-05, "loss": 0.3556, "num_input_tokens_seen": 37261088, "step": 39000 }, { "epoch": 3.1817440247981077, "grad_norm": 0.30981042981147766, "learning_rate": 4.309428874214647e-05, "loss": 0.3671, "num_input_tokens_seen": 37265264, "step": 39005 }, { "epoch": 3.1821518884085163, "grad_norm": 0.38289839029312134, "learning_rate": 4.309183251338151e-05, "loss": 0.3591, "num_input_tokens_seen": 37270032, "step": 39010 }, { "epoch": 3.182559752018925, "grad_norm": 0.14080819487571716, "learning_rate": 4.30893759179028e-05, "loss": 0.3443, "num_input_tokens_seen": 37274448, "step": 39015 }, { "epoch": 3.1829676156293334, "grad_norm": 0.1724461019039154, "learning_rate": 4.3086918955760114e-05, "loss": 0.3684, "num_input_tokens_seen": 37278176, "step": 39020 }, { "epoch": 3.1833754792397424, "grad_norm": 0.27599766850471497, "learning_rate": 4.308446162700327e-05, "loss": 0.3564, "num_input_tokens_seen": 37283472, "step": 39025 }, { "epoch": 3.183783342850151, "grad_norm": 0.22729800641536713, "learning_rate": 4.308200393168208e-05, "loss": 0.3609, "num_input_tokens_seen": 37288416, "step": 39030 }, { "epoch": 3.1841912064605595, "grad_norm": 0.09145534038543701, "learning_rate": 4.3079545869846346e-05, "loss": 0.3879, "num_input_tokens_seen": 37292768, "step": 39035 }, { "epoch": 3.184599070070968, "grad_norm": 0.8622198104858398, "learning_rate": 4.307708744154591e-05, "loss": 0.3126, "num_input_tokens_seen": 37298176, "step": 39040 }, { "epoch": 3.185006933681377, "grad_norm": 0.3487054705619812, "learning_rate": 4.3074628646830584e-05, "loss": 0.3435, "num_input_tokens_seen": 37304256, "step": 39045 }, { "epoch": 3.1854147972917857, "grad_norm": 1.205237865447998, "learning_rate": 4.307216948575021e-05, "loss": 0.4149, "num_input_tokens_seen": 37308880, "step": 39050 }, { "epoch": 3.1858226609021942, "grad_norm": 0.3497447073459625, "learning_rate": 4.3069709958354655e-05, "loss": 0.4031, "num_input_tokens_seen": 37313744, "step": 39055 }, { "epoch": 3.186230524512603, "grad_norm": 0.9649563431739807, "learning_rate": 4.306725006469375e-05, "loss": 0.3493, "num_input_tokens_seen": 37318928, "step": 39060 }, { "epoch": 3.186638388123012, "grad_norm": 0.17422382533550262, "learning_rate": 4.3064789804817365e-05, "loss": 0.3579, "num_input_tokens_seen": 37323936, "step": 39065 }, { "epoch": 3.1870462517334204, "grad_norm": 0.2887652516365051, "learning_rate": 4.3062329178775365e-05, "loss": 0.3417, "num_input_tokens_seen": 37328976, "step": 39070 }, { "epoch": 3.187454115343829, "grad_norm": 0.27778691053390503, "learning_rate": 4.3059868186617625e-05, "loss": 0.363, "num_input_tokens_seen": 37333760, "step": 39075 }, { "epoch": 3.1878619789542375, "grad_norm": 0.3440840244293213, "learning_rate": 4.3057406828394037e-05, "loss": 0.3434, "num_input_tokens_seen": 37339680, "step": 39080 }, { "epoch": 3.1882698425646465, "grad_norm": 0.30157017707824707, "learning_rate": 4.305494510415449e-05, "loss": 0.3115, "num_input_tokens_seen": 37345216, "step": 39085 }, { "epoch": 3.188677706175055, "grad_norm": 0.6852725744247437, "learning_rate": 4.3052483013948866e-05, "loss": 0.3133, "num_input_tokens_seen": 37349904, "step": 39090 }, { "epoch": 3.1890855697854636, "grad_norm": 0.41703540086746216, "learning_rate": 4.3050020557827084e-05, "loss": 0.2398, "num_input_tokens_seen": 37354224, "step": 39095 }, { "epoch": 3.189493433395872, "grad_norm": 0.6437690854072571, "learning_rate": 4.304755773583906e-05, "loss": 0.3565, "num_input_tokens_seen": 37359216, "step": 39100 }, { "epoch": 3.1899012970062812, "grad_norm": 0.44287580251693726, "learning_rate": 4.304509454803471e-05, "loss": 0.3073, "num_input_tokens_seen": 37364288, "step": 39105 }, { "epoch": 3.19030916061669, "grad_norm": 0.5699058771133423, "learning_rate": 4.304263099446396e-05, "loss": 0.5352, "num_input_tokens_seen": 37369408, "step": 39110 }, { "epoch": 3.1907170242270984, "grad_norm": 0.6324242949485779, "learning_rate": 4.304016707517674e-05, "loss": 0.4263, "num_input_tokens_seen": 37373776, "step": 39115 }, { "epoch": 3.191124887837507, "grad_norm": 0.6931511759757996, "learning_rate": 4.303770279022301e-05, "loss": 0.3489, "num_input_tokens_seen": 37378880, "step": 39120 }, { "epoch": 3.191532751447916, "grad_norm": 0.7464153170585632, "learning_rate": 4.30352381396527e-05, "loss": 0.3474, "num_input_tokens_seen": 37382640, "step": 39125 }, { "epoch": 3.1919406150583245, "grad_norm": 0.7533184289932251, "learning_rate": 4.303277312351578e-05, "loss": 0.3505, "num_input_tokens_seen": 37387456, "step": 39130 }, { "epoch": 3.192348478668733, "grad_norm": 0.7054044008255005, "learning_rate": 4.303030774186221e-05, "loss": 0.3663, "num_input_tokens_seen": 37391952, "step": 39135 }, { "epoch": 3.192756342279142, "grad_norm": 0.3315037786960602, "learning_rate": 4.302784199474197e-05, "loss": 0.3506, "num_input_tokens_seen": 37397184, "step": 39140 }, { "epoch": 3.1931642058895506, "grad_norm": 0.8025307655334473, "learning_rate": 4.302537588220503e-05, "loss": 0.3286, "num_input_tokens_seen": 37402000, "step": 39145 }, { "epoch": 3.193572069499959, "grad_norm": 0.6124973297119141, "learning_rate": 4.302290940430138e-05, "loss": 0.2962, "num_input_tokens_seen": 37407264, "step": 39150 }, { "epoch": 3.1939799331103678, "grad_norm": 0.3659718632698059, "learning_rate": 4.302044256108101e-05, "loss": 0.2733, "num_input_tokens_seen": 37411792, "step": 39155 }, { "epoch": 3.1943877967207768, "grad_norm": 1.2913508415222168, "learning_rate": 4.301797535259393e-05, "loss": 0.3748, "num_input_tokens_seen": 37415824, "step": 39160 }, { "epoch": 3.1947956603311853, "grad_norm": 0.44645553827285767, "learning_rate": 4.301550777889015e-05, "loss": 0.4077, "num_input_tokens_seen": 37420496, "step": 39165 }, { "epoch": 3.195203523941594, "grad_norm": 1.1801190376281738, "learning_rate": 4.301303984001967e-05, "loss": 0.3944, "num_input_tokens_seen": 37424832, "step": 39170 }, { "epoch": 3.1956113875520025, "grad_norm": 0.2636716365814209, "learning_rate": 4.301057153603254e-05, "loss": 0.3269, "num_input_tokens_seen": 37429840, "step": 39175 }, { "epoch": 3.1960192511624115, "grad_norm": 0.9563578963279724, "learning_rate": 4.3008102866978775e-05, "loss": 0.3756, "num_input_tokens_seen": 37433648, "step": 39180 }, { "epoch": 3.19642711477282, "grad_norm": 0.7055323123931885, "learning_rate": 4.300563383290842e-05, "loss": 0.3169, "num_input_tokens_seen": 37438432, "step": 39185 }, { "epoch": 3.1968349783832286, "grad_norm": 0.8983018398284912, "learning_rate": 4.300316443387151e-05, "loss": 0.3342, "num_input_tokens_seen": 37443344, "step": 39190 }, { "epoch": 3.197242841993637, "grad_norm": 0.2519015371799469, "learning_rate": 4.3000694669918114e-05, "loss": 0.2617, "num_input_tokens_seen": 37447584, "step": 39195 }, { "epoch": 3.197650705604046, "grad_norm": 0.4089900851249695, "learning_rate": 4.2998224541098284e-05, "loss": 0.3436, "num_input_tokens_seen": 37451616, "step": 39200 }, { "epoch": 3.1980585692144547, "grad_norm": 0.5419268608093262, "learning_rate": 4.299575404746209e-05, "loss": 0.3899, "num_input_tokens_seen": 37456496, "step": 39205 }, { "epoch": 3.1984664328248633, "grad_norm": 0.6767501831054688, "learning_rate": 4.2993283189059604e-05, "loss": 0.3184, "num_input_tokens_seen": 37461440, "step": 39210 }, { "epoch": 3.198874296435272, "grad_norm": 0.2036084532737732, "learning_rate": 4.299081196594092e-05, "loss": 0.3885, "num_input_tokens_seen": 37465088, "step": 39215 }, { "epoch": 3.199282160045681, "grad_norm": 0.384978324174881, "learning_rate": 4.298834037815612e-05, "loss": 0.3529, "num_input_tokens_seen": 37470256, "step": 39220 }, { "epoch": 3.1996900236560895, "grad_norm": 0.7599566578865051, "learning_rate": 4.298586842575531e-05, "loss": 0.3629, "num_input_tokens_seen": 37475440, "step": 39225 }, { "epoch": 3.200097887266498, "grad_norm": 0.27266812324523926, "learning_rate": 4.2983396108788577e-05, "loss": 0.347, "num_input_tokens_seen": 37480480, "step": 39230 }, { "epoch": 3.2005057508769066, "grad_norm": 0.855133593082428, "learning_rate": 4.2980923427306055e-05, "loss": 0.3435, "num_input_tokens_seen": 37485264, "step": 39235 }, { "epoch": 3.2009136144873156, "grad_norm": 0.0834730789065361, "learning_rate": 4.2978450381357847e-05, "loss": 0.3429, "num_input_tokens_seen": 37490016, "step": 39240 }, { "epoch": 3.201321478097724, "grad_norm": 0.8763267397880554, "learning_rate": 4.2975976970994096e-05, "loss": 0.3555, "num_input_tokens_seen": 37494480, "step": 39245 }, { "epoch": 3.2017293417081327, "grad_norm": 0.27901145815849304, "learning_rate": 4.297350319626493e-05, "loss": 0.3492, "num_input_tokens_seen": 37499984, "step": 39250 }, { "epoch": 3.2021372053185413, "grad_norm": 0.20973801612854004, "learning_rate": 4.297102905722049e-05, "loss": 0.3452, "num_input_tokens_seen": 37504720, "step": 39255 }, { "epoch": 3.2025450689289503, "grad_norm": 0.8102096915245056, "learning_rate": 4.2968554553910936e-05, "loss": 0.3469, "num_input_tokens_seen": 37509312, "step": 39260 }, { "epoch": 3.202952932539359, "grad_norm": 0.32443955540657043, "learning_rate": 4.2966079686386404e-05, "loss": 0.3468, "num_input_tokens_seen": 37513920, "step": 39265 }, { "epoch": 3.2033607961497674, "grad_norm": 0.9331774711608887, "learning_rate": 4.296360445469708e-05, "loss": 0.3485, "num_input_tokens_seen": 37518832, "step": 39270 }, { "epoch": 3.203768659760176, "grad_norm": 0.8939395546913147, "learning_rate": 4.2961128858893124e-05, "loss": 0.3587, "num_input_tokens_seen": 37523808, "step": 39275 }, { "epoch": 3.204176523370585, "grad_norm": 1.0007123947143555, "learning_rate": 4.295865289902472e-05, "loss": 0.3406, "num_input_tokens_seen": 37527296, "step": 39280 }, { "epoch": 3.2045843869809936, "grad_norm": 0.8165704011917114, "learning_rate": 4.295617657514205e-05, "loss": 0.332, "num_input_tokens_seen": 37532096, "step": 39285 }, { "epoch": 3.204992250591402, "grad_norm": 0.9918278455734253, "learning_rate": 4.295369988729531e-05, "loss": 0.3447, "num_input_tokens_seen": 37535760, "step": 39290 }, { "epoch": 3.2054001142018107, "grad_norm": 0.6129617094993591, "learning_rate": 4.29512228355347e-05, "loss": 0.3103, "num_input_tokens_seen": 37541152, "step": 39295 }, { "epoch": 3.2058079778122197, "grad_norm": 0.37445250153541565, "learning_rate": 4.294874541991044e-05, "loss": 0.3781, "num_input_tokens_seen": 37545680, "step": 39300 }, { "epoch": 3.2062158414226283, "grad_norm": 0.6657415628433228, "learning_rate": 4.2946267640472723e-05, "loss": 0.3809, "num_input_tokens_seen": 37549680, "step": 39305 }, { "epoch": 3.206623705033037, "grad_norm": 0.9220491647720337, "learning_rate": 4.2943789497271803e-05, "loss": 0.3791, "num_input_tokens_seen": 37554528, "step": 39310 }, { "epoch": 3.207031568643446, "grad_norm": 0.8825670480728149, "learning_rate": 4.2941310990357884e-05, "loss": 0.3441, "num_input_tokens_seen": 37559664, "step": 39315 }, { "epoch": 3.2074394322538544, "grad_norm": 0.8804749250411987, "learning_rate": 4.2938832119781217e-05, "loss": 0.3506, "num_input_tokens_seen": 37564960, "step": 39320 }, { "epoch": 3.207847295864263, "grad_norm": 1.0975310802459717, "learning_rate": 4.293635288559205e-05, "loss": 0.3509, "num_input_tokens_seen": 37568960, "step": 39325 }, { "epoch": 3.2082551594746715, "grad_norm": 0.22648493945598602, "learning_rate": 4.293387328784063e-05, "loss": 0.3291, "num_input_tokens_seen": 37573808, "step": 39330 }, { "epoch": 3.2086630230850806, "grad_norm": 0.6588918566703796, "learning_rate": 4.293139332657722e-05, "loss": 0.3354, "num_input_tokens_seen": 37578896, "step": 39335 }, { "epoch": 3.209070886695489, "grad_norm": 0.29071810841560364, "learning_rate": 4.292891300185208e-05, "loss": 0.3264, "num_input_tokens_seen": 37582768, "step": 39340 }, { "epoch": 3.2094787503058977, "grad_norm": 0.5746976733207703, "learning_rate": 4.2926432313715495e-05, "loss": 0.3326, "num_input_tokens_seen": 37586896, "step": 39345 }, { "epoch": 3.2098866139163063, "grad_norm": 1.1430511474609375, "learning_rate": 4.292395126221774e-05, "loss": 0.4263, "num_input_tokens_seen": 37592176, "step": 39350 }, { "epoch": 3.2102944775267153, "grad_norm": 0.25624170899391174, "learning_rate": 4.2921469847409126e-05, "loss": 0.3562, "num_input_tokens_seen": 37597664, "step": 39355 }, { "epoch": 3.210702341137124, "grad_norm": 0.8522005677223206, "learning_rate": 4.2918988069339914e-05, "loss": 0.3708, "num_input_tokens_seen": 37602576, "step": 39360 }, { "epoch": 3.2111102047475324, "grad_norm": 0.41524583101272583, "learning_rate": 4.291650592806044e-05, "loss": 0.3435, "num_input_tokens_seen": 37607376, "step": 39365 }, { "epoch": 3.211518068357941, "grad_norm": 0.379140168428421, "learning_rate": 4.2914023423621e-05, "loss": 0.3106, "num_input_tokens_seen": 37612512, "step": 39370 }, { "epoch": 3.21192593196835, "grad_norm": 0.5333443880081177, "learning_rate": 4.291154055607192e-05, "loss": 0.2954, "num_input_tokens_seen": 37617808, "step": 39375 }, { "epoch": 3.2123337955787585, "grad_norm": 1.3027558326721191, "learning_rate": 4.290905732546352e-05, "loss": 0.4257, "num_input_tokens_seen": 37622672, "step": 39380 }, { "epoch": 3.212741659189167, "grad_norm": 0.4619283080101013, "learning_rate": 4.290657373184614e-05, "loss": 0.3644, "num_input_tokens_seen": 37627392, "step": 39385 }, { "epoch": 3.2131495227995757, "grad_norm": 0.20032240450382233, "learning_rate": 4.290408977527012e-05, "loss": 0.3143, "num_input_tokens_seen": 37631456, "step": 39390 }, { "epoch": 3.2135573864099847, "grad_norm": 0.629351794719696, "learning_rate": 4.290160545578581e-05, "loss": 0.3855, "num_input_tokens_seen": 37636512, "step": 39395 }, { "epoch": 3.2139652500203932, "grad_norm": 0.2822156548500061, "learning_rate": 4.289912077344356e-05, "loss": 0.3617, "num_input_tokens_seen": 37641088, "step": 39400 }, { "epoch": 3.214373113630802, "grad_norm": 0.1718590259552002, "learning_rate": 4.2896635728293735e-05, "loss": 0.3538, "num_input_tokens_seen": 37645872, "step": 39405 }, { "epoch": 3.2147809772412104, "grad_norm": 0.1747942417860031, "learning_rate": 4.289415032038672e-05, "loss": 0.3308, "num_input_tokens_seen": 37650448, "step": 39410 }, { "epoch": 3.2151888408516194, "grad_norm": 0.7992618083953857, "learning_rate": 4.289166454977287e-05, "loss": 0.352, "num_input_tokens_seen": 37655760, "step": 39415 }, { "epoch": 3.215596704462028, "grad_norm": 0.23098142445087433, "learning_rate": 4.288917841650259e-05, "loss": 0.3465, "num_input_tokens_seen": 37660592, "step": 39420 }, { "epoch": 3.2160045680724365, "grad_norm": 1.1119494438171387, "learning_rate": 4.288669192062626e-05, "loss": 0.4214, "num_input_tokens_seen": 37665600, "step": 39425 }, { "epoch": 3.216412431682845, "grad_norm": 0.7944066524505615, "learning_rate": 4.288420506219429e-05, "loss": 0.3551, "num_input_tokens_seen": 37670368, "step": 39430 }, { "epoch": 3.216820295293254, "grad_norm": 0.2204604595899582, "learning_rate": 4.2881717841257077e-05, "loss": 0.3425, "num_input_tokens_seen": 37675712, "step": 39435 }, { "epoch": 3.2172281589036626, "grad_norm": 0.8109557032585144, "learning_rate": 4.287923025786504e-05, "loss": 0.3451, "num_input_tokens_seen": 37680704, "step": 39440 }, { "epoch": 3.217636022514071, "grad_norm": 0.8351357579231262, "learning_rate": 4.287674231206861e-05, "loss": 0.3557, "num_input_tokens_seen": 37685520, "step": 39445 }, { "epoch": 3.21804388612448, "grad_norm": 0.9299062490463257, "learning_rate": 4.287425400391821e-05, "loss": 0.3692, "num_input_tokens_seen": 37690384, "step": 39450 }, { "epoch": 3.218451749734889, "grad_norm": 0.9039412140846252, "learning_rate": 4.287176533346428e-05, "loss": 0.3608, "num_input_tokens_seen": 37695328, "step": 39455 }, { "epoch": 3.2188596133452974, "grad_norm": 0.8891710042953491, "learning_rate": 4.286927630075725e-05, "loss": 0.362, "num_input_tokens_seen": 37700240, "step": 39460 }, { "epoch": 3.219267476955706, "grad_norm": 0.37567809224128723, "learning_rate": 4.286678690584759e-05, "loss": 0.3518, "num_input_tokens_seen": 37704848, "step": 39465 }, { "epoch": 3.2196753405661145, "grad_norm": 0.7064650654792786, "learning_rate": 4.286429714878575e-05, "loss": 0.334, "num_input_tokens_seen": 37709856, "step": 39470 }, { "epoch": 3.2200832041765235, "grad_norm": 0.7621505856513977, "learning_rate": 4.2861807029622195e-05, "loss": 0.3352, "num_input_tokens_seen": 37715824, "step": 39475 }, { "epoch": 3.220491067786932, "grad_norm": 0.6040433049201965, "learning_rate": 4.2859316548407404e-05, "loss": 0.3798, "num_input_tokens_seen": 37721040, "step": 39480 }, { "epoch": 3.2208989313973406, "grad_norm": 0.41047602891921997, "learning_rate": 4.285682570519185e-05, "loss": 0.3809, "num_input_tokens_seen": 37726688, "step": 39485 }, { "epoch": 3.2213067950077496, "grad_norm": 0.937716543674469, "learning_rate": 4.285433450002603e-05, "loss": 0.385, "num_input_tokens_seen": 37731392, "step": 39490 }, { "epoch": 3.221714658618158, "grad_norm": 0.32850557565689087, "learning_rate": 4.285184293296044e-05, "loss": 0.3459, "num_input_tokens_seen": 37736384, "step": 39495 }, { "epoch": 3.2221225222285668, "grad_norm": 0.7610837817192078, "learning_rate": 4.284935100404558e-05, "loss": 0.3676, "num_input_tokens_seen": 37741136, "step": 39500 }, { "epoch": 3.2225303858389753, "grad_norm": 0.41099676489830017, "learning_rate": 4.284685871333196e-05, "loss": 0.352, "num_input_tokens_seen": 37745600, "step": 39505 }, { "epoch": 3.222938249449384, "grad_norm": 0.21266867220401764, "learning_rate": 4.284436606087009e-05, "loss": 0.3337, "num_input_tokens_seen": 37750496, "step": 39510 }, { "epoch": 3.223346113059793, "grad_norm": 0.6748851537704468, "learning_rate": 4.2841873046710506e-05, "loss": 0.3308, "num_input_tokens_seen": 37754832, "step": 39515 }, { "epoch": 3.2237539766702015, "grad_norm": 0.9087457656860352, "learning_rate": 4.283937967090374e-05, "loss": 0.378, "num_input_tokens_seen": 37759168, "step": 39520 }, { "epoch": 3.22416184028061, "grad_norm": 0.6688266396522522, "learning_rate": 4.283688593350033e-05, "loss": 0.3324, "num_input_tokens_seen": 37763344, "step": 39525 }, { "epoch": 3.224569703891019, "grad_norm": 0.9458792805671692, "learning_rate": 4.283439183455081e-05, "loss": 0.3717, "num_input_tokens_seen": 37767584, "step": 39530 }, { "epoch": 3.2249775675014276, "grad_norm": 0.9194567203521729, "learning_rate": 4.283189737410575e-05, "loss": 0.3472, "num_input_tokens_seen": 37771296, "step": 39535 }, { "epoch": 3.225385431111836, "grad_norm": 0.8298743963241577, "learning_rate": 4.2829402552215716e-05, "loss": 0.3475, "num_input_tokens_seen": 37776080, "step": 39540 }, { "epoch": 3.2257932947222447, "grad_norm": 0.7171640992164612, "learning_rate": 4.2826907368931266e-05, "loss": 0.3258, "num_input_tokens_seen": 37781056, "step": 39545 }, { "epoch": 3.2262011583326538, "grad_norm": 0.3651311993598938, "learning_rate": 4.2824411824302974e-05, "loss": 0.3203, "num_input_tokens_seen": 37785760, "step": 39550 }, { "epoch": 3.2266090219430623, "grad_norm": 0.4085868000984192, "learning_rate": 4.282191591838143e-05, "loss": 0.3099, "num_input_tokens_seen": 37790992, "step": 39555 }, { "epoch": 3.227016885553471, "grad_norm": 0.48366686701774597, "learning_rate": 4.281941965121723e-05, "loss": 0.4033, "num_input_tokens_seen": 37796080, "step": 39560 }, { "epoch": 3.2274247491638794, "grad_norm": 0.45144617557525635, "learning_rate": 4.2816923022860955e-05, "loss": 0.3536, "num_input_tokens_seen": 37801088, "step": 39565 }, { "epoch": 3.2278326127742885, "grad_norm": 0.5747580528259277, "learning_rate": 4.2814426033363224e-05, "loss": 0.3412, "num_input_tokens_seen": 37805840, "step": 39570 }, { "epoch": 3.228240476384697, "grad_norm": 0.22432757914066315, "learning_rate": 4.281192868277465e-05, "loss": 0.3893, "num_input_tokens_seen": 37810672, "step": 39575 }, { "epoch": 3.2286483399951056, "grad_norm": 0.8159048557281494, "learning_rate": 4.2809430971145845e-05, "loss": 0.3553, "num_input_tokens_seen": 37815440, "step": 39580 }, { "epoch": 3.229056203605514, "grad_norm": 0.788627028465271, "learning_rate": 4.2806932898527454e-05, "loss": 0.3648, "num_input_tokens_seen": 37819616, "step": 39585 }, { "epoch": 3.229464067215923, "grad_norm": 0.26009851694107056, "learning_rate": 4.280443446497009e-05, "loss": 0.3569, "num_input_tokens_seen": 37824640, "step": 39590 }, { "epoch": 3.2298719308263317, "grad_norm": 0.21767760813236237, "learning_rate": 4.280193567052441e-05, "loss": 0.341, "num_input_tokens_seen": 37830000, "step": 39595 }, { "epoch": 3.2302797944367403, "grad_norm": 0.16387253999710083, "learning_rate": 4.279943651524105e-05, "loss": 0.343, "num_input_tokens_seen": 37834384, "step": 39600 }, { "epoch": 3.230687658047149, "grad_norm": 0.7690457701683044, "learning_rate": 4.279693699917068e-05, "loss": 0.337, "num_input_tokens_seen": 37839488, "step": 39605 }, { "epoch": 3.231095521657558, "grad_norm": 0.9812577962875366, "learning_rate": 4.2794437122363965e-05, "loss": 0.3454, "num_input_tokens_seen": 37843872, "step": 39610 }, { "epoch": 3.2315033852679664, "grad_norm": 0.49723881483078003, "learning_rate": 4.279193688487157e-05, "loss": 0.3484, "num_input_tokens_seen": 37848432, "step": 39615 }, { "epoch": 3.231911248878375, "grad_norm": 0.3678784966468811, "learning_rate": 4.2789436286744165e-05, "loss": 0.3248, "num_input_tokens_seen": 37852720, "step": 39620 }, { "epoch": 3.2323191124887836, "grad_norm": 0.6255226135253906, "learning_rate": 4.278693532803245e-05, "loss": 0.3655, "num_input_tokens_seen": 37857696, "step": 39625 }, { "epoch": 3.2327269760991926, "grad_norm": 0.2822800576686859, "learning_rate": 4.278443400878711e-05, "loss": 0.3724, "num_input_tokens_seen": 37862320, "step": 39630 }, { "epoch": 3.233134839709601, "grad_norm": 0.5963888764381409, "learning_rate": 4.278193232905886e-05, "loss": 0.3133, "num_input_tokens_seen": 37866800, "step": 39635 }, { "epoch": 3.2335427033200097, "grad_norm": 0.5203970074653625, "learning_rate": 4.277943028889839e-05, "loss": 0.354, "num_input_tokens_seen": 37871520, "step": 39640 }, { "epoch": 3.2339505669304183, "grad_norm": 0.2581270635128021, "learning_rate": 4.277692788835642e-05, "loss": 0.3481, "num_input_tokens_seen": 37876256, "step": 39645 }, { "epoch": 3.2343584305408273, "grad_norm": 0.3774675130844116, "learning_rate": 4.2774425127483686e-05, "loss": 0.3824, "num_input_tokens_seen": 37881152, "step": 39650 }, { "epoch": 3.234766294151236, "grad_norm": 0.7770772576332092, "learning_rate": 4.27719220063309e-05, "loss": 0.3327, "num_input_tokens_seen": 37885584, "step": 39655 }, { "epoch": 3.2351741577616444, "grad_norm": 0.9456331133842468, "learning_rate": 4.276941852494881e-05, "loss": 0.3526, "num_input_tokens_seen": 37890000, "step": 39660 }, { "epoch": 3.2355820213720534, "grad_norm": 0.2092643529176712, "learning_rate": 4.276691468338816e-05, "loss": 0.36, "num_input_tokens_seen": 37895072, "step": 39665 }, { "epoch": 3.235989884982462, "grad_norm": 0.2370564192533493, "learning_rate": 4.276441048169969e-05, "loss": 0.34, "num_input_tokens_seen": 37900512, "step": 39670 }, { "epoch": 3.2363977485928705, "grad_norm": 0.33647021651268005, "learning_rate": 4.2761905919934174e-05, "loss": 0.3765, "num_input_tokens_seen": 37905568, "step": 39675 }, { "epoch": 3.236805612203279, "grad_norm": 1.1205919981002808, "learning_rate": 4.2759400998142374e-05, "loss": 0.3183, "num_input_tokens_seen": 37910432, "step": 39680 }, { "epoch": 3.2372134758136877, "grad_norm": 0.6950762867927551, "learning_rate": 4.275689571637506e-05, "loss": 0.3451, "num_input_tokens_seen": 37915264, "step": 39685 }, { "epoch": 3.2376213394240967, "grad_norm": 0.29560616612434387, "learning_rate": 4.2754390074683015e-05, "loss": 0.3665, "num_input_tokens_seen": 37920400, "step": 39690 }, { "epoch": 3.2380292030345053, "grad_norm": 0.6267878413200378, "learning_rate": 4.275188407311703e-05, "loss": 0.3399, "num_input_tokens_seen": 37925184, "step": 39695 }, { "epoch": 3.238437066644914, "grad_norm": 0.32582563161849976, "learning_rate": 4.2749377711727896e-05, "loss": 0.3287, "num_input_tokens_seen": 37930192, "step": 39700 }, { "epoch": 3.238844930255323, "grad_norm": 0.7660477161407471, "learning_rate": 4.2746870990566414e-05, "loss": 0.3209, "num_input_tokens_seen": 37935232, "step": 39705 }, { "epoch": 3.2392527938657314, "grad_norm": 0.30904680490493774, "learning_rate": 4.27443639096834e-05, "loss": 0.3095, "num_input_tokens_seen": 37939232, "step": 39710 }, { "epoch": 3.23966065747614, "grad_norm": 0.2794826924800873, "learning_rate": 4.2741856469129684e-05, "loss": 0.3897, "num_input_tokens_seen": 37943536, "step": 39715 }, { "epoch": 3.2400685210865485, "grad_norm": 0.9562666416168213, "learning_rate": 4.273934866895606e-05, "loss": 0.3909, "num_input_tokens_seen": 37948240, "step": 39720 }, { "epoch": 3.2404763846969575, "grad_norm": 0.19619157910346985, "learning_rate": 4.2736840509213375e-05, "loss": 0.3367, "num_input_tokens_seen": 37952432, "step": 39725 }, { "epoch": 3.240884248307366, "grad_norm": 0.41446352005004883, "learning_rate": 4.273433198995248e-05, "loss": 0.4207, "num_input_tokens_seen": 37957808, "step": 39730 }, { "epoch": 3.2412921119177747, "grad_norm": 0.819195032119751, "learning_rate": 4.27318231112242e-05, "loss": 0.362, "num_input_tokens_seen": 37962896, "step": 39735 }, { "epoch": 3.2416999755281832, "grad_norm": 0.7139835953712463, "learning_rate": 4.272931387307941e-05, "loss": 0.3415, "num_input_tokens_seen": 37967856, "step": 39740 }, { "epoch": 3.2421078391385922, "grad_norm": 0.198532834649086, "learning_rate": 4.272680427556895e-05, "loss": 0.3709, "num_input_tokens_seen": 37972432, "step": 39745 }, { "epoch": 3.242515702749001, "grad_norm": 0.19317181408405304, "learning_rate": 4.272429431874372e-05, "loss": 0.3427, "num_input_tokens_seen": 37977728, "step": 39750 }, { "epoch": 3.2429235663594094, "grad_norm": 0.0955946072936058, "learning_rate": 4.272178400265456e-05, "loss": 0.3732, "num_input_tokens_seen": 37982160, "step": 39755 }, { "epoch": 3.243331429969818, "grad_norm": 0.8252133131027222, "learning_rate": 4.271927332735237e-05, "loss": 0.3816, "num_input_tokens_seen": 37987232, "step": 39760 }, { "epoch": 3.243739293580227, "grad_norm": 0.8227450847625732, "learning_rate": 4.271676229288805e-05, "loss": 0.3349, "num_input_tokens_seen": 37992864, "step": 39765 }, { "epoch": 3.2441471571906355, "grad_norm": 0.6589339971542358, "learning_rate": 4.2714250899312466e-05, "loss": 0.3676, "num_input_tokens_seen": 37997120, "step": 39770 }, { "epoch": 3.244555020801044, "grad_norm": 0.3652181029319763, "learning_rate": 4.271173914667656e-05, "loss": 0.3434, "num_input_tokens_seen": 38002016, "step": 39775 }, { "epoch": 3.2449628844114526, "grad_norm": 0.2123366892337799, "learning_rate": 4.2709227035031224e-05, "loss": 0.2945, "num_input_tokens_seen": 38007024, "step": 39780 }, { "epoch": 3.2453707480218617, "grad_norm": 0.5868328809738159, "learning_rate": 4.270671456442738e-05, "loss": 0.3752, "num_input_tokens_seen": 38011648, "step": 39785 }, { "epoch": 3.24577861163227, "grad_norm": 0.4133256673812866, "learning_rate": 4.270420173491595e-05, "loss": 0.2998, "num_input_tokens_seen": 38017280, "step": 39790 }, { "epoch": 3.246186475242679, "grad_norm": 0.6109876036643982, "learning_rate": 4.2701688546547886e-05, "loss": 0.2428, "num_input_tokens_seen": 38022032, "step": 39795 }, { "epoch": 3.2465943388530873, "grad_norm": 0.4105919301509857, "learning_rate": 4.269917499937411e-05, "loss": 0.3273, "num_input_tokens_seen": 38026752, "step": 39800 }, { "epoch": 3.2470022024634964, "grad_norm": 0.5874294638633728, "learning_rate": 4.269666109344558e-05, "loss": 0.2939, "num_input_tokens_seen": 38031600, "step": 39805 }, { "epoch": 3.247410066073905, "grad_norm": 0.8390971422195435, "learning_rate": 4.269414682881325e-05, "loss": 0.3661, "num_input_tokens_seen": 38035840, "step": 39810 }, { "epoch": 3.2478179296843135, "grad_norm": 0.572318434715271, "learning_rate": 4.2691632205528076e-05, "loss": 0.4319, "num_input_tokens_seen": 38041120, "step": 39815 }, { "epoch": 3.248225793294722, "grad_norm": 0.533653974533081, "learning_rate": 4.268911722364104e-05, "loss": 0.2845, "num_input_tokens_seen": 38045568, "step": 39820 }, { "epoch": 3.248633656905131, "grad_norm": 0.3243155777454376, "learning_rate": 4.268660188320311e-05, "loss": 0.346, "num_input_tokens_seen": 38050176, "step": 39825 }, { "epoch": 3.2490415205155396, "grad_norm": 0.6778703927993774, "learning_rate": 4.2684086184265284e-05, "loss": 0.275, "num_input_tokens_seen": 38055424, "step": 39830 }, { "epoch": 3.249449384125948, "grad_norm": 1.5033990144729614, "learning_rate": 4.268157012687854e-05, "loss": 0.3764, "num_input_tokens_seen": 38060064, "step": 39835 }, { "epoch": 3.2498572477363568, "grad_norm": 0.5905841588973999, "learning_rate": 4.267905371109388e-05, "loss": 0.3064, "num_input_tokens_seen": 38064912, "step": 39840 }, { "epoch": 3.2502651113467658, "grad_norm": 0.4906918406486511, "learning_rate": 4.267653693696232e-05, "loss": 0.3637, "num_input_tokens_seen": 38069504, "step": 39845 }, { "epoch": 3.2506729749571743, "grad_norm": 1.3012946844100952, "learning_rate": 4.267401980453486e-05, "loss": 0.4794, "num_input_tokens_seen": 38074368, "step": 39850 }, { "epoch": 3.251080838567583, "grad_norm": 0.7778422236442566, "learning_rate": 4.2671502313862534e-05, "loss": 0.2936, "num_input_tokens_seen": 38079056, "step": 39855 }, { "epoch": 3.2514887021779915, "grad_norm": 0.37454214692115784, "learning_rate": 4.2668984464996365e-05, "loss": 0.3601, "num_input_tokens_seen": 38084432, "step": 39860 }, { "epoch": 3.2518965657884005, "grad_norm": 0.3505229651927948, "learning_rate": 4.2666466257987393e-05, "loss": 0.3266, "num_input_tokens_seen": 38089728, "step": 39865 }, { "epoch": 3.252304429398809, "grad_norm": 0.968533992767334, "learning_rate": 4.266394769288665e-05, "loss": 0.3835, "num_input_tokens_seen": 38094208, "step": 39870 }, { "epoch": 3.2527122930092176, "grad_norm": 0.2149844765663147, "learning_rate": 4.266142876974519e-05, "loss": 0.3463, "num_input_tokens_seen": 38099136, "step": 39875 }, { "epoch": 3.2531201566196266, "grad_norm": 0.7718108296394348, "learning_rate": 4.265890948861408e-05, "loss": 0.3496, "num_input_tokens_seen": 38104064, "step": 39880 }, { "epoch": 3.253528020230035, "grad_norm": 0.564180314540863, "learning_rate": 4.2656389849544377e-05, "loss": 0.3372, "num_input_tokens_seen": 38109136, "step": 39885 }, { "epoch": 3.2539358838404437, "grad_norm": 0.2154199331998825, "learning_rate": 4.2653869852587156e-05, "loss": 0.3252, "num_input_tokens_seen": 38112800, "step": 39890 }, { "epoch": 3.2543437474508523, "grad_norm": 1.0028396844863892, "learning_rate": 4.2651349497793494e-05, "loss": 0.3527, "num_input_tokens_seen": 38117072, "step": 39895 }, { "epoch": 3.254751611061261, "grad_norm": 0.8504912853240967, "learning_rate": 4.264882878521448e-05, "loss": 0.386, "num_input_tokens_seen": 38122144, "step": 39900 }, { "epoch": 3.25515947467167, "grad_norm": 0.38764265179634094, "learning_rate": 4.26463077149012e-05, "loss": 0.3412, "num_input_tokens_seen": 38126672, "step": 39905 }, { "epoch": 3.2555673382820784, "grad_norm": 0.2633782625198364, "learning_rate": 4.2643786286904765e-05, "loss": 0.3613, "num_input_tokens_seen": 38130128, "step": 39910 }, { "epoch": 3.255975201892487, "grad_norm": 0.2950441837310791, "learning_rate": 4.2641264501276276e-05, "loss": 0.3496, "num_input_tokens_seen": 38135216, "step": 39915 }, { "epoch": 3.256383065502896, "grad_norm": 0.7178293466567993, "learning_rate": 4.263874235806686e-05, "loss": 0.3152, "num_input_tokens_seen": 38140224, "step": 39920 }, { "epoch": 3.2567909291133046, "grad_norm": 0.371245414018631, "learning_rate": 4.2636219857327625e-05, "loss": 0.3772, "num_input_tokens_seen": 38145552, "step": 39925 }, { "epoch": 3.257198792723713, "grad_norm": 1.0887212753295898, "learning_rate": 4.2633696999109704e-05, "loss": 0.4009, "num_input_tokens_seen": 38150240, "step": 39930 }, { "epoch": 3.2576066563341217, "grad_norm": 0.6775001287460327, "learning_rate": 4.263117378346425e-05, "loss": 0.3375, "num_input_tokens_seen": 38155392, "step": 39935 }, { "epoch": 3.2580145199445307, "grad_norm": 0.35811734199523926, "learning_rate": 4.262865021044239e-05, "loss": 0.3358, "num_input_tokens_seen": 38160464, "step": 39940 }, { "epoch": 3.2584223835549393, "grad_norm": 0.8380607962608337, "learning_rate": 4.262612628009527e-05, "loss": 0.3209, "num_input_tokens_seen": 38165120, "step": 39945 }, { "epoch": 3.258830247165348, "grad_norm": 0.9533384442329407, "learning_rate": 4.2623601992474073e-05, "loss": 0.3239, "num_input_tokens_seen": 38169936, "step": 39950 }, { "epoch": 3.2592381107757564, "grad_norm": 0.4456333518028259, "learning_rate": 4.2621077347629946e-05, "loss": 0.3112, "num_input_tokens_seen": 38174608, "step": 39955 }, { "epoch": 3.2596459743861654, "grad_norm": 0.5059007406234741, "learning_rate": 4.261855234561407e-05, "loss": 0.3344, "num_input_tokens_seen": 38179120, "step": 39960 }, { "epoch": 3.260053837996574, "grad_norm": 0.22977852821350098, "learning_rate": 4.261602698647763e-05, "loss": 0.3333, "num_input_tokens_seen": 38183024, "step": 39965 }, { "epoch": 3.2604617016069826, "grad_norm": 0.6861744523048401, "learning_rate": 4.2613501270271805e-05, "loss": 0.3017, "num_input_tokens_seen": 38187712, "step": 39970 }, { "epoch": 3.260869565217391, "grad_norm": 0.6790561676025391, "learning_rate": 4.261097519704779e-05, "loss": 0.3612, "num_input_tokens_seen": 38193152, "step": 39975 }, { "epoch": 3.2612774288278, "grad_norm": 0.25730830430984497, "learning_rate": 4.260844876685679e-05, "loss": 0.3847, "num_input_tokens_seen": 38197712, "step": 39980 }, { "epoch": 3.2616852924382087, "grad_norm": 0.9465731978416443, "learning_rate": 4.260592197975002e-05, "loss": 0.3514, "num_input_tokens_seen": 38201296, "step": 39985 }, { "epoch": 3.2620931560486173, "grad_norm": 1.0998331308364868, "learning_rate": 4.26033948357787e-05, "loss": 0.3934, "num_input_tokens_seen": 38206608, "step": 39990 }, { "epoch": 3.262501019659026, "grad_norm": 0.3704943060874939, "learning_rate": 4.260086733499404e-05, "loss": 0.3096, "num_input_tokens_seen": 38211344, "step": 39995 }, { "epoch": 3.262908883269435, "grad_norm": 0.44147124886512756, "learning_rate": 4.2598339477447266e-05, "loss": 0.3304, "num_input_tokens_seen": 38215920, "step": 40000 }, { "epoch": 3.2633167468798434, "grad_norm": 0.4791169762611389, "learning_rate": 4.2595811263189643e-05, "loss": 0.3653, "num_input_tokens_seen": 38220944, "step": 40005 }, { "epoch": 3.263724610490252, "grad_norm": 0.47751301527023315, "learning_rate": 4.25932826922724e-05, "loss": 0.4151, "num_input_tokens_seen": 38225040, "step": 40010 }, { "epoch": 3.264132474100661, "grad_norm": 0.25451502203941345, "learning_rate": 4.25907537647468e-05, "loss": 0.3427, "num_input_tokens_seen": 38229776, "step": 40015 }, { "epoch": 3.2645403377110696, "grad_norm": 0.7996518611907959, "learning_rate": 4.2588224480664085e-05, "loss": 0.3675, "num_input_tokens_seen": 38234960, "step": 40020 }, { "epoch": 3.264948201321478, "grad_norm": 0.4087769091129303, "learning_rate": 4.258569484007554e-05, "loss": 0.3769, "num_input_tokens_seen": 38239856, "step": 40025 }, { "epoch": 3.2653560649318867, "grad_norm": 0.585620641708374, "learning_rate": 4.258316484303242e-05, "loss": 0.2665, "num_input_tokens_seen": 38244512, "step": 40030 }, { "epoch": 3.2657639285422952, "grad_norm": 0.4970394968986511, "learning_rate": 4.2580634489586036e-05, "loss": 0.3279, "num_input_tokens_seen": 38248432, "step": 40035 }, { "epoch": 3.2661717921527043, "grad_norm": 0.6411485075950623, "learning_rate": 4.257810377978766e-05, "loss": 0.4186, "num_input_tokens_seen": 38252992, "step": 40040 }, { "epoch": 3.266579655763113, "grad_norm": 0.29287561774253845, "learning_rate": 4.2575572713688585e-05, "loss": 0.312, "num_input_tokens_seen": 38257520, "step": 40045 }, { "epoch": 3.2669875193735214, "grad_norm": 0.5129377841949463, "learning_rate": 4.257304129134012e-05, "loss": 0.3527, "num_input_tokens_seen": 38262560, "step": 40050 }, { "epoch": 3.2673953829839304, "grad_norm": 0.5105006694793701, "learning_rate": 4.257050951279357e-05, "loss": 0.2624, "num_input_tokens_seen": 38267792, "step": 40055 }, { "epoch": 3.267803246594339, "grad_norm": 0.5521777272224426, "learning_rate": 4.256797737810027e-05, "loss": 0.4643, "num_input_tokens_seen": 38272480, "step": 40060 }, { "epoch": 3.2682111102047475, "grad_norm": 0.9406083226203918, "learning_rate": 4.256544488731152e-05, "loss": 0.4174, "num_input_tokens_seen": 38276944, "step": 40065 }, { "epoch": 3.268618973815156, "grad_norm": 0.2777777314186096, "learning_rate": 4.256291204047867e-05, "loss": 0.345, "num_input_tokens_seen": 38282368, "step": 40070 }, { "epoch": 3.2690268374255647, "grad_norm": 0.6225993633270264, "learning_rate": 4.256037883765307e-05, "loss": 0.3535, "num_input_tokens_seen": 38287520, "step": 40075 }, { "epoch": 3.2694347010359737, "grad_norm": 0.4601701498031616, "learning_rate": 4.2557845278886036e-05, "loss": 0.4059, "num_input_tokens_seen": 38292912, "step": 40080 }, { "epoch": 3.2698425646463822, "grad_norm": 0.23469145596027374, "learning_rate": 4.2555311364228944e-05, "loss": 0.325, "num_input_tokens_seen": 38297936, "step": 40085 }, { "epoch": 3.270250428256791, "grad_norm": 0.1487015336751938, "learning_rate": 4.2552777093733155e-05, "loss": 0.365, "num_input_tokens_seen": 38301952, "step": 40090 }, { "epoch": 3.2706582918672, "grad_norm": 0.3516722321510315, "learning_rate": 4.255024246745003e-05, "loss": 0.3494, "num_input_tokens_seen": 38306304, "step": 40095 }, { "epoch": 3.2710661554776084, "grad_norm": 0.4000169336795807, "learning_rate": 4.254770748543094e-05, "loss": 0.3447, "num_input_tokens_seen": 38311136, "step": 40100 }, { "epoch": 3.271474019088017, "grad_norm": 0.6502111554145813, "learning_rate": 4.2545172147727287e-05, "loss": 0.2921, "num_input_tokens_seen": 38315136, "step": 40105 }, { "epoch": 3.2718818826984255, "grad_norm": 0.5702220797538757, "learning_rate": 4.2542636454390445e-05, "loss": 0.3172, "num_input_tokens_seen": 38319216, "step": 40110 }, { "epoch": 3.2722897463088345, "grad_norm": 0.5491393208503723, "learning_rate": 4.2540100405471815e-05, "loss": 0.3528, "num_input_tokens_seen": 38323984, "step": 40115 }, { "epoch": 3.272697609919243, "grad_norm": 0.41818967461586, "learning_rate": 4.253756400102281e-05, "loss": 0.4011, "num_input_tokens_seen": 38328032, "step": 40120 }, { "epoch": 3.2731054735296516, "grad_norm": 0.5815364122390747, "learning_rate": 4.253502724109483e-05, "loss": 0.3059, "num_input_tokens_seen": 38333120, "step": 40125 }, { "epoch": 3.27351333714006, "grad_norm": 0.5019561052322388, "learning_rate": 4.2532490125739296e-05, "loss": 0.3783, "num_input_tokens_seen": 38337264, "step": 40130 }, { "epoch": 3.273921200750469, "grad_norm": 0.2777192294597626, "learning_rate": 4.252995265500764e-05, "loss": 0.3559, "num_input_tokens_seen": 38341792, "step": 40135 }, { "epoch": 3.274329064360878, "grad_norm": 0.2341867983341217, "learning_rate": 4.2527414828951295e-05, "loss": 0.3515, "num_input_tokens_seen": 38347024, "step": 40140 }, { "epoch": 3.2747369279712863, "grad_norm": 0.24263353645801544, "learning_rate": 4.252487664762169e-05, "loss": 0.3457, "num_input_tokens_seen": 38350480, "step": 40145 }, { "epoch": 3.275144791581695, "grad_norm": 0.23734480142593384, "learning_rate": 4.2522338111070294e-05, "loss": 0.3155, "num_input_tokens_seen": 38355088, "step": 40150 }, { "epoch": 3.275552655192104, "grad_norm": 0.9438433647155762, "learning_rate": 4.251979921934854e-05, "loss": 0.3842, "num_input_tokens_seen": 38360432, "step": 40155 }, { "epoch": 3.2759605188025125, "grad_norm": 0.24045635759830475, "learning_rate": 4.251725997250791e-05, "loss": 0.3421, "num_input_tokens_seen": 38365568, "step": 40160 }, { "epoch": 3.276368382412921, "grad_norm": 0.3796606659889221, "learning_rate": 4.251472037059986e-05, "loss": 0.3582, "num_input_tokens_seen": 38370720, "step": 40165 }, { "epoch": 3.2767762460233296, "grad_norm": 0.8415223360061646, "learning_rate": 4.251218041367587e-05, "loss": 0.3468, "num_input_tokens_seen": 38375712, "step": 40170 }, { "epoch": 3.2771841096337386, "grad_norm": 0.985028088092804, "learning_rate": 4.250964010178742e-05, "loss": 0.3834, "num_input_tokens_seen": 38380752, "step": 40175 }, { "epoch": 3.277591973244147, "grad_norm": 0.367206871509552, "learning_rate": 4.250709943498601e-05, "loss": 0.3249, "num_input_tokens_seen": 38385728, "step": 40180 }, { "epoch": 3.2779998368545558, "grad_norm": 0.32674187421798706, "learning_rate": 4.2504558413323135e-05, "loss": 0.3329, "num_input_tokens_seen": 38390816, "step": 40185 }, { "epoch": 3.2784077004649643, "grad_norm": 0.3408171832561493, "learning_rate": 4.25020170368503e-05, "loss": 0.3888, "num_input_tokens_seen": 38395728, "step": 40190 }, { "epoch": 3.2788155640753733, "grad_norm": 0.4169217348098755, "learning_rate": 4.2499475305619015e-05, "loss": 0.3199, "num_input_tokens_seen": 38400416, "step": 40195 }, { "epoch": 3.279223427685782, "grad_norm": 0.9282062649726868, "learning_rate": 4.24969332196808e-05, "loss": 0.3729, "num_input_tokens_seen": 38405424, "step": 40200 }, { "epoch": 3.2796312912961905, "grad_norm": 0.21591918170452118, "learning_rate": 4.2494390779087187e-05, "loss": 0.3457, "num_input_tokens_seen": 38410320, "step": 40205 }, { "epoch": 3.280039154906599, "grad_norm": 0.7420680522918701, "learning_rate": 4.24918479838897e-05, "loss": 0.3689, "num_input_tokens_seen": 38415280, "step": 40210 }, { "epoch": 3.280447018517008, "grad_norm": 0.37535277009010315, "learning_rate": 4.248930483413989e-05, "loss": 0.3577, "num_input_tokens_seen": 38420416, "step": 40215 }, { "epoch": 3.2808548821274166, "grad_norm": 0.7931438088417053, "learning_rate": 4.24867613298893e-05, "loss": 0.3856, "num_input_tokens_seen": 38424848, "step": 40220 }, { "epoch": 3.281262745737825, "grad_norm": 0.21456730365753174, "learning_rate": 4.2484217471189496e-05, "loss": 0.3576, "num_input_tokens_seen": 38430064, "step": 40225 }, { "epoch": 3.281670609348234, "grad_norm": 0.203004390001297, "learning_rate": 4.248167325809203e-05, "loss": 0.3392, "num_input_tokens_seen": 38434848, "step": 40230 }, { "epoch": 3.2820784729586427, "grad_norm": 0.46059462428092957, "learning_rate": 4.2479128690648474e-05, "loss": 0.3312, "num_input_tokens_seen": 38439680, "step": 40235 }, { "epoch": 3.2824863365690513, "grad_norm": 0.5009670853614807, "learning_rate": 4.2476583768910413e-05, "loss": 0.4106, "num_input_tokens_seen": 38444176, "step": 40240 }, { "epoch": 3.28289420017946, "grad_norm": 0.5794185996055603, "learning_rate": 4.247403849292941e-05, "loss": 0.285, "num_input_tokens_seen": 38449488, "step": 40245 }, { "epoch": 3.2833020637898684, "grad_norm": 0.5994337797164917, "learning_rate": 4.247149286275709e-05, "loss": 0.3687, "num_input_tokens_seen": 38453632, "step": 40250 }, { "epoch": 3.2837099274002775, "grad_norm": 0.5988553166389465, "learning_rate": 4.2468946878445024e-05, "loss": 0.3203, "num_input_tokens_seen": 38458544, "step": 40255 }, { "epoch": 3.284117791010686, "grad_norm": 0.3464987277984619, "learning_rate": 4.2466400540044826e-05, "loss": 0.3672, "num_input_tokens_seen": 38463568, "step": 40260 }, { "epoch": 3.2845256546210946, "grad_norm": 0.96531081199646, "learning_rate": 4.246385384760811e-05, "loss": 0.3615, "num_input_tokens_seen": 38467952, "step": 40265 }, { "epoch": 3.2849335182315036, "grad_norm": 0.9138544201850891, "learning_rate": 4.246130680118651e-05, "loss": 0.3516, "num_input_tokens_seen": 38472064, "step": 40270 }, { "epoch": 3.285341381841912, "grad_norm": 0.3301997184753418, "learning_rate": 4.245875940083163e-05, "loss": 0.3101, "num_input_tokens_seen": 38476640, "step": 40275 }, { "epoch": 3.2857492454523207, "grad_norm": 0.37810057401657104, "learning_rate": 4.245621164659512e-05, "loss": 0.2817, "num_input_tokens_seen": 38481440, "step": 40280 }, { "epoch": 3.2861571090627293, "grad_norm": 1.2526415586471558, "learning_rate": 4.2453663538528615e-05, "loss": 0.372, "num_input_tokens_seen": 38486576, "step": 40285 }, { "epoch": 3.2865649726731383, "grad_norm": 0.37451738119125366, "learning_rate": 4.245111507668377e-05, "loss": 0.2678, "num_input_tokens_seen": 38490608, "step": 40290 }, { "epoch": 3.286972836283547, "grad_norm": 0.526427149772644, "learning_rate": 4.244856626111223e-05, "loss": 0.3565, "num_input_tokens_seen": 38495488, "step": 40295 }, { "epoch": 3.2873806998939554, "grad_norm": 0.46268096566200256, "learning_rate": 4.2446017091865676e-05, "loss": 0.2958, "num_input_tokens_seen": 38499616, "step": 40300 }, { "epoch": 3.287788563504364, "grad_norm": 0.45209363102912903, "learning_rate": 4.244346756899576e-05, "loss": 0.3118, "num_input_tokens_seen": 38504016, "step": 40305 }, { "epoch": 3.288196427114773, "grad_norm": 0.6230591535568237, "learning_rate": 4.244091769255418e-05, "loss": 0.366, "num_input_tokens_seen": 38509296, "step": 40310 }, { "epoch": 3.2886042907251816, "grad_norm": 0.3558523654937744, "learning_rate": 4.24383674625926e-05, "loss": 0.3884, "num_input_tokens_seen": 38513456, "step": 40315 }, { "epoch": 3.28901215433559, "grad_norm": 0.41532102227211, "learning_rate": 4.2435816879162725e-05, "loss": 0.3602, "num_input_tokens_seen": 38518128, "step": 40320 }, { "epoch": 3.2894200179459987, "grad_norm": 0.8351285457611084, "learning_rate": 4.243326594231625e-05, "loss": 0.3676, "num_input_tokens_seen": 38522480, "step": 40325 }, { "epoch": 3.2898278815564077, "grad_norm": 0.8231003880500793, "learning_rate": 4.243071465210488e-05, "loss": 0.364, "num_input_tokens_seen": 38526928, "step": 40330 }, { "epoch": 3.2902357451668163, "grad_norm": 0.8986554145812988, "learning_rate": 4.2428163008580336e-05, "loss": 0.3785, "num_input_tokens_seen": 38531552, "step": 40335 }, { "epoch": 3.290643608777225, "grad_norm": 0.34341949224472046, "learning_rate": 4.242561101179433e-05, "loss": 0.3653, "num_input_tokens_seen": 38536144, "step": 40340 }, { "epoch": 3.2910514723876334, "grad_norm": 0.28753402829170227, "learning_rate": 4.2423058661798594e-05, "loss": 0.3429, "num_input_tokens_seen": 38540960, "step": 40345 }, { "epoch": 3.2914593359980424, "grad_norm": 0.8458188772201538, "learning_rate": 4.2420505958644864e-05, "loss": 0.3282, "num_input_tokens_seen": 38545776, "step": 40350 }, { "epoch": 3.291867199608451, "grad_norm": 0.21208880841732025, "learning_rate": 4.241795290238487e-05, "loss": 0.3958, "num_input_tokens_seen": 38550592, "step": 40355 }, { "epoch": 3.2922750632188595, "grad_norm": 0.36312779784202576, "learning_rate": 4.2415399493070385e-05, "loss": 0.3836, "num_input_tokens_seen": 38555344, "step": 40360 }, { "epoch": 3.292682926829268, "grad_norm": 0.17469465732574463, "learning_rate": 4.241284573075315e-05, "loss": 0.3588, "num_input_tokens_seen": 38560672, "step": 40365 }, { "epoch": 3.293090790439677, "grad_norm": 0.4847951829433441, "learning_rate": 4.241029161548493e-05, "loss": 0.3425, "num_input_tokens_seen": 38564816, "step": 40370 }, { "epoch": 3.2934986540500857, "grad_norm": 0.6873133182525635, "learning_rate": 4.240773714731749e-05, "loss": 0.3281, "num_input_tokens_seen": 38569840, "step": 40375 }, { "epoch": 3.2939065176604942, "grad_norm": 0.4907929003238678, "learning_rate": 4.240518232630263e-05, "loss": 0.3396, "num_input_tokens_seen": 38575568, "step": 40380 }, { "epoch": 3.294314381270903, "grad_norm": 0.9022118449211121, "learning_rate": 4.24026271524921e-05, "loss": 0.3988, "num_input_tokens_seen": 38581024, "step": 40385 }, { "epoch": 3.294722244881312, "grad_norm": 0.41324520111083984, "learning_rate": 4.240007162593773e-05, "loss": 0.3519, "num_input_tokens_seen": 38584864, "step": 40390 }, { "epoch": 3.2951301084917204, "grad_norm": 0.36177530884742737, "learning_rate": 4.23975157466913e-05, "loss": 0.3472, "num_input_tokens_seen": 38589408, "step": 40395 }, { "epoch": 3.295537972102129, "grad_norm": 0.42216381430625916, "learning_rate": 4.2394959514804614e-05, "loss": 0.3814, "num_input_tokens_seen": 38594768, "step": 40400 }, { "epoch": 3.295945835712538, "grad_norm": 0.6232860684394836, "learning_rate": 4.2392402930329486e-05, "loss": 0.326, "num_input_tokens_seen": 38599664, "step": 40405 }, { "epoch": 3.2963536993229465, "grad_norm": 0.41874557733535767, "learning_rate": 4.238984599331775e-05, "loss": 0.368, "num_input_tokens_seen": 38604432, "step": 40410 }, { "epoch": 3.296761562933355, "grad_norm": 0.4264260232448578, "learning_rate": 4.238728870382122e-05, "loss": 0.3608, "num_input_tokens_seen": 38608592, "step": 40415 }, { "epoch": 3.2971694265437637, "grad_norm": 0.2757355570793152, "learning_rate": 4.2384731061891734e-05, "loss": 0.3525, "num_input_tokens_seen": 38613888, "step": 40420 }, { "epoch": 3.2975772901541722, "grad_norm": 0.6821573376655579, "learning_rate": 4.238217306758114e-05, "loss": 0.3415, "num_input_tokens_seen": 38619168, "step": 40425 }, { "epoch": 3.2979851537645812, "grad_norm": 0.7413724064826965, "learning_rate": 4.2379614720941287e-05, "loss": 0.32, "num_input_tokens_seen": 38623600, "step": 40430 }, { "epoch": 3.29839301737499, "grad_norm": 0.4352215528488159, "learning_rate": 4.237705602202402e-05, "loss": 0.302, "num_input_tokens_seen": 38627872, "step": 40435 }, { "epoch": 3.2988008809853984, "grad_norm": 0.5570665597915649, "learning_rate": 4.2374496970881214e-05, "loss": 0.3501, "num_input_tokens_seen": 38632272, "step": 40440 }, { "epoch": 3.2992087445958074, "grad_norm": 0.5100211501121521, "learning_rate": 4.237193756756475e-05, "loss": 0.3043, "num_input_tokens_seen": 38637152, "step": 40445 }, { "epoch": 3.299616608206216, "grad_norm": 0.6561722755432129, "learning_rate": 4.2369377812126474e-05, "loss": 0.428, "num_input_tokens_seen": 38641888, "step": 40450 }, { "epoch": 3.3000244718166245, "grad_norm": 1.0078990459442139, "learning_rate": 4.23668177046183e-05, "loss": 0.3834, "num_input_tokens_seen": 38647584, "step": 40455 }, { "epoch": 3.300432335427033, "grad_norm": 0.444477915763855, "learning_rate": 4.236425724509211e-05, "loss": 0.4032, "num_input_tokens_seen": 38652752, "step": 40460 }, { "epoch": 3.3008401990374416, "grad_norm": 0.8205577731132507, "learning_rate": 4.2361696433599805e-05, "loss": 0.3497, "num_input_tokens_seen": 38657776, "step": 40465 }, { "epoch": 3.3012480626478506, "grad_norm": 0.7602643966674805, "learning_rate": 4.235913527019329e-05, "loss": 0.3623, "num_input_tokens_seen": 38662864, "step": 40470 }, { "epoch": 3.301655926258259, "grad_norm": 0.8911029696464539, "learning_rate": 4.2356573754924484e-05, "loss": 0.3809, "num_input_tokens_seen": 38667760, "step": 40475 }, { "epoch": 3.3020637898686678, "grad_norm": 0.9509114623069763, "learning_rate": 4.2354011887845303e-05, "loss": 0.3776, "num_input_tokens_seen": 38672912, "step": 40480 }, { "epoch": 3.302471653479077, "grad_norm": 0.7477750182151794, "learning_rate": 4.2351449669007665e-05, "loss": 0.3521, "num_input_tokens_seen": 38677488, "step": 40485 }, { "epoch": 3.3028795170894854, "grad_norm": 0.3691852390766144, "learning_rate": 4.234888709846352e-05, "loss": 0.3286, "num_input_tokens_seen": 38682624, "step": 40490 }, { "epoch": 3.303287380699894, "grad_norm": 1.0114247798919678, "learning_rate": 4.234632417626481e-05, "loss": 0.3597, "num_input_tokens_seen": 38687872, "step": 40495 }, { "epoch": 3.3036952443103025, "grad_norm": 0.5184757709503174, "learning_rate": 4.234376090246348e-05, "loss": 0.3668, "num_input_tokens_seen": 38692320, "step": 40500 }, { "epoch": 3.3041031079207115, "grad_norm": 0.7590670585632324, "learning_rate": 4.234119727711148e-05, "loss": 0.3515, "num_input_tokens_seen": 38698272, "step": 40505 }, { "epoch": 3.30451097153112, "grad_norm": 0.3526882231235504, "learning_rate": 4.2338633300260784e-05, "loss": 0.3224, "num_input_tokens_seen": 38703104, "step": 40510 }, { "epoch": 3.3049188351415286, "grad_norm": 0.26903924345970154, "learning_rate": 4.233606897196336e-05, "loss": 0.38, "num_input_tokens_seen": 38706992, "step": 40515 }, { "epoch": 3.305326698751937, "grad_norm": 0.35474497079849243, "learning_rate": 4.233350429227117e-05, "loss": 0.3342, "num_input_tokens_seen": 38711600, "step": 40520 }, { "epoch": 3.305734562362346, "grad_norm": 0.6508583426475525, "learning_rate": 4.2330939261236235e-05, "loss": 0.387, "num_input_tokens_seen": 38716144, "step": 40525 }, { "epoch": 3.3061424259727548, "grad_norm": 0.4431351125240326, "learning_rate": 4.2328373878910516e-05, "loss": 0.332, "num_input_tokens_seen": 38720864, "step": 40530 }, { "epoch": 3.3065502895831633, "grad_norm": 0.34815338253974915, "learning_rate": 4.2325808145346025e-05, "loss": 0.3569, "num_input_tokens_seen": 38725392, "step": 40535 }, { "epoch": 3.306958153193572, "grad_norm": 0.2432628571987152, "learning_rate": 4.232324206059476e-05, "loss": 0.3121, "num_input_tokens_seen": 38730416, "step": 40540 }, { "epoch": 3.307366016803981, "grad_norm": 0.5701522827148438, "learning_rate": 4.232067562470874e-05, "loss": 0.4139, "num_input_tokens_seen": 38734880, "step": 40545 }, { "epoch": 3.3077738804143895, "grad_norm": 0.6089634895324707, "learning_rate": 4.231810883773999e-05, "loss": 0.3386, "num_input_tokens_seen": 38739504, "step": 40550 }, { "epoch": 3.308181744024798, "grad_norm": 0.6811640858650208, "learning_rate": 4.231554169974052e-05, "loss": 0.3475, "num_input_tokens_seen": 38744848, "step": 40555 }, { "epoch": 3.3085896076352066, "grad_norm": 1.5816514492034912, "learning_rate": 4.231297421076239e-05, "loss": 0.3983, "num_input_tokens_seen": 38748896, "step": 40560 }, { "epoch": 3.3089974712456156, "grad_norm": 0.6497496366500854, "learning_rate": 4.231040637085763e-05, "loss": 0.3409, "num_input_tokens_seen": 38753504, "step": 40565 }, { "epoch": 3.309405334856024, "grad_norm": 0.495696485042572, "learning_rate": 4.230783818007828e-05, "loss": 0.3517, "num_input_tokens_seen": 38757824, "step": 40570 }, { "epoch": 3.3098131984664327, "grad_norm": 1.5179206132888794, "learning_rate": 4.230526963847641e-05, "loss": 0.3531, "num_input_tokens_seen": 38763616, "step": 40575 }, { "epoch": 3.3102210620768417, "grad_norm": 1.3944077491760254, "learning_rate": 4.230270074610408e-05, "loss": 0.3624, "num_input_tokens_seen": 38768736, "step": 40580 }, { "epoch": 3.3106289256872503, "grad_norm": 0.2960573136806488, "learning_rate": 4.230013150301336e-05, "loss": 0.3508, "num_input_tokens_seen": 38773888, "step": 40585 }, { "epoch": 3.311036789297659, "grad_norm": 0.8242174386978149, "learning_rate": 4.229756190925632e-05, "loss": 0.3433, "num_input_tokens_seen": 38778896, "step": 40590 }, { "epoch": 3.3114446529080674, "grad_norm": 0.32696333527565, "learning_rate": 4.2294991964885055e-05, "loss": 0.3377, "num_input_tokens_seen": 38783952, "step": 40595 }, { "epoch": 3.311852516518476, "grad_norm": 0.4243508577346802, "learning_rate": 4.229242166995164e-05, "loss": 0.3522, "num_input_tokens_seen": 38787616, "step": 40600 }, { "epoch": 3.312260380128885, "grad_norm": 0.719555675983429, "learning_rate": 4.2289851024508196e-05, "loss": 0.3255, "num_input_tokens_seen": 38792672, "step": 40605 }, { "epoch": 3.3126682437392936, "grad_norm": 0.5151528120040894, "learning_rate": 4.228728002860682e-05, "loss": 0.3824, "num_input_tokens_seen": 38797344, "step": 40610 }, { "epoch": 3.313076107349702, "grad_norm": 1.0029383897781372, "learning_rate": 4.228470868229962e-05, "loss": 0.3602, "num_input_tokens_seen": 38801840, "step": 40615 }, { "epoch": 3.313483970960111, "grad_norm": 0.8209736943244934, "learning_rate": 4.228213698563872e-05, "loss": 0.3555, "num_input_tokens_seen": 38806560, "step": 40620 }, { "epoch": 3.3138918345705197, "grad_norm": 0.4925220310688019, "learning_rate": 4.227956493867625e-05, "loss": 0.3219, "num_input_tokens_seen": 38810784, "step": 40625 }, { "epoch": 3.3142996981809283, "grad_norm": 0.4141109585762024, "learning_rate": 4.2276992541464334e-05, "loss": 0.3488, "num_input_tokens_seen": 38816208, "step": 40630 }, { "epoch": 3.314707561791337, "grad_norm": 0.3541558086872101, "learning_rate": 4.2274419794055124e-05, "loss": 0.3349, "num_input_tokens_seen": 38820720, "step": 40635 }, { "epoch": 3.3151154254017454, "grad_norm": 0.9025042057037354, "learning_rate": 4.227184669650077e-05, "loss": 0.3392, "num_input_tokens_seen": 38825136, "step": 40640 }, { "epoch": 3.3155232890121544, "grad_norm": 0.8848493099212646, "learning_rate": 4.226927324885342e-05, "loss": 0.3611, "num_input_tokens_seen": 38830672, "step": 40645 }, { "epoch": 3.315931152622563, "grad_norm": 0.8378477692604065, "learning_rate": 4.226669945116524e-05, "loss": 0.3858, "num_input_tokens_seen": 38835600, "step": 40650 }, { "epoch": 3.3163390162329716, "grad_norm": 1.19804847240448, "learning_rate": 4.22641253034884e-05, "loss": 0.3586, "num_input_tokens_seen": 38839984, "step": 40655 }, { "epoch": 3.3167468798433806, "grad_norm": 0.493479460477829, "learning_rate": 4.226155080587507e-05, "loss": 0.36, "num_input_tokens_seen": 38845200, "step": 40660 }, { "epoch": 3.317154743453789, "grad_norm": 0.4579969048500061, "learning_rate": 4.225897595837744e-05, "loss": 0.3595, "num_input_tokens_seen": 38850000, "step": 40665 }, { "epoch": 3.3175626070641977, "grad_norm": 0.7781844735145569, "learning_rate": 4.225640076104771e-05, "loss": 0.3405, "num_input_tokens_seen": 38855200, "step": 40670 }, { "epoch": 3.3179704706746063, "grad_norm": 0.5041456818580627, "learning_rate": 4.225382521393806e-05, "loss": 0.3605, "num_input_tokens_seen": 38859744, "step": 40675 }, { "epoch": 3.3183783342850153, "grad_norm": 0.5659904479980469, "learning_rate": 4.225124931710071e-05, "loss": 0.3798, "num_input_tokens_seen": 38864224, "step": 40680 }, { "epoch": 3.318786197895424, "grad_norm": 0.3971206247806549, "learning_rate": 4.2248673070587864e-05, "loss": 0.3487, "num_input_tokens_seen": 38868864, "step": 40685 }, { "epoch": 3.3191940615058324, "grad_norm": 0.7940398454666138, "learning_rate": 4.224609647445175e-05, "loss": 0.3649, "num_input_tokens_seen": 38873696, "step": 40690 }, { "epoch": 3.319601925116241, "grad_norm": 0.3126295506954193, "learning_rate": 4.224351952874458e-05, "loss": 0.3467, "num_input_tokens_seen": 38878400, "step": 40695 }, { "epoch": 3.32000978872665, "grad_norm": 0.4809815287590027, "learning_rate": 4.22409422335186e-05, "loss": 0.3279, "num_input_tokens_seen": 38883168, "step": 40700 }, { "epoch": 3.3204176523370585, "grad_norm": 0.416518896818161, "learning_rate": 4.223836458882605e-05, "loss": 0.3893, "num_input_tokens_seen": 38887808, "step": 40705 }, { "epoch": 3.320825515947467, "grad_norm": 0.35990649461746216, "learning_rate": 4.2235786594719174e-05, "loss": 0.3666, "num_input_tokens_seen": 38892720, "step": 40710 }, { "epoch": 3.3212333795578757, "grad_norm": 0.2764759957790375, "learning_rate": 4.223320825125022e-05, "loss": 0.3364, "num_input_tokens_seen": 38898144, "step": 40715 }, { "epoch": 3.3216412431682847, "grad_norm": 0.8514909744262695, "learning_rate": 4.223062955847147e-05, "loss": 0.3645, "num_input_tokens_seen": 38903088, "step": 40720 }, { "epoch": 3.3220491067786933, "grad_norm": 0.17478039860725403, "learning_rate": 4.222805051643517e-05, "loss": 0.3454, "num_input_tokens_seen": 38906528, "step": 40725 }, { "epoch": 3.322456970389102, "grad_norm": 0.7502983808517456, "learning_rate": 4.222547112519361e-05, "loss": 0.3342, "num_input_tokens_seen": 38911696, "step": 40730 }, { "epoch": 3.3228648339995104, "grad_norm": 0.2892874479293823, "learning_rate": 4.222289138479907e-05, "loss": 0.3494, "num_input_tokens_seen": 38916400, "step": 40735 }, { "epoch": 3.3232726976099194, "grad_norm": 1.1908776760101318, "learning_rate": 4.2220311295303844e-05, "loss": 0.3572, "num_input_tokens_seen": 38921376, "step": 40740 }, { "epoch": 3.323680561220328, "grad_norm": 0.7372767329216003, "learning_rate": 4.221773085676022e-05, "loss": 0.3811, "num_input_tokens_seen": 38926304, "step": 40745 }, { "epoch": 3.3240884248307365, "grad_norm": 0.9740437865257263, "learning_rate": 4.2215150069220513e-05, "loss": 0.3872, "num_input_tokens_seen": 38931760, "step": 40750 }, { "epoch": 3.3244962884411455, "grad_norm": 0.9475733041763306, "learning_rate": 4.221256893273703e-05, "loss": 0.3247, "num_input_tokens_seen": 38936112, "step": 40755 }, { "epoch": 3.324904152051554, "grad_norm": 0.38438814878463745, "learning_rate": 4.2209987447362086e-05, "loss": 0.3432, "num_input_tokens_seen": 38940816, "step": 40760 }, { "epoch": 3.3253120156619627, "grad_norm": 0.9322096109390259, "learning_rate": 4.2207405613148006e-05, "loss": 0.3437, "num_input_tokens_seen": 38945568, "step": 40765 }, { "epoch": 3.3257198792723712, "grad_norm": 0.6710835695266724, "learning_rate": 4.2204823430147125e-05, "loss": 0.3879, "num_input_tokens_seen": 38949920, "step": 40770 }, { "epoch": 3.32612774288278, "grad_norm": 0.7427403330802917, "learning_rate": 4.2202240898411794e-05, "loss": 0.3487, "num_input_tokens_seen": 38953952, "step": 40775 }, { "epoch": 3.326535606493189, "grad_norm": 0.9864561557769775, "learning_rate": 4.219965801799434e-05, "loss": 0.3396, "num_input_tokens_seen": 38958304, "step": 40780 }, { "epoch": 3.3269434701035974, "grad_norm": 0.4865664541721344, "learning_rate": 4.219707478894713e-05, "loss": 0.3368, "num_input_tokens_seen": 38963200, "step": 40785 }, { "epoch": 3.327351333714006, "grad_norm": 0.8588403463363647, "learning_rate": 4.2194491211322516e-05, "loss": 0.3462, "num_input_tokens_seen": 38967408, "step": 40790 }, { "epoch": 3.327759197324415, "grad_norm": 0.14802780747413635, "learning_rate": 4.2191907285172884e-05, "loss": 0.3215, "num_input_tokens_seen": 38971680, "step": 40795 }, { "epoch": 3.3281670609348235, "grad_norm": 0.5278199315071106, "learning_rate": 4.2189323010550584e-05, "loss": 0.3455, "num_input_tokens_seen": 38975744, "step": 40800 }, { "epoch": 3.328574924545232, "grad_norm": 0.7787951231002808, "learning_rate": 4.2186738387508017e-05, "loss": 0.3405, "num_input_tokens_seen": 38981072, "step": 40805 }, { "epoch": 3.3289827881556406, "grad_norm": 0.38441529870033264, "learning_rate": 4.218415341609757e-05, "loss": 0.3466, "num_input_tokens_seen": 38986272, "step": 40810 }, { "epoch": 3.329390651766049, "grad_norm": 0.3455394208431244, "learning_rate": 4.218156809637163e-05, "loss": 0.3184, "num_input_tokens_seen": 38991104, "step": 40815 }, { "epoch": 3.329798515376458, "grad_norm": 0.7715456485748291, "learning_rate": 4.2178982428382605e-05, "loss": 0.35, "num_input_tokens_seen": 38996112, "step": 40820 }, { "epoch": 3.3302063789868668, "grad_norm": 0.634696364402771, "learning_rate": 4.217639641218291e-05, "loss": 0.3426, "num_input_tokens_seen": 39000720, "step": 40825 }, { "epoch": 3.3306142425972753, "grad_norm": 0.6093066930770874, "learning_rate": 4.2173810047824944e-05, "loss": 0.2633, "num_input_tokens_seen": 39005792, "step": 40830 }, { "epoch": 3.3310221062076844, "grad_norm": 1.3443443775177002, "learning_rate": 4.2171223335361164e-05, "loss": 0.4829, "num_input_tokens_seen": 39010896, "step": 40835 }, { "epoch": 3.331429969818093, "grad_norm": 0.9418342113494873, "learning_rate": 4.216863627484398e-05, "loss": 0.4544, "num_input_tokens_seen": 39015360, "step": 40840 }, { "epoch": 3.3318378334285015, "grad_norm": 0.7344167828559875, "learning_rate": 4.216604886632583e-05, "loss": 0.2978, "num_input_tokens_seen": 39020400, "step": 40845 }, { "epoch": 3.33224569703891, "grad_norm": 0.36334821581840515, "learning_rate": 4.216346110985917e-05, "loss": 0.3266, "num_input_tokens_seen": 39025584, "step": 40850 }, { "epoch": 3.332653560649319, "grad_norm": 1.9736604690551758, "learning_rate": 4.2160873005496435e-05, "loss": 0.3882, "num_input_tokens_seen": 39030448, "step": 40855 }, { "epoch": 3.3330614242597276, "grad_norm": 0.4163562059402466, "learning_rate": 4.215828455329011e-05, "loss": 0.3552, "num_input_tokens_seen": 39035504, "step": 40860 }, { "epoch": 3.333469287870136, "grad_norm": 0.7333728671073914, "learning_rate": 4.215569575329264e-05, "loss": 0.3276, "num_input_tokens_seen": 39040608, "step": 40865 }, { "epoch": 3.3338771514805448, "grad_norm": 0.3562096655368805, "learning_rate": 4.21531066055565e-05, "loss": 0.3385, "num_input_tokens_seen": 39045136, "step": 40870 }, { "epoch": 3.3342850150909538, "grad_norm": 1.2589614391326904, "learning_rate": 4.215051711013419e-05, "loss": 0.4182, "num_input_tokens_seen": 39050000, "step": 40875 }, { "epoch": 3.3346928787013623, "grad_norm": 0.39198750257492065, "learning_rate": 4.214792726707818e-05, "loss": 0.3247, "num_input_tokens_seen": 39054592, "step": 40880 }, { "epoch": 3.335100742311771, "grad_norm": 0.6109887957572937, "learning_rate": 4.214533707644097e-05, "loss": 0.2737, "num_input_tokens_seen": 39059904, "step": 40885 }, { "epoch": 3.3355086059221795, "grad_norm": 0.26012569665908813, "learning_rate": 4.214274653827507e-05, "loss": 0.3181, "num_input_tokens_seen": 39064448, "step": 40890 }, { "epoch": 3.3359164695325885, "grad_norm": 0.5796302556991577, "learning_rate": 4.2140155652632974e-05, "loss": 0.3143, "num_input_tokens_seen": 39068928, "step": 40895 }, { "epoch": 3.336324333142997, "grad_norm": 2.600557804107666, "learning_rate": 4.213756441956721e-05, "loss": 0.3976, "num_input_tokens_seen": 39072976, "step": 40900 }, { "epoch": 3.3367321967534056, "grad_norm": 0.5432190895080566, "learning_rate": 4.213497283913029e-05, "loss": 0.3608, "num_input_tokens_seen": 39077936, "step": 40905 }, { "epoch": 3.337140060363814, "grad_norm": 0.35715246200561523, "learning_rate": 4.2132380911374765e-05, "loss": 0.2778, "num_input_tokens_seen": 39082912, "step": 40910 }, { "epoch": 3.337547923974223, "grad_norm": 0.2770105302333832, "learning_rate": 4.212978863635315e-05, "loss": 0.338, "num_input_tokens_seen": 39087792, "step": 40915 }, { "epoch": 3.3379557875846317, "grad_norm": 1.0821285247802734, "learning_rate": 4.2127196014118e-05, "loss": 0.3557, "num_input_tokens_seen": 39093344, "step": 40920 }, { "epoch": 3.3383636511950403, "grad_norm": 0.5584695339202881, "learning_rate": 4.2124603044721864e-05, "loss": 0.3828, "num_input_tokens_seen": 39098480, "step": 40925 }, { "epoch": 3.338771514805449, "grad_norm": 0.86641925573349, "learning_rate": 4.2122009728217304e-05, "loss": 0.3517, "num_input_tokens_seen": 39103088, "step": 40930 }, { "epoch": 3.339179378415858, "grad_norm": 0.794974148273468, "learning_rate": 4.2119416064656883e-05, "loss": 0.3462, "num_input_tokens_seen": 39107696, "step": 40935 }, { "epoch": 3.3395872420262664, "grad_norm": 0.9812759160995483, "learning_rate": 4.211682205409317e-05, "loss": 0.3477, "num_input_tokens_seen": 39112672, "step": 40940 }, { "epoch": 3.339995105636675, "grad_norm": 0.27380385994911194, "learning_rate": 4.2114227696578745e-05, "loss": 0.3248, "num_input_tokens_seen": 39117280, "step": 40945 }, { "epoch": 3.3404029692470836, "grad_norm": 0.48591747879981995, "learning_rate": 4.211163299216621e-05, "loss": 0.3428, "num_input_tokens_seen": 39122576, "step": 40950 }, { "epoch": 3.3408108328574926, "grad_norm": 0.5639603734016418, "learning_rate": 4.2109037940908126e-05, "loss": 0.3168, "num_input_tokens_seen": 39127648, "step": 40955 }, { "epoch": 3.341218696467901, "grad_norm": 0.27513235807418823, "learning_rate": 4.210644254285713e-05, "loss": 0.2966, "num_input_tokens_seen": 39132176, "step": 40960 }, { "epoch": 3.3416265600783097, "grad_norm": 0.42824822664260864, "learning_rate": 4.210384679806579e-05, "loss": 0.359, "num_input_tokens_seen": 39136912, "step": 40965 }, { "epoch": 3.3420344236887187, "grad_norm": 0.5593432784080505, "learning_rate": 4.210125070658677e-05, "loss": 0.3174, "num_input_tokens_seen": 39142192, "step": 40970 }, { "epoch": 3.3424422872991273, "grad_norm": 0.45125260949134827, "learning_rate": 4.209865426847265e-05, "loss": 0.3627, "num_input_tokens_seen": 39147152, "step": 40975 }, { "epoch": 3.342850150909536, "grad_norm": 0.3226574957370758, "learning_rate": 4.209605748377607e-05, "loss": 0.5127, "num_input_tokens_seen": 39152080, "step": 40980 }, { "epoch": 3.3432580145199444, "grad_norm": 0.17186158895492554, "learning_rate": 4.209346035254968e-05, "loss": 0.3791, "num_input_tokens_seen": 39156864, "step": 40985 }, { "epoch": 3.343665878130353, "grad_norm": 0.538068413734436, "learning_rate": 4.209086287484611e-05, "loss": 0.3526, "num_input_tokens_seen": 39161744, "step": 40990 }, { "epoch": 3.344073741740762, "grad_norm": 0.8916652798652649, "learning_rate": 4.2088265050718015e-05, "loss": 0.3695, "num_input_tokens_seen": 39166704, "step": 40995 }, { "epoch": 3.3444816053511706, "grad_norm": 0.6255947351455688, "learning_rate": 4.208566688021804e-05, "loss": 0.3262, "num_input_tokens_seen": 39171296, "step": 41000 }, { "epoch": 3.344889468961579, "grad_norm": 0.831148087978363, "learning_rate": 4.208306836339886e-05, "loss": 0.3822, "num_input_tokens_seen": 39176288, "step": 41005 }, { "epoch": 3.345297332571988, "grad_norm": 0.492103636264801, "learning_rate": 4.208046950031314e-05, "loss": 0.337, "num_input_tokens_seen": 39181696, "step": 41010 }, { "epoch": 3.3457051961823967, "grad_norm": 0.7119290828704834, "learning_rate": 4.207787029101357e-05, "loss": 0.3585, "num_input_tokens_seen": 39186976, "step": 41015 }, { "epoch": 3.3461130597928053, "grad_norm": 0.633263349533081, "learning_rate": 4.207527073555282e-05, "loss": 0.3405, "num_input_tokens_seen": 39192016, "step": 41020 }, { "epoch": 3.346520923403214, "grad_norm": 0.35367417335510254, "learning_rate": 4.207267083398359e-05, "loss": 0.3592, "num_input_tokens_seen": 39196896, "step": 41025 }, { "epoch": 3.3469287870136224, "grad_norm": 0.7301613688468933, "learning_rate": 4.207007058635858e-05, "loss": 0.3649, "num_input_tokens_seen": 39201824, "step": 41030 }, { "epoch": 3.3473366506240314, "grad_norm": 0.2611100673675537, "learning_rate": 4.206746999273049e-05, "loss": 0.3491, "num_input_tokens_seen": 39206688, "step": 41035 }, { "epoch": 3.34774451423444, "grad_norm": 0.2992889881134033, "learning_rate": 4.206486905315203e-05, "loss": 0.3455, "num_input_tokens_seen": 39211248, "step": 41040 }, { "epoch": 3.3481523778448485, "grad_norm": 0.31779778003692627, "learning_rate": 4.206226776767593e-05, "loss": 0.3403, "num_input_tokens_seen": 39215568, "step": 41045 }, { "epoch": 3.3485602414552575, "grad_norm": 0.42478618025779724, "learning_rate": 4.205966613635492e-05, "loss": 0.3266, "num_input_tokens_seen": 39220336, "step": 41050 }, { "epoch": 3.348968105065666, "grad_norm": 0.5827497839927673, "learning_rate": 4.205706415924171e-05, "loss": 0.3369, "num_input_tokens_seen": 39225280, "step": 41055 }, { "epoch": 3.3493759686760747, "grad_norm": 0.5607886910438538, "learning_rate": 4.205446183638907e-05, "loss": 0.2967, "num_input_tokens_seen": 39229968, "step": 41060 }, { "epoch": 3.3497838322864832, "grad_norm": 0.5423524975776672, "learning_rate": 4.205185916784973e-05, "loss": 0.339, "num_input_tokens_seen": 39234832, "step": 41065 }, { "epoch": 3.3501916958968923, "grad_norm": 0.41925913095474243, "learning_rate": 4.204925615367645e-05, "loss": 0.2988, "num_input_tokens_seen": 39239520, "step": 41070 }, { "epoch": 3.350599559507301, "grad_norm": 0.39619898796081543, "learning_rate": 4.2046652793921997e-05, "loss": 0.3929, "num_input_tokens_seen": 39245104, "step": 41075 }, { "epoch": 3.3510074231177094, "grad_norm": 0.5984968543052673, "learning_rate": 4.204404908863914e-05, "loss": 0.2492, "num_input_tokens_seen": 39250240, "step": 41080 }, { "epoch": 3.351415286728118, "grad_norm": 0.7514188885688782, "learning_rate": 4.204144503788064e-05, "loss": 0.374, "num_input_tokens_seen": 39255264, "step": 41085 }, { "epoch": 3.351823150338527, "grad_norm": 0.3882436752319336, "learning_rate": 4.20388406416993e-05, "loss": 0.4925, "num_input_tokens_seen": 39259552, "step": 41090 }, { "epoch": 3.3522310139489355, "grad_norm": 0.972339391708374, "learning_rate": 4.203623590014789e-05, "loss": 0.3295, "num_input_tokens_seen": 39264880, "step": 41095 }, { "epoch": 3.352638877559344, "grad_norm": 0.15286536514759064, "learning_rate": 4.2033630813279225e-05, "loss": 0.3257, "num_input_tokens_seen": 39269664, "step": 41100 }, { "epoch": 3.3530467411697527, "grad_norm": 0.8841352462768555, "learning_rate": 4.2031025381146096e-05, "loss": 0.3418, "num_input_tokens_seen": 39275136, "step": 41105 }, { "epoch": 3.3534546047801617, "grad_norm": 0.15182563662528992, "learning_rate": 4.2028419603801315e-05, "loss": 0.3383, "num_input_tokens_seen": 39280000, "step": 41110 }, { "epoch": 3.3538624683905702, "grad_norm": 0.9950265884399414, "learning_rate": 4.2025813481297716e-05, "loss": 0.3636, "num_input_tokens_seen": 39285232, "step": 41115 }, { "epoch": 3.354270332000979, "grad_norm": 0.6277857422828674, "learning_rate": 4.202320701368811e-05, "loss": 0.3612, "num_input_tokens_seen": 39289552, "step": 41120 }, { "epoch": 3.3546781956113874, "grad_norm": 0.18485920131206512, "learning_rate": 4.202060020102533e-05, "loss": 0.3803, "num_input_tokens_seen": 39294928, "step": 41125 }, { "epoch": 3.3550860592217964, "grad_norm": 0.31152600049972534, "learning_rate": 4.2017993043362216e-05, "loss": 0.357, "num_input_tokens_seen": 39300048, "step": 41130 }, { "epoch": 3.355493922832205, "grad_norm": 0.15928632020950317, "learning_rate": 4.201538554075162e-05, "loss": 0.3462, "num_input_tokens_seen": 39304960, "step": 41135 }, { "epoch": 3.3559017864426135, "grad_norm": 0.1533280611038208, "learning_rate": 4.201277769324637e-05, "loss": 0.3459, "num_input_tokens_seen": 39309504, "step": 41140 }, { "epoch": 3.3563096500530225, "grad_norm": 0.24754664301872253, "learning_rate": 4.2010169500899366e-05, "loss": 0.3455, "num_input_tokens_seen": 39313808, "step": 41145 }, { "epoch": 3.356717513663431, "grad_norm": 0.31996437907218933, "learning_rate": 4.200756096376345e-05, "loss": 0.3405, "num_input_tokens_seen": 39318912, "step": 41150 }, { "epoch": 3.3571253772738396, "grad_norm": 0.2648009657859802, "learning_rate": 4.200495208189149e-05, "loss": 0.3343, "num_input_tokens_seen": 39323584, "step": 41155 }, { "epoch": 3.357533240884248, "grad_norm": 0.5888620018959045, "learning_rate": 4.200234285533639e-05, "loss": 0.3186, "num_input_tokens_seen": 39328960, "step": 41160 }, { "epoch": 3.3579411044946568, "grad_norm": 0.5796914100646973, "learning_rate": 4.199973328415102e-05, "loss": 0.2989, "num_input_tokens_seen": 39334560, "step": 41165 }, { "epoch": 3.358348968105066, "grad_norm": 0.30350103974342346, "learning_rate": 4.199712336838828e-05, "loss": 0.3365, "num_input_tokens_seen": 39339632, "step": 41170 }, { "epoch": 3.3587568317154743, "grad_norm": 1.1029000282287598, "learning_rate": 4.199451310810107e-05, "loss": 0.4284, "num_input_tokens_seen": 39343936, "step": 41175 }, { "epoch": 3.359164695325883, "grad_norm": 0.2222800850868225, "learning_rate": 4.19919025033423e-05, "loss": 0.2607, "num_input_tokens_seen": 39348624, "step": 41180 }, { "epoch": 3.359572558936292, "grad_norm": 0.9894067049026489, "learning_rate": 4.198929155416489e-05, "loss": 0.4319, "num_input_tokens_seen": 39354320, "step": 41185 }, { "epoch": 3.3599804225467005, "grad_norm": 0.1017957329750061, "learning_rate": 4.198668026062176e-05, "loss": 0.3565, "num_input_tokens_seen": 39358384, "step": 41190 }, { "epoch": 3.360388286157109, "grad_norm": 0.263690710067749, "learning_rate": 4.198406862276584e-05, "loss": 0.3486, "num_input_tokens_seen": 39363024, "step": 41195 }, { "epoch": 3.3607961497675176, "grad_norm": 0.17239217460155487, "learning_rate": 4.198145664065006e-05, "loss": 0.3633, "num_input_tokens_seen": 39368176, "step": 41200 }, { "epoch": 3.361204013377926, "grad_norm": 0.15323542058467865, "learning_rate": 4.197884431432737e-05, "loss": 0.3936, "num_input_tokens_seen": 39372176, "step": 41205 }, { "epoch": 3.361611876988335, "grad_norm": 0.6834115982055664, "learning_rate": 4.197623164385073e-05, "loss": 0.337, "num_input_tokens_seen": 39377472, "step": 41210 }, { "epoch": 3.3620197405987438, "grad_norm": 0.9456017017364502, "learning_rate": 4.197361862927308e-05, "loss": 0.377, "num_input_tokens_seen": 39382704, "step": 41215 }, { "epoch": 3.3624276042091523, "grad_norm": 0.6053563952445984, "learning_rate": 4.197100527064739e-05, "loss": 0.3299, "num_input_tokens_seen": 39386592, "step": 41220 }, { "epoch": 3.3628354678195613, "grad_norm": 0.3477666974067688, "learning_rate": 4.196839156802664e-05, "loss": 0.3281, "num_input_tokens_seen": 39390640, "step": 41225 }, { "epoch": 3.36324333142997, "grad_norm": 0.42740076780319214, "learning_rate": 4.19657775214638e-05, "loss": 0.2876, "num_input_tokens_seen": 39396352, "step": 41230 }, { "epoch": 3.3636511950403785, "grad_norm": 1.206263780593872, "learning_rate": 4.1963163131011864e-05, "loss": 0.457, "num_input_tokens_seen": 39400704, "step": 41235 }, { "epoch": 3.364059058650787, "grad_norm": 0.569320559501648, "learning_rate": 4.196054839672382e-05, "loss": 0.3349, "num_input_tokens_seen": 39406240, "step": 41240 }, { "epoch": 3.364466922261196, "grad_norm": 0.1937137246131897, "learning_rate": 4.195793331865267e-05, "loss": 0.3492, "num_input_tokens_seen": 39411504, "step": 41245 }, { "epoch": 3.3648747858716046, "grad_norm": 0.18989168107509613, "learning_rate": 4.19553178968514e-05, "loss": 0.3395, "num_input_tokens_seen": 39416992, "step": 41250 }, { "epoch": 3.365282649482013, "grad_norm": 0.7295031547546387, "learning_rate": 4.1952702131373056e-05, "loss": 0.3738, "num_input_tokens_seen": 39421440, "step": 41255 }, { "epoch": 3.3656905130924217, "grad_norm": 0.17920030653476715, "learning_rate": 4.195008602227064e-05, "loss": 0.3308, "num_input_tokens_seen": 39425904, "step": 41260 }, { "epoch": 3.3660983767028307, "grad_norm": 0.2594693899154663, "learning_rate": 4.194746956959718e-05, "loss": 0.3428, "num_input_tokens_seen": 39430736, "step": 41265 }, { "epoch": 3.3665062403132393, "grad_norm": 1.013218879699707, "learning_rate": 4.194485277340572e-05, "loss": 0.3416, "num_input_tokens_seen": 39435616, "step": 41270 }, { "epoch": 3.366914103923648, "grad_norm": 0.6132299304008484, "learning_rate": 4.194223563374928e-05, "loss": 0.3151, "num_input_tokens_seen": 39441248, "step": 41275 }, { "epoch": 3.3673219675340564, "grad_norm": 1.0495102405548096, "learning_rate": 4.193961815068094e-05, "loss": 0.3433, "num_input_tokens_seen": 39445760, "step": 41280 }, { "epoch": 3.3677298311444654, "grad_norm": 0.5529658794403076, "learning_rate": 4.193700032425373e-05, "loss": 0.2756, "num_input_tokens_seen": 39450064, "step": 41285 }, { "epoch": 3.368137694754874, "grad_norm": 0.5433979034423828, "learning_rate": 4.1934382154520724e-05, "loss": 0.278, "num_input_tokens_seen": 39454512, "step": 41290 }, { "epoch": 3.3685455583652826, "grad_norm": 0.4301670789718628, "learning_rate": 4.1931763641534985e-05, "loss": 0.3459, "num_input_tokens_seen": 39459392, "step": 41295 }, { "epoch": 3.368953421975691, "grad_norm": 0.47449612617492676, "learning_rate": 4.1929144785349594e-05, "loss": 0.2689, "num_input_tokens_seen": 39463840, "step": 41300 }, { "epoch": 3.3693612855861, "grad_norm": 0.4558459222316742, "learning_rate": 4.192652558601763e-05, "loss": 0.4286, "num_input_tokens_seen": 39469072, "step": 41305 }, { "epoch": 3.3697691491965087, "grad_norm": 0.41824737191200256, "learning_rate": 4.192390604359218e-05, "loss": 0.4229, "num_input_tokens_seen": 39474080, "step": 41310 }, { "epoch": 3.3701770128069173, "grad_norm": 0.8750072717666626, "learning_rate": 4.1921286158126357e-05, "loss": 0.3248, "num_input_tokens_seen": 39478912, "step": 41315 }, { "epoch": 3.3705848764173263, "grad_norm": 0.6803897619247437, "learning_rate": 4.191866592967324e-05, "loss": 0.3426, "num_input_tokens_seen": 39483488, "step": 41320 }, { "epoch": 3.370992740027735, "grad_norm": 0.8789042830467224, "learning_rate": 4.1916045358285965e-05, "loss": 0.353, "num_input_tokens_seen": 39488560, "step": 41325 }, { "epoch": 3.3714006036381434, "grad_norm": 0.17080046236515045, "learning_rate": 4.1913424444017635e-05, "loss": 0.3687, "num_input_tokens_seen": 39492704, "step": 41330 }, { "epoch": 3.371808467248552, "grad_norm": 0.7611717581748962, "learning_rate": 4.1910803186921374e-05, "loss": 0.3306, "num_input_tokens_seen": 39497744, "step": 41335 }, { "epoch": 3.3722163308589606, "grad_norm": 0.18161368370056152, "learning_rate": 4.190818158705032e-05, "loss": 0.3727, "num_input_tokens_seen": 39502704, "step": 41340 }, { "epoch": 3.3726241944693696, "grad_norm": 0.23663277924060822, "learning_rate": 4.190555964445762e-05, "loss": 0.3403, "num_input_tokens_seen": 39507696, "step": 41345 }, { "epoch": 3.373032058079778, "grad_norm": 0.311026006937027, "learning_rate": 4.1902937359196406e-05, "loss": 0.3021, "num_input_tokens_seen": 39512176, "step": 41350 }, { "epoch": 3.3734399216901867, "grad_norm": 0.3155125379562378, "learning_rate": 4.1900314731319825e-05, "loss": 0.3918, "num_input_tokens_seen": 39516816, "step": 41355 }, { "epoch": 3.3738477853005957, "grad_norm": 0.5685480833053589, "learning_rate": 4.1897691760881056e-05, "loss": 0.2763, "num_input_tokens_seen": 39522000, "step": 41360 }, { "epoch": 3.3742556489110043, "grad_norm": 0.57169508934021, "learning_rate": 4.189506844793325e-05, "loss": 0.3322, "num_input_tokens_seen": 39527552, "step": 41365 }, { "epoch": 3.374663512521413, "grad_norm": 0.3515194356441498, "learning_rate": 4.189244479252958e-05, "loss": 0.3007, "num_input_tokens_seen": 39532400, "step": 41370 }, { "epoch": 3.3750713761318214, "grad_norm": 0.3773258626461029, "learning_rate": 4.1889820794723236e-05, "loss": 0.3426, "num_input_tokens_seen": 39538032, "step": 41375 }, { "epoch": 3.37547923974223, "grad_norm": 0.4284553527832031, "learning_rate": 4.188719645456741e-05, "loss": 0.35, "num_input_tokens_seen": 39543520, "step": 41380 }, { "epoch": 3.375887103352639, "grad_norm": 0.43439728021621704, "learning_rate": 4.188457177211528e-05, "loss": 0.4284, "num_input_tokens_seen": 39548656, "step": 41385 }, { "epoch": 3.3762949669630475, "grad_norm": 0.8022930026054382, "learning_rate": 4.188194674742005e-05, "loss": 0.3493, "num_input_tokens_seen": 39553376, "step": 41390 }, { "epoch": 3.376702830573456, "grad_norm": 0.7451594471931458, "learning_rate": 4.187932138053494e-05, "loss": 0.3186, "num_input_tokens_seen": 39558576, "step": 41395 }, { "epoch": 3.377110694183865, "grad_norm": 0.9467217326164246, "learning_rate": 4.187669567151316e-05, "loss": 0.411, "num_input_tokens_seen": 39562960, "step": 41400 }, { "epoch": 3.3775185577942737, "grad_norm": 0.9685454964637756, "learning_rate": 4.187406962040793e-05, "loss": 0.3606, "num_input_tokens_seen": 39567936, "step": 41405 }, { "epoch": 3.3779264214046822, "grad_norm": 0.7204654812812805, "learning_rate": 4.1871443227272477e-05, "loss": 0.3663, "num_input_tokens_seen": 39572784, "step": 41410 }, { "epoch": 3.378334285015091, "grad_norm": 0.7810231447219849, "learning_rate": 4.186881649216004e-05, "loss": 0.3414, "num_input_tokens_seen": 39577312, "step": 41415 }, { "epoch": 3.3787421486255, "grad_norm": 1.0293073654174805, "learning_rate": 4.186618941512386e-05, "loss": 0.385, "num_input_tokens_seen": 39582624, "step": 41420 }, { "epoch": 3.3791500122359084, "grad_norm": 0.33508098125457764, "learning_rate": 4.1863561996217184e-05, "loss": 0.3243, "num_input_tokens_seen": 39587040, "step": 41425 }, { "epoch": 3.379557875846317, "grad_norm": 0.6394016146659851, "learning_rate": 4.1860934235493276e-05, "loss": 0.2834, "num_input_tokens_seen": 39591440, "step": 41430 }, { "epoch": 3.3799657394567255, "grad_norm": 0.2850809693336487, "learning_rate": 4.185830613300539e-05, "loss": 0.391, "num_input_tokens_seen": 39595920, "step": 41435 }, { "epoch": 3.3803736030671345, "grad_norm": 0.37273213267326355, "learning_rate": 4.18556776888068e-05, "loss": 0.4032, "num_input_tokens_seen": 39600336, "step": 41440 }, { "epoch": 3.380781466677543, "grad_norm": 0.3676096796989441, "learning_rate": 4.1853048902950795e-05, "loss": 0.3215, "num_input_tokens_seen": 39605376, "step": 41445 }, { "epoch": 3.3811893302879517, "grad_norm": 0.36239078640937805, "learning_rate": 4.1850419775490654e-05, "loss": 0.3519, "num_input_tokens_seen": 39610304, "step": 41450 }, { "epoch": 3.38159719389836, "grad_norm": 0.30193763971328735, "learning_rate": 4.1847790306479657e-05, "loss": 0.3221, "num_input_tokens_seen": 39615760, "step": 41455 }, { "epoch": 3.3820050575087692, "grad_norm": 0.293953537940979, "learning_rate": 4.184516049597112e-05, "loss": 0.3497, "num_input_tokens_seen": 39620480, "step": 41460 }, { "epoch": 3.382412921119178, "grad_norm": 0.6846731305122375, "learning_rate": 4.184253034401832e-05, "loss": 0.3366, "num_input_tokens_seen": 39625824, "step": 41465 }, { "epoch": 3.3828207847295864, "grad_norm": 0.646594226360321, "learning_rate": 4.183989985067459e-05, "loss": 0.3495, "num_input_tokens_seen": 39630192, "step": 41470 }, { "epoch": 3.383228648339995, "grad_norm": 0.8963201642036438, "learning_rate": 4.183726901599325e-05, "loss": 0.3892, "num_input_tokens_seen": 39634976, "step": 41475 }, { "epoch": 3.383636511950404, "grad_norm": 0.7586123943328857, "learning_rate": 4.183463784002762e-05, "loss": 0.3483, "num_input_tokens_seen": 39639344, "step": 41480 }, { "epoch": 3.3840443755608125, "grad_norm": 0.38257211446762085, "learning_rate": 4.183200632283104e-05, "loss": 0.3523, "num_input_tokens_seen": 39643344, "step": 41485 }, { "epoch": 3.384452239171221, "grad_norm": 0.32043182849884033, "learning_rate": 4.182937446445684e-05, "loss": 0.3325, "num_input_tokens_seen": 39648464, "step": 41490 }, { "epoch": 3.3848601027816296, "grad_norm": 0.38495925068855286, "learning_rate": 4.1826742264958376e-05, "loss": 0.3218, "num_input_tokens_seen": 39652848, "step": 41495 }, { "epoch": 3.3852679663920386, "grad_norm": 0.5854383111000061, "learning_rate": 4.182410972438899e-05, "loss": 0.2908, "num_input_tokens_seen": 39656800, "step": 41500 }, { "epoch": 3.385675830002447, "grad_norm": 0.5457459688186646, "learning_rate": 4.182147684280205e-05, "loss": 0.2618, "num_input_tokens_seen": 39662384, "step": 41505 }, { "epoch": 3.3860836936128558, "grad_norm": 0.9666649103164673, "learning_rate": 4.1818843620250925e-05, "loss": 0.3154, "num_input_tokens_seen": 39667344, "step": 41510 }, { "epoch": 3.3864915572232643, "grad_norm": 0.4666718542575836, "learning_rate": 4.181621005678899e-05, "loss": 0.3214, "num_input_tokens_seen": 39672080, "step": 41515 }, { "epoch": 3.3868994208336733, "grad_norm": 0.5328689813613892, "learning_rate": 4.1813576152469607e-05, "loss": 0.4071, "num_input_tokens_seen": 39677360, "step": 41520 }, { "epoch": 3.387307284444082, "grad_norm": 0.6548384428024292, "learning_rate": 4.181094190734619e-05, "loss": 0.3606, "num_input_tokens_seen": 39681664, "step": 41525 }, { "epoch": 3.3877151480544905, "grad_norm": 0.717082679271698, "learning_rate": 4.180830732147213e-05, "loss": 0.3406, "num_input_tokens_seen": 39686272, "step": 41530 }, { "epoch": 3.3881230116648995, "grad_norm": 0.9046743512153625, "learning_rate": 4.1805672394900816e-05, "loss": 0.3198, "num_input_tokens_seen": 39690416, "step": 41535 }, { "epoch": 3.388530875275308, "grad_norm": 0.6079533696174622, "learning_rate": 4.1803037127685664e-05, "loss": 0.2988, "num_input_tokens_seen": 39694896, "step": 41540 }, { "epoch": 3.3889387388857166, "grad_norm": 0.5050548911094666, "learning_rate": 4.180040151988008e-05, "loss": 0.3766, "num_input_tokens_seen": 39699408, "step": 41545 }, { "epoch": 3.389346602496125, "grad_norm": 0.26524972915649414, "learning_rate": 4.179776557153752e-05, "loss": 0.3247, "num_input_tokens_seen": 39703472, "step": 41550 }, { "epoch": 3.3897544661065337, "grad_norm": 0.6612886786460876, "learning_rate": 4.179512928271136e-05, "loss": 0.3206, "num_input_tokens_seen": 39707200, "step": 41555 }, { "epoch": 3.3901623297169428, "grad_norm": 0.5945093631744385, "learning_rate": 4.1792492653455095e-05, "loss": 0.2997, "num_input_tokens_seen": 39712624, "step": 41560 }, { "epoch": 3.3905701933273513, "grad_norm": 0.25545865297317505, "learning_rate": 4.178985568382212e-05, "loss": 0.4019, "num_input_tokens_seen": 39716064, "step": 41565 }, { "epoch": 3.39097805693776, "grad_norm": 1.0810778141021729, "learning_rate": 4.178721837386592e-05, "loss": 0.3503, "num_input_tokens_seen": 39720752, "step": 41570 }, { "epoch": 3.391385920548169, "grad_norm": 1.0480597019195557, "learning_rate": 4.1784580723639923e-05, "loss": 0.3805, "num_input_tokens_seen": 39726384, "step": 41575 }, { "epoch": 3.3917937841585775, "grad_norm": 0.6338064670562744, "learning_rate": 4.178194273319761e-05, "loss": 0.3445, "num_input_tokens_seen": 39730576, "step": 41580 }, { "epoch": 3.392201647768986, "grad_norm": 0.8359847664833069, "learning_rate": 4.177930440259245e-05, "loss": 0.3396, "num_input_tokens_seen": 39735312, "step": 41585 }, { "epoch": 3.3926095113793946, "grad_norm": 0.5133441686630249, "learning_rate": 4.1776665731877924e-05, "loss": 0.3649, "num_input_tokens_seen": 39739856, "step": 41590 }, { "epoch": 3.3930173749898036, "grad_norm": 0.6082059741020203, "learning_rate": 4.177402672110751e-05, "loss": 0.3301, "num_input_tokens_seen": 39744752, "step": 41595 }, { "epoch": 3.393425238600212, "grad_norm": 0.31964370608329773, "learning_rate": 4.17713873703347e-05, "loss": 0.3805, "num_input_tokens_seen": 39749056, "step": 41600 }, { "epoch": 3.3938331022106207, "grad_norm": 0.29308605194091797, "learning_rate": 4.1768747679612996e-05, "loss": 0.3184, "num_input_tokens_seen": 39753632, "step": 41605 }, { "epoch": 3.3942409658210293, "grad_norm": 0.6901655197143555, "learning_rate": 4.17661076489959e-05, "loss": 0.2735, "num_input_tokens_seen": 39758448, "step": 41610 }, { "epoch": 3.3946488294314383, "grad_norm": 0.6829401254653931, "learning_rate": 4.176346727853693e-05, "loss": 0.2911, "num_input_tokens_seen": 39763712, "step": 41615 }, { "epoch": 3.395056693041847, "grad_norm": 0.3770563304424286, "learning_rate": 4.17608265682896e-05, "loss": 0.4362, "num_input_tokens_seen": 39767984, "step": 41620 }, { "epoch": 3.3954645566522554, "grad_norm": 0.5660653710365295, "learning_rate": 4.175818551830744e-05, "loss": 0.4123, "num_input_tokens_seen": 39772720, "step": 41625 }, { "epoch": 3.395872420262664, "grad_norm": 0.31425219774246216, "learning_rate": 4.175554412864397e-05, "loss": 0.3565, "num_input_tokens_seen": 39777568, "step": 41630 }, { "epoch": 3.396280283873073, "grad_norm": 0.7934860587120056, "learning_rate": 4.1752902399352755e-05, "loss": 0.3995, "num_input_tokens_seen": 39782384, "step": 41635 }, { "epoch": 3.3966881474834816, "grad_norm": 0.41397374868392944, "learning_rate": 4.175026033048731e-05, "loss": 0.3208, "num_input_tokens_seen": 39787792, "step": 41640 }, { "epoch": 3.39709601109389, "grad_norm": 0.6139947772026062, "learning_rate": 4.174761792210122e-05, "loss": 0.3144, "num_input_tokens_seen": 39792832, "step": 41645 }, { "epoch": 3.3975038747042987, "grad_norm": 0.33299097418785095, "learning_rate": 4.1744975174248026e-05, "loss": 0.3669, "num_input_tokens_seen": 39796960, "step": 41650 }, { "epoch": 3.3979117383147077, "grad_norm": 0.5122479200363159, "learning_rate": 4.174233208698131e-05, "loss": 0.4058, "num_input_tokens_seen": 39802576, "step": 41655 }, { "epoch": 3.3983196019251163, "grad_norm": 0.3698669373989105, "learning_rate": 4.173968866035463e-05, "loss": 0.3623, "num_input_tokens_seen": 39807824, "step": 41660 }, { "epoch": 3.398727465535525, "grad_norm": 0.6007484793663025, "learning_rate": 4.1737044894421565e-05, "loss": 0.376, "num_input_tokens_seen": 39812176, "step": 41665 }, { "epoch": 3.3991353291459334, "grad_norm": 0.3043089807033539, "learning_rate": 4.173440078923572e-05, "loss": 0.3428, "num_input_tokens_seen": 39815760, "step": 41670 }, { "epoch": 3.3995431927563424, "grad_norm": 0.7587214708328247, "learning_rate": 4.173175634485068e-05, "loss": 0.3411, "num_input_tokens_seen": 39820288, "step": 41675 }, { "epoch": 3.399951056366751, "grad_norm": 0.7097640633583069, "learning_rate": 4.172911156132006e-05, "loss": 0.3319, "num_input_tokens_seen": 39825008, "step": 41680 }, { "epoch": 3.4003589199771596, "grad_norm": 0.48646774888038635, "learning_rate": 4.172646643869744e-05, "loss": 0.3311, "num_input_tokens_seen": 39829936, "step": 41685 }, { "epoch": 3.400766783587568, "grad_norm": 0.6413142681121826, "learning_rate": 4.172382097703646e-05, "loss": 0.372, "num_input_tokens_seen": 39834848, "step": 41690 }, { "epoch": 3.401174647197977, "grad_norm": 0.5423712134361267, "learning_rate": 4.1721175176390736e-05, "loss": 0.3252, "num_input_tokens_seen": 39839600, "step": 41695 }, { "epoch": 3.4015825108083857, "grad_norm": 0.43416038155555725, "learning_rate": 4.171852903681389e-05, "loss": 0.338, "num_input_tokens_seen": 39844064, "step": 41700 }, { "epoch": 3.4019903744187943, "grad_norm": 1.2151626348495483, "learning_rate": 4.1715882558359574e-05, "loss": 0.4244, "num_input_tokens_seen": 39848656, "step": 41705 }, { "epoch": 3.4023982380292033, "grad_norm": 0.7143798470497131, "learning_rate": 4.171323574108142e-05, "loss": 0.3048, "num_input_tokens_seen": 39853808, "step": 41710 }, { "epoch": 3.402806101639612, "grad_norm": 0.46569734811782837, "learning_rate": 4.171058858503308e-05, "loss": 0.3101, "num_input_tokens_seen": 39857872, "step": 41715 }, { "epoch": 3.4032139652500204, "grad_norm": 0.48978498578071594, "learning_rate": 4.1707941090268206e-05, "loss": 0.3071, "num_input_tokens_seen": 39862448, "step": 41720 }, { "epoch": 3.403621828860429, "grad_norm": 0.5522122383117676, "learning_rate": 4.1705293256840464e-05, "loss": 0.2653, "num_input_tokens_seen": 39867872, "step": 41725 }, { "epoch": 3.4040296924708375, "grad_norm": 0.8398959636688232, "learning_rate": 4.170264508480353e-05, "loss": 0.4757, "num_input_tokens_seen": 39872064, "step": 41730 }, { "epoch": 3.4044375560812465, "grad_norm": 0.4357706308364868, "learning_rate": 4.1699996574211075e-05, "loss": 0.4, "num_input_tokens_seen": 39876336, "step": 41735 }, { "epoch": 3.404845419691655, "grad_norm": 0.9358804225921631, "learning_rate": 4.1697347725116785e-05, "loss": 0.3298, "num_input_tokens_seen": 39880976, "step": 41740 }, { "epoch": 3.4052532833020637, "grad_norm": 0.437571257352829, "learning_rate": 4.169469853757436e-05, "loss": 0.3635, "num_input_tokens_seen": 39886240, "step": 41745 }, { "epoch": 3.4056611469124727, "grad_norm": 1.0079705715179443, "learning_rate": 4.169204901163748e-05, "loss": 0.3781, "num_input_tokens_seen": 39891344, "step": 41750 }, { "epoch": 3.4060690105228812, "grad_norm": 0.8811396360397339, "learning_rate": 4.1689399147359865e-05, "loss": 0.3528, "num_input_tokens_seen": 39896176, "step": 41755 }, { "epoch": 3.40647687413329, "grad_norm": 0.7561772465705872, "learning_rate": 4.168674894479522e-05, "loss": 0.3824, "num_input_tokens_seen": 39900768, "step": 41760 }, { "epoch": 3.4068847377436984, "grad_norm": 0.2195419818162918, "learning_rate": 4.168409840399726e-05, "loss": 0.3407, "num_input_tokens_seen": 39905488, "step": 41765 }, { "epoch": 3.407292601354107, "grad_norm": 0.37441322207450867, "learning_rate": 4.1681447525019724e-05, "loss": 0.3693, "num_input_tokens_seen": 39909616, "step": 41770 }, { "epoch": 3.407700464964516, "grad_norm": 0.6975545287132263, "learning_rate": 4.167879630791633e-05, "loss": 0.3391, "num_input_tokens_seen": 39915216, "step": 41775 }, { "epoch": 3.4081083285749245, "grad_norm": 0.7640516757965088, "learning_rate": 4.167614475274082e-05, "loss": 0.3396, "num_input_tokens_seen": 39919376, "step": 41780 }, { "epoch": 3.408516192185333, "grad_norm": 0.5744535326957703, "learning_rate": 4.167349285954695e-05, "loss": 0.3715, "num_input_tokens_seen": 39924704, "step": 41785 }, { "epoch": 3.408924055795742, "grad_norm": 0.39374667406082153, "learning_rate": 4.167084062838845e-05, "loss": 0.3361, "num_input_tokens_seen": 39929808, "step": 41790 }, { "epoch": 3.4093319194061507, "grad_norm": 1.0719597339630127, "learning_rate": 4.16681880593191e-05, "loss": 0.3491, "num_input_tokens_seen": 39935152, "step": 41795 }, { "epoch": 3.4097397830165592, "grad_norm": 0.7798879146575928, "learning_rate": 4.166553515239267e-05, "loss": 0.3532, "num_input_tokens_seen": 39941040, "step": 41800 }, { "epoch": 3.410147646626968, "grad_norm": 0.5162121057510376, "learning_rate": 4.166288190766292e-05, "loss": 0.3491, "num_input_tokens_seen": 39945504, "step": 41805 }, { "epoch": 3.410555510237377, "grad_norm": 0.7222340106964111, "learning_rate": 4.1660228325183634e-05, "loss": 0.342, "num_input_tokens_seen": 39949680, "step": 41810 }, { "epoch": 3.4109633738477854, "grad_norm": 0.7847175598144531, "learning_rate": 4.1657574405008595e-05, "loss": 0.3346, "num_input_tokens_seen": 39954336, "step": 41815 }, { "epoch": 3.411371237458194, "grad_norm": 0.23465467989444733, "learning_rate": 4.16549201471916e-05, "loss": 0.3458, "num_input_tokens_seen": 39958896, "step": 41820 }, { "epoch": 3.4117791010686025, "grad_norm": 0.6520318984985352, "learning_rate": 4.165226555178646e-05, "loss": 0.2817, "num_input_tokens_seen": 39964176, "step": 41825 }, { "epoch": 3.4121869646790115, "grad_norm": 0.6164824962615967, "learning_rate": 4.1649610618846956e-05, "loss": 0.2682, "num_input_tokens_seen": 39969744, "step": 41830 }, { "epoch": 3.41259482828942, "grad_norm": 0.6793348789215088, "learning_rate": 4.1646955348426935e-05, "loss": 0.4135, "num_input_tokens_seen": 39974736, "step": 41835 }, { "epoch": 3.4130026918998286, "grad_norm": 0.4463076889514923, "learning_rate": 4.16442997405802e-05, "loss": 0.2895, "num_input_tokens_seen": 39978848, "step": 41840 }, { "epoch": 3.413410555510237, "grad_norm": 1.504154920578003, "learning_rate": 4.1641643795360585e-05, "loss": 0.3824, "num_input_tokens_seen": 39983440, "step": 41845 }, { "epoch": 3.413818419120646, "grad_norm": 0.5347998738288879, "learning_rate": 4.163898751282191e-05, "loss": 0.3608, "num_input_tokens_seen": 39988928, "step": 41850 }, { "epoch": 3.4142262827310548, "grad_norm": 0.9030652642250061, "learning_rate": 4.1636330893018036e-05, "loss": 0.4183, "num_input_tokens_seen": 39993760, "step": 41855 }, { "epoch": 3.4146341463414633, "grad_norm": 0.6773232221603394, "learning_rate": 4.163367393600281e-05, "loss": 0.3388, "num_input_tokens_seen": 39998976, "step": 41860 }, { "epoch": 3.415042009951872, "grad_norm": 0.7741321325302124, "learning_rate": 4.1631016641830067e-05, "loss": 0.366, "num_input_tokens_seen": 40004096, "step": 41865 }, { "epoch": 3.415449873562281, "grad_norm": 0.21434634923934937, "learning_rate": 4.1628359010553696e-05, "loss": 0.3181, "num_input_tokens_seen": 40008816, "step": 41870 }, { "epoch": 3.4158577371726895, "grad_norm": 1.0929205417633057, "learning_rate": 4.1625701042227547e-05, "loss": 0.3768, "num_input_tokens_seen": 40012720, "step": 41875 }, { "epoch": 3.416265600783098, "grad_norm": 0.8090188503265381, "learning_rate": 4.162304273690551e-05, "loss": 0.3722, "num_input_tokens_seen": 40017184, "step": 41880 }, { "epoch": 3.416673464393507, "grad_norm": 0.30374911427497864, "learning_rate": 4.162038409464145e-05, "loss": 0.3633, "num_input_tokens_seen": 40022400, "step": 41885 }, { "epoch": 3.4170813280039156, "grad_norm": 0.7947192788124084, "learning_rate": 4.161772511548928e-05, "loss": 0.3516, "num_input_tokens_seen": 40027136, "step": 41890 }, { "epoch": 3.417489191614324, "grad_norm": 0.6704819202423096, "learning_rate": 4.1615065799502864e-05, "loss": 0.3664, "num_input_tokens_seen": 40031424, "step": 41895 }, { "epoch": 3.4178970552247327, "grad_norm": 0.5861326456069946, "learning_rate": 4.161240614673614e-05, "loss": 0.3049, "num_input_tokens_seen": 40036256, "step": 41900 }, { "epoch": 3.4183049188351413, "grad_norm": 0.3701530694961548, "learning_rate": 4.1609746157243e-05, "loss": 0.322, "num_input_tokens_seen": 40041424, "step": 41905 }, { "epoch": 3.4187127824455503, "grad_norm": 0.8431318402290344, "learning_rate": 4.160708583107736e-05, "loss": 0.383, "num_input_tokens_seen": 40045392, "step": 41910 }, { "epoch": 3.419120646055959, "grad_norm": 0.34313100576400757, "learning_rate": 4.160442516829315e-05, "loss": 0.3773, "num_input_tokens_seen": 40050800, "step": 41915 }, { "epoch": 3.4195285096663675, "grad_norm": 1.3962321281433105, "learning_rate": 4.16017641689443e-05, "loss": 0.3332, "num_input_tokens_seen": 40055232, "step": 41920 }, { "epoch": 3.4199363732767765, "grad_norm": 0.6885004639625549, "learning_rate": 4.159910283308473e-05, "loss": 0.3703, "num_input_tokens_seen": 40060080, "step": 41925 }, { "epoch": 3.420344236887185, "grad_norm": 0.7157101035118103, "learning_rate": 4.159644116076842e-05, "loss": 0.3231, "num_input_tokens_seen": 40065184, "step": 41930 }, { "epoch": 3.4207521004975936, "grad_norm": 0.377817839384079, "learning_rate": 4.159377915204928e-05, "loss": 0.3939, "num_input_tokens_seen": 40070288, "step": 41935 }, { "epoch": 3.421159964108002, "grad_norm": 0.40593501925468445, "learning_rate": 4.15911168069813e-05, "loss": 0.3973, "num_input_tokens_seen": 40074048, "step": 41940 }, { "epoch": 3.4215678277184107, "grad_norm": 0.4885813593864441, "learning_rate": 4.158845412561844e-05, "loss": 0.3523, "num_input_tokens_seen": 40079792, "step": 41945 }, { "epoch": 3.4219756913288197, "grad_norm": 0.7205221056938171, "learning_rate": 4.1585791108014644e-05, "loss": 0.3569, "num_input_tokens_seen": 40083936, "step": 41950 }, { "epoch": 3.4223835549392283, "grad_norm": 0.4981955289840698, "learning_rate": 4.158312775422392e-05, "loss": 0.3543, "num_input_tokens_seen": 40089600, "step": 41955 }, { "epoch": 3.422791418549637, "grad_norm": 0.829330325126648, "learning_rate": 4.1580464064300244e-05, "loss": 0.3221, "num_input_tokens_seen": 40094608, "step": 41960 }, { "epoch": 3.423199282160046, "grad_norm": 0.6151643991470337, "learning_rate": 4.1577800038297606e-05, "loss": 0.3342, "num_input_tokens_seen": 40100144, "step": 41965 }, { "epoch": 3.4236071457704544, "grad_norm": 0.3821113109588623, "learning_rate": 4.1575135676270004e-05, "loss": 0.3546, "num_input_tokens_seen": 40104960, "step": 41970 }, { "epoch": 3.424015009380863, "grad_norm": 0.3308934271335602, "learning_rate": 4.1572470978271446e-05, "loss": 0.3536, "num_input_tokens_seen": 40109856, "step": 41975 }, { "epoch": 3.4244228729912716, "grad_norm": 0.4909589886665344, "learning_rate": 4.1569805944355945e-05, "loss": 0.3297, "num_input_tokens_seen": 40115008, "step": 41980 }, { "epoch": 3.4248307366016806, "grad_norm": 0.28885912895202637, "learning_rate": 4.1567140574577515e-05, "loss": 0.366, "num_input_tokens_seen": 40119712, "step": 41985 }, { "epoch": 3.425238600212089, "grad_norm": 0.12646442651748657, "learning_rate": 4.156447486899019e-05, "loss": 0.3474, "num_input_tokens_seen": 40123424, "step": 41990 }, { "epoch": 3.4256464638224977, "grad_norm": 0.8781242370605469, "learning_rate": 4.156180882764798e-05, "loss": 0.3599, "num_input_tokens_seen": 40128848, "step": 41995 }, { "epoch": 3.4260543274329063, "grad_norm": 0.49225300550460815, "learning_rate": 4.155914245060497e-05, "loss": 0.3578, "num_input_tokens_seen": 40133440, "step": 42000 }, { "epoch": 3.4264621910433153, "grad_norm": 0.9290586113929749, "learning_rate": 4.155647573791517e-05, "loss": 0.3596, "num_input_tokens_seen": 40138640, "step": 42005 }, { "epoch": 3.426870054653724, "grad_norm": 0.9730007648468018, "learning_rate": 4.1553808689632633e-05, "loss": 0.3558, "num_input_tokens_seen": 40142400, "step": 42010 }, { "epoch": 3.4272779182641324, "grad_norm": 2.456815242767334, "learning_rate": 4.1551141305811435e-05, "loss": 0.3523, "num_input_tokens_seen": 40146672, "step": 42015 }, { "epoch": 3.427685781874541, "grad_norm": 0.8666085600852966, "learning_rate": 4.154847358650563e-05, "loss": 0.3372, "num_input_tokens_seen": 40152000, "step": 42020 }, { "epoch": 3.42809364548495, "grad_norm": 0.3244079649448395, "learning_rate": 4.15458055317693e-05, "loss": 0.3435, "num_input_tokens_seen": 40156992, "step": 42025 }, { "epoch": 3.4285015090953586, "grad_norm": 0.5950202345848083, "learning_rate": 4.154313714165653e-05, "loss": 0.3277, "num_input_tokens_seen": 40160800, "step": 42030 }, { "epoch": 3.428909372705767, "grad_norm": 0.3778311610221863, "learning_rate": 4.154046841622139e-05, "loss": 0.3602, "num_input_tokens_seen": 40166208, "step": 42035 }, { "epoch": 3.4293172363161757, "grad_norm": 0.8983297348022461, "learning_rate": 4.1537799355517995e-05, "loss": 0.3682, "num_input_tokens_seen": 40170848, "step": 42040 }, { "epoch": 3.4297250999265847, "grad_norm": 1.5795069932937622, "learning_rate": 4.153512995960043e-05, "loss": 0.2983, "num_input_tokens_seen": 40174848, "step": 42045 }, { "epoch": 3.4301329635369933, "grad_norm": 0.6828103065490723, "learning_rate": 4.1532460228522794e-05, "loss": 0.3519, "num_input_tokens_seen": 40179744, "step": 42050 }, { "epoch": 3.430540827147402, "grad_norm": 1.0166302919387817, "learning_rate": 4.152979016233923e-05, "loss": 0.421, "num_input_tokens_seen": 40183520, "step": 42055 }, { "epoch": 3.430948690757811, "grad_norm": 1.0858105421066284, "learning_rate": 4.1527119761103836e-05, "loss": 0.3165, "num_input_tokens_seen": 40188224, "step": 42060 }, { "epoch": 3.4313565543682194, "grad_norm": 0.473617821931839, "learning_rate": 4.152444902487075e-05, "loss": 0.3855, "num_input_tokens_seen": 40193744, "step": 42065 }, { "epoch": 3.431764417978628, "grad_norm": 0.5698316693305969, "learning_rate": 4.1521777953694104e-05, "loss": 0.3497, "num_input_tokens_seen": 40198416, "step": 42070 }, { "epoch": 3.4321722815890365, "grad_norm": 0.6386497616767883, "learning_rate": 4.1519106547628034e-05, "loss": 0.3563, "num_input_tokens_seen": 40202016, "step": 42075 }, { "epoch": 3.432580145199445, "grad_norm": 0.7880424857139587, "learning_rate": 4.15164348067267e-05, "loss": 0.3699, "num_input_tokens_seen": 40207264, "step": 42080 }, { "epoch": 3.432988008809854, "grad_norm": 0.8750429749488831, "learning_rate": 4.1513762731044245e-05, "loss": 0.3317, "num_input_tokens_seen": 40212816, "step": 42085 }, { "epoch": 3.4333958724202627, "grad_norm": 0.7218442559242249, "learning_rate": 4.1511090320634845e-05, "loss": 0.3134, "num_input_tokens_seen": 40217744, "step": 42090 }, { "epoch": 3.4338037360306712, "grad_norm": 0.8055790066719055, "learning_rate": 4.150841757555265e-05, "loss": 0.3674, "num_input_tokens_seen": 40221968, "step": 42095 }, { "epoch": 3.4342115996410802, "grad_norm": 0.3840171992778778, "learning_rate": 4.150574449585185e-05, "loss": 0.3415, "num_input_tokens_seen": 40226784, "step": 42100 }, { "epoch": 3.434619463251489, "grad_norm": 1.1193896532058716, "learning_rate": 4.150307108158663e-05, "loss": 0.4006, "num_input_tokens_seen": 40232080, "step": 42105 }, { "epoch": 3.4350273268618974, "grad_norm": 0.3678113520145416, "learning_rate": 4.1500397332811156e-05, "loss": 0.3007, "num_input_tokens_seen": 40236480, "step": 42110 }, { "epoch": 3.435435190472306, "grad_norm": 0.3720681965351105, "learning_rate": 4.149772324957966e-05, "loss": 0.334, "num_input_tokens_seen": 40242144, "step": 42115 }, { "epoch": 3.4358430540827145, "grad_norm": 0.6447439193725586, "learning_rate": 4.14950488319463e-05, "loss": 0.3362, "num_input_tokens_seen": 40247072, "step": 42120 }, { "epoch": 3.4362509176931235, "grad_norm": 0.3376706540584564, "learning_rate": 4.1492374079965336e-05, "loss": 0.314, "num_input_tokens_seen": 40251920, "step": 42125 }, { "epoch": 3.436658781303532, "grad_norm": 0.38309773802757263, "learning_rate": 4.148969899369094e-05, "loss": 0.3425, "num_input_tokens_seen": 40255888, "step": 42130 }, { "epoch": 3.4370666449139406, "grad_norm": 0.8039177656173706, "learning_rate": 4.148702357317735e-05, "loss": 0.3706, "num_input_tokens_seen": 40261280, "step": 42135 }, { "epoch": 3.4374745085243497, "grad_norm": 0.4168512225151062, "learning_rate": 4.148434781847881e-05, "loss": 0.3435, "num_input_tokens_seen": 40265616, "step": 42140 }, { "epoch": 3.4378823721347582, "grad_norm": 0.3716789186000824, "learning_rate": 4.1481671729649536e-05, "loss": 0.3536, "num_input_tokens_seen": 40269536, "step": 42145 }, { "epoch": 3.438290235745167, "grad_norm": 1.5552752017974854, "learning_rate": 4.1478995306743787e-05, "loss": 0.3607, "num_input_tokens_seen": 40273808, "step": 42150 }, { "epoch": 3.4386980993555754, "grad_norm": 0.8825110793113708, "learning_rate": 4.14763185498158e-05, "loss": 0.3457, "num_input_tokens_seen": 40278224, "step": 42155 }, { "epoch": 3.4391059629659844, "grad_norm": 1.165001392364502, "learning_rate": 4.147364145891983e-05, "loss": 0.3459, "num_input_tokens_seen": 40283392, "step": 42160 }, { "epoch": 3.439513826576393, "grad_norm": 0.427518755197525, "learning_rate": 4.147096403411016e-05, "loss": 0.323, "num_input_tokens_seen": 40288048, "step": 42165 }, { "epoch": 3.4399216901868015, "grad_norm": 0.7378564476966858, "learning_rate": 4.1468286275441046e-05, "loss": 0.3666, "num_input_tokens_seen": 40292032, "step": 42170 }, { "epoch": 3.44032955379721, "grad_norm": 1.0970996618270874, "learning_rate": 4.146560818296676e-05, "loss": 0.3285, "num_input_tokens_seen": 40297344, "step": 42175 }, { "epoch": 3.440737417407619, "grad_norm": 1.0758655071258545, "learning_rate": 4.14629297567416e-05, "loss": 0.3634, "num_input_tokens_seen": 40301392, "step": 42180 }, { "epoch": 3.4411452810180276, "grad_norm": 0.3610902726650238, "learning_rate": 4.146025099681984e-05, "loss": 0.3283, "num_input_tokens_seen": 40305872, "step": 42185 }, { "epoch": 3.441553144628436, "grad_norm": 0.7697528004646301, "learning_rate": 4.14575719032558e-05, "loss": 0.3265, "num_input_tokens_seen": 40311136, "step": 42190 }, { "epoch": 3.4419610082388448, "grad_norm": 0.5898803472518921, "learning_rate": 4.1454892476103755e-05, "loss": 0.3613, "num_input_tokens_seen": 40316352, "step": 42195 }, { "epoch": 3.4423688718492538, "grad_norm": 0.4875484108924866, "learning_rate": 4.145221271541805e-05, "loss": 0.3716, "num_input_tokens_seen": 40320992, "step": 42200 }, { "epoch": 3.4427767354596623, "grad_norm": 0.46777889132499695, "learning_rate": 4.144953262125297e-05, "loss": 0.3606, "num_input_tokens_seen": 40326288, "step": 42205 }, { "epoch": 3.443184599070071, "grad_norm": 0.9220752716064453, "learning_rate": 4.1446852193662855e-05, "loss": 0.3347, "num_input_tokens_seen": 40331216, "step": 42210 }, { "epoch": 3.4435924626804795, "grad_norm": 0.6986306309700012, "learning_rate": 4.144417143270204e-05, "loss": 0.3291, "num_input_tokens_seen": 40336144, "step": 42215 }, { "epoch": 3.4440003262908885, "grad_norm": 0.3668423593044281, "learning_rate": 4.144149033842486e-05, "loss": 0.3731, "num_input_tokens_seen": 40340800, "step": 42220 }, { "epoch": 3.444408189901297, "grad_norm": 0.3206997215747833, "learning_rate": 4.1438808910885654e-05, "loss": 0.3539, "num_input_tokens_seen": 40346240, "step": 42225 }, { "epoch": 3.4448160535117056, "grad_norm": 0.7147256731987, "learning_rate": 4.1436127150138774e-05, "loss": 0.3471, "num_input_tokens_seen": 40351664, "step": 42230 }, { "epoch": 3.445223917122114, "grad_norm": 1.0435831546783447, "learning_rate": 4.143344505623858e-05, "loss": 0.3434, "num_input_tokens_seen": 40357616, "step": 42235 }, { "epoch": 3.445631780732523, "grad_norm": 0.631011962890625, "learning_rate": 4.143076262923944e-05, "loss": 0.3042, "num_input_tokens_seen": 40362912, "step": 42240 }, { "epoch": 3.4460396443429318, "grad_norm": 0.7813494205474854, "learning_rate": 4.142807986919573e-05, "loss": 0.3538, "num_input_tokens_seen": 40367680, "step": 42245 }, { "epoch": 3.4464475079533403, "grad_norm": 0.44401517510414124, "learning_rate": 4.142539677616181e-05, "loss": 0.3734, "num_input_tokens_seen": 40371968, "step": 42250 }, { "epoch": 3.446855371563749, "grad_norm": 0.9363979697227478, "learning_rate": 4.142271335019208e-05, "loss": 0.3837, "num_input_tokens_seen": 40375856, "step": 42255 }, { "epoch": 3.447263235174158, "grad_norm": 0.5924482941627502, "learning_rate": 4.1420029591340933e-05, "loss": 0.3588, "num_input_tokens_seen": 40380976, "step": 42260 }, { "epoch": 3.4476710987845665, "grad_norm": 1.1010854244232178, "learning_rate": 4.141734549966276e-05, "loss": 0.3646, "num_input_tokens_seen": 40386160, "step": 42265 }, { "epoch": 3.448078962394975, "grad_norm": 0.7963592410087585, "learning_rate": 4.141466107521197e-05, "loss": 0.3209, "num_input_tokens_seen": 40391616, "step": 42270 }, { "epoch": 3.448486826005384, "grad_norm": 0.431489497423172, "learning_rate": 4.141197631804298e-05, "loss": 0.3818, "num_input_tokens_seen": 40396816, "step": 42275 }, { "epoch": 3.4488946896157926, "grad_norm": 0.76097571849823, "learning_rate": 4.14092912282102e-05, "loss": 0.3412, "num_input_tokens_seen": 40401056, "step": 42280 }, { "epoch": 3.449302553226201, "grad_norm": 1.244803547859192, "learning_rate": 4.140660580576805e-05, "loss": 0.3582, "num_input_tokens_seen": 40406192, "step": 42285 }, { "epoch": 3.4497104168366097, "grad_norm": 0.7686602473258972, "learning_rate": 4.140392005077099e-05, "loss": 0.3436, "num_input_tokens_seen": 40409824, "step": 42290 }, { "epoch": 3.4501182804470183, "grad_norm": 0.8126835823059082, "learning_rate": 4.140123396327343e-05, "loss": 0.3267, "num_input_tokens_seen": 40414464, "step": 42295 }, { "epoch": 3.4505261440574273, "grad_norm": 0.5149424076080322, "learning_rate": 4.139854754332983e-05, "loss": 0.3612, "num_input_tokens_seen": 40419472, "step": 42300 }, { "epoch": 3.450934007667836, "grad_norm": 0.7544217109680176, "learning_rate": 4.1395860790994635e-05, "loss": 0.2759, "num_input_tokens_seen": 40424240, "step": 42305 }, { "epoch": 3.4513418712782444, "grad_norm": 0.555300772190094, "learning_rate": 4.139317370632231e-05, "loss": 0.4279, "num_input_tokens_seen": 40429072, "step": 42310 }, { "epoch": 3.4517497348886534, "grad_norm": 0.4910545349121094, "learning_rate": 4.1390486289367324e-05, "loss": 0.3253, "num_input_tokens_seen": 40433952, "step": 42315 }, { "epoch": 3.452157598499062, "grad_norm": 0.6976273655891418, "learning_rate": 4.138779854018415e-05, "loss": 0.4125, "num_input_tokens_seen": 40438416, "step": 42320 }, { "epoch": 3.4525654621094706, "grad_norm": 0.4804689586162567, "learning_rate": 4.1385110458827256e-05, "loss": 0.3613, "num_input_tokens_seen": 40443744, "step": 42325 }, { "epoch": 3.452973325719879, "grad_norm": 0.43142643570899963, "learning_rate": 4.138242204535113e-05, "loss": 0.3994, "num_input_tokens_seen": 40448608, "step": 42330 }, { "epoch": 3.4533811893302877, "grad_norm": 0.47137466073036194, "learning_rate": 4.137973329981028e-05, "loss": 0.3689, "num_input_tokens_seen": 40453616, "step": 42335 }, { "epoch": 3.4537890529406967, "grad_norm": 0.8118708729743958, "learning_rate": 4.1377044222259196e-05, "loss": 0.34, "num_input_tokens_seen": 40458704, "step": 42340 }, { "epoch": 3.4541969165511053, "grad_norm": 0.4823445677757263, "learning_rate": 4.137435481275238e-05, "loss": 0.3474, "num_input_tokens_seen": 40463712, "step": 42345 }, { "epoch": 3.454604780161514, "grad_norm": 0.349599689245224, "learning_rate": 4.137166507134436e-05, "loss": 0.3629, "num_input_tokens_seen": 40467776, "step": 42350 }, { "epoch": 3.455012643771923, "grad_norm": 0.8252474665641785, "learning_rate": 4.136897499808963e-05, "loss": 0.3512, "num_input_tokens_seen": 40472704, "step": 42355 }, { "epoch": 3.4554205073823314, "grad_norm": 0.7878984808921814, "learning_rate": 4.1366284593042746e-05, "loss": 0.341, "num_input_tokens_seen": 40477712, "step": 42360 }, { "epoch": 3.45582837099274, "grad_norm": 0.33855539560317993, "learning_rate": 4.136359385625822e-05, "loss": 0.3409, "num_input_tokens_seen": 40482320, "step": 42365 }, { "epoch": 3.4562362346031485, "grad_norm": 0.8907378911972046, "learning_rate": 4.13609027877906e-05, "loss": 0.3746, "num_input_tokens_seen": 40487312, "step": 42370 }, { "epoch": 3.4566440982135576, "grad_norm": 0.6081799864768982, "learning_rate": 4.135821138769444e-05, "loss": 0.339, "num_input_tokens_seen": 40491184, "step": 42375 }, { "epoch": 3.457051961823966, "grad_norm": 0.22181174159049988, "learning_rate": 4.135551965602428e-05, "loss": 0.3256, "num_input_tokens_seen": 40495872, "step": 42380 }, { "epoch": 3.4574598254343747, "grad_norm": 0.3155931234359741, "learning_rate": 4.135282759283468e-05, "loss": 0.3278, "num_input_tokens_seen": 40500592, "step": 42385 }, { "epoch": 3.4578676890447833, "grad_norm": 0.6930621266365051, "learning_rate": 4.135013519818022e-05, "loss": 0.3664, "num_input_tokens_seen": 40505328, "step": 42390 }, { "epoch": 3.4582755526551923, "grad_norm": 0.5660051703453064, "learning_rate": 4.1347442472115475e-05, "loss": 0.2953, "num_input_tokens_seen": 40509952, "step": 42395 }, { "epoch": 3.458683416265601, "grad_norm": 0.40406355261802673, "learning_rate": 4.1344749414695e-05, "loss": 0.4042, "num_input_tokens_seen": 40514400, "step": 42400 }, { "epoch": 3.4590912798760094, "grad_norm": 1.2705163955688477, "learning_rate": 4.1342056025973416e-05, "loss": 0.3925, "num_input_tokens_seen": 40519840, "step": 42405 }, { "epoch": 3.459499143486418, "grad_norm": 0.32139888405799866, "learning_rate": 4.133936230600529e-05, "loss": 0.3583, "num_input_tokens_seen": 40523984, "step": 42410 }, { "epoch": 3.459907007096827, "grad_norm": 0.7915074825286865, "learning_rate": 4.133666825484524e-05, "loss": 0.3406, "num_input_tokens_seen": 40529088, "step": 42415 }, { "epoch": 3.4603148707072355, "grad_norm": 0.11480747163295746, "learning_rate": 4.1333973872547864e-05, "loss": 0.368, "num_input_tokens_seen": 40533792, "step": 42420 }, { "epoch": 3.460722734317644, "grad_norm": 0.3311174511909485, "learning_rate": 4.133127915916778e-05, "loss": 0.3341, "num_input_tokens_seen": 40538896, "step": 42425 }, { "epoch": 3.4611305979280527, "grad_norm": 0.39721474051475525, "learning_rate": 4.132858411475961e-05, "loss": 0.3385, "num_input_tokens_seen": 40543216, "step": 42430 }, { "epoch": 3.4615384615384617, "grad_norm": 0.7991058826446533, "learning_rate": 4.132588873937797e-05, "loss": 0.3503, "num_input_tokens_seen": 40548336, "step": 42435 }, { "epoch": 3.4619463251488702, "grad_norm": 0.4109898805618286, "learning_rate": 4.13231930330775e-05, "loss": 0.3406, "num_input_tokens_seen": 40553904, "step": 42440 }, { "epoch": 3.462354188759279, "grad_norm": 0.5928276777267456, "learning_rate": 4.132049699591285e-05, "loss": 0.3153, "num_input_tokens_seen": 40557744, "step": 42445 }, { "epoch": 3.462762052369688, "grad_norm": 0.7449840903282166, "learning_rate": 4.1317800627938666e-05, "loss": 0.3427, "num_input_tokens_seen": 40562592, "step": 42450 }, { "epoch": 3.4631699159800964, "grad_norm": 0.81461501121521, "learning_rate": 4.1315103929209585e-05, "loss": 0.3456, "num_input_tokens_seen": 40566800, "step": 42455 }, { "epoch": 3.463577779590505, "grad_norm": 0.6420068740844727, "learning_rate": 4.131240689978029e-05, "loss": 0.4665, "num_input_tokens_seen": 40571200, "step": 42460 }, { "epoch": 3.4639856432009135, "grad_norm": 0.12644219398498535, "learning_rate": 4.130970953970543e-05, "loss": 0.3857, "num_input_tokens_seen": 40575184, "step": 42465 }, { "epoch": 3.464393506811322, "grad_norm": 0.609920859336853, "learning_rate": 4.13070118490397e-05, "loss": 0.3399, "num_input_tokens_seen": 40578880, "step": 42470 }, { "epoch": 3.464801370421731, "grad_norm": 0.7333993315696716, "learning_rate": 4.130431382783776e-05, "loss": 0.3565, "num_input_tokens_seen": 40583584, "step": 42475 }, { "epoch": 3.4652092340321397, "grad_norm": 0.6804333329200745, "learning_rate": 4.130161547615431e-05, "loss": 0.2979, "num_input_tokens_seen": 40588192, "step": 42480 }, { "epoch": 3.465617097642548, "grad_norm": 0.5257817506790161, "learning_rate": 4.129891679404404e-05, "loss": 0.3477, "num_input_tokens_seen": 40593104, "step": 42485 }, { "epoch": 3.4660249612529572, "grad_norm": 0.6756802201271057, "learning_rate": 4.129621778156165e-05, "loss": 0.3443, "num_input_tokens_seen": 40597600, "step": 42490 }, { "epoch": 3.466432824863366, "grad_norm": 0.48207080364227295, "learning_rate": 4.129351843876186e-05, "loss": 0.3293, "num_input_tokens_seen": 40602464, "step": 42495 }, { "epoch": 3.4668406884737744, "grad_norm": 0.6681668758392334, "learning_rate": 4.1290818765699363e-05, "loss": 0.3688, "num_input_tokens_seen": 40608048, "step": 42500 }, { "epoch": 3.467248552084183, "grad_norm": 0.3991472125053406, "learning_rate": 4.1288118762428896e-05, "loss": 0.3335, "num_input_tokens_seen": 40612784, "step": 42505 }, { "epoch": 3.4676564156945915, "grad_norm": 0.47627559304237366, "learning_rate": 4.1285418429005185e-05, "loss": 0.3207, "num_input_tokens_seen": 40616976, "step": 42510 }, { "epoch": 3.4680642793050005, "grad_norm": 0.7079602479934692, "learning_rate": 4.1282717765482956e-05, "loss": 0.3146, "num_input_tokens_seen": 40622272, "step": 42515 }, { "epoch": 3.468472142915409, "grad_norm": 0.9761795401573181, "learning_rate": 4.128001677191696e-05, "loss": 0.439, "num_input_tokens_seen": 40626544, "step": 42520 }, { "epoch": 3.4688800065258176, "grad_norm": 1.0079401731491089, "learning_rate": 4.1277315448361955e-05, "loss": 0.3709, "num_input_tokens_seen": 40630704, "step": 42525 }, { "epoch": 3.4692878701362266, "grad_norm": 0.5391622185707092, "learning_rate": 4.127461379487266e-05, "loss": 0.3542, "num_input_tokens_seen": 40635792, "step": 42530 }, { "epoch": 3.469695733746635, "grad_norm": 0.5616292357444763, "learning_rate": 4.127191181150388e-05, "loss": 0.3546, "num_input_tokens_seen": 40640656, "step": 42535 }, { "epoch": 3.4701035973570438, "grad_norm": 0.7676205039024353, "learning_rate": 4.126920949831035e-05, "loss": 0.3541, "num_input_tokens_seen": 40645008, "step": 42540 }, { "epoch": 3.4705114609674523, "grad_norm": 0.9976746439933777, "learning_rate": 4.126650685534686e-05, "loss": 0.364, "num_input_tokens_seen": 40650336, "step": 42545 }, { "epoch": 3.4709193245778613, "grad_norm": 0.29865479469299316, "learning_rate": 4.1263803882668183e-05, "loss": 0.3327, "num_input_tokens_seen": 40654944, "step": 42550 }, { "epoch": 3.47132718818827, "grad_norm": 0.957030177116394, "learning_rate": 4.126110058032912e-05, "loss": 0.3844, "num_input_tokens_seen": 40659568, "step": 42555 }, { "epoch": 3.4717350517986785, "grad_norm": 1.1488929986953735, "learning_rate": 4.125839694838445e-05, "loss": 0.3319, "num_input_tokens_seen": 40664736, "step": 42560 }, { "epoch": 3.472142915409087, "grad_norm": 0.5280402898788452, "learning_rate": 4.1255692986888993e-05, "loss": 0.341, "num_input_tokens_seen": 40670384, "step": 42565 }, { "epoch": 3.472550779019496, "grad_norm": 0.35470861196517944, "learning_rate": 4.125298869589754e-05, "loss": 0.3171, "num_input_tokens_seen": 40675440, "step": 42570 }, { "epoch": 3.4729586426299046, "grad_norm": 0.6224591135978699, "learning_rate": 4.1250284075464906e-05, "loss": 0.2967, "num_input_tokens_seen": 40679216, "step": 42575 }, { "epoch": 3.473366506240313, "grad_norm": 0.503173291683197, "learning_rate": 4.124757912564593e-05, "loss": 0.2846, "num_input_tokens_seen": 40683088, "step": 42580 }, { "epoch": 3.4737743698507217, "grad_norm": 0.5886809825897217, "learning_rate": 4.124487384649542e-05, "loss": 0.3568, "num_input_tokens_seen": 40688320, "step": 42585 }, { "epoch": 3.4741822334611308, "grad_norm": 0.806226909160614, "learning_rate": 4.124216823806823e-05, "loss": 0.4127, "num_input_tokens_seen": 40692832, "step": 42590 }, { "epoch": 3.4745900970715393, "grad_norm": 0.3970155715942383, "learning_rate": 4.123946230041919e-05, "loss": 0.3185, "num_input_tokens_seen": 40697648, "step": 42595 }, { "epoch": 3.474997960681948, "grad_norm": 0.20208221673965454, "learning_rate": 4.123675603360314e-05, "loss": 0.4041, "num_input_tokens_seen": 40702864, "step": 42600 }, { "epoch": 3.4754058242923564, "grad_norm": 0.33768460154533386, "learning_rate": 4.123404943767495e-05, "loss": 0.3595, "num_input_tokens_seen": 40707968, "step": 42605 }, { "epoch": 3.4758136879027655, "grad_norm": 0.4306504428386688, "learning_rate": 4.123134251268948e-05, "loss": 0.3624, "num_input_tokens_seen": 40712864, "step": 42610 }, { "epoch": 3.476221551513174, "grad_norm": 1.062351107597351, "learning_rate": 4.122863525870159e-05, "loss": 0.3548, "num_input_tokens_seen": 40717712, "step": 42615 }, { "epoch": 3.4766294151235826, "grad_norm": 0.7464016675949097, "learning_rate": 4.122592767576616e-05, "loss": 0.35, "num_input_tokens_seen": 40722352, "step": 42620 }, { "epoch": 3.4770372787339916, "grad_norm": 0.8619683980941772, "learning_rate": 4.122321976393807e-05, "loss": 0.3371, "num_input_tokens_seen": 40727840, "step": 42625 }, { "epoch": 3.4774451423444, "grad_norm": 0.44160473346710205, "learning_rate": 4.1220511523272214e-05, "loss": 0.3399, "num_input_tokens_seen": 40731648, "step": 42630 }, { "epoch": 3.4778530059548087, "grad_norm": 0.3933650553226471, "learning_rate": 4.121780295382348e-05, "loss": 0.3079, "num_input_tokens_seen": 40736352, "step": 42635 }, { "epoch": 3.4782608695652173, "grad_norm": 0.6243126392364502, "learning_rate": 4.1215094055646766e-05, "loss": 0.3062, "num_input_tokens_seen": 40741616, "step": 42640 }, { "epoch": 3.478668733175626, "grad_norm": 0.6538200378417969, "learning_rate": 4.1212384828796994e-05, "loss": 0.3509, "num_input_tokens_seen": 40745264, "step": 42645 }, { "epoch": 3.479076596786035, "grad_norm": 0.5886659026145935, "learning_rate": 4.120967527332906e-05, "loss": 0.4313, "num_input_tokens_seen": 40750160, "step": 42650 }, { "epoch": 3.4794844603964434, "grad_norm": 0.5472545027732849, "learning_rate": 4.1206965389297906e-05, "loss": 0.3582, "num_input_tokens_seen": 40755232, "step": 42655 }, { "epoch": 3.479892324006852, "grad_norm": 1.069332480430603, "learning_rate": 4.120425517675845e-05, "loss": 0.3457, "num_input_tokens_seen": 40760768, "step": 42660 }, { "epoch": 3.480300187617261, "grad_norm": 0.24690115451812744, "learning_rate": 4.120154463576562e-05, "loss": 0.3323, "num_input_tokens_seen": 40766064, "step": 42665 }, { "epoch": 3.4807080512276696, "grad_norm": 0.6028220653533936, "learning_rate": 4.119883376637437e-05, "loss": 0.2872, "num_input_tokens_seen": 40770096, "step": 42670 }, { "epoch": 3.481115914838078, "grad_norm": 0.4567420780658722, "learning_rate": 4.119612256863964e-05, "loss": 0.3504, "num_input_tokens_seen": 40774192, "step": 42675 }, { "epoch": 3.4815237784484867, "grad_norm": 0.429368793964386, "learning_rate": 4.119341104261639e-05, "loss": 0.349, "num_input_tokens_seen": 40779440, "step": 42680 }, { "epoch": 3.4819316420588953, "grad_norm": 1.0634599924087524, "learning_rate": 4.119069918835958e-05, "loss": 0.4107, "num_input_tokens_seen": 40784176, "step": 42685 }, { "epoch": 3.4823395056693043, "grad_norm": 0.4037228524684906, "learning_rate": 4.118798700592417e-05, "loss": 0.3657, "num_input_tokens_seen": 40790032, "step": 42690 }, { "epoch": 3.482747369279713, "grad_norm": 0.3282396197319031, "learning_rate": 4.118527449536515e-05, "loss": 0.3352, "num_input_tokens_seen": 40795040, "step": 42695 }, { "epoch": 3.4831552328901214, "grad_norm": 0.932496964931488, "learning_rate": 4.1182561656737496e-05, "loss": 0.3673, "num_input_tokens_seen": 40800496, "step": 42700 }, { "epoch": 3.4835630965005304, "grad_norm": 0.4112337529659271, "learning_rate": 4.1179848490096183e-05, "loss": 0.3433, "num_input_tokens_seen": 40805328, "step": 42705 }, { "epoch": 3.483970960110939, "grad_norm": 0.4664585590362549, "learning_rate": 4.117713499549622e-05, "loss": 0.3204, "num_input_tokens_seen": 40809776, "step": 42710 }, { "epoch": 3.4843788237213476, "grad_norm": 0.8743367195129395, "learning_rate": 4.117442117299261e-05, "loss": 0.3622, "num_input_tokens_seen": 40814336, "step": 42715 }, { "epoch": 3.484786687331756, "grad_norm": 0.6624761819839478, "learning_rate": 4.117170702264035e-05, "loss": 0.3613, "num_input_tokens_seen": 40818496, "step": 42720 }, { "epoch": 3.485194550942165, "grad_norm": 0.25567176938056946, "learning_rate": 4.1168992544494466e-05, "loss": 0.3633, "num_input_tokens_seen": 40823568, "step": 42725 }, { "epoch": 3.4856024145525737, "grad_norm": 0.436922550201416, "learning_rate": 4.116627773860997e-05, "loss": 0.3882, "num_input_tokens_seen": 40827568, "step": 42730 }, { "epoch": 3.4860102781629823, "grad_norm": 0.20852242410182953, "learning_rate": 4.116356260504189e-05, "loss": 0.3421, "num_input_tokens_seen": 40832528, "step": 42735 }, { "epoch": 3.486418141773391, "grad_norm": 0.8541529178619385, "learning_rate": 4.116084714384527e-05, "loss": 0.3581, "num_input_tokens_seen": 40837904, "step": 42740 }, { "epoch": 3.4868260053838, "grad_norm": 0.4182984232902527, "learning_rate": 4.115813135507515e-05, "loss": 0.3578, "num_input_tokens_seen": 40843648, "step": 42745 }, { "epoch": 3.4872338689942084, "grad_norm": 0.9955458641052246, "learning_rate": 4.115541523878657e-05, "loss": 0.379, "num_input_tokens_seen": 40848288, "step": 42750 }, { "epoch": 3.487641732604617, "grad_norm": 0.6854072213172913, "learning_rate": 4.115269879503458e-05, "loss": 0.3398, "num_input_tokens_seen": 40853008, "step": 42755 }, { "epoch": 3.4880495962150255, "grad_norm": 0.860205352306366, "learning_rate": 4.1149982023874255e-05, "loss": 0.3365, "num_input_tokens_seen": 40857680, "step": 42760 }, { "epoch": 3.4884574598254345, "grad_norm": 0.83647620677948, "learning_rate": 4.114726492536066e-05, "loss": 0.3431, "num_input_tokens_seen": 40862288, "step": 42765 }, { "epoch": 3.488865323435843, "grad_norm": 0.7839386463165283, "learning_rate": 4.1144547499548855e-05, "loss": 0.3407, "num_input_tokens_seen": 40867088, "step": 42770 }, { "epoch": 3.4892731870462517, "grad_norm": 0.45129916071891785, "learning_rate": 4.1141829746493944e-05, "loss": 0.3645, "num_input_tokens_seen": 40871888, "step": 42775 }, { "epoch": 3.4896810506566602, "grad_norm": 0.2839913070201874, "learning_rate": 4.113911166625101e-05, "loss": 0.297, "num_input_tokens_seen": 40875552, "step": 42780 }, { "epoch": 3.4900889142670692, "grad_norm": 0.5700646042823792, "learning_rate": 4.113639325887512e-05, "loss": 0.3606, "num_input_tokens_seen": 40880608, "step": 42785 }, { "epoch": 3.490496777877478, "grad_norm": 1.1294187307357788, "learning_rate": 4.1133674524421406e-05, "loss": 0.3575, "num_input_tokens_seen": 40885184, "step": 42790 }, { "epoch": 3.4909046414878864, "grad_norm": 0.2804175019264221, "learning_rate": 4.113095546294496e-05, "loss": 0.4125, "num_input_tokens_seen": 40889632, "step": 42795 }, { "epoch": 3.491312505098295, "grad_norm": 0.27990788221359253, "learning_rate": 4.112823607450091e-05, "loss": 0.3851, "num_input_tokens_seen": 40893568, "step": 42800 }, { "epoch": 3.491720368708704, "grad_norm": 0.26622310280799866, "learning_rate": 4.112551635914437e-05, "loss": 0.3408, "num_input_tokens_seen": 40899120, "step": 42805 }, { "epoch": 3.4921282323191125, "grad_norm": 0.7509916424751282, "learning_rate": 4.1122796316930457e-05, "loss": 0.356, "num_input_tokens_seen": 40902768, "step": 42810 }, { "epoch": 3.492536095929521, "grad_norm": 1.3713512420654297, "learning_rate": 4.112007594791432e-05, "loss": 0.3463, "num_input_tokens_seen": 40907568, "step": 42815 }, { "epoch": 3.4929439595399296, "grad_norm": 0.7694910764694214, "learning_rate": 4.111735525215108e-05, "loss": 0.3627, "num_input_tokens_seen": 40911952, "step": 42820 }, { "epoch": 3.4933518231503387, "grad_norm": 0.4188533127307892, "learning_rate": 4.111463422969591e-05, "loss": 0.3355, "num_input_tokens_seen": 40916096, "step": 42825 }, { "epoch": 3.493759686760747, "grad_norm": 0.9581484794616699, "learning_rate": 4.111191288060394e-05, "loss": 0.3391, "num_input_tokens_seen": 40920320, "step": 42830 }, { "epoch": 3.494167550371156, "grad_norm": 0.6585201621055603, "learning_rate": 4.1109191204930356e-05, "loss": 0.3366, "num_input_tokens_seen": 40925072, "step": 42835 }, { "epoch": 3.494575413981565, "grad_norm": 0.34254738688468933, "learning_rate": 4.110646920273031e-05, "loss": 0.3525, "num_input_tokens_seen": 40929968, "step": 42840 }, { "epoch": 3.4949832775919734, "grad_norm": 0.6816509962081909, "learning_rate": 4.1103746874058973e-05, "loss": 0.2884, "num_input_tokens_seen": 40935344, "step": 42845 }, { "epoch": 3.495391141202382, "grad_norm": 0.5520750880241394, "learning_rate": 4.1101024218971526e-05, "loss": 0.3599, "num_input_tokens_seen": 40938784, "step": 42850 }, { "epoch": 3.4957990048127905, "grad_norm": 0.6504642367362976, "learning_rate": 4.1098301237523165e-05, "loss": 0.4088, "num_input_tokens_seen": 40944240, "step": 42855 }, { "epoch": 3.496206868423199, "grad_norm": 0.6494569182395935, "learning_rate": 4.109557792976908e-05, "loss": 0.2991, "num_input_tokens_seen": 40949168, "step": 42860 }, { "epoch": 3.496614732033608, "grad_norm": 0.5339776873588562, "learning_rate": 4.109285429576447e-05, "loss": 0.3638, "num_input_tokens_seen": 40953808, "step": 42865 }, { "epoch": 3.4970225956440166, "grad_norm": 1.0227893590927124, "learning_rate": 4.109013033556453e-05, "loss": 0.3477, "num_input_tokens_seen": 40959408, "step": 42870 }, { "epoch": 3.497430459254425, "grad_norm": 0.4145321846008301, "learning_rate": 4.1087406049224505e-05, "loss": 0.3428, "num_input_tokens_seen": 40964224, "step": 42875 }, { "epoch": 3.497838322864834, "grad_norm": 0.35288533568382263, "learning_rate": 4.108468143679959e-05, "loss": 0.3705, "num_input_tokens_seen": 40968928, "step": 42880 }, { "epoch": 3.4982461864752428, "grad_norm": 0.3453623354434967, "learning_rate": 4.1081956498345e-05, "loss": 0.338, "num_input_tokens_seen": 40973792, "step": 42885 }, { "epoch": 3.4986540500856513, "grad_norm": 0.6630244255065918, "learning_rate": 4.1079231233916005e-05, "loss": 0.3576, "num_input_tokens_seen": 40979168, "step": 42890 }, { "epoch": 3.49906191369606, "grad_norm": 0.6996614336967468, "learning_rate": 4.107650564356781e-05, "loss": 0.3644, "num_input_tokens_seen": 40983936, "step": 42895 }, { "epoch": 3.499469777306469, "grad_norm": 0.39972686767578125, "learning_rate": 4.1073779727355685e-05, "loss": 0.3578, "num_input_tokens_seen": 40989088, "step": 42900 }, { "epoch": 3.4998776409168775, "grad_norm": 0.3738403618335724, "learning_rate": 4.1071053485334875e-05, "loss": 0.3306, "num_input_tokens_seen": 40993584, "step": 42905 }, { "epoch": 3.500285504527286, "grad_norm": 0.25901874899864197, "learning_rate": 4.106832691756064e-05, "loss": 0.3145, "num_input_tokens_seen": 40997712, "step": 42910 }, { "epoch": 3.500285504527286, "eval_loss": 0.33907994627952576, "eval_runtime": 570.9494, "eval_samples_per_second": 4.773, "eval_steps_per_second": 2.387, "num_input_tokens_seen": 40997712, "step": 42910 }, { "epoch": 3.5006933681376946, "grad_norm": 0.5764617323875427, "learning_rate": 4.106560002408825e-05, "loss": 0.3671, "num_input_tokens_seen": 41002192, "step": 42915 }, { "epoch": 3.5011012317481036, "grad_norm": 0.5131296515464783, "learning_rate": 4.106287280497297e-05, "loss": 0.3102, "num_input_tokens_seen": 41007312, "step": 42920 }, { "epoch": 3.501509095358512, "grad_norm": 1.0482947826385498, "learning_rate": 4.106014526027009e-05, "loss": 0.4218, "num_input_tokens_seen": 41010992, "step": 42925 }, { "epoch": 3.5019169589689207, "grad_norm": 0.43195071816444397, "learning_rate": 4.105741739003487e-05, "loss": 0.4019, "num_input_tokens_seen": 41016320, "step": 42930 }, { "epoch": 3.5023248225793293, "grad_norm": 0.6553485989570618, "learning_rate": 4.105468919432264e-05, "loss": 0.35, "num_input_tokens_seen": 41020544, "step": 42935 }, { "epoch": 3.502732686189738, "grad_norm": 0.8123399615287781, "learning_rate": 4.105196067318868e-05, "loss": 0.3567, "num_input_tokens_seen": 41026000, "step": 42940 }, { "epoch": 3.503140549800147, "grad_norm": 0.7689839005470276, "learning_rate": 4.10492318266883e-05, "loss": 0.3647, "num_input_tokens_seen": 41030304, "step": 42945 }, { "epoch": 3.5035484134105555, "grad_norm": 0.6521792411804199, "learning_rate": 4.104650265487681e-05, "loss": 0.3623, "num_input_tokens_seen": 41034912, "step": 42950 }, { "epoch": 3.503956277020964, "grad_norm": 0.8004834055900574, "learning_rate": 4.1043773157809524e-05, "loss": 0.3422, "num_input_tokens_seen": 41039808, "step": 42955 }, { "epoch": 3.504364140631373, "grad_norm": 0.3381124436855316, "learning_rate": 4.1041043335541775e-05, "loss": 0.3529, "num_input_tokens_seen": 41043216, "step": 42960 }, { "epoch": 3.5047720042417816, "grad_norm": 0.7107941508293152, "learning_rate": 4.1038313188128896e-05, "loss": 0.3635, "num_input_tokens_seen": 41048384, "step": 42965 }, { "epoch": 3.50517986785219, "grad_norm": 0.9663428068161011, "learning_rate": 4.103558271562622e-05, "loss": 0.3349, "num_input_tokens_seen": 41052848, "step": 42970 }, { "epoch": 3.505587731462599, "grad_norm": 0.3649806082248688, "learning_rate": 4.10328519180891e-05, "loss": 0.3527, "num_input_tokens_seen": 41058208, "step": 42975 }, { "epoch": 3.5059955950730077, "grad_norm": 0.859088659286499, "learning_rate": 4.103012079557289e-05, "loss": 0.3769, "num_input_tokens_seen": 41062736, "step": 42980 }, { "epoch": 3.5064034586834163, "grad_norm": 0.2484423667192459, "learning_rate": 4.1027389348132936e-05, "loss": 0.3473, "num_input_tokens_seen": 41067872, "step": 42985 }, { "epoch": 3.506811322293825, "grad_norm": 0.5527074933052063, "learning_rate": 4.102465757582461e-05, "loss": 0.3034, "num_input_tokens_seen": 41072768, "step": 42990 }, { "epoch": 3.5072191859042334, "grad_norm": 0.3648888170719147, "learning_rate": 4.102192547870328e-05, "loss": 0.3249, "num_input_tokens_seen": 41078512, "step": 42995 }, { "epoch": 3.5076270495146424, "grad_norm": 0.4228135645389557, "learning_rate": 4.101919305682433e-05, "loss": 0.3727, "num_input_tokens_seen": 41084224, "step": 43000 }, { "epoch": 3.508034913125051, "grad_norm": 0.9485263824462891, "learning_rate": 4.101646031024316e-05, "loss": 0.2843, "num_input_tokens_seen": 41088624, "step": 43005 }, { "epoch": 3.5084427767354596, "grad_norm": 0.47712385654449463, "learning_rate": 4.101372723901513e-05, "loss": 0.2631, "num_input_tokens_seen": 41093696, "step": 43010 }, { "epoch": 3.5088506403458686, "grad_norm": 0.5988559722900391, "learning_rate": 4.1010993843195656e-05, "loss": 0.3496, "num_input_tokens_seen": 41098800, "step": 43015 }, { "epoch": 3.509258503956277, "grad_norm": 1.2260286808013916, "learning_rate": 4.1008260122840136e-05, "loss": 0.3699, "num_input_tokens_seen": 41103088, "step": 43020 }, { "epoch": 3.5096663675666857, "grad_norm": 0.5012186765670776, "learning_rate": 4.1005526078004e-05, "loss": 0.4083, "num_input_tokens_seen": 41107824, "step": 43025 }, { "epoch": 3.5100742311770943, "grad_norm": 0.5043892860412598, "learning_rate": 4.100279170874264e-05, "loss": 0.3063, "num_input_tokens_seen": 41112640, "step": 43030 }, { "epoch": 3.510482094787503, "grad_norm": 0.32727864384651184, "learning_rate": 4.100005701511149e-05, "loss": 0.3101, "num_input_tokens_seen": 41117760, "step": 43035 }, { "epoch": 3.510889958397912, "grad_norm": 0.24460075795650482, "learning_rate": 4.0997321997165985e-05, "loss": 0.3713, "num_input_tokens_seen": 41121648, "step": 43040 }, { "epoch": 3.5112978220083204, "grad_norm": 0.2443220168352127, "learning_rate": 4.099458665496156e-05, "loss": 0.3274, "num_input_tokens_seen": 41126928, "step": 43045 }, { "epoch": 3.511705685618729, "grad_norm": 0.5291175246238708, "learning_rate": 4.0991850988553673e-05, "loss": 0.2884, "num_input_tokens_seen": 41131264, "step": 43050 }, { "epoch": 3.512113549229138, "grad_norm": 0.5101394653320312, "learning_rate": 4.098911499799775e-05, "loss": 0.3364, "num_input_tokens_seen": 41136032, "step": 43055 }, { "epoch": 3.5125214128395466, "grad_norm": 0.27969998121261597, "learning_rate": 4.0986378683349256e-05, "loss": 0.3034, "num_input_tokens_seen": 41141136, "step": 43060 }, { "epoch": 3.512929276449955, "grad_norm": 0.29422247409820557, "learning_rate": 4.098364204466367e-05, "loss": 0.3322, "num_input_tokens_seen": 41145776, "step": 43065 }, { "epoch": 3.5133371400603637, "grad_norm": 0.5357071757316589, "learning_rate": 4.098090508199645e-05, "loss": 0.297, "num_input_tokens_seen": 41150976, "step": 43070 }, { "epoch": 3.5137450036707722, "grad_norm": 0.525002658367157, "learning_rate": 4.097816779540307e-05, "loss": 0.341, "num_input_tokens_seen": 41155424, "step": 43075 }, { "epoch": 3.5141528672811813, "grad_norm": 0.3551223576068878, "learning_rate": 4.097543018493902e-05, "loss": 0.4343, "num_input_tokens_seen": 41160976, "step": 43080 }, { "epoch": 3.51456073089159, "grad_norm": 0.20069578289985657, "learning_rate": 4.097269225065978e-05, "loss": 0.3582, "num_input_tokens_seen": 41166208, "step": 43085 }, { "epoch": 3.5149685945019984, "grad_norm": 0.7544271945953369, "learning_rate": 4.096995399262087e-05, "loss": 0.357, "num_input_tokens_seen": 41171344, "step": 43090 }, { "epoch": 3.5153764581124074, "grad_norm": 0.23706094920635223, "learning_rate": 4.096721541087777e-05, "loss": 0.3492, "num_input_tokens_seen": 41175680, "step": 43095 }, { "epoch": 3.515784321722816, "grad_norm": 0.39534062147140503, "learning_rate": 4.0964476505486005e-05, "loss": 0.3347, "num_input_tokens_seen": 41180368, "step": 43100 }, { "epoch": 3.5161921853332245, "grad_norm": 0.3911008834838867, "learning_rate": 4.096173727650108e-05, "loss": 0.3837, "num_input_tokens_seen": 41185408, "step": 43105 }, { "epoch": 3.516600048943633, "grad_norm": 0.7998383641242981, "learning_rate": 4.095899772397852e-05, "loss": 0.3815, "num_input_tokens_seen": 41190192, "step": 43110 }, { "epoch": 3.5170079125540417, "grad_norm": 0.2807084918022156, "learning_rate": 4.095625784797387e-05, "loss": 0.3614, "num_input_tokens_seen": 41194576, "step": 43115 }, { "epoch": 3.5174157761644507, "grad_norm": 0.6300694942474365, "learning_rate": 4.095351764854264e-05, "loss": 0.3345, "num_input_tokens_seen": 41199904, "step": 43120 }, { "epoch": 3.5178236397748592, "grad_norm": 0.7911888360977173, "learning_rate": 4.095077712574039e-05, "loss": 0.3441, "num_input_tokens_seen": 41205072, "step": 43125 }, { "epoch": 3.518231503385268, "grad_norm": 0.22151367366313934, "learning_rate": 4.0948036279622666e-05, "loss": 0.3321, "num_input_tokens_seen": 41209552, "step": 43130 }, { "epoch": 3.518639366995677, "grad_norm": 0.5786325931549072, "learning_rate": 4.0945295110245027e-05, "loss": 0.2907, "num_input_tokens_seen": 41213600, "step": 43135 }, { "epoch": 3.5190472306060854, "grad_norm": 0.38174495100975037, "learning_rate": 4.094255361766303e-05, "loss": 0.3699, "num_input_tokens_seen": 41218544, "step": 43140 }, { "epoch": 3.519455094216494, "grad_norm": 1.2112250328063965, "learning_rate": 4.093981180193224e-05, "loss": 0.3811, "num_input_tokens_seen": 41224240, "step": 43145 }, { "epoch": 3.519862957826903, "grad_norm": 0.6481873393058777, "learning_rate": 4.093706966310824e-05, "loss": 0.4499, "num_input_tokens_seen": 41229136, "step": 43150 }, { "epoch": 3.5202708214373115, "grad_norm": 0.4067087769508362, "learning_rate": 4.093432720124661e-05, "loss": 0.3642, "num_input_tokens_seen": 41233920, "step": 43155 }, { "epoch": 3.52067868504772, "grad_norm": 0.36112087965011597, "learning_rate": 4.0931584416402944e-05, "loss": 0.3549, "num_input_tokens_seen": 41238112, "step": 43160 }, { "epoch": 3.5210865486581286, "grad_norm": 0.7867162823677063, "learning_rate": 4.0928841308632825e-05, "loss": 0.3309, "num_input_tokens_seen": 41242016, "step": 43165 }, { "epoch": 3.521494412268537, "grad_norm": 0.3074262738227844, "learning_rate": 4.092609787799187e-05, "loss": 0.3521, "num_input_tokens_seen": 41246672, "step": 43170 }, { "epoch": 3.521902275878946, "grad_norm": 0.4050713777542114, "learning_rate": 4.092335412453567e-05, "loss": 0.3368, "num_input_tokens_seen": 41251808, "step": 43175 }, { "epoch": 3.522310139489355, "grad_norm": 0.7117751240730286, "learning_rate": 4.092061004831985e-05, "loss": 0.3316, "num_input_tokens_seen": 41256848, "step": 43180 }, { "epoch": 3.5227180030997634, "grad_norm": 0.9703166484832764, "learning_rate": 4.0917865649400036e-05, "loss": 0.3483, "num_input_tokens_seen": 41262224, "step": 43185 }, { "epoch": 3.5231258667101724, "grad_norm": 1.0054994821548462, "learning_rate": 4.091512092783184e-05, "loss": 0.3539, "num_input_tokens_seen": 41267584, "step": 43190 }, { "epoch": 3.523533730320581, "grad_norm": 0.6683103442192078, "learning_rate": 4.0912375883670907e-05, "loss": 0.324, "num_input_tokens_seen": 41271776, "step": 43195 }, { "epoch": 3.5239415939309895, "grad_norm": 0.8633211851119995, "learning_rate": 4.0909630516972886e-05, "loss": 0.3665, "num_input_tokens_seen": 41276784, "step": 43200 }, { "epoch": 3.524349457541398, "grad_norm": 1.0136795043945312, "learning_rate": 4.09068848277934e-05, "loss": 0.3588, "num_input_tokens_seen": 41281328, "step": 43205 }, { "epoch": 3.5247573211518066, "grad_norm": 0.3500107228755951, "learning_rate": 4.090413881618813e-05, "loss": 0.3515, "num_input_tokens_seen": 41285216, "step": 43210 }, { "epoch": 3.5251651847622156, "grad_norm": 0.35434088110923767, "learning_rate": 4.090139248221272e-05, "loss": 0.3535, "num_input_tokens_seen": 41289248, "step": 43215 }, { "epoch": 3.525573048372624, "grad_norm": 0.9480071067810059, "learning_rate": 4.0898645825922845e-05, "loss": 0.3854, "num_input_tokens_seen": 41294992, "step": 43220 }, { "epoch": 3.5259809119830328, "grad_norm": 0.6969213485717773, "learning_rate": 4.089589884737417e-05, "loss": 0.3419, "num_input_tokens_seen": 41299120, "step": 43225 }, { "epoch": 3.5263887755934418, "grad_norm": 0.3157767951488495, "learning_rate": 4.0893151546622375e-05, "loss": 0.3247, "num_input_tokens_seen": 41303312, "step": 43230 }, { "epoch": 3.5267966392038503, "grad_norm": 0.3297811448574066, "learning_rate": 4.089040392372317e-05, "loss": 0.5015, "num_input_tokens_seen": 41307744, "step": 43235 }, { "epoch": 3.527204502814259, "grad_norm": 0.33288854360580444, "learning_rate": 4.088765597873221e-05, "loss": 0.2793, "num_input_tokens_seen": 41312656, "step": 43240 }, { "epoch": 3.5276123664246675, "grad_norm": 0.6034600734710693, "learning_rate": 4.0884907711705215e-05, "loss": 0.3364, "num_input_tokens_seen": 41317232, "step": 43245 }, { "epoch": 3.528020230035076, "grad_norm": 0.4733068645000458, "learning_rate": 4.08821591226979e-05, "loss": 0.3047, "num_input_tokens_seen": 41321904, "step": 43250 }, { "epoch": 3.528428093645485, "grad_norm": 1.0225683450698853, "learning_rate": 4.087941021176597e-05, "loss": 0.3494, "num_input_tokens_seen": 41326832, "step": 43255 }, { "epoch": 3.5288359572558936, "grad_norm": 0.394325315952301, "learning_rate": 4.087666097896513e-05, "loss": 0.2829, "num_input_tokens_seen": 41332144, "step": 43260 }, { "epoch": 3.529243820866302, "grad_norm": 0.3157953917980194, "learning_rate": 4.087391142435113e-05, "loss": 0.375, "num_input_tokens_seen": 41336016, "step": 43265 }, { "epoch": 3.529651684476711, "grad_norm": 0.35920602083206177, "learning_rate": 4.087116154797969e-05, "loss": 0.379, "num_input_tokens_seen": 41339920, "step": 43270 }, { "epoch": 3.5300595480871197, "grad_norm": 0.39521369338035583, "learning_rate": 4.0868411349906546e-05, "loss": 0.3497, "num_input_tokens_seen": 41345168, "step": 43275 }, { "epoch": 3.5304674116975283, "grad_norm": 0.6798554062843323, "learning_rate": 4.086566083018745e-05, "loss": 0.3275, "num_input_tokens_seen": 41350000, "step": 43280 }, { "epoch": 3.530875275307937, "grad_norm": 0.6126792430877686, "learning_rate": 4.086290998887814e-05, "loss": 0.3535, "num_input_tokens_seen": 41354992, "step": 43285 }, { "epoch": 3.5312831389183454, "grad_norm": 0.7417278289794922, "learning_rate": 4.08601588260344e-05, "loss": 0.3487, "num_input_tokens_seen": 41359456, "step": 43290 }, { "epoch": 3.5316910025287545, "grad_norm": 0.8825732469558716, "learning_rate": 4.085740734171197e-05, "loss": 0.4043, "num_input_tokens_seen": 41363792, "step": 43295 }, { "epoch": 3.532098866139163, "grad_norm": 0.7085745930671692, "learning_rate": 4.0854655535966634e-05, "loss": 0.3515, "num_input_tokens_seen": 41368480, "step": 43300 }, { "epoch": 3.5325067297495716, "grad_norm": 0.5710375905036926, "learning_rate": 4.0851903408854166e-05, "loss": 0.3317, "num_input_tokens_seen": 41374064, "step": 43305 }, { "epoch": 3.5329145933599806, "grad_norm": 0.7905879020690918, "learning_rate": 4.0849150960430356e-05, "loss": 0.4258, "num_input_tokens_seen": 41378656, "step": 43310 }, { "epoch": 3.533322456970389, "grad_norm": 0.3526478409767151, "learning_rate": 4.084639819075099e-05, "loss": 0.3426, "num_input_tokens_seen": 41384128, "step": 43315 }, { "epoch": 3.5337303205807977, "grad_norm": 0.6827874183654785, "learning_rate": 4.0843645099871865e-05, "loss": 0.3307, "num_input_tokens_seen": 41388960, "step": 43320 }, { "epoch": 3.5341381841912067, "grad_norm": 0.8916603326797485, "learning_rate": 4.0840891687848784e-05, "loss": 0.3932, "num_input_tokens_seen": 41393568, "step": 43325 }, { "epoch": 3.5345460478016153, "grad_norm": 0.6353691816329956, "learning_rate": 4.083813795473756e-05, "loss": 0.3747, "num_input_tokens_seen": 41398208, "step": 43330 }, { "epoch": 3.534953911412024, "grad_norm": 0.24121037125587463, "learning_rate": 4.083538390059402e-05, "loss": 0.3503, "num_input_tokens_seen": 41403632, "step": 43335 }, { "epoch": 3.5353617750224324, "grad_norm": 0.7547804117202759, "learning_rate": 4.083262952547395e-05, "loss": 0.3489, "num_input_tokens_seen": 41407936, "step": 43340 }, { "epoch": 3.535769638632841, "grad_norm": 0.7993738055229187, "learning_rate": 4.082987482943323e-05, "loss": 0.3614, "num_input_tokens_seen": 41413328, "step": 43345 }, { "epoch": 3.53617750224325, "grad_norm": 0.11229455471038818, "learning_rate": 4.082711981252767e-05, "loss": 0.3372, "num_input_tokens_seen": 41419168, "step": 43350 }, { "epoch": 3.5365853658536586, "grad_norm": 0.19331897795200348, "learning_rate": 4.082436447481312e-05, "loss": 0.3577, "num_input_tokens_seen": 41424272, "step": 43355 }, { "epoch": 3.536993229464067, "grad_norm": 0.6301857829093933, "learning_rate": 4.082160881634542e-05, "loss": 0.3273, "num_input_tokens_seen": 41428816, "step": 43360 }, { "epoch": 3.537401093074476, "grad_norm": 0.3229118883609772, "learning_rate": 4.0818852837180435e-05, "loss": 0.3522, "num_input_tokens_seen": 41433872, "step": 43365 }, { "epoch": 3.5378089566848847, "grad_norm": 0.4859757721424103, "learning_rate": 4.0816096537374025e-05, "loss": 0.2977, "num_input_tokens_seen": 41438768, "step": 43370 }, { "epoch": 3.5382168202952933, "grad_norm": 0.3876759111881256, "learning_rate": 4.081333991698206e-05, "loss": 0.2635, "num_input_tokens_seen": 41443904, "step": 43375 }, { "epoch": 3.538624683905702, "grad_norm": 0.39488714933395386, "learning_rate": 4.0810582976060415e-05, "loss": 0.3041, "num_input_tokens_seen": 41448368, "step": 43380 }, { "epoch": 3.5390325475161104, "grad_norm": 0.5313664078712463, "learning_rate": 4.080782571466497e-05, "loss": 0.4426, "num_input_tokens_seen": 41453744, "step": 43385 }, { "epoch": 3.5394404111265194, "grad_norm": 1.4146913290023804, "learning_rate": 4.080506813285161e-05, "loss": 0.5407, "num_input_tokens_seen": 41459040, "step": 43390 }, { "epoch": 3.539848274736928, "grad_norm": 0.8375306129455566, "learning_rate": 4.080231023067624e-05, "loss": 0.4024, "num_input_tokens_seen": 41463680, "step": 43395 }, { "epoch": 3.5402561383473365, "grad_norm": 0.44666028022766113, "learning_rate": 4.079955200819475e-05, "loss": 0.3508, "num_input_tokens_seen": 41469008, "step": 43400 }, { "epoch": 3.5406640019577456, "grad_norm": 1.2665112018585205, "learning_rate": 4.079679346546306e-05, "loss": 0.3433, "num_input_tokens_seen": 41473632, "step": 43405 }, { "epoch": 3.541071865568154, "grad_norm": 1.141292691230774, "learning_rate": 4.079403460253708e-05, "loss": 0.3687, "num_input_tokens_seen": 41478528, "step": 43410 }, { "epoch": 3.5414797291785627, "grad_norm": 0.8601322770118713, "learning_rate": 4.079127541947273e-05, "loss": 0.3806, "num_input_tokens_seen": 41482784, "step": 43415 }, { "epoch": 3.5418875927889713, "grad_norm": 0.8295078873634338, "learning_rate": 4.078851591632594e-05, "loss": 0.3631, "num_input_tokens_seen": 41488256, "step": 43420 }, { "epoch": 3.54229545639938, "grad_norm": 0.8734768033027649, "learning_rate": 4.0785756093152636e-05, "loss": 0.3891, "num_input_tokens_seen": 41493104, "step": 43425 }, { "epoch": 3.542703320009789, "grad_norm": 0.3892919421195984, "learning_rate": 4.078299595000876e-05, "loss": 0.3376, "num_input_tokens_seen": 41498016, "step": 43430 }, { "epoch": 3.5431111836201974, "grad_norm": 0.33324649930000305, "learning_rate": 4.078023548695027e-05, "loss": 0.3542, "num_input_tokens_seen": 41502480, "step": 43435 }, { "epoch": 3.543519047230606, "grad_norm": 0.4043474793434143, "learning_rate": 4.077747470403311e-05, "loss": 0.3625, "num_input_tokens_seen": 41507264, "step": 43440 }, { "epoch": 3.543926910841015, "grad_norm": 0.6638874411582947, "learning_rate": 4.077471360131324e-05, "loss": 0.3417, "num_input_tokens_seen": 41511472, "step": 43445 }, { "epoch": 3.5443347744514235, "grad_norm": 0.3299521803855896, "learning_rate": 4.077195217884663e-05, "loss": 0.3216, "num_input_tokens_seen": 41516272, "step": 43450 }, { "epoch": 3.544742638061832, "grad_norm": 0.721359372138977, "learning_rate": 4.0769190436689254e-05, "loss": 0.2853, "num_input_tokens_seen": 41521360, "step": 43455 }, { "epoch": 3.5451505016722407, "grad_norm": 0.440353125333786, "learning_rate": 4.076642837489708e-05, "loss": 0.2989, "num_input_tokens_seen": 41526096, "step": 43460 }, { "epoch": 3.5455583652826492, "grad_norm": 0.6810424327850342, "learning_rate": 4.076366599352611e-05, "loss": 0.3462, "num_input_tokens_seen": 41530784, "step": 43465 }, { "epoch": 3.5459662288930582, "grad_norm": 0.5189096331596375, "learning_rate": 4.076090329263233e-05, "loss": 0.3494, "num_input_tokens_seen": 41535968, "step": 43470 }, { "epoch": 3.546374092503467, "grad_norm": 0.4443656802177429, "learning_rate": 4.0758140272271736e-05, "loss": 0.3623, "num_input_tokens_seen": 41540464, "step": 43475 }, { "epoch": 3.5467819561138754, "grad_norm": 0.5702672004699707, "learning_rate": 4.0755376932500334e-05, "loss": 0.3843, "num_input_tokens_seen": 41545072, "step": 43480 }, { "epoch": 3.5471898197242844, "grad_norm": 0.22323133051395416, "learning_rate": 4.075261327337413e-05, "loss": 0.3652, "num_input_tokens_seen": 41550272, "step": 43485 }, { "epoch": 3.547597683334693, "grad_norm": 0.7381369471549988, "learning_rate": 4.074984929494916e-05, "loss": 0.3412, "num_input_tokens_seen": 41555440, "step": 43490 }, { "epoch": 3.5480055469451015, "grad_norm": 0.6158977746963501, "learning_rate": 4.074708499728143e-05, "loss": 0.3294, "num_input_tokens_seen": 41560448, "step": 43495 }, { "epoch": 3.54841341055551, "grad_norm": 0.9289566278457642, "learning_rate": 4.074432038042698e-05, "loss": 0.3621, "num_input_tokens_seen": 41564704, "step": 43500 }, { "epoch": 3.548821274165919, "grad_norm": 0.3065977990627289, "learning_rate": 4.0741555444441847e-05, "loss": 0.3479, "num_input_tokens_seen": 41569360, "step": 43505 }, { "epoch": 3.5492291377763276, "grad_norm": 0.24455130100250244, "learning_rate": 4.073879018938207e-05, "loss": 0.327, "num_input_tokens_seen": 41574416, "step": 43510 }, { "epoch": 3.549637001386736, "grad_norm": 0.4808310568332672, "learning_rate": 4.0736024615303717e-05, "loss": 0.3186, "num_input_tokens_seen": 41579440, "step": 43515 }, { "epoch": 3.5500448649971448, "grad_norm": 1.098860502243042, "learning_rate": 4.0733258722262816e-05, "loss": 0.3955, "num_input_tokens_seen": 41582960, "step": 43520 }, { "epoch": 3.550452728607554, "grad_norm": 0.276359885931015, "learning_rate": 4.0730492510315454e-05, "loss": 0.3353, "num_input_tokens_seen": 41587136, "step": 43525 }, { "epoch": 3.5508605922179624, "grad_norm": 0.2049950510263443, "learning_rate": 4.072772597951769e-05, "loss": 0.3593, "num_input_tokens_seen": 41592224, "step": 43530 }, { "epoch": 3.551268455828371, "grad_norm": 0.7499656677246094, "learning_rate": 4.0724959129925606e-05, "loss": 0.3366, "num_input_tokens_seen": 41597024, "step": 43535 }, { "epoch": 3.55167631943878, "grad_norm": 0.7381070256233215, "learning_rate": 4.072219196159528e-05, "loss": 0.3395, "num_input_tokens_seen": 41602368, "step": 43540 }, { "epoch": 3.5520841830491885, "grad_norm": 0.4971616566181183, "learning_rate": 4.07194244745828e-05, "loss": 0.3397, "num_input_tokens_seen": 41607328, "step": 43545 }, { "epoch": 3.552492046659597, "grad_norm": 0.4446444511413574, "learning_rate": 4.071665666894427e-05, "loss": 0.3394, "num_input_tokens_seen": 41611888, "step": 43550 }, { "epoch": 3.5528999102700056, "grad_norm": 0.8635849356651306, "learning_rate": 4.071388854473579e-05, "loss": 0.3477, "num_input_tokens_seen": 41616448, "step": 43555 }, { "epoch": 3.553307773880414, "grad_norm": 0.6593319177627563, "learning_rate": 4.071112010201347e-05, "loss": 0.3282, "num_input_tokens_seen": 41621920, "step": 43560 }, { "epoch": 3.553715637490823, "grad_norm": 0.8657279014587402, "learning_rate": 4.070835134083341e-05, "loss": 0.3323, "num_input_tokens_seen": 41627568, "step": 43565 }, { "epoch": 3.5541235011012318, "grad_norm": 0.8933783173561096, "learning_rate": 4.070558226125175e-05, "loss": 0.3754, "num_input_tokens_seen": 41632432, "step": 43570 }, { "epoch": 3.5545313647116403, "grad_norm": 0.7267831563949585, "learning_rate": 4.0702812863324615e-05, "loss": 0.3214, "num_input_tokens_seen": 41637408, "step": 43575 }, { "epoch": 3.5549392283220493, "grad_norm": 0.761086106300354, "learning_rate": 4.0700043147108126e-05, "loss": 0.3032, "num_input_tokens_seen": 41642736, "step": 43580 }, { "epoch": 3.555347091932458, "grad_norm": 0.7165766954421997, "learning_rate": 4.069727311265844e-05, "loss": 0.324, "num_input_tokens_seen": 41648000, "step": 43585 }, { "epoch": 3.5557549555428665, "grad_norm": 0.5615803599357605, "learning_rate": 4.069450276003169e-05, "loss": 0.3238, "num_input_tokens_seen": 41653328, "step": 43590 }, { "epoch": 3.556162819153275, "grad_norm": 0.6592139005661011, "learning_rate": 4.069173208928405e-05, "loss": 0.3288, "num_input_tokens_seen": 41658336, "step": 43595 }, { "epoch": 3.5565706827636836, "grad_norm": 0.5867514610290527, "learning_rate": 4.068896110047167e-05, "loss": 0.3359, "num_input_tokens_seen": 41663328, "step": 43600 }, { "epoch": 3.5569785463740926, "grad_norm": 0.5419966578483582, "learning_rate": 4.0686189793650695e-05, "loss": 0.3812, "num_input_tokens_seen": 41667888, "step": 43605 }, { "epoch": 3.557386409984501, "grad_norm": 0.7129815816879272, "learning_rate": 4.068341816887734e-05, "loss": 0.3537, "num_input_tokens_seen": 41673120, "step": 43610 }, { "epoch": 3.5577942735949097, "grad_norm": 0.9140099883079529, "learning_rate": 4.0680646226207756e-05, "loss": 0.3818, "num_input_tokens_seen": 41677824, "step": 43615 }, { "epoch": 3.5582021372053187, "grad_norm": 0.9827062487602234, "learning_rate": 4.0677873965698135e-05, "loss": 0.3627, "num_input_tokens_seen": 41683072, "step": 43620 }, { "epoch": 3.5586100008157273, "grad_norm": 0.6448184847831726, "learning_rate": 4.067510138740467e-05, "loss": 0.3635, "num_input_tokens_seen": 41688000, "step": 43625 }, { "epoch": 3.559017864426136, "grad_norm": 0.44916895031929016, "learning_rate": 4.067232849138356e-05, "loss": 0.4483, "num_input_tokens_seen": 41692288, "step": 43630 }, { "epoch": 3.5594257280365444, "grad_norm": 0.4416574239730835, "learning_rate": 4.0669555277691015e-05, "loss": 0.2803, "num_input_tokens_seen": 41696720, "step": 43635 }, { "epoch": 3.559833591646953, "grad_norm": 1.018092393875122, "learning_rate": 4.066678174638324e-05, "loss": 0.3555, "num_input_tokens_seen": 41701952, "step": 43640 }, { "epoch": 3.560241455257362, "grad_norm": 0.4620457887649536, "learning_rate": 4.066400789751645e-05, "loss": 0.3483, "num_input_tokens_seen": 41707296, "step": 43645 }, { "epoch": 3.5606493188677706, "grad_norm": 0.8603308200836182, "learning_rate": 4.0661233731146887e-05, "loss": 0.3742, "num_input_tokens_seen": 41712080, "step": 43650 }, { "epoch": 3.561057182478179, "grad_norm": 0.9038426876068115, "learning_rate": 4.0658459247330766e-05, "loss": 0.3652, "num_input_tokens_seen": 41716880, "step": 43655 }, { "epoch": 3.561465046088588, "grad_norm": 0.2726878225803375, "learning_rate": 4.065568444612433e-05, "loss": 0.359, "num_input_tokens_seen": 41721648, "step": 43660 }, { "epoch": 3.5618729096989967, "grad_norm": 0.4463573694229126, "learning_rate": 4.065290932758382e-05, "loss": 0.3502, "num_input_tokens_seen": 41726368, "step": 43665 }, { "epoch": 3.5622807733094053, "grad_norm": 0.47009897232055664, "learning_rate": 4.06501338917655e-05, "loss": 0.3099, "num_input_tokens_seen": 41731376, "step": 43670 }, { "epoch": 3.562688636919814, "grad_norm": 1.0555163621902466, "learning_rate": 4.0647358138725604e-05, "loss": 0.3949, "num_input_tokens_seen": 41736784, "step": 43675 }, { "epoch": 3.5630965005302224, "grad_norm": 0.7239214777946472, "learning_rate": 4.0644582068520416e-05, "loss": 0.3325, "num_input_tokens_seen": 41741648, "step": 43680 }, { "epoch": 3.5635043641406314, "grad_norm": 0.43598952889442444, "learning_rate": 4.064180568120619e-05, "loss": 0.3575, "num_input_tokens_seen": 41746544, "step": 43685 }, { "epoch": 3.56391222775104, "grad_norm": 0.9309713840484619, "learning_rate": 4.063902897683921e-05, "loss": 0.3596, "num_input_tokens_seen": 41750992, "step": 43690 }, { "epoch": 3.5643200913614486, "grad_norm": 0.393485963344574, "learning_rate": 4.0636251955475765e-05, "loss": 0.3562, "num_input_tokens_seen": 41754736, "step": 43695 }, { "epoch": 3.5647279549718576, "grad_norm": 0.3784031569957733, "learning_rate": 4.063347461717214e-05, "loss": 0.3195, "num_input_tokens_seen": 41759728, "step": 43700 }, { "epoch": 3.565135818582266, "grad_norm": 0.6447912454605103, "learning_rate": 4.0630696961984615e-05, "loss": 0.3264, "num_input_tokens_seen": 41764304, "step": 43705 }, { "epoch": 3.5655436821926747, "grad_norm": 0.3762803375720978, "learning_rate": 4.062791898996951e-05, "loss": 0.3494, "num_input_tokens_seen": 41770000, "step": 43710 }, { "epoch": 3.5659515458030837, "grad_norm": 0.5710321068763733, "learning_rate": 4.0625140701183124e-05, "loss": 0.3834, "num_input_tokens_seen": 41775088, "step": 43715 }, { "epoch": 3.5663594094134923, "grad_norm": 0.8249177932739258, "learning_rate": 4.062236209568178e-05, "loss": 0.3506, "num_input_tokens_seen": 41779600, "step": 43720 }, { "epoch": 3.566767273023901, "grad_norm": 0.8175354599952698, "learning_rate": 4.0619583173521794e-05, "loss": 0.3504, "num_input_tokens_seen": 41783936, "step": 43725 }, { "epoch": 3.5671751366343094, "grad_norm": 0.39338386058807373, "learning_rate": 4.0616803934759494e-05, "loss": 0.3285, "num_input_tokens_seen": 41789568, "step": 43730 }, { "epoch": 3.567583000244718, "grad_norm": 0.5160516500473022, "learning_rate": 4.061402437945122e-05, "loss": 0.3382, "num_input_tokens_seen": 41794288, "step": 43735 }, { "epoch": 3.567990863855127, "grad_norm": 0.617720901966095, "learning_rate": 4.0611244507653295e-05, "loss": 0.2765, "num_input_tokens_seen": 41798560, "step": 43740 }, { "epoch": 3.5683987274655355, "grad_norm": 0.45954641699790955, "learning_rate": 4.0608464319422085e-05, "loss": 0.3538, "num_input_tokens_seen": 41802752, "step": 43745 }, { "epoch": 3.568806591075944, "grad_norm": 0.6467002034187317, "learning_rate": 4.060568381481393e-05, "loss": 0.339, "num_input_tokens_seen": 41808112, "step": 43750 }, { "epoch": 3.569214454686353, "grad_norm": 1.1996058225631714, "learning_rate": 4.060290299388521e-05, "loss": 0.3375, "num_input_tokens_seen": 41813408, "step": 43755 }, { "epoch": 3.5696223182967617, "grad_norm": 0.4329128861427307, "learning_rate": 4.0600121856692255e-05, "loss": 0.3584, "num_input_tokens_seen": 41818416, "step": 43760 }, { "epoch": 3.5700301819071703, "grad_norm": 0.23908641934394836, "learning_rate": 4.0597340403291476e-05, "loss": 0.3479, "num_input_tokens_seen": 41823184, "step": 43765 }, { "epoch": 3.570438045517579, "grad_norm": 0.4452809691429138, "learning_rate": 4.0594558633739234e-05, "loss": 0.3735, "num_input_tokens_seen": 41828352, "step": 43770 }, { "epoch": 3.5708459091279874, "grad_norm": 0.946187436580658, "learning_rate": 4.05917765480919e-05, "loss": 0.3538, "num_input_tokens_seen": 41832528, "step": 43775 }, { "epoch": 3.5712537727383964, "grad_norm": 0.3134721517562866, "learning_rate": 4.058899414640589e-05, "loss": 0.3593, "num_input_tokens_seen": 41837568, "step": 43780 }, { "epoch": 3.571661636348805, "grad_norm": 0.3357318043708801, "learning_rate": 4.0586211428737596e-05, "loss": 0.3417, "num_input_tokens_seen": 41843168, "step": 43785 }, { "epoch": 3.5720694999592135, "grad_norm": 0.6502585411071777, "learning_rate": 4.0583428395143416e-05, "loss": 0.3242, "num_input_tokens_seen": 41847152, "step": 43790 }, { "epoch": 3.5724773635696225, "grad_norm": 0.7198231220245361, "learning_rate": 4.058064504567977e-05, "loss": 0.3165, "num_input_tokens_seen": 41852736, "step": 43795 }, { "epoch": 3.572885227180031, "grad_norm": 0.46617791056632996, "learning_rate": 4.057786138040306e-05, "loss": 0.3971, "num_input_tokens_seen": 41857120, "step": 43800 }, { "epoch": 3.5732930907904397, "grad_norm": 0.6557103395462036, "learning_rate": 4.057507739936972e-05, "loss": 0.3748, "num_input_tokens_seen": 41862160, "step": 43805 }, { "epoch": 3.5737009544008482, "grad_norm": 0.33886754512786865, "learning_rate": 4.057229310263618e-05, "loss": 0.3545, "num_input_tokens_seen": 41866752, "step": 43810 }, { "epoch": 3.574108818011257, "grad_norm": 0.8660191893577576, "learning_rate": 4.0569508490258875e-05, "loss": 0.347, "num_input_tokens_seen": 41871184, "step": 43815 }, { "epoch": 3.574516681621666, "grad_norm": 0.7462060451507568, "learning_rate": 4.056672356229426e-05, "loss": 0.3557, "num_input_tokens_seen": 41874928, "step": 43820 }, { "epoch": 3.5749245452320744, "grad_norm": 0.9221944212913513, "learning_rate": 4.056393831879877e-05, "loss": 0.3422, "num_input_tokens_seen": 41879728, "step": 43825 }, { "epoch": 3.575332408842483, "grad_norm": 0.8229893445968628, "learning_rate": 4.0561152759828855e-05, "loss": 0.3193, "num_input_tokens_seen": 41884688, "step": 43830 }, { "epoch": 3.575740272452892, "grad_norm": 0.7943950891494751, "learning_rate": 4.0558366885440994e-05, "loss": 0.3539, "num_input_tokens_seen": 41889120, "step": 43835 }, { "epoch": 3.5761481360633005, "grad_norm": 0.365660160779953, "learning_rate": 4.055558069569164e-05, "loss": 0.3739, "num_input_tokens_seen": 41893520, "step": 43840 }, { "epoch": 3.576555999673709, "grad_norm": 0.7325352430343628, "learning_rate": 4.055279419063728e-05, "loss": 0.347, "num_input_tokens_seen": 41898912, "step": 43845 }, { "epoch": 3.5769638632841176, "grad_norm": 0.3204878270626068, "learning_rate": 4.055000737033439e-05, "loss": 0.3593, "num_input_tokens_seen": 41903424, "step": 43850 }, { "epoch": 3.577371726894526, "grad_norm": 0.6577427387237549, "learning_rate": 4.0547220234839456e-05, "loss": 0.3755, "num_input_tokens_seen": 41908192, "step": 43855 }, { "epoch": 3.577779590504935, "grad_norm": 0.46734899282455444, "learning_rate": 4.054443278420897e-05, "loss": 0.3601, "num_input_tokens_seen": 41913424, "step": 43860 }, { "epoch": 3.578187454115344, "grad_norm": 0.6645910143852234, "learning_rate": 4.054164501849945e-05, "loss": 0.3227, "num_input_tokens_seen": 41917632, "step": 43865 }, { "epoch": 3.5785953177257523, "grad_norm": 0.529707133769989, "learning_rate": 4.053885693776738e-05, "loss": 0.3477, "num_input_tokens_seen": 41922400, "step": 43870 }, { "epoch": 3.5790031813361614, "grad_norm": 0.38913676142692566, "learning_rate": 4.0536068542069284e-05, "loss": 0.3233, "num_input_tokens_seen": 41927136, "step": 43875 }, { "epoch": 3.57941104494657, "grad_norm": 0.5229785442352295, "learning_rate": 4.053327983146168e-05, "loss": 0.3055, "num_input_tokens_seen": 41931376, "step": 43880 }, { "epoch": 3.5798189085569785, "grad_norm": 0.523111879825592, "learning_rate": 4.05304908060011e-05, "loss": 0.3311, "num_input_tokens_seen": 41935920, "step": 43885 }, { "epoch": 3.5802267721673875, "grad_norm": 1.3852324485778809, "learning_rate": 4.052770146574406e-05, "loss": 0.4857, "num_input_tokens_seen": 41938992, "step": 43890 }, { "epoch": 3.580634635777796, "grad_norm": 0.3552997410297394, "learning_rate": 4.052491181074712e-05, "loss": 0.3664, "num_input_tokens_seen": 41944400, "step": 43895 }, { "epoch": 3.5810424993882046, "grad_norm": 0.20113344490528107, "learning_rate": 4.052212184106681e-05, "loss": 0.3502, "num_input_tokens_seen": 41949792, "step": 43900 }, { "epoch": 3.581450362998613, "grad_norm": 0.626743495464325, "learning_rate": 4.0519331556759684e-05, "loss": 0.3311, "num_input_tokens_seen": 41954400, "step": 43905 }, { "epoch": 3.5818582266090218, "grad_norm": 0.21435916423797607, "learning_rate": 4.0516540957882306e-05, "loss": 0.3224, "num_input_tokens_seen": 41959264, "step": 43910 }, { "epoch": 3.5822660902194308, "grad_norm": 0.5779033303260803, "learning_rate": 4.051375004449123e-05, "loss": 0.3485, "num_input_tokens_seen": 41964160, "step": 43915 }, { "epoch": 3.5826739538298393, "grad_norm": 0.632064938545227, "learning_rate": 4.051095881664304e-05, "loss": 0.3554, "num_input_tokens_seen": 41968960, "step": 43920 }, { "epoch": 3.583081817440248, "grad_norm": 0.2703087031841278, "learning_rate": 4.0508167274394307e-05, "loss": 0.3705, "num_input_tokens_seen": 41973504, "step": 43925 }, { "epoch": 3.583489681050657, "grad_norm": 0.2461923062801361, "learning_rate": 4.050537541780161e-05, "loss": 0.3376, "num_input_tokens_seen": 41977904, "step": 43930 }, { "epoch": 3.5838975446610655, "grad_norm": 0.5952031016349792, "learning_rate": 4.050258324692153e-05, "loss": 0.3493, "num_input_tokens_seen": 41983168, "step": 43935 }, { "epoch": 3.584305408271474, "grad_norm": 0.6134963631629944, "learning_rate": 4.049979076181069e-05, "loss": 0.3456, "num_input_tokens_seen": 41988288, "step": 43940 }, { "epoch": 3.5847132718818826, "grad_norm": 0.27539631724357605, "learning_rate": 4.049699796252567e-05, "loss": 0.3284, "num_input_tokens_seen": 41992944, "step": 43945 }, { "epoch": 3.585121135492291, "grad_norm": 0.5718777775764465, "learning_rate": 4.049420484912309e-05, "loss": 0.3402, "num_input_tokens_seen": 41997536, "step": 43950 }, { "epoch": 3.5855289991027, "grad_norm": 0.912632167339325, "learning_rate": 4.049141142165957e-05, "loss": 0.3493, "num_input_tokens_seen": 42001424, "step": 43955 }, { "epoch": 3.5859368627131087, "grad_norm": 0.49882203340530396, "learning_rate": 4.048861768019171e-05, "loss": 0.3037, "num_input_tokens_seen": 42006640, "step": 43960 }, { "epoch": 3.5863447263235173, "grad_norm": 0.49363937973976135, "learning_rate": 4.048582362477615e-05, "loss": 0.2847, "num_input_tokens_seen": 42010240, "step": 43965 }, { "epoch": 3.5867525899339263, "grad_norm": 0.49294140934944153, "learning_rate": 4.048302925546954e-05, "loss": 0.3628, "num_input_tokens_seen": 42015328, "step": 43970 }, { "epoch": 3.587160453544335, "grad_norm": 0.4781080484390259, "learning_rate": 4.048023457232849e-05, "loss": 0.2911, "num_input_tokens_seen": 42020176, "step": 43975 }, { "epoch": 3.5875683171547434, "grad_norm": 0.40451544523239136, "learning_rate": 4.0477439575409674e-05, "loss": 0.3065, "num_input_tokens_seen": 42025200, "step": 43980 }, { "epoch": 3.587976180765152, "grad_norm": 0.5262895226478577, "learning_rate": 4.047464426476973e-05, "loss": 0.3494, "num_input_tokens_seen": 42029968, "step": 43985 }, { "epoch": 3.5883840443755606, "grad_norm": 0.452519953250885, "learning_rate": 4.047184864046533e-05, "loss": 0.4536, "num_input_tokens_seen": 42034112, "step": 43990 }, { "epoch": 3.5887919079859696, "grad_norm": 0.22084921598434448, "learning_rate": 4.046905270255312e-05, "loss": 0.3544, "num_input_tokens_seen": 42039136, "step": 43995 }, { "epoch": 3.589199771596378, "grad_norm": 0.709335446357727, "learning_rate": 4.046625645108979e-05, "loss": 0.3905, "num_input_tokens_seen": 42044368, "step": 44000 }, { "epoch": 3.5896076352067867, "grad_norm": 0.5968729853630066, "learning_rate": 4.046345988613201e-05, "loss": 0.35, "num_input_tokens_seen": 42049664, "step": 44005 }, { "epoch": 3.5900154988171957, "grad_norm": 0.5868810415267944, "learning_rate": 4.0460663007736476e-05, "loss": 0.3394, "num_input_tokens_seen": 42054240, "step": 44010 }, { "epoch": 3.5904233624276043, "grad_norm": 0.14250731468200684, "learning_rate": 4.045786581595987e-05, "loss": 0.3598, "num_input_tokens_seen": 42059040, "step": 44015 }, { "epoch": 3.590831226038013, "grad_norm": 0.5451875329017639, "learning_rate": 4.0455068310858894e-05, "loss": 0.3551, "num_input_tokens_seen": 42062864, "step": 44020 }, { "epoch": 3.5912390896484214, "grad_norm": 0.6847131848335266, "learning_rate": 4.0452270492490255e-05, "loss": 0.3303, "num_input_tokens_seen": 42068608, "step": 44025 }, { "epoch": 3.59164695325883, "grad_norm": 0.6553807258605957, "learning_rate": 4.0449472360910645e-05, "loss": 0.3354, "num_input_tokens_seen": 42073568, "step": 44030 }, { "epoch": 3.592054816869239, "grad_norm": 0.8924509882926941, "learning_rate": 4.04466739161768e-05, "loss": 0.3402, "num_input_tokens_seen": 42077792, "step": 44035 }, { "epoch": 3.5924626804796476, "grad_norm": 0.8806955218315125, "learning_rate": 4.044387515834544e-05, "loss": 0.355, "num_input_tokens_seen": 42081872, "step": 44040 }, { "epoch": 3.592870544090056, "grad_norm": 0.7938622236251831, "learning_rate": 4.04410760874733e-05, "loss": 0.4005, "num_input_tokens_seen": 42086656, "step": 44045 }, { "epoch": 3.593278407700465, "grad_norm": 0.705993115901947, "learning_rate": 4.04382767036171e-05, "loss": 0.3508, "num_input_tokens_seen": 42091664, "step": 44050 }, { "epoch": 3.5936862713108737, "grad_norm": 0.3294045627117157, "learning_rate": 4.04354770068336e-05, "loss": 0.3381, "num_input_tokens_seen": 42096816, "step": 44055 }, { "epoch": 3.5940941349212823, "grad_norm": 0.5750271081924438, "learning_rate": 4.043267699717953e-05, "loss": 0.292, "num_input_tokens_seen": 42101776, "step": 44060 }, { "epoch": 3.5945019985316913, "grad_norm": 0.5216583013534546, "learning_rate": 4.042987667471166e-05, "loss": 0.3019, "num_input_tokens_seen": 42106624, "step": 44065 }, { "epoch": 3.5949098621421, "grad_norm": 0.4365710914134979, "learning_rate": 4.042707603948675e-05, "loss": 0.3147, "num_input_tokens_seen": 42112032, "step": 44070 }, { "epoch": 3.5953177257525084, "grad_norm": 0.466475248336792, "learning_rate": 4.042427509156156e-05, "loss": 0.4072, "num_input_tokens_seen": 42117520, "step": 44075 }, { "epoch": 3.595725589362917, "grad_norm": 0.5020627975463867, "learning_rate": 4.0421473830992866e-05, "loss": 0.3, "num_input_tokens_seen": 42122320, "step": 44080 }, { "epoch": 3.5961334529733255, "grad_norm": 0.5301291942596436, "learning_rate": 4.0418672257837456e-05, "loss": 0.4771, "num_input_tokens_seen": 42127584, "step": 44085 }, { "epoch": 3.5965413165837345, "grad_norm": 0.8110411167144775, "learning_rate": 4.041587037215211e-05, "loss": 0.4338, "num_input_tokens_seen": 42132736, "step": 44090 }, { "epoch": 3.596949180194143, "grad_norm": 0.28299063444137573, "learning_rate": 4.041306817399361e-05, "loss": 0.3072, "num_input_tokens_seen": 42136480, "step": 44095 }, { "epoch": 3.5973570438045517, "grad_norm": 0.6518176794052124, "learning_rate": 4.041026566341878e-05, "loss": 0.3441, "num_input_tokens_seen": 42141216, "step": 44100 }, { "epoch": 3.5977649074149607, "grad_norm": 0.25308552384376526, "learning_rate": 4.040746284048441e-05, "loss": 0.3519, "num_input_tokens_seen": 42144816, "step": 44105 }, { "epoch": 3.5981727710253693, "grad_norm": 0.3480158746242523, "learning_rate": 4.0404659705247314e-05, "loss": 0.3488, "num_input_tokens_seen": 42149280, "step": 44110 }, { "epoch": 3.598580634635778, "grad_norm": 0.3207472860813141, "learning_rate": 4.040185625776431e-05, "loss": 0.3551, "num_input_tokens_seen": 42154272, "step": 44115 }, { "epoch": 3.5989884982461864, "grad_norm": 0.29391732811927795, "learning_rate": 4.0399052498092235e-05, "loss": 0.3437, "num_input_tokens_seen": 42159104, "step": 44120 }, { "epoch": 3.599396361856595, "grad_norm": 0.20403389632701874, "learning_rate": 4.039624842628791e-05, "loss": 0.3374, "num_input_tokens_seen": 42163728, "step": 44125 }, { "epoch": 3.599804225467004, "grad_norm": 0.6101232171058655, "learning_rate": 4.039344404240816e-05, "loss": 0.2711, "num_input_tokens_seen": 42168096, "step": 44130 }, { "epoch": 3.6002120890774125, "grad_norm": 0.5990392565727234, "learning_rate": 4.039063934650984e-05, "loss": 0.2973, "num_input_tokens_seen": 42172592, "step": 44135 }, { "epoch": 3.600619952687821, "grad_norm": 1.053847312927246, "learning_rate": 4.038783433864981e-05, "loss": 0.421, "num_input_tokens_seen": 42176432, "step": 44140 }, { "epoch": 3.60102781629823, "grad_norm": 0.40401318669319153, "learning_rate": 4.038502901888491e-05, "loss": 0.3165, "num_input_tokens_seen": 42181136, "step": 44145 }, { "epoch": 3.6014356799086387, "grad_norm": 0.5149515271186829, "learning_rate": 4.0382223387272006e-05, "loss": 0.2899, "num_input_tokens_seen": 42185600, "step": 44150 }, { "epoch": 3.6018435435190472, "grad_norm": 0.5119320750236511, "learning_rate": 4.037941744386798e-05, "loss": 0.2739, "num_input_tokens_seen": 42190320, "step": 44155 }, { "epoch": 3.602251407129456, "grad_norm": 0.4445854723453522, "learning_rate": 4.037661118872969e-05, "loss": 0.4176, "num_input_tokens_seen": 42195280, "step": 44160 }, { "epoch": 3.6026592707398644, "grad_norm": 0.5828553438186646, "learning_rate": 4.037380462191403e-05, "loss": 0.2895, "num_input_tokens_seen": 42200624, "step": 44165 }, { "epoch": 3.6030671343502734, "grad_norm": 0.6402345299720764, "learning_rate": 4.037099774347788e-05, "loss": 0.3957, "num_input_tokens_seen": 42205072, "step": 44170 }, { "epoch": 3.603474997960682, "grad_norm": 0.6289605498313904, "learning_rate": 4.036819055347814e-05, "loss": 0.251, "num_input_tokens_seen": 42210368, "step": 44175 }, { "epoch": 3.6038828615710905, "grad_norm": 0.9259887337684631, "learning_rate": 4.0365383051971704e-05, "loss": 0.3577, "num_input_tokens_seen": 42215808, "step": 44180 }, { "epoch": 3.6042907251814995, "grad_norm": 0.42074671387672424, "learning_rate": 4.0362575239015476e-05, "loss": 0.3012, "num_input_tokens_seen": 42220576, "step": 44185 }, { "epoch": 3.604698588791908, "grad_norm": 0.3997320830821991, "learning_rate": 4.035976711466639e-05, "loss": 0.3753, "num_input_tokens_seen": 42225104, "step": 44190 }, { "epoch": 3.6051064524023166, "grad_norm": 0.6274371147155762, "learning_rate": 4.035695867898134e-05, "loss": 0.3962, "num_input_tokens_seen": 42230368, "step": 44195 }, { "epoch": 3.605514316012725, "grad_norm": 0.4395498037338257, "learning_rate": 4.035414993201727e-05, "loss": 0.3724, "num_input_tokens_seen": 42234704, "step": 44200 }, { "epoch": 3.6059221796231338, "grad_norm": 0.15788154304027557, "learning_rate": 4.035134087383111e-05, "loss": 0.3451, "num_input_tokens_seen": 42239104, "step": 44205 }, { "epoch": 3.606330043233543, "grad_norm": 0.5153630375862122, "learning_rate": 4.034853150447978e-05, "loss": 0.3217, "num_input_tokens_seen": 42243152, "step": 44210 }, { "epoch": 3.6067379068439513, "grad_norm": 0.463409423828125, "learning_rate": 4.034572182402024e-05, "loss": 0.3347, "num_input_tokens_seen": 42247872, "step": 44215 }, { "epoch": 3.60714577045436, "grad_norm": 0.37936684489250183, "learning_rate": 4.0342911832509456e-05, "loss": 0.3683, "num_input_tokens_seen": 42252768, "step": 44220 }, { "epoch": 3.607553634064769, "grad_norm": 1.0411561727523804, "learning_rate": 4.034010153000436e-05, "loss": 0.3616, "num_input_tokens_seen": 42258192, "step": 44225 }, { "epoch": 3.6079614976751775, "grad_norm": 0.3146095871925354, "learning_rate": 4.033729091656192e-05, "loss": 0.401, "num_input_tokens_seen": 42262720, "step": 44230 }, { "epoch": 3.608369361285586, "grad_norm": 0.40206417441368103, "learning_rate": 4.033447999223911e-05, "loss": 0.3591, "num_input_tokens_seen": 42267584, "step": 44235 }, { "epoch": 3.6087772248959946, "grad_norm": 0.8295536041259766, "learning_rate": 4.033166875709291e-05, "loss": 0.3699, "num_input_tokens_seen": 42272336, "step": 44240 }, { "epoch": 3.609185088506403, "grad_norm": 0.6818982362747192, "learning_rate": 4.0328857211180305e-05, "loss": 0.3146, "num_input_tokens_seen": 42277040, "step": 44245 }, { "epoch": 3.609592952116812, "grad_norm": 0.5995487570762634, "learning_rate": 4.0326045354558264e-05, "loss": 0.319, "num_input_tokens_seen": 42281744, "step": 44250 }, { "epoch": 3.6100008157272208, "grad_norm": 0.9712069630622864, "learning_rate": 4.032323318728381e-05, "loss": 0.3734, "num_input_tokens_seen": 42286592, "step": 44255 }, { "epoch": 3.6104086793376293, "grad_norm": 0.5291368961334229, "learning_rate": 4.032042070941392e-05, "loss": 0.3679, "num_input_tokens_seen": 42291552, "step": 44260 }, { "epoch": 3.6108165429480383, "grad_norm": 0.30034691095352173, "learning_rate": 4.0317607921005616e-05, "loss": 0.3159, "num_input_tokens_seen": 42295968, "step": 44265 }, { "epoch": 3.611224406558447, "grad_norm": 0.275318443775177, "learning_rate": 4.031479482211591e-05, "loss": 0.3479, "num_input_tokens_seen": 42300880, "step": 44270 }, { "epoch": 3.6116322701688555, "grad_norm": 0.7218063473701477, "learning_rate": 4.0311981412801816e-05, "loss": 0.3444, "num_input_tokens_seen": 42305440, "step": 44275 }, { "epoch": 3.6120401337792645, "grad_norm": 0.2833515703678131, "learning_rate": 4.030916769312037e-05, "loss": 0.3581, "num_input_tokens_seen": 42310640, "step": 44280 }, { "epoch": 3.612447997389673, "grad_norm": 0.8494868874549866, "learning_rate": 4.030635366312859e-05, "loss": 0.4037, "num_input_tokens_seen": 42315552, "step": 44285 }, { "epoch": 3.6128558610000816, "grad_norm": 0.7459953427314758, "learning_rate": 4.030353932288354e-05, "loss": 0.356, "num_input_tokens_seen": 42320800, "step": 44290 }, { "epoch": 3.61326372461049, "grad_norm": 0.8762907981872559, "learning_rate": 4.030072467244225e-05, "loss": 0.3612, "num_input_tokens_seen": 42325936, "step": 44295 }, { "epoch": 3.6136715882208987, "grad_norm": 0.8627167344093323, "learning_rate": 4.029790971186176e-05, "loss": 0.3606, "num_input_tokens_seen": 42330512, "step": 44300 }, { "epoch": 3.6140794518313077, "grad_norm": 0.5967521071434021, "learning_rate": 4.029509444119915e-05, "loss": 0.3954, "num_input_tokens_seen": 42335584, "step": 44305 }, { "epoch": 3.6144873154417163, "grad_norm": 0.23080620169639587, "learning_rate": 4.029227886051147e-05, "loss": 0.35, "num_input_tokens_seen": 42340752, "step": 44310 }, { "epoch": 3.614895179052125, "grad_norm": 0.7579318284988403, "learning_rate": 4.02894629698558e-05, "loss": 0.35, "num_input_tokens_seen": 42345696, "step": 44315 }, { "epoch": 3.615303042662534, "grad_norm": 0.16786639392375946, "learning_rate": 4.0286646769289215e-05, "loss": 0.3632, "num_input_tokens_seen": 42351008, "step": 44320 }, { "epoch": 3.6157109062729424, "grad_norm": 0.39336785674095154, "learning_rate": 4.028383025886879e-05, "loss": 0.3665, "num_input_tokens_seen": 42354864, "step": 44325 }, { "epoch": 3.616118769883351, "grad_norm": 0.19737128913402557, "learning_rate": 4.0281013438651616e-05, "loss": 0.3358, "num_input_tokens_seen": 42359536, "step": 44330 }, { "epoch": 3.6165266334937596, "grad_norm": 0.6585108041763306, "learning_rate": 4.027819630869481e-05, "loss": 0.3407, "num_input_tokens_seen": 42364000, "step": 44335 }, { "epoch": 3.616934497104168, "grad_norm": 0.7746397852897644, "learning_rate": 4.0275378869055435e-05, "loss": 0.3645, "num_input_tokens_seen": 42369456, "step": 44340 }, { "epoch": 3.617342360714577, "grad_norm": 0.6516693234443665, "learning_rate": 4.027256111979063e-05, "loss": 0.3455, "num_input_tokens_seen": 42374144, "step": 44345 }, { "epoch": 3.6177502243249857, "grad_norm": 0.6900051236152649, "learning_rate": 4.026974306095751e-05, "loss": 0.3424, "num_input_tokens_seen": 42379120, "step": 44350 }, { "epoch": 3.6181580879353943, "grad_norm": 0.2084837406873703, "learning_rate": 4.026692469261318e-05, "loss": 0.3695, "num_input_tokens_seen": 42384736, "step": 44355 }, { "epoch": 3.6185659515458033, "grad_norm": 0.6888055205345154, "learning_rate": 4.0264106014814765e-05, "loss": 0.3467, "num_input_tokens_seen": 42389424, "step": 44360 }, { "epoch": 3.618973815156212, "grad_norm": 0.31693291664123535, "learning_rate": 4.026128702761942e-05, "loss": 0.3244, "num_input_tokens_seen": 42394704, "step": 44365 }, { "epoch": 3.6193816787666204, "grad_norm": 0.3262953758239746, "learning_rate": 4.025846773108426e-05, "loss": 0.3336, "num_input_tokens_seen": 42399360, "step": 44370 }, { "epoch": 3.619789542377029, "grad_norm": 0.2332686334848404, "learning_rate": 4.0255648125266455e-05, "loss": 0.3433, "num_input_tokens_seen": 42403552, "step": 44375 }, { "epoch": 3.6201974059874376, "grad_norm": 0.34737420082092285, "learning_rate": 4.0252828210223134e-05, "loss": 0.3225, "num_input_tokens_seen": 42408256, "step": 44380 }, { "epoch": 3.6206052695978466, "grad_norm": 0.6191734075546265, "learning_rate": 4.025000798601148e-05, "loss": 0.3763, "num_input_tokens_seen": 42412704, "step": 44385 }, { "epoch": 3.621013133208255, "grad_norm": 0.26012280583381653, "learning_rate": 4.024718745268864e-05, "loss": 0.3798, "num_input_tokens_seen": 42417440, "step": 44390 }, { "epoch": 3.6214209968186637, "grad_norm": 0.3006155490875244, "learning_rate": 4.024436661031178e-05, "loss": 0.3198, "num_input_tokens_seen": 42420960, "step": 44395 }, { "epoch": 3.6218288604290727, "grad_norm": 0.6188884377479553, "learning_rate": 4.024154545893809e-05, "loss": 0.3193, "num_input_tokens_seen": 42426736, "step": 44400 }, { "epoch": 3.6222367240394813, "grad_norm": 0.3839830458164215, "learning_rate": 4.023872399862475e-05, "loss": 0.3531, "num_input_tokens_seen": 42431280, "step": 44405 }, { "epoch": 3.62264458764989, "grad_norm": 0.3409855365753174, "learning_rate": 4.0235902229428954e-05, "loss": 0.3537, "num_input_tokens_seen": 42436160, "step": 44410 }, { "epoch": 3.6230524512602984, "grad_norm": 0.38083282113075256, "learning_rate": 4.0233080151407896e-05, "loss": 0.3725, "num_input_tokens_seen": 42441840, "step": 44415 }, { "epoch": 3.623460314870707, "grad_norm": 0.8360741138458252, "learning_rate": 4.023025776461877e-05, "loss": 0.3374, "num_input_tokens_seen": 42445632, "step": 44420 }, { "epoch": 3.623868178481116, "grad_norm": 0.754465639591217, "learning_rate": 4.02274350691188e-05, "loss": 0.3385, "num_input_tokens_seen": 42450320, "step": 44425 }, { "epoch": 3.6242760420915245, "grad_norm": 0.4132317900657654, "learning_rate": 4.0224612064965185e-05, "loss": 0.3374, "num_input_tokens_seen": 42455648, "step": 44430 }, { "epoch": 3.624683905701933, "grad_norm": 0.7715452909469604, "learning_rate": 4.022178875221515e-05, "loss": 0.358, "num_input_tokens_seen": 42459648, "step": 44435 }, { "epoch": 3.625091769312342, "grad_norm": 0.14670246839523315, "learning_rate": 4.021896513092592e-05, "loss": 0.3508, "num_input_tokens_seen": 42464640, "step": 44440 }, { "epoch": 3.6254996329227507, "grad_norm": 0.31232523918151855, "learning_rate": 4.021614120115474e-05, "loss": 0.3405, "num_input_tokens_seen": 42469360, "step": 44445 }, { "epoch": 3.6259074965331592, "grad_norm": 0.7232218980789185, "learning_rate": 4.021331696295885e-05, "loss": 0.3378, "num_input_tokens_seen": 42473520, "step": 44450 }, { "epoch": 3.6263153601435683, "grad_norm": 0.615250289440155, "learning_rate": 4.021049241639549e-05, "loss": 0.3474, "num_input_tokens_seen": 42478368, "step": 44455 }, { "epoch": 3.626723223753977, "grad_norm": 0.41135087609291077, "learning_rate": 4.02076675615219e-05, "loss": 0.3281, "num_input_tokens_seen": 42483472, "step": 44460 }, { "epoch": 3.6271310873643854, "grad_norm": 0.5690650343894958, "learning_rate": 4.020484239839536e-05, "loss": 0.3658, "num_input_tokens_seen": 42488464, "step": 44465 }, { "epoch": 3.627538950974794, "grad_norm": 0.7968064546585083, "learning_rate": 4.0202016927073125e-05, "loss": 0.3632, "num_input_tokens_seen": 42493872, "step": 44470 }, { "epoch": 3.6279468145852025, "grad_norm": 0.37087878584861755, "learning_rate": 4.019919114761246e-05, "loss": 0.3523, "num_input_tokens_seen": 42498976, "step": 44475 }, { "epoch": 3.6283546781956115, "grad_norm": 0.7225372195243835, "learning_rate": 4.019636506007066e-05, "loss": 0.3242, "num_input_tokens_seen": 42503456, "step": 44480 }, { "epoch": 3.62876254180602, "grad_norm": 0.8630270957946777, "learning_rate": 4.0193538664504995e-05, "loss": 0.3805, "num_input_tokens_seen": 42508736, "step": 44485 }, { "epoch": 3.6291704054164287, "grad_norm": 0.39490145444869995, "learning_rate": 4.019071196097275e-05, "loss": 0.3392, "num_input_tokens_seen": 42512528, "step": 44490 }, { "epoch": 3.6295782690268377, "grad_norm": 0.8798011541366577, "learning_rate": 4.018788494953123e-05, "loss": 0.3889, "num_input_tokens_seen": 42516256, "step": 44495 }, { "epoch": 3.6299861326372462, "grad_norm": 0.3385513424873352, "learning_rate": 4.0185057630237736e-05, "loss": 0.3507, "num_input_tokens_seen": 42520224, "step": 44500 }, { "epoch": 3.630393996247655, "grad_norm": 0.7303597927093506, "learning_rate": 4.018223000314958e-05, "loss": 0.3491, "num_input_tokens_seen": 42524496, "step": 44505 }, { "epoch": 3.6308018598580634, "grad_norm": 0.7338176965713501, "learning_rate": 4.0179402068324065e-05, "loss": 0.3483, "num_input_tokens_seen": 42527696, "step": 44510 }, { "epoch": 3.631209723468472, "grad_norm": 0.596578061580658, "learning_rate": 4.017657382581853e-05, "loss": 0.3245, "num_input_tokens_seen": 42533216, "step": 44515 }, { "epoch": 3.631617587078881, "grad_norm": 0.5068262815475464, "learning_rate": 4.017374527569029e-05, "loss": 0.3831, "num_input_tokens_seen": 42538496, "step": 44520 }, { "epoch": 3.6320254506892895, "grad_norm": 1.0442960262298584, "learning_rate": 4.017091641799669e-05, "loss": 0.4996, "num_input_tokens_seen": 42543312, "step": 44525 }, { "epoch": 3.632433314299698, "grad_norm": 0.6566271185874939, "learning_rate": 4.016808725279504e-05, "loss": 0.3198, "num_input_tokens_seen": 42547776, "step": 44530 }, { "epoch": 3.632841177910107, "grad_norm": 0.815498948097229, "learning_rate": 4.0165257780142724e-05, "loss": 0.35, "num_input_tokens_seen": 42552688, "step": 44535 }, { "epoch": 3.6332490415205156, "grad_norm": 0.4692671000957489, "learning_rate": 4.016242800009706e-05, "loss": 0.3348, "num_input_tokens_seen": 42557616, "step": 44540 }, { "epoch": 3.633656905130924, "grad_norm": 0.6910149455070496, "learning_rate": 4.015959791271544e-05, "loss": 0.3047, "num_input_tokens_seen": 42561968, "step": 44545 }, { "epoch": 3.6340647687413328, "grad_norm": 0.810273289680481, "learning_rate": 4.015676751805521e-05, "loss": 0.3763, "num_input_tokens_seen": 42566000, "step": 44550 }, { "epoch": 3.6344726323517413, "grad_norm": 0.26475149393081665, "learning_rate": 4.0153936816173735e-05, "loss": 0.3638, "num_input_tokens_seen": 42571328, "step": 44555 }, { "epoch": 3.6348804959621503, "grad_norm": 0.2744964063167572, "learning_rate": 4.01511058071284e-05, "loss": 0.3865, "num_input_tokens_seen": 42576048, "step": 44560 }, { "epoch": 3.635288359572559, "grad_norm": 0.7533156871795654, "learning_rate": 4.0148274490976595e-05, "loss": 0.332, "num_input_tokens_seen": 42580112, "step": 44565 }, { "epoch": 3.6356962231829675, "grad_norm": 0.7017633318901062, "learning_rate": 4.01454428677757e-05, "loss": 0.3494, "num_input_tokens_seen": 42585216, "step": 44570 }, { "epoch": 3.6361040867933765, "grad_norm": 0.4255537986755371, "learning_rate": 4.014261093758311e-05, "loss": 0.3616, "num_input_tokens_seen": 42588624, "step": 44575 }, { "epoch": 3.636511950403785, "grad_norm": 0.4254889190196991, "learning_rate": 4.0139778700456245e-05, "loss": 0.3268, "num_input_tokens_seen": 42593568, "step": 44580 }, { "epoch": 3.6369198140141936, "grad_norm": 0.827457845211029, "learning_rate": 4.013694615645248e-05, "loss": 0.3901, "num_input_tokens_seen": 42598464, "step": 44585 }, { "epoch": 3.637327677624602, "grad_norm": 0.7594587206840515, "learning_rate": 4.013411330562926e-05, "loss": 0.3617, "num_input_tokens_seen": 42602592, "step": 44590 }, { "epoch": 3.6377355412350107, "grad_norm": 0.689306378364563, "learning_rate": 4.013128014804399e-05, "loss": 0.3563, "num_input_tokens_seen": 42606432, "step": 44595 }, { "epoch": 3.6381434048454198, "grad_norm": 0.6496831178665161, "learning_rate": 4.0128446683754105e-05, "loss": 0.3582, "num_input_tokens_seen": 42611696, "step": 44600 }, { "epoch": 3.6385512684558283, "grad_norm": 0.37423175573349, "learning_rate": 4.012561291281703e-05, "loss": 0.3614, "num_input_tokens_seen": 42616720, "step": 44605 }, { "epoch": 3.638959132066237, "grad_norm": 0.3771088719367981, "learning_rate": 4.012277883529021e-05, "loss": 0.3429, "num_input_tokens_seen": 42620672, "step": 44610 }, { "epoch": 3.639366995676646, "grad_norm": 0.6470677852630615, "learning_rate": 4.011994445123108e-05, "loss": 0.35, "num_input_tokens_seen": 42625760, "step": 44615 }, { "epoch": 3.6397748592870545, "grad_norm": 0.41283395886421204, "learning_rate": 4.011710976069712e-05, "loss": 0.3692, "num_input_tokens_seen": 42631280, "step": 44620 }, { "epoch": 3.640182722897463, "grad_norm": 0.7747907638549805, "learning_rate": 4.0114274763745755e-05, "loss": 0.3413, "num_input_tokens_seen": 42636320, "step": 44625 }, { "epoch": 3.640590586507872, "grad_norm": 0.34449926018714905, "learning_rate": 4.011143946043447e-05, "loss": 0.3629, "num_input_tokens_seen": 42641296, "step": 44630 }, { "epoch": 3.6409984501182806, "grad_norm": 0.6716656684875488, "learning_rate": 4.010860385082072e-05, "loss": 0.3526, "num_input_tokens_seen": 42645936, "step": 44635 }, { "epoch": 3.641406313728689, "grad_norm": 0.5678203701972961, "learning_rate": 4.0105767934961995e-05, "loss": 0.2924, "num_input_tokens_seen": 42651120, "step": 44640 }, { "epoch": 3.6418141773390977, "grad_norm": 0.449542373418808, "learning_rate": 4.010293171291578e-05, "loss": 0.4109, "num_input_tokens_seen": 42655968, "step": 44645 }, { "epoch": 3.6422220409495063, "grad_norm": 0.4365334212779999, "learning_rate": 4.010009518473955e-05, "loss": 0.3001, "num_input_tokens_seen": 42660960, "step": 44650 }, { "epoch": 3.6426299045599153, "grad_norm": 0.9494619965553284, "learning_rate": 4.00972583504908e-05, "loss": 0.4311, "num_input_tokens_seen": 42666304, "step": 44655 }, { "epoch": 3.643037768170324, "grad_norm": 0.44243311882019043, "learning_rate": 4.009442121022704e-05, "loss": 0.4068, "num_input_tokens_seen": 42671680, "step": 44660 }, { "epoch": 3.6434456317807324, "grad_norm": 0.6610050797462463, "learning_rate": 4.009158376400579e-05, "loss": 0.3764, "num_input_tokens_seen": 42675920, "step": 44665 }, { "epoch": 3.6438534953911415, "grad_norm": 0.4136577546596527, "learning_rate": 4.008874601188454e-05, "loss": 0.3727, "num_input_tokens_seen": 42680720, "step": 44670 }, { "epoch": 3.64426135900155, "grad_norm": 0.6867879629135132, "learning_rate": 4.008590795392082e-05, "loss": 0.3526, "num_input_tokens_seen": 42685408, "step": 44675 }, { "epoch": 3.6446692226119586, "grad_norm": 0.4819355010986328, "learning_rate": 4.008306959017216e-05, "loss": 0.364, "num_input_tokens_seen": 42689104, "step": 44680 }, { "epoch": 3.645077086222367, "grad_norm": 0.4449988603591919, "learning_rate": 4.008023092069608e-05, "loss": 0.3518, "num_input_tokens_seen": 42693056, "step": 44685 }, { "epoch": 3.6454849498327757, "grad_norm": 0.18204565346240997, "learning_rate": 4.0077391945550135e-05, "loss": 0.3624, "num_input_tokens_seen": 42697360, "step": 44690 }, { "epoch": 3.6458928134431847, "grad_norm": 0.21108472347259521, "learning_rate": 4.007455266479186e-05, "loss": 0.388, "num_input_tokens_seen": 42702752, "step": 44695 }, { "epoch": 3.6463006770535933, "grad_norm": 0.27709123492240906, "learning_rate": 4.007171307847881e-05, "loss": 0.3371, "num_input_tokens_seen": 42707600, "step": 44700 }, { "epoch": 3.646708540664002, "grad_norm": 0.6135154366493225, "learning_rate": 4.0068873186668536e-05, "loss": 0.3458, "num_input_tokens_seen": 42711952, "step": 44705 }, { "epoch": 3.647116404274411, "grad_norm": 0.7935357093811035, "learning_rate": 4.006603298941861e-05, "loss": 0.3763, "num_input_tokens_seen": 42716544, "step": 44710 }, { "epoch": 3.6475242678848194, "grad_norm": 0.19900484383106232, "learning_rate": 4.006319248678659e-05, "loss": 0.3445, "num_input_tokens_seen": 42721840, "step": 44715 }, { "epoch": 3.647932131495228, "grad_norm": 0.7550017237663269, "learning_rate": 4.0060351678830066e-05, "loss": 0.3412, "num_input_tokens_seen": 42726752, "step": 44720 }, { "epoch": 3.6483399951056366, "grad_norm": 0.7034404873847961, "learning_rate": 4.005751056560662e-05, "loss": 0.3676, "num_input_tokens_seen": 42731712, "step": 44725 }, { "epoch": 3.648747858716045, "grad_norm": 0.6315490007400513, "learning_rate": 4.005466914717382e-05, "loss": 0.3458, "num_input_tokens_seen": 42736784, "step": 44730 }, { "epoch": 3.649155722326454, "grad_norm": 0.7037976980209351, "learning_rate": 4.005182742358928e-05, "loss": 0.3627, "num_input_tokens_seen": 42740432, "step": 44735 }, { "epoch": 3.6495635859368627, "grad_norm": 0.4504861533641815, "learning_rate": 4.0048985394910587e-05, "loss": 0.3398, "num_input_tokens_seen": 42745312, "step": 44740 }, { "epoch": 3.6499714495472713, "grad_norm": 0.2823925316333771, "learning_rate": 4.004614306119536e-05, "loss": 0.3455, "num_input_tokens_seen": 42750400, "step": 44745 }, { "epoch": 3.6503793131576803, "grad_norm": 0.6736828684806824, "learning_rate": 4.004330042250121e-05, "loss": 0.3393, "num_input_tokens_seen": 42755552, "step": 44750 }, { "epoch": 3.650787176768089, "grad_norm": 0.3465433716773987, "learning_rate": 4.004045747888574e-05, "loss": 0.3331, "num_input_tokens_seen": 42760240, "step": 44755 }, { "epoch": 3.6511950403784974, "grad_norm": 0.8054845333099365, "learning_rate": 4.0037614230406605e-05, "loss": 0.3516, "num_input_tokens_seen": 42764832, "step": 44760 }, { "epoch": 3.651602903988906, "grad_norm": 0.7787812948226929, "learning_rate": 4.0034770677121414e-05, "loss": 0.3581, "num_input_tokens_seen": 42770192, "step": 44765 }, { "epoch": 3.6520107675993145, "grad_norm": 0.7750496864318848, "learning_rate": 4.003192681908781e-05, "loss": 0.3336, "num_input_tokens_seen": 42775072, "step": 44770 }, { "epoch": 3.6524186312097235, "grad_norm": 0.6661461591720581, "learning_rate": 4.002908265636343e-05, "loss": 0.3425, "num_input_tokens_seen": 42779856, "step": 44775 }, { "epoch": 3.652826494820132, "grad_norm": 0.3512309193611145, "learning_rate": 4.002623818900594e-05, "loss": 0.3583, "num_input_tokens_seen": 42783680, "step": 44780 }, { "epoch": 3.6532343584305407, "grad_norm": 0.38212528824806213, "learning_rate": 4.002339341707297e-05, "loss": 0.3301, "num_input_tokens_seen": 42788432, "step": 44785 }, { "epoch": 3.6536422220409497, "grad_norm": 0.3826543986797333, "learning_rate": 4.002054834062222e-05, "loss": 0.3638, "num_input_tokens_seen": 42793216, "step": 44790 }, { "epoch": 3.6540500856513582, "grad_norm": 0.8049892783164978, "learning_rate": 4.001770295971132e-05, "loss": 0.3362, "num_input_tokens_seen": 42799008, "step": 44795 }, { "epoch": 3.654457949261767, "grad_norm": 0.41088905930519104, "learning_rate": 4.0014857274397974e-05, "loss": 0.3383, "num_input_tokens_seen": 42803520, "step": 44800 }, { "epoch": 3.6548658128721754, "grad_norm": 0.3185158669948578, "learning_rate": 4.001201128473985e-05, "loss": 0.3424, "num_input_tokens_seen": 42808416, "step": 44805 }, { "epoch": 3.6552736764825844, "grad_norm": 0.7729501724243164, "learning_rate": 4.0009164990794636e-05, "loss": 0.3285, "num_input_tokens_seen": 42813584, "step": 44810 }, { "epoch": 3.655681540092993, "grad_norm": 0.8382214307785034, "learning_rate": 4.0006318392620025e-05, "loss": 0.3364, "num_input_tokens_seen": 42817776, "step": 44815 }, { "epoch": 3.6560894037034015, "grad_norm": 0.6852116584777832, "learning_rate": 4.000347149027371e-05, "loss": 0.3706, "num_input_tokens_seen": 42821248, "step": 44820 }, { "epoch": 3.65649726731381, "grad_norm": 0.41297647356987, "learning_rate": 4.0000624283813416e-05, "loss": 0.3612, "num_input_tokens_seen": 42826176, "step": 44825 }, { "epoch": 3.656905130924219, "grad_norm": 0.6276145577430725, "learning_rate": 3.999777677329684e-05, "loss": 0.3469, "num_input_tokens_seen": 42831584, "step": 44830 }, { "epoch": 3.6573129945346277, "grad_norm": 0.5974755883216858, "learning_rate": 3.9994928958781694e-05, "loss": 0.3115, "num_input_tokens_seen": 42836080, "step": 44835 }, { "epoch": 3.6577208581450362, "grad_norm": 0.560992956161499, "learning_rate": 3.9992080840325705e-05, "loss": 0.3177, "num_input_tokens_seen": 42841040, "step": 44840 }, { "epoch": 3.6581287217554452, "grad_norm": 0.9128835797309875, "learning_rate": 3.998923241798662e-05, "loss": 0.4079, "num_input_tokens_seen": 42845616, "step": 44845 }, { "epoch": 3.658536585365854, "grad_norm": 0.3821152448654175, "learning_rate": 3.998638369182216e-05, "loss": 0.3565, "num_input_tokens_seen": 42851312, "step": 44850 }, { "epoch": 3.6589444489762624, "grad_norm": 0.42055171728134155, "learning_rate": 3.998353466189007e-05, "loss": 0.4317, "num_input_tokens_seen": 42856496, "step": 44855 }, { "epoch": 3.659352312586671, "grad_norm": 0.39697888493537903, "learning_rate": 3.9980685328248104e-05, "loss": 0.3787, "num_input_tokens_seen": 42861712, "step": 44860 }, { "epoch": 3.6597601761970795, "grad_norm": 0.7362351417541504, "learning_rate": 3.9977835690954e-05, "loss": 0.3399, "num_input_tokens_seen": 42866384, "step": 44865 }, { "epoch": 3.6601680398074885, "grad_norm": 0.5835257172584534, "learning_rate": 3.997498575006554e-05, "loss": 0.3533, "num_input_tokens_seen": 42871520, "step": 44870 }, { "epoch": 3.660575903417897, "grad_norm": 0.3088332414627075, "learning_rate": 3.9972135505640484e-05, "loss": 0.3509, "num_input_tokens_seen": 42876976, "step": 44875 }, { "epoch": 3.6609837670283056, "grad_norm": 0.6888879537582397, "learning_rate": 3.9969284957736596e-05, "loss": 0.3863, "num_input_tokens_seen": 42881760, "step": 44880 }, { "epoch": 3.6613916306387146, "grad_norm": 0.23409995436668396, "learning_rate": 3.996643410641167e-05, "loss": 0.3417, "num_input_tokens_seen": 42885872, "step": 44885 }, { "epoch": 3.661799494249123, "grad_norm": 0.6161282658576965, "learning_rate": 3.996358295172348e-05, "loss": 0.3401, "num_input_tokens_seen": 42891344, "step": 44890 }, { "epoch": 3.6622073578595318, "grad_norm": 0.22079390287399292, "learning_rate": 3.9960731493729816e-05, "loss": 0.3503, "num_input_tokens_seen": 42895376, "step": 44895 }, { "epoch": 3.6626152214699403, "grad_norm": 0.49326762557029724, "learning_rate": 3.995787973248849e-05, "loss": 0.3304, "num_input_tokens_seen": 42900256, "step": 44900 }, { "epoch": 3.663023085080349, "grad_norm": 0.30593210458755493, "learning_rate": 3.99550276680573e-05, "loss": 0.3217, "num_input_tokens_seen": 42905376, "step": 44905 }, { "epoch": 3.663430948690758, "grad_norm": 0.6282874941825867, "learning_rate": 3.9952175300494044e-05, "loss": 0.3043, "num_input_tokens_seen": 42910096, "step": 44910 }, { "epoch": 3.6638388123011665, "grad_norm": 0.8801169395446777, "learning_rate": 3.994932262985655e-05, "loss": 0.4015, "num_input_tokens_seen": 42914336, "step": 44915 }, { "epoch": 3.664246675911575, "grad_norm": 0.3575384318828583, "learning_rate": 3.994646965620265e-05, "loss": 0.4038, "num_input_tokens_seen": 42919776, "step": 44920 }, { "epoch": 3.664654539521984, "grad_norm": 0.9494069218635559, "learning_rate": 3.994361637959016e-05, "loss": 0.3544, "num_input_tokens_seen": 42924816, "step": 44925 }, { "epoch": 3.6650624031323926, "grad_norm": 0.6311779618263245, "learning_rate": 3.99407628000769e-05, "loss": 0.3491, "num_input_tokens_seen": 42928896, "step": 44930 }, { "epoch": 3.665470266742801, "grad_norm": 0.24832957983016968, "learning_rate": 3.9937908917720745e-05, "loss": 0.3298, "num_input_tokens_seen": 42933936, "step": 44935 }, { "epoch": 3.6658781303532098, "grad_norm": 0.4517379403114319, "learning_rate": 3.9935054732579515e-05, "loss": 0.3348, "num_input_tokens_seen": 42938224, "step": 44940 }, { "epoch": 3.6662859939636183, "grad_norm": 0.29722198843955994, "learning_rate": 3.993220024471107e-05, "loss": 0.3227, "num_input_tokens_seen": 42942048, "step": 44945 }, { "epoch": 3.6666938575740273, "grad_norm": 0.4535363018512726, "learning_rate": 3.992934545417327e-05, "loss": 0.3641, "num_input_tokens_seen": 42946784, "step": 44950 }, { "epoch": 3.667101721184436, "grad_norm": 0.7982372045516968, "learning_rate": 3.9926490361023986e-05, "loss": 0.3598, "num_input_tokens_seen": 42952224, "step": 44955 }, { "epoch": 3.6675095847948445, "grad_norm": 0.5373563170433044, "learning_rate": 3.992363496532109e-05, "loss": 0.3503, "num_input_tokens_seen": 42956352, "step": 44960 }, { "epoch": 3.6679174484052535, "grad_norm": 0.4454016089439392, "learning_rate": 3.9920779267122446e-05, "loss": 0.3292, "num_input_tokens_seen": 42961088, "step": 44965 }, { "epoch": 3.668325312015662, "grad_norm": 0.9430587291717529, "learning_rate": 3.9917923266485946e-05, "loss": 0.3364, "num_input_tokens_seen": 42966000, "step": 44970 }, { "epoch": 3.6687331756260706, "grad_norm": 1.056084156036377, "learning_rate": 3.9915066963469485e-05, "loss": 0.3873, "num_input_tokens_seen": 42970832, "step": 44975 }, { "epoch": 3.669141039236479, "grad_norm": 0.69671630859375, "learning_rate": 3.991221035813095e-05, "loss": 0.352, "num_input_tokens_seen": 42976144, "step": 44980 }, { "epoch": 3.6695489028468877, "grad_norm": 0.4018057584762573, "learning_rate": 3.990935345052825e-05, "loss": 0.3272, "num_input_tokens_seen": 42981520, "step": 44985 }, { "epoch": 3.6699567664572967, "grad_norm": 0.7715816497802734, "learning_rate": 3.990649624071929e-05, "loss": 0.3016, "num_input_tokens_seen": 42986320, "step": 44990 }, { "epoch": 3.6703646300677053, "grad_norm": 0.5974150896072388, "learning_rate": 3.9903638728761994e-05, "loss": 0.3089, "num_input_tokens_seen": 42990704, "step": 44995 }, { "epoch": 3.670772493678114, "grad_norm": 0.4888588488101959, "learning_rate": 3.990078091471426e-05, "loss": 0.3211, "num_input_tokens_seen": 42995344, "step": 45000 }, { "epoch": 3.671180357288523, "grad_norm": 0.6117628812789917, "learning_rate": 3.989792279863404e-05, "loss": 0.4112, "num_input_tokens_seen": 42999872, "step": 45005 }, { "epoch": 3.6715882208989314, "grad_norm": 0.9537333846092224, "learning_rate": 3.989506438057925e-05, "loss": 0.4001, "num_input_tokens_seen": 43005136, "step": 45010 }, { "epoch": 3.67199608450934, "grad_norm": 0.5324316620826721, "learning_rate": 3.989220566060784e-05, "loss": 0.378, "num_input_tokens_seen": 43009888, "step": 45015 }, { "epoch": 3.672403948119749, "grad_norm": 0.5174831748008728, "learning_rate": 3.988934663877774e-05, "loss": 0.3825, "num_input_tokens_seen": 43013936, "step": 45020 }, { "epoch": 3.6728118117301576, "grad_norm": 0.2996421158313751, "learning_rate": 3.988648731514691e-05, "loss": 0.3374, "num_input_tokens_seen": 43018896, "step": 45025 }, { "epoch": 3.673219675340566, "grad_norm": 0.5385184288024902, "learning_rate": 3.988362768977332e-05, "loss": 0.3576, "num_input_tokens_seen": 43023120, "step": 45030 }, { "epoch": 3.6736275389509747, "grad_norm": 0.9506929516792297, "learning_rate": 3.988076776271491e-05, "loss": 0.3658, "num_input_tokens_seen": 43027232, "step": 45035 }, { "epoch": 3.6740354025613833, "grad_norm": 1.1343673467636108, "learning_rate": 3.987790753402967e-05, "loss": 0.3054, "num_input_tokens_seen": 43032544, "step": 45040 }, { "epoch": 3.6744432661717923, "grad_norm": 0.6632674336433411, "learning_rate": 3.987504700377555e-05, "loss": 0.3259, "num_input_tokens_seen": 43037232, "step": 45045 }, { "epoch": 3.674851129782201, "grad_norm": 0.4121454060077667, "learning_rate": 3.987218617201056e-05, "loss": 0.3704, "num_input_tokens_seen": 43042432, "step": 45050 }, { "epoch": 3.6752589933926094, "grad_norm": 0.5152803063392639, "learning_rate": 3.986932503879268e-05, "loss": 0.3042, "num_input_tokens_seen": 43047680, "step": 45055 }, { "epoch": 3.6756668570030184, "grad_norm": 0.48990586400032043, "learning_rate": 3.9866463604179894e-05, "loss": 0.2714, "num_input_tokens_seen": 43052336, "step": 45060 }, { "epoch": 3.676074720613427, "grad_norm": 0.5305220484733582, "learning_rate": 3.98636018682302e-05, "loss": 0.3179, "num_input_tokens_seen": 43057856, "step": 45065 }, { "epoch": 3.6764825842238356, "grad_norm": 0.4519728720188141, "learning_rate": 3.9860739831001625e-05, "loss": 0.3789, "num_input_tokens_seen": 43062048, "step": 45070 }, { "epoch": 3.676890447834244, "grad_norm": 0.5248516798019409, "learning_rate": 3.9857877492552166e-05, "loss": 0.4073, "num_input_tokens_seen": 43067024, "step": 45075 }, { "epoch": 3.6772983114446527, "grad_norm": 0.8797906637191772, "learning_rate": 3.9855014852939845e-05, "loss": 0.3439, "num_input_tokens_seen": 43071712, "step": 45080 }, { "epoch": 3.6777061750550617, "grad_norm": 0.7192668914794922, "learning_rate": 3.985215191222268e-05, "loss": 0.3074, "num_input_tokens_seen": 43077072, "step": 45085 }, { "epoch": 3.6781140386654703, "grad_norm": 0.47641757130622864, "learning_rate": 3.984928867045871e-05, "loss": 0.3866, "num_input_tokens_seen": 43081280, "step": 45090 }, { "epoch": 3.678521902275879, "grad_norm": 1.0420845746994019, "learning_rate": 3.9846425127705965e-05, "loss": 0.409, "num_input_tokens_seen": 43086656, "step": 45095 }, { "epoch": 3.678929765886288, "grad_norm": 0.398561954498291, "learning_rate": 3.984356128402249e-05, "loss": 0.3543, "num_input_tokens_seen": 43091072, "step": 45100 }, { "epoch": 3.6793376294966964, "grad_norm": 0.6777587532997131, "learning_rate": 3.9840697139466344e-05, "loss": 0.3482, "num_input_tokens_seen": 43094688, "step": 45105 }, { "epoch": 3.679745493107105, "grad_norm": 0.7590689063072205, "learning_rate": 3.9837832694095566e-05, "loss": 0.361, "num_input_tokens_seen": 43099440, "step": 45110 }, { "epoch": 3.6801533567175135, "grad_norm": 0.733420729637146, "learning_rate": 3.983496794796823e-05, "loss": 0.3497, "num_input_tokens_seen": 43105168, "step": 45115 }, { "epoch": 3.680561220327922, "grad_norm": 0.7560504674911499, "learning_rate": 3.983210290114239e-05, "loss": 0.3435, "num_input_tokens_seen": 43110080, "step": 45120 }, { "epoch": 3.680969083938331, "grad_norm": 0.3697897493839264, "learning_rate": 3.9829237553676126e-05, "loss": 0.3464, "num_input_tokens_seen": 43114640, "step": 45125 }, { "epoch": 3.6813769475487397, "grad_norm": 0.5235660672187805, "learning_rate": 3.9826371905627525e-05, "loss": 0.2974, "num_input_tokens_seen": 43119344, "step": 45130 }, { "epoch": 3.6817848111591482, "grad_norm": 0.44510701298713684, "learning_rate": 3.982350595705466e-05, "loss": 0.2552, "num_input_tokens_seen": 43123696, "step": 45135 }, { "epoch": 3.6821926747695573, "grad_norm": 1.0537309646606445, "learning_rate": 3.982063970801564e-05, "loss": 0.4707, "num_input_tokens_seen": 43128400, "step": 45140 }, { "epoch": 3.682600538379966, "grad_norm": 0.36399778723716736, "learning_rate": 3.981777315856854e-05, "loss": 0.3626, "num_input_tokens_seen": 43133056, "step": 45145 }, { "epoch": 3.6830084019903744, "grad_norm": 0.509309709072113, "learning_rate": 3.981490630877147e-05, "loss": 0.3941, "num_input_tokens_seen": 43137424, "step": 45150 }, { "epoch": 3.683416265600783, "grad_norm": 0.575903594493866, "learning_rate": 3.9812039158682554e-05, "loss": 0.3286, "num_input_tokens_seen": 43143232, "step": 45155 }, { "epoch": 3.6838241292111915, "grad_norm": 0.8112636804580688, "learning_rate": 3.98091717083599e-05, "loss": 0.3232, "num_input_tokens_seen": 43148928, "step": 45160 }, { "epoch": 3.6842319928216005, "grad_norm": 0.5258557796478271, "learning_rate": 3.980630395786162e-05, "loss": 0.297, "num_input_tokens_seen": 43153936, "step": 45165 }, { "epoch": 3.684639856432009, "grad_norm": 0.5551691651344299, "learning_rate": 3.980343590724585e-05, "loss": 0.3803, "num_input_tokens_seen": 43159408, "step": 45170 }, { "epoch": 3.6850477200424177, "grad_norm": 0.3760682940483093, "learning_rate": 3.9800567556570736e-05, "loss": 0.3275, "num_input_tokens_seen": 43164992, "step": 45175 }, { "epoch": 3.6854555836528267, "grad_norm": 0.8875042200088501, "learning_rate": 3.97976989058944e-05, "loss": 0.3606, "num_input_tokens_seen": 43169872, "step": 45180 }, { "epoch": 3.6858634472632352, "grad_norm": 0.6411535143852234, "learning_rate": 3.979482995527499e-05, "loss": 0.3438, "num_input_tokens_seen": 43174656, "step": 45185 }, { "epoch": 3.686271310873644, "grad_norm": 0.8122122287750244, "learning_rate": 3.9791960704770665e-05, "loss": 0.3618, "num_input_tokens_seen": 43179456, "step": 45190 }, { "epoch": 3.686679174484053, "grad_norm": 0.3895588517189026, "learning_rate": 3.978909115443958e-05, "loss": 0.3486, "num_input_tokens_seen": 43184000, "step": 45195 }, { "epoch": 3.6870870380944614, "grad_norm": 0.7648611664772034, "learning_rate": 3.978622130433991e-05, "loss": 0.3607, "num_input_tokens_seen": 43187664, "step": 45200 }, { "epoch": 3.68749490170487, "grad_norm": 0.4701051712036133, "learning_rate": 3.978335115452981e-05, "loss": 0.3337, "num_input_tokens_seen": 43191328, "step": 45205 }, { "epoch": 3.6879027653152785, "grad_norm": 0.7778624892234802, "learning_rate": 3.978048070506747e-05, "loss": 0.3602, "num_input_tokens_seen": 43196576, "step": 45210 }, { "epoch": 3.688310628925687, "grad_norm": 0.4265401065349579, "learning_rate": 3.977760995601106e-05, "loss": 0.3383, "num_input_tokens_seen": 43201760, "step": 45215 }, { "epoch": 3.688718492536096, "grad_norm": 0.6862236857414246, "learning_rate": 3.977473890741879e-05, "loss": 0.3822, "num_input_tokens_seen": 43206992, "step": 45220 }, { "epoch": 3.6891263561465046, "grad_norm": 0.5292206406593323, "learning_rate": 3.977186755934882e-05, "loss": 0.3426, "num_input_tokens_seen": 43211424, "step": 45225 }, { "epoch": 3.689534219756913, "grad_norm": 0.45777368545532227, "learning_rate": 3.9768995911859385e-05, "loss": 0.3363, "num_input_tokens_seen": 43217088, "step": 45230 }, { "epoch": 3.689942083367322, "grad_norm": 0.38895338773727417, "learning_rate": 3.9766123965008674e-05, "loss": 0.3262, "num_input_tokens_seen": 43222208, "step": 45235 }, { "epoch": 3.6903499469777308, "grad_norm": 0.8651946783065796, "learning_rate": 3.976325171885491e-05, "loss": 0.3398, "num_input_tokens_seen": 43227312, "step": 45240 }, { "epoch": 3.6907578105881393, "grad_norm": 0.25848588347435, "learning_rate": 3.97603791734563e-05, "loss": 0.3349, "num_input_tokens_seen": 43231984, "step": 45245 }, { "epoch": 3.691165674198548, "grad_norm": 1.02462637424469, "learning_rate": 3.9757506328871075e-05, "loss": 0.3571, "num_input_tokens_seen": 43237456, "step": 45250 }, { "epoch": 3.6915735378089565, "grad_norm": 0.7519523501396179, "learning_rate": 3.9754633185157476e-05, "loss": 0.3291, "num_input_tokens_seen": 43242720, "step": 45255 }, { "epoch": 3.6919814014193655, "grad_norm": 0.27961230278015137, "learning_rate": 3.975175974237372e-05, "loss": 0.3865, "num_input_tokens_seen": 43246992, "step": 45260 }, { "epoch": 3.692389265029774, "grad_norm": 0.6838816404342651, "learning_rate": 3.974888600057808e-05, "loss": 0.364, "num_input_tokens_seen": 43251392, "step": 45265 }, { "epoch": 3.6927971286401826, "grad_norm": 0.5595845580101013, "learning_rate": 3.974601195982877e-05, "loss": 0.3456, "num_input_tokens_seen": 43256384, "step": 45270 }, { "epoch": 3.6932049922505916, "grad_norm": 0.8676221966743469, "learning_rate": 3.974313762018408e-05, "loss": 0.3457, "num_input_tokens_seen": 43261616, "step": 45275 }, { "epoch": 3.693612855861, "grad_norm": 0.6192408800125122, "learning_rate": 3.974026298170225e-05, "loss": 0.322, "num_input_tokens_seen": 43266384, "step": 45280 }, { "epoch": 3.6940207194714088, "grad_norm": 0.4447695314884186, "learning_rate": 3.973738804444154e-05, "loss": 0.3395, "num_input_tokens_seen": 43271760, "step": 45285 }, { "epoch": 3.6944285830818173, "grad_norm": 0.6379547119140625, "learning_rate": 3.973451280846024e-05, "loss": 0.367, "num_input_tokens_seen": 43276080, "step": 45290 }, { "epoch": 3.694836446692226, "grad_norm": 0.44017577171325684, "learning_rate": 3.973163727381663e-05, "loss": 0.3568, "num_input_tokens_seen": 43280688, "step": 45295 }, { "epoch": 3.695244310302635, "grad_norm": 0.5158473253250122, "learning_rate": 3.972876144056901e-05, "loss": 0.3, "num_input_tokens_seen": 43284992, "step": 45300 }, { "epoch": 3.6956521739130435, "grad_norm": 0.6456663012504578, "learning_rate": 3.972588530877563e-05, "loss": 0.3208, "num_input_tokens_seen": 43290304, "step": 45305 }, { "epoch": 3.696060037523452, "grad_norm": 0.7077009081840515, "learning_rate": 3.972300887849483e-05, "loss": 0.4131, "num_input_tokens_seen": 43294608, "step": 45310 }, { "epoch": 3.696467901133861, "grad_norm": 0.7237595915794373, "learning_rate": 3.972013214978488e-05, "loss": 0.3385, "num_input_tokens_seen": 43299648, "step": 45315 }, { "epoch": 3.6968757647442696, "grad_norm": 0.671010434627533, "learning_rate": 3.971725512270412e-05, "loss": 0.3268, "num_input_tokens_seen": 43304688, "step": 45320 }, { "epoch": 3.697283628354678, "grad_norm": 0.4186444878578186, "learning_rate": 3.9714377797310844e-05, "loss": 0.3607, "num_input_tokens_seen": 43309440, "step": 45325 }, { "epoch": 3.6976914919650867, "grad_norm": 0.8172647356987, "learning_rate": 3.9711500173663385e-05, "loss": 0.3747, "num_input_tokens_seen": 43314544, "step": 45330 }, { "epoch": 3.6980993555754953, "grad_norm": 0.38232240080833435, "learning_rate": 3.9708622251820064e-05, "loss": 0.3313, "num_input_tokens_seen": 43319152, "step": 45335 }, { "epoch": 3.6985072191859043, "grad_norm": 0.24030660092830658, "learning_rate": 3.9705744031839225e-05, "loss": 0.312, "num_input_tokens_seen": 43323648, "step": 45340 }, { "epoch": 3.698915082796313, "grad_norm": 0.46915584802627563, "learning_rate": 3.970286551377921e-05, "loss": 0.3484, "num_input_tokens_seen": 43328816, "step": 45345 }, { "epoch": 3.6993229464067214, "grad_norm": 0.8629525303840637, "learning_rate": 3.9699986697698346e-05, "loss": 0.3728, "num_input_tokens_seen": 43332224, "step": 45350 }, { "epoch": 3.6997308100171304, "grad_norm": 0.8760148286819458, "learning_rate": 3.9697107583655e-05, "loss": 0.3622, "num_input_tokens_seen": 43336848, "step": 45355 }, { "epoch": 3.700138673627539, "grad_norm": 0.24605676531791687, "learning_rate": 3.969422817170754e-05, "loss": 0.3422, "num_input_tokens_seen": 43342256, "step": 45360 }, { "epoch": 3.7005465372379476, "grad_norm": 0.7614725232124329, "learning_rate": 3.969134846191431e-05, "loss": 0.3681, "num_input_tokens_seen": 43346848, "step": 45365 }, { "epoch": 3.7009544008483566, "grad_norm": 0.822788655757904, "learning_rate": 3.9688468454333684e-05, "loss": 0.3782, "num_input_tokens_seen": 43351184, "step": 45370 }, { "epoch": 3.701362264458765, "grad_norm": 0.4674089550971985, "learning_rate": 3.9685588149024055e-05, "loss": 0.3419, "num_input_tokens_seen": 43355808, "step": 45375 }, { "epoch": 3.7017701280691737, "grad_norm": 0.7594358325004578, "learning_rate": 3.9682707546043785e-05, "loss": 0.3474, "num_input_tokens_seen": 43360272, "step": 45380 }, { "epoch": 3.7021779916795823, "grad_norm": 0.8862987160682678, "learning_rate": 3.967982664545128e-05, "loss": 0.3548, "num_input_tokens_seen": 43365760, "step": 45385 }, { "epoch": 3.702585855289991, "grad_norm": 0.889653742313385, "learning_rate": 3.967694544730492e-05, "loss": 0.3814, "num_input_tokens_seen": 43370496, "step": 45390 }, { "epoch": 3.7029937189004, "grad_norm": 0.3851240873336792, "learning_rate": 3.967406395166312e-05, "loss": 0.3242, "num_input_tokens_seen": 43375792, "step": 45395 }, { "epoch": 3.7034015825108084, "grad_norm": 0.4861111640930176, "learning_rate": 3.967118215858428e-05, "loss": 0.3586, "num_input_tokens_seen": 43381376, "step": 45400 }, { "epoch": 3.703809446121217, "grad_norm": 0.4121136963367462, "learning_rate": 3.96683000681268e-05, "loss": 0.3466, "num_input_tokens_seen": 43386656, "step": 45405 }, { "epoch": 3.704217309731626, "grad_norm": 0.8860535621643066, "learning_rate": 3.9665417680349116e-05, "loss": 0.356, "num_input_tokens_seen": 43391232, "step": 45410 }, { "epoch": 3.7046251733420346, "grad_norm": 0.5502749085426331, "learning_rate": 3.966253499530965e-05, "loss": 0.3324, "num_input_tokens_seen": 43396128, "step": 45415 }, { "epoch": 3.705033036952443, "grad_norm": 0.9864186644554138, "learning_rate": 3.965965201306683e-05, "loss": 0.3691, "num_input_tokens_seen": 43400624, "step": 45420 }, { "epoch": 3.7054409005628517, "grad_norm": 0.6167508363723755, "learning_rate": 3.965676873367909e-05, "loss": 0.3276, "num_input_tokens_seen": 43405472, "step": 45425 }, { "epoch": 3.7058487641732603, "grad_norm": 0.2875400185585022, "learning_rate": 3.965388515720488e-05, "loss": 0.3641, "num_input_tokens_seen": 43410416, "step": 45430 }, { "epoch": 3.7062566277836693, "grad_norm": 0.3204818069934845, "learning_rate": 3.965100128370264e-05, "loss": 0.3214, "num_input_tokens_seen": 43415808, "step": 45435 }, { "epoch": 3.706664491394078, "grad_norm": 0.3746607005596161, "learning_rate": 3.964811711323083e-05, "loss": 0.316, "num_input_tokens_seen": 43420256, "step": 45440 }, { "epoch": 3.7070723550044864, "grad_norm": 0.47457337379455566, "learning_rate": 3.9645232645847916e-05, "loss": 0.4356, "num_input_tokens_seen": 43424672, "step": 45445 }, { "epoch": 3.7074802186148954, "grad_norm": 0.38949131965637207, "learning_rate": 3.9642347881612356e-05, "loss": 0.382, "num_input_tokens_seen": 43429472, "step": 45450 }, { "epoch": 3.707888082225304, "grad_norm": 0.4152131974697113, "learning_rate": 3.963946282058263e-05, "loss": 0.3545, "num_input_tokens_seen": 43433840, "step": 45455 }, { "epoch": 3.7082959458357125, "grad_norm": 0.40181997418403625, "learning_rate": 3.9636577462817206e-05, "loss": 0.3672, "num_input_tokens_seen": 43438976, "step": 45460 }, { "epoch": 3.708703809446121, "grad_norm": 0.8687826991081238, "learning_rate": 3.963369180837458e-05, "loss": 0.3689, "num_input_tokens_seen": 43443232, "step": 45465 }, { "epoch": 3.7091116730565297, "grad_norm": 0.6502817273139954, "learning_rate": 3.963080585731324e-05, "loss": 0.3671, "num_input_tokens_seen": 43448432, "step": 45470 }, { "epoch": 3.7095195366669387, "grad_norm": 0.45783326029777527, "learning_rate": 3.962791960969168e-05, "loss": 0.3386, "num_input_tokens_seen": 43453376, "step": 45475 }, { "epoch": 3.7099274002773472, "grad_norm": 0.32333239912986755, "learning_rate": 3.9625033065568405e-05, "loss": 0.3518, "num_input_tokens_seen": 43457584, "step": 45480 }, { "epoch": 3.710335263887756, "grad_norm": 0.38066014647483826, "learning_rate": 3.962214622500193e-05, "loss": 0.3003, "num_input_tokens_seen": 43461696, "step": 45485 }, { "epoch": 3.710743127498165, "grad_norm": 0.35379523038864136, "learning_rate": 3.9619259088050765e-05, "loss": 0.4225, "num_input_tokens_seen": 43466960, "step": 45490 }, { "epoch": 3.7111509911085734, "grad_norm": 0.9117390513420105, "learning_rate": 3.961637165477342e-05, "loss": 0.4296, "num_input_tokens_seen": 43471600, "step": 45495 }, { "epoch": 3.711558854718982, "grad_norm": 0.7076725363731384, "learning_rate": 3.961348392522844e-05, "loss": 0.3423, "num_input_tokens_seen": 43475536, "step": 45500 }, { "epoch": 3.7119667183293905, "grad_norm": 0.4713990092277527, "learning_rate": 3.9610595899474346e-05, "loss": 0.3346, "num_input_tokens_seen": 43480672, "step": 45505 }, { "epoch": 3.712374581939799, "grad_norm": 0.8352357149124146, "learning_rate": 3.9607707577569686e-05, "loss": 0.3655, "num_input_tokens_seen": 43485616, "step": 45510 }, { "epoch": 3.712782445550208, "grad_norm": 0.7680601477622986, "learning_rate": 3.9604818959573e-05, "loss": 0.3634, "num_input_tokens_seen": 43489680, "step": 45515 }, { "epoch": 3.7131903091606167, "grad_norm": 0.5223230719566345, "learning_rate": 3.9601930045542834e-05, "loss": 0.3402, "num_input_tokens_seen": 43493344, "step": 45520 }, { "epoch": 3.713598172771025, "grad_norm": 0.7313873171806335, "learning_rate": 3.9599040835537756e-05, "loss": 0.3061, "num_input_tokens_seen": 43497696, "step": 45525 }, { "epoch": 3.7140060363814342, "grad_norm": 0.8081993460655212, "learning_rate": 3.9596151329616324e-05, "loss": 0.3598, "num_input_tokens_seen": 43502096, "step": 45530 }, { "epoch": 3.714413899991843, "grad_norm": 0.5848372578620911, "learning_rate": 3.9593261527837105e-05, "loss": 0.294, "num_input_tokens_seen": 43506880, "step": 45535 }, { "epoch": 3.7148217636022514, "grad_norm": 0.41109538078308105, "learning_rate": 3.959037143025868e-05, "loss": 0.3531, "num_input_tokens_seen": 43512048, "step": 45540 }, { "epoch": 3.71522962721266, "grad_norm": 0.3907230794429779, "learning_rate": 3.958748103693962e-05, "loss": 0.3814, "num_input_tokens_seen": 43517376, "step": 45545 }, { "epoch": 3.7156374908230685, "grad_norm": 0.6095905900001526, "learning_rate": 3.958459034793852e-05, "loss": 0.3632, "num_input_tokens_seen": 43522800, "step": 45550 }, { "epoch": 3.7160453544334775, "grad_norm": 0.6540012955665588, "learning_rate": 3.958169936331397e-05, "loss": 0.3609, "num_input_tokens_seen": 43526464, "step": 45555 }, { "epoch": 3.716453218043886, "grad_norm": 0.6246508359909058, "learning_rate": 3.957880808312458e-05, "loss": 0.3392, "num_input_tokens_seen": 43530864, "step": 45560 }, { "epoch": 3.7168610816542946, "grad_norm": 0.30218642950057983, "learning_rate": 3.957591650742893e-05, "loss": 0.3421, "num_input_tokens_seen": 43536192, "step": 45565 }, { "epoch": 3.7172689452647036, "grad_norm": 0.4535699486732483, "learning_rate": 3.957302463628565e-05, "loss": 0.3558, "num_input_tokens_seen": 43541552, "step": 45570 }, { "epoch": 3.717676808875112, "grad_norm": 0.3313058614730835, "learning_rate": 3.9570132469753356e-05, "loss": 0.3604, "num_input_tokens_seen": 43545184, "step": 45575 }, { "epoch": 3.7180846724855208, "grad_norm": 0.5833361148834229, "learning_rate": 3.956724000789067e-05, "loss": 0.3137, "num_input_tokens_seen": 43550112, "step": 45580 }, { "epoch": 3.71849253609593, "grad_norm": 0.8558218479156494, "learning_rate": 3.9564347250756215e-05, "loss": 0.4064, "num_input_tokens_seen": 43554848, "step": 45585 }, { "epoch": 3.7189003997063383, "grad_norm": 0.7053354978561401, "learning_rate": 3.956145419840863e-05, "loss": 0.3331, "num_input_tokens_seen": 43559296, "step": 45590 }, { "epoch": 3.719308263316747, "grad_norm": 0.5977599620819092, "learning_rate": 3.955856085090656e-05, "loss": 0.3225, "num_input_tokens_seen": 43564368, "step": 45595 }, { "epoch": 3.7197161269271555, "grad_norm": 0.5048174262046814, "learning_rate": 3.9555667208308645e-05, "loss": 0.331, "num_input_tokens_seen": 43569360, "step": 45600 }, { "epoch": 3.720123990537564, "grad_norm": 0.24481602013111115, "learning_rate": 3.955277327067354e-05, "loss": 0.3587, "num_input_tokens_seen": 43573952, "step": 45605 }, { "epoch": 3.720531854147973, "grad_norm": 0.6360541582107544, "learning_rate": 3.954987903805991e-05, "loss": 0.3638, "num_input_tokens_seen": 43578704, "step": 45610 }, { "epoch": 3.7209397177583816, "grad_norm": 0.5724374055862427, "learning_rate": 3.954698451052641e-05, "loss": 0.3534, "num_input_tokens_seen": 43583696, "step": 45615 }, { "epoch": 3.72134758136879, "grad_norm": 0.7710610628128052, "learning_rate": 3.954408968813171e-05, "loss": 0.3482, "num_input_tokens_seen": 43587936, "step": 45620 }, { "epoch": 3.721755444979199, "grad_norm": 0.6969549655914307, "learning_rate": 3.9541194570934505e-05, "loss": 0.3121, "num_input_tokens_seen": 43592544, "step": 45625 }, { "epoch": 3.7221633085896078, "grad_norm": 0.3629949390888214, "learning_rate": 3.9538299158993456e-05, "loss": 0.3116, "num_input_tokens_seen": 43597280, "step": 45630 }, { "epoch": 3.7225711722000163, "grad_norm": 0.28220120072364807, "learning_rate": 3.9535403452367265e-05, "loss": 0.3846, "num_input_tokens_seen": 43602336, "step": 45635 }, { "epoch": 3.722979035810425, "grad_norm": 0.6497306823730469, "learning_rate": 3.953250745111462e-05, "loss": 0.3118, "num_input_tokens_seen": 43606864, "step": 45640 }, { "epoch": 3.7233868994208335, "grad_norm": 0.4031538665294647, "learning_rate": 3.952961115529422e-05, "loss": 0.3395, "num_input_tokens_seen": 43612400, "step": 45645 }, { "epoch": 3.7237947630312425, "grad_norm": 0.2949315011501312, "learning_rate": 3.952671456496478e-05, "loss": 0.3281, "num_input_tokens_seen": 43617920, "step": 45650 }, { "epoch": 3.724202626641651, "grad_norm": 0.6399696469306946, "learning_rate": 3.9523817680185016e-05, "loss": 0.3127, "num_input_tokens_seen": 43623584, "step": 45655 }, { "epoch": 3.7246104902520596, "grad_norm": 0.48527660965919495, "learning_rate": 3.952092050101362e-05, "loss": 0.2532, "num_input_tokens_seen": 43628448, "step": 45660 }, { "epoch": 3.7250183538624686, "grad_norm": 1.060817003250122, "learning_rate": 3.951802302750935e-05, "loss": 0.4616, "num_input_tokens_seen": 43633312, "step": 45665 }, { "epoch": 3.725426217472877, "grad_norm": 1.0327157974243164, "learning_rate": 3.951512525973092e-05, "loss": 0.4028, "num_input_tokens_seen": 43638544, "step": 45670 }, { "epoch": 3.7258340810832857, "grad_norm": 0.6270872354507446, "learning_rate": 3.9512227197737075e-05, "loss": 0.3628, "num_input_tokens_seen": 43643376, "step": 45675 }, { "epoch": 3.7262419446936943, "grad_norm": 0.347674697637558, "learning_rate": 3.950932884158655e-05, "loss": 0.3347, "num_input_tokens_seen": 43649136, "step": 45680 }, { "epoch": 3.726649808304103, "grad_norm": 0.6500361561775208, "learning_rate": 3.950643019133808e-05, "loss": 0.342, "num_input_tokens_seen": 43653856, "step": 45685 }, { "epoch": 3.727057671914512, "grad_norm": 0.6194630861282349, "learning_rate": 3.950353124705045e-05, "loss": 0.3171, "num_input_tokens_seen": 43658512, "step": 45690 }, { "epoch": 3.7274655355249204, "grad_norm": 0.3721497356891632, "learning_rate": 3.9500632008782404e-05, "loss": 0.3394, "num_input_tokens_seen": 43663376, "step": 45695 }, { "epoch": 3.727873399135329, "grad_norm": 0.467570424079895, "learning_rate": 3.94977324765927e-05, "loss": 0.2589, "num_input_tokens_seen": 43667264, "step": 45700 }, { "epoch": 3.728281262745738, "grad_norm": 0.5299516916275024, "learning_rate": 3.949483265054012e-05, "loss": 0.2939, "num_input_tokens_seen": 43671712, "step": 45705 }, { "epoch": 3.7286891263561466, "grad_norm": 1.026336908340454, "learning_rate": 3.949193253068344e-05, "loss": 0.5383, "num_input_tokens_seen": 43677120, "step": 45710 }, { "epoch": 3.729096989966555, "grad_norm": 0.2979595959186554, "learning_rate": 3.9489032117081456e-05, "loss": 0.289, "num_input_tokens_seen": 43681600, "step": 45715 }, { "epoch": 3.7295048535769637, "grad_norm": 0.8845051527023315, "learning_rate": 3.948613140979294e-05, "loss": 0.3906, "num_input_tokens_seen": 43686304, "step": 45720 }, { "epoch": 3.7299127171873723, "grad_norm": 0.4904915392398834, "learning_rate": 3.948323040887669e-05, "loss": 0.3549, "num_input_tokens_seen": 43691424, "step": 45725 }, { "epoch": 3.7303205807977813, "grad_norm": 0.8696976900100708, "learning_rate": 3.948032911439153e-05, "loss": 0.3775, "num_input_tokens_seen": 43695824, "step": 45730 }, { "epoch": 3.73072844440819, "grad_norm": 0.5048597455024719, "learning_rate": 3.9477427526396245e-05, "loss": 0.3338, "num_input_tokens_seen": 43701648, "step": 45735 }, { "epoch": 3.7311363080185984, "grad_norm": 0.3892359435558319, "learning_rate": 3.947452564494965e-05, "loss": 0.3379, "num_input_tokens_seen": 43706128, "step": 45740 }, { "epoch": 3.7315441716290074, "grad_norm": 0.8739622831344604, "learning_rate": 3.9471623470110577e-05, "loss": 0.3188, "num_input_tokens_seen": 43710976, "step": 45745 }, { "epoch": 3.731952035239416, "grad_norm": 0.8884177803993225, "learning_rate": 3.946872100193785e-05, "loss": 0.3558, "num_input_tokens_seen": 43715584, "step": 45750 }, { "epoch": 3.7323598988498246, "grad_norm": 0.4573900103569031, "learning_rate": 3.946581824049028e-05, "loss": 0.3348, "num_input_tokens_seen": 43720288, "step": 45755 }, { "epoch": 3.7327677624602336, "grad_norm": 0.5555846691131592, "learning_rate": 3.9462915185826735e-05, "loss": 0.3069, "num_input_tokens_seen": 43724640, "step": 45760 }, { "epoch": 3.733175626070642, "grad_norm": 0.46057751774787903, "learning_rate": 3.946001183800604e-05, "loss": 0.3679, "num_input_tokens_seen": 43729088, "step": 45765 }, { "epoch": 3.7335834896810507, "grad_norm": 0.9144914746284485, "learning_rate": 3.9457108197087044e-05, "loss": 0.4038, "num_input_tokens_seen": 43733904, "step": 45770 }, { "epoch": 3.7339913532914593, "grad_norm": 0.5567351579666138, "learning_rate": 3.945420426312863e-05, "loss": 0.3503, "num_input_tokens_seen": 43738512, "step": 45775 }, { "epoch": 3.734399216901868, "grad_norm": 0.327039897441864, "learning_rate": 3.945130003618961e-05, "loss": 0.3683, "num_input_tokens_seen": 43743504, "step": 45780 }, { "epoch": 3.734807080512277, "grad_norm": 0.8882900476455688, "learning_rate": 3.9448395516328895e-05, "loss": 0.3827, "num_input_tokens_seen": 43746944, "step": 45785 }, { "epoch": 3.7352149441226854, "grad_norm": 0.587070882320404, "learning_rate": 3.944549070360534e-05, "loss": 0.3333, "num_input_tokens_seen": 43751936, "step": 45790 }, { "epoch": 3.735622807733094, "grad_norm": 0.46080657839775085, "learning_rate": 3.944258559807782e-05, "loss": 0.3407, "num_input_tokens_seen": 43756224, "step": 45795 }, { "epoch": 3.736030671343503, "grad_norm": 0.33175259828567505, "learning_rate": 3.943968019980524e-05, "loss": 0.3442, "num_input_tokens_seen": 43760864, "step": 45800 }, { "epoch": 3.7364385349539115, "grad_norm": 0.738524854183197, "learning_rate": 3.943677450884646e-05, "loss": 0.3477, "num_input_tokens_seen": 43765824, "step": 45805 }, { "epoch": 3.73684639856432, "grad_norm": 0.4412802457809448, "learning_rate": 3.943386852526041e-05, "loss": 0.3854, "num_input_tokens_seen": 43771104, "step": 45810 }, { "epoch": 3.7372542621747287, "grad_norm": 0.6353134512901306, "learning_rate": 3.943096224910597e-05, "loss": 0.3847, "num_input_tokens_seen": 43774752, "step": 45815 }, { "epoch": 3.7376621257851372, "grad_norm": 0.7157958149909973, "learning_rate": 3.9428055680442064e-05, "loss": 0.3394, "num_input_tokens_seen": 43779152, "step": 45820 }, { "epoch": 3.7380699893955462, "grad_norm": 0.4418368339538574, "learning_rate": 3.94251488193276e-05, "loss": 0.3816, "num_input_tokens_seen": 43784032, "step": 45825 }, { "epoch": 3.738477853005955, "grad_norm": 0.737373411655426, "learning_rate": 3.94222416658215e-05, "loss": 0.3533, "num_input_tokens_seen": 43788432, "step": 45830 }, { "epoch": 3.7388857166163634, "grad_norm": 0.7018035650253296, "learning_rate": 3.9419334219982686e-05, "loss": 0.3336, "num_input_tokens_seen": 43793328, "step": 45835 }, { "epoch": 3.7392935802267724, "grad_norm": 0.3461286127567291, "learning_rate": 3.9416426481870087e-05, "loss": 0.3248, "num_input_tokens_seen": 43797984, "step": 45840 }, { "epoch": 3.739701443837181, "grad_norm": 0.26264727115631104, "learning_rate": 3.941351845154267e-05, "loss": 0.3559, "num_input_tokens_seen": 43802992, "step": 45845 }, { "epoch": 3.7401093074475895, "grad_norm": 0.2780989110469818, "learning_rate": 3.941061012905933e-05, "loss": 0.3432, "num_input_tokens_seen": 43807232, "step": 45850 }, { "epoch": 3.740517171057998, "grad_norm": 0.6227392554283142, "learning_rate": 3.9407701514479066e-05, "loss": 0.3394, "num_input_tokens_seen": 43810880, "step": 45855 }, { "epoch": 3.7409250346684066, "grad_norm": 0.5893229842185974, "learning_rate": 3.9404792607860805e-05, "loss": 0.3117, "num_input_tokens_seen": 43815584, "step": 45860 }, { "epoch": 3.7413328982788157, "grad_norm": 0.36098459362983704, "learning_rate": 3.940188340926353e-05, "loss": 0.3993, "num_input_tokens_seen": 43820384, "step": 45865 }, { "epoch": 3.741740761889224, "grad_norm": 0.4613425135612488, "learning_rate": 3.939897391874619e-05, "loss": 0.3201, "num_input_tokens_seen": 43825504, "step": 45870 }, { "epoch": 3.742148625499633, "grad_norm": 0.6246297955513, "learning_rate": 3.939606413636777e-05, "loss": 0.3093, "num_input_tokens_seen": 43830160, "step": 45875 }, { "epoch": 3.742556489110042, "grad_norm": 0.35164839029312134, "learning_rate": 3.939315406218724e-05, "loss": 0.36, "num_input_tokens_seen": 43834384, "step": 45880 }, { "epoch": 3.7429643527204504, "grad_norm": 0.322086364030838, "learning_rate": 3.939024369626361e-05, "loss": 0.303, "num_input_tokens_seen": 43838912, "step": 45885 }, { "epoch": 3.743372216330859, "grad_norm": 0.4300415813922882, "learning_rate": 3.938733303865584e-05, "loss": 0.373, "num_input_tokens_seen": 43844352, "step": 45890 }, { "epoch": 3.7437800799412675, "grad_norm": 0.39471012353897095, "learning_rate": 3.9384422089422944e-05, "loss": 0.3178, "num_input_tokens_seen": 43848384, "step": 45895 }, { "epoch": 3.744187943551676, "grad_norm": 0.3864542841911316, "learning_rate": 3.9381510848623936e-05, "loss": 0.3592, "num_input_tokens_seen": 43852880, "step": 45900 }, { "epoch": 3.744595807162085, "grad_norm": 0.5657678246498108, "learning_rate": 3.93785993163178e-05, "loss": 0.3604, "num_input_tokens_seen": 43857456, "step": 45905 }, { "epoch": 3.7450036707724936, "grad_norm": 0.3332763910293579, "learning_rate": 3.937568749256357e-05, "loss": 0.3445, "num_input_tokens_seen": 43861792, "step": 45910 }, { "epoch": 3.745411534382902, "grad_norm": 0.6887235045433044, "learning_rate": 3.9372775377420265e-05, "loss": 0.3308, "num_input_tokens_seen": 43866368, "step": 45915 }, { "epoch": 3.745819397993311, "grad_norm": 0.32836541533470154, "learning_rate": 3.9369862970946915e-05, "loss": 0.3376, "num_input_tokens_seen": 43871488, "step": 45920 }, { "epoch": 3.7462272616037198, "grad_norm": 0.411221981048584, "learning_rate": 3.936695027320254e-05, "loss": 0.3407, "num_input_tokens_seen": 43876864, "step": 45925 }, { "epoch": 3.7466351252141283, "grad_norm": 0.29423126578330994, "learning_rate": 3.93640372842462e-05, "loss": 0.3781, "num_input_tokens_seen": 43881872, "step": 45930 }, { "epoch": 3.7470429888245373, "grad_norm": 0.6556030511856079, "learning_rate": 3.936112400413691e-05, "loss": 0.3344, "num_input_tokens_seen": 43886592, "step": 45935 }, { "epoch": 3.747450852434946, "grad_norm": 0.3543398082256317, "learning_rate": 3.935821043293375e-05, "loss": 0.3211, "num_input_tokens_seen": 43891360, "step": 45940 }, { "epoch": 3.7478587160453545, "grad_norm": 0.5370427966117859, "learning_rate": 3.935529657069577e-05, "loss": 0.3754, "num_input_tokens_seen": 43896432, "step": 45945 }, { "epoch": 3.748266579655763, "grad_norm": 0.9253732562065125, "learning_rate": 3.935238241748202e-05, "loss": 0.3716, "num_input_tokens_seen": 43901232, "step": 45950 }, { "epoch": 3.7486744432661716, "grad_norm": 0.6381860971450806, "learning_rate": 3.9349467973351584e-05, "loss": 0.3662, "num_input_tokens_seen": 43906496, "step": 45955 }, { "epoch": 3.7490823068765806, "grad_norm": 0.3661666512489319, "learning_rate": 3.9346553238363527e-05, "loss": 0.3358, "num_input_tokens_seen": 43910624, "step": 45960 }, { "epoch": 3.749490170486989, "grad_norm": 0.2917023301124573, "learning_rate": 3.934363821257693e-05, "loss": 0.3262, "num_input_tokens_seen": 43915600, "step": 45965 }, { "epoch": 3.7498980340973977, "grad_norm": 0.5575205087661743, "learning_rate": 3.9340722896050874e-05, "loss": 0.3683, "num_input_tokens_seen": 43920864, "step": 45970 }, { "epoch": 3.7503058977078068, "grad_norm": 0.3301696181297302, "learning_rate": 3.9337807288844476e-05, "loss": 0.324, "num_input_tokens_seen": 43924816, "step": 45975 }, { "epoch": 3.7507137613182153, "grad_norm": 0.4622928500175476, "learning_rate": 3.9334891391016806e-05, "loss": 0.3341, "num_input_tokens_seen": 43929904, "step": 45980 }, { "epoch": 3.751121624928624, "grad_norm": 0.35443538427352905, "learning_rate": 3.933197520262698e-05, "loss": 0.3287, "num_input_tokens_seen": 43934512, "step": 45985 }, { "epoch": 3.7515294885390325, "grad_norm": 0.4349018335342407, "learning_rate": 3.93290587237341e-05, "loss": 0.3417, "num_input_tokens_seen": 43939056, "step": 45990 }, { "epoch": 3.751937352149441, "grad_norm": 0.3782362639904022, "learning_rate": 3.93261419543973e-05, "loss": 0.3486, "num_input_tokens_seen": 43943760, "step": 45995 }, { "epoch": 3.75234521575985, "grad_norm": 0.6012098789215088, "learning_rate": 3.9323224894675686e-05, "loss": 0.3326, "num_input_tokens_seen": 43948432, "step": 46000 }, { "epoch": 3.7527530793702586, "grad_norm": 0.5322999954223633, "learning_rate": 3.9320307544628385e-05, "loss": 0.2868, "num_input_tokens_seen": 43952864, "step": 46005 }, { "epoch": 3.753160942980667, "grad_norm": 0.39765384793281555, "learning_rate": 3.931738990431455e-05, "loss": 0.3726, "num_input_tokens_seen": 43958112, "step": 46010 }, { "epoch": 3.753568806591076, "grad_norm": 0.4576823115348816, "learning_rate": 3.931447197379329e-05, "loss": 0.3258, "num_input_tokens_seen": 43963104, "step": 46015 }, { "epoch": 3.7539766702014847, "grad_norm": 0.5938472747802734, "learning_rate": 3.931155375312376e-05, "loss": 0.2788, "num_input_tokens_seen": 43967920, "step": 46020 }, { "epoch": 3.7543845338118933, "grad_norm": 0.5344898104667664, "learning_rate": 3.930863524236513e-05, "loss": 0.3917, "num_input_tokens_seen": 43973440, "step": 46025 }, { "epoch": 3.754792397422302, "grad_norm": 0.5955381989479065, "learning_rate": 3.930571644157655e-05, "loss": 0.3176, "num_input_tokens_seen": 43978560, "step": 46030 }, { "epoch": 3.7552002610327104, "grad_norm": 0.5928831100463867, "learning_rate": 3.930279735081716e-05, "loss": 0.309, "num_input_tokens_seen": 43983728, "step": 46035 }, { "epoch": 3.7556081246431194, "grad_norm": 1.27239191532135, "learning_rate": 3.929987797014616e-05, "loss": 0.3935, "num_input_tokens_seen": 43988816, "step": 46040 }, { "epoch": 3.756015988253528, "grad_norm": 1.0155102014541626, "learning_rate": 3.92969582996227e-05, "loss": 0.4407, "num_input_tokens_seen": 43993280, "step": 46045 }, { "epoch": 3.7564238518639366, "grad_norm": 0.944831907749176, "learning_rate": 3.929403833930597e-05, "loss": 0.3772, "num_input_tokens_seen": 43998112, "step": 46050 }, { "epoch": 3.7568317154743456, "grad_norm": 0.6668364405632019, "learning_rate": 3.9291118089255165e-05, "loss": 0.3409, "num_input_tokens_seen": 44002928, "step": 46055 }, { "epoch": 3.757239579084754, "grad_norm": 1.1284931898117065, "learning_rate": 3.928819754952946e-05, "loss": 0.3498, "num_input_tokens_seen": 44007632, "step": 46060 }, { "epoch": 3.7576474426951627, "grad_norm": 0.6952232122421265, "learning_rate": 3.928527672018807e-05, "loss": 0.3475, "num_input_tokens_seen": 44012320, "step": 46065 }, { "epoch": 3.7580553063055713, "grad_norm": 0.9898298382759094, "learning_rate": 3.928235560129018e-05, "loss": 0.3667, "num_input_tokens_seen": 44016352, "step": 46070 }, { "epoch": 3.75846316991598, "grad_norm": 0.7636564373970032, "learning_rate": 3.9279434192895025e-05, "loss": 0.3316, "num_input_tokens_seen": 44021360, "step": 46075 }, { "epoch": 3.758871033526389, "grad_norm": 1.0549495220184326, "learning_rate": 3.92765124950618e-05, "loss": 0.3341, "num_input_tokens_seen": 44025568, "step": 46080 }, { "epoch": 3.7592788971367974, "grad_norm": 0.9793262481689453, "learning_rate": 3.927359050784973e-05, "loss": 0.3658, "num_input_tokens_seen": 44031216, "step": 46085 }, { "epoch": 3.759686760747206, "grad_norm": 0.843064546585083, "learning_rate": 3.927066823131805e-05, "loss": 0.3311, "num_input_tokens_seen": 44036016, "step": 46090 }, { "epoch": 3.760094624357615, "grad_norm": 0.8139331340789795, "learning_rate": 3.926774566552599e-05, "loss": 0.3473, "num_input_tokens_seen": 44039920, "step": 46095 }, { "epoch": 3.7605024879680236, "grad_norm": 1.0135215520858765, "learning_rate": 3.9264822810532784e-05, "loss": 0.3444, "num_input_tokens_seen": 44044688, "step": 46100 }, { "epoch": 3.760910351578432, "grad_norm": 0.5737653970718384, "learning_rate": 3.9261899666397684e-05, "loss": 0.3949, "num_input_tokens_seen": 44050320, "step": 46105 }, { "epoch": 3.7613182151888407, "grad_norm": 0.8449029922485352, "learning_rate": 3.9258976233179936e-05, "loss": 0.325, "num_input_tokens_seen": 44054960, "step": 46110 }, { "epoch": 3.7617260787992497, "grad_norm": 0.7889896631240845, "learning_rate": 3.92560525109388e-05, "loss": 0.3274, "num_input_tokens_seen": 44059056, "step": 46115 }, { "epoch": 3.7621339424096583, "grad_norm": 0.8651776909828186, "learning_rate": 3.925312849973353e-05, "loss": 0.3092, "num_input_tokens_seen": 44062640, "step": 46120 }, { "epoch": 3.762541806020067, "grad_norm": 0.7983900904655457, "learning_rate": 3.925020419962341e-05, "loss": 0.3142, "num_input_tokens_seen": 44068288, "step": 46125 }, { "epoch": 3.7629496696304754, "grad_norm": 1.430749535560608, "learning_rate": 3.924727961066771e-05, "loss": 0.4206, "num_input_tokens_seen": 44072656, "step": 46130 }, { "epoch": 3.7633575332408844, "grad_norm": 0.5870509147644043, "learning_rate": 3.924435473292569e-05, "loss": 0.3595, "num_input_tokens_seen": 44077392, "step": 46135 }, { "epoch": 3.763765396851293, "grad_norm": 0.7320319414138794, "learning_rate": 3.924142956645667e-05, "loss": 0.2714, "num_input_tokens_seen": 44082592, "step": 46140 }, { "epoch": 3.7641732604617015, "grad_norm": 1.4359434843063354, "learning_rate": 3.923850411131992e-05, "loss": 0.3265, "num_input_tokens_seen": 44088240, "step": 46145 }, { "epoch": 3.7645811240721105, "grad_norm": 0.8578364849090576, "learning_rate": 3.923557836757474e-05, "loss": 0.3912, "num_input_tokens_seen": 44093552, "step": 46150 }, { "epoch": 3.764988987682519, "grad_norm": 1.056721568107605, "learning_rate": 3.923265233528043e-05, "loss": 0.2787, "num_input_tokens_seen": 44098496, "step": 46155 }, { "epoch": 3.7653968512929277, "grad_norm": 0.9649211168289185, "learning_rate": 3.922972601449632e-05, "loss": 0.3497, "num_input_tokens_seen": 44103968, "step": 46160 }, { "epoch": 3.7658047149033362, "grad_norm": 0.6002851724624634, "learning_rate": 3.9226799405281697e-05, "loss": 0.362, "num_input_tokens_seen": 44107984, "step": 46165 }, { "epoch": 3.766212578513745, "grad_norm": 0.7252333760261536, "learning_rate": 3.9223872507695894e-05, "loss": 0.3756, "num_input_tokens_seen": 44112416, "step": 46170 }, { "epoch": 3.766620442124154, "grad_norm": 0.7725163102149963, "learning_rate": 3.9220945321798244e-05, "loss": 0.454, "num_input_tokens_seen": 44117040, "step": 46175 }, { "epoch": 3.7670283057345624, "grad_norm": 0.7074533104896545, "learning_rate": 3.921801784764807e-05, "loss": 0.3218, "num_input_tokens_seen": 44121680, "step": 46180 }, { "epoch": 3.767436169344971, "grad_norm": 0.7846298813819885, "learning_rate": 3.921509008530473e-05, "loss": 0.3309, "num_input_tokens_seen": 44126656, "step": 46185 }, { "epoch": 3.76784403295538, "grad_norm": 0.6381621360778809, "learning_rate": 3.921216203482755e-05, "loss": 0.4359, "num_input_tokens_seen": 44131296, "step": 46190 }, { "epoch": 3.7682518965657885, "grad_norm": 0.7367428541183472, "learning_rate": 3.920923369627587e-05, "loss": 0.4035, "num_input_tokens_seen": 44137024, "step": 46195 }, { "epoch": 3.768659760176197, "grad_norm": 0.39091363549232483, "learning_rate": 3.9206305069709075e-05, "loss": 0.3186, "num_input_tokens_seen": 44141728, "step": 46200 }, { "epoch": 3.7690676237866056, "grad_norm": 0.4047004282474518, "learning_rate": 3.920337615518651e-05, "loss": 0.3538, "num_input_tokens_seen": 44147280, "step": 46205 }, { "epoch": 3.769475487397014, "grad_norm": 0.7956303358078003, "learning_rate": 3.9200446952767546e-05, "loss": 0.3412, "num_input_tokens_seen": 44152032, "step": 46210 }, { "epoch": 3.7698833510074232, "grad_norm": 0.3431151211261749, "learning_rate": 3.919751746251155e-05, "loss": 0.3065, "num_input_tokens_seen": 44156128, "step": 46215 }, { "epoch": 3.770291214617832, "grad_norm": 0.560554563999176, "learning_rate": 3.919458768447791e-05, "loss": 0.3106, "num_input_tokens_seen": 44161120, "step": 46220 }, { "epoch": 3.7706990782282404, "grad_norm": 0.5039939880371094, "learning_rate": 3.919165761872602e-05, "loss": 0.3139, "num_input_tokens_seen": 44165680, "step": 46225 }, { "epoch": 3.7711069418386494, "grad_norm": 0.8406514525413513, "learning_rate": 3.918872726531525e-05, "loss": 0.306, "num_input_tokens_seen": 44170176, "step": 46230 }, { "epoch": 3.771514805449058, "grad_norm": 0.6538684368133545, "learning_rate": 3.918579662430501e-05, "loss": 0.4324, "num_input_tokens_seen": 44175088, "step": 46235 }, { "epoch": 3.7719226690594665, "grad_norm": 0.5633978247642517, "learning_rate": 3.91828656957547e-05, "loss": 0.3891, "num_input_tokens_seen": 44180176, "step": 46240 }, { "epoch": 3.772330532669875, "grad_norm": 0.45595791935920715, "learning_rate": 3.917993447972374e-05, "loss": 0.3514, "num_input_tokens_seen": 44185008, "step": 46245 }, { "epoch": 3.7727383962802836, "grad_norm": 0.8234331011772156, "learning_rate": 3.917700297627151e-05, "loss": 0.3497, "num_input_tokens_seen": 44190288, "step": 46250 }, { "epoch": 3.7731462598906926, "grad_norm": 0.5520646572113037, "learning_rate": 3.917407118545747e-05, "loss": 0.3861, "num_input_tokens_seen": 44194896, "step": 46255 }, { "epoch": 3.773554123501101, "grad_norm": 0.45615798234939575, "learning_rate": 3.9171139107341025e-05, "loss": 0.3188, "num_input_tokens_seen": 44200112, "step": 46260 }, { "epoch": 3.7739619871115098, "grad_norm": 1.11292564868927, "learning_rate": 3.9168206741981616e-05, "loss": 0.3639, "num_input_tokens_seen": 44203840, "step": 46265 }, { "epoch": 3.7743698507219188, "grad_norm": 0.7416204810142517, "learning_rate": 3.9165274089438675e-05, "loss": 0.3044, "num_input_tokens_seen": 44207360, "step": 46270 }, { "epoch": 3.7747777143323273, "grad_norm": 0.36853963136672974, "learning_rate": 3.916234114977165e-05, "loss": 0.3741, "num_input_tokens_seen": 44212448, "step": 46275 }, { "epoch": 3.775185577942736, "grad_norm": 0.8305232524871826, "learning_rate": 3.915940792303998e-05, "loss": 0.3105, "num_input_tokens_seen": 44217552, "step": 46280 }, { "epoch": 3.7755934415531445, "grad_norm": 0.5715912580490112, "learning_rate": 3.915647440930313e-05, "loss": 0.4008, "num_input_tokens_seen": 44222560, "step": 46285 }, { "epoch": 3.776001305163553, "grad_norm": 0.8045297265052795, "learning_rate": 3.915354060862056e-05, "loss": 0.361, "num_input_tokens_seen": 44227264, "step": 46290 }, { "epoch": 3.776409168773962, "grad_norm": 0.9482769966125488, "learning_rate": 3.915060652105175e-05, "loss": 0.35, "num_input_tokens_seen": 44231776, "step": 46295 }, { "epoch": 3.7768170323843706, "grad_norm": 0.5485340356826782, "learning_rate": 3.9147672146656146e-05, "loss": 0.3852, "num_input_tokens_seen": 44236336, "step": 46300 }, { "epoch": 3.777224895994779, "grad_norm": 0.36208710074424744, "learning_rate": 3.914473748549323e-05, "loss": 0.3566, "num_input_tokens_seen": 44241232, "step": 46305 }, { "epoch": 3.777632759605188, "grad_norm": 0.37734177708625793, "learning_rate": 3.9141802537622515e-05, "loss": 0.3112, "num_input_tokens_seen": 44246592, "step": 46310 }, { "epoch": 3.7780406232155967, "grad_norm": 0.5698084235191345, "learning_rate": 3.913886730310346e-05, "loss": 0.3347, "num_input_tokens_seen": 44251696, "step": 46315 }, { "epoch": 3.7784484868260053, "grad_norm": 0.8882185220718384, "learning_rate": 3.913593178199557e-05, "loss": 0.4526, "num_input_tokens_seen": 44256464, "step": 46320 }, { "epoch": 3.7788563504364143, "grad_norm": 1.0917235612869263, "learning_rate": 3.9132995974358356e-05, "loss": 0.4034, "num_input_tokens_seen": 44261808, "step": 46325 }, { "epoch": 3.779264214046823, "grad_norm": 0.17703931033611298, "learning_rate": 3.9130059880251324e-05, "loss": 0.3581, "num_input_tokens_seen": 44266768, "step": 46330 }, { "epoch": 3.7796720776572315, "grad_norm": 0.41554951667785645, "learning_rate": 3.912712349973398e-05, "loss": 0.3657, "num_input_tokens_seen": 44271840, "step": 46335 }, { "epoch": 3.78007994126764, "grad_norm": 1.083344578742981, "learning_rate": 3.9124186832865836e-05, "loss": 0.388, "num_input_tokens_seen": 44276544, "step": 46340 }, { "epoch": 3.7804878048780486, "grad_norm": 0.505997896194458, "learning_rate": 3.912124987970643e-05, "loss": 0.3652, "num_input_tokens_seen": 44281968, "step": 46345 }, { "epoch": 3.7808956684884576, "grad_norm": 0.8329511880874634, "learning_rate": 3.911831264031529e-05, "loss": 0.3582, "num_input_tokens_seen": 44285696, "step": 46350 }, { "epoch": 3.781303532098866, "grad_norm": 0.8322145938873291, "learning_rate": 3.9115375114751965e-05, "loss": 0.3501, "num_input_tokens_seen": 44290688, "step": 46355 }, { "epoch": 3.7817113957092747, "grad_norm": 0.7185616493225098, "learning_rate": 3.911243730307597e-05, "loss": 0.3589, "num_input_tokens_seen": 44296000, "step": 46360 }, { "epoch": 3.7821192593196837, "grad_norm": 0.783840537071228, "learning_rate": 3.910949920534687e-05, "loss": 0.3455, "num_input_tokens_seen": 44300448, "step": 46365 }, { "epoch": 3.7825271229300923, "grad_norm": 0.27972501516342163, "learning_rate": 3.9106560821624215e-05, "loss": 0.3884, "num_input_tokens_seen": 44305216, "step": 46370 }, { "epoch": 3.782934986540501, "grad_norm": 0.8775591850280762, "learning_rate": 3.910362215196757e-05, "loss": 0.3681, "num_input_tokens_seen": 44309616, "step": 46375 }, { "epoch": 3.7833428501509094, "grad_norm": 0.8809223771095276, "learning_rate": 3.910068319643649e-05, "loss": 0.2859, "num_input_tokens_seen": 44313824, "step": 46380 }, { "epoch": 3.783750713761318, "grad_norm": 0.4396493136882782, "learning_rate": 3.909774395509057e-05, "loss": 0.3358, "num_input_tokens_seen": 44318960, "step": 46385 }, { "epoch": 3.784158577371727, "grad_norm": 0.44406458735466003, "learning_rate": 3.9094804427989365e-05, "loss": 0.3089, "num_input_tokens_seen": 44323456, "step": 46390 }, { "epoch": 3.7845664409821356, "grad_norm": 0.5351261496543884, "learning_rate": 3.909186461519245e-05, "loss": 0.3085, "num_input_tokens_seen": 44328576, "step": 46395 }, { "epoch": 3.784974304592544, "grad_norm": 0.5400693416595459, "learning_rate": 3.908892451675944e-05, "loss": 0.3267, "num_input_tokens_seen": 44333520, "step": 46400 }, { "epoch": 3.785382168202953, "grad_norm": 0.48408254981040955, "learning_rate": 3.908598413274991e-05, "loss": 0.3466, "num_input_tokens_seen": 44338096, "step": 46405 }, { "epoch": 3.7857900318133617, "grad_norm": 0.5364367365837097, "learning_rate": 3.908304346322347e-05, "loss": 0.4173, "num_input_tokens_seen": 44343552, "step": 46410 }, { "epoch": 3.7861978954237703, "grad_norm": 1.2982100248336792, "learning_rate": 3.908010250823972e-05, "loss": 0.341, "num_input_tokens_seen": 44348320, "step": 46415 }, { "epoch": 3.786605759034179, "grad_norm": 0.3727304935455322, "learning_rate": 3.907716126785828e-05, "loss": 0.3161, "num_input_tokens_seen": 44353120, "step": 46420 }, { "epoch": 3.7870136226445874, "grad_norm": 1.857617735862732, "learning_rate": 3.907421974213876e-05, "loss": 0.3775, "num_input_tokens_seen": 44357568, "step": 46425 }, { "epoch": 3.7874214862549964, "grad_norm": 1.2546030282974243, "learning_rate": 3.907127793114079e-05, "loss": 0.3394, "num_input_tokens_seen": 44362208, "step": 46430 }, { "epoch": 3.787829349865405, "grad_norm": 0.525852382183075, "learning_rate": 3.9068335834923984e-05, "loss": 0.356, "num_input_tokens_seen": 44367040, "step": 46435 }, { "epoch": 3.7882372134758135, "grad_norm": 0.8381422758102417, "learning_rate": 3.906539345354801e-05, "loss": 0.3849, "num_input_tokens_seen": 44371648, "step": 46440 }, { "epoch": 3.7886450770862226, "grad_norm": 0.5502311587333679, "learning_rate": 3.906245078707247e-05, "loss": 0.3325, "num_input_tokens_seen": 44376864, "step": 46445 }, { "epoch": 3.789052940696631, "grad_norm": 1.023052453994751, "learning_rate": 3.9059507835557027e-05, "loss": 0.3383, "num_input_tokens_seen": 44381568, "step": 46450 }, { "epoch": 3.7894608043070397, "grad_norm": 0.7089568972587585, "learning_rate": 3.9056564599061344e-05, "loss": 0.2845, "num_input_tokens_seen": 44386656, "step": 46455 }, { "epoch": 3.7898686679174483, "grad_norm": 1.3747583627700806, "learning_rate": 3.905362107764506e-05, "loss": 0.5246, "num_input_tokens_seen": 44391392, "step": 46460 }, { "epoch": 3.790276531527857, "grad_norm": 0.5486183762550354, "learning_rate": 3.905067727136786e-05, "loss": 0.3618, "num_input_tokens_seen": 44396464, "step": 46465 }, { "epoch": 3.790684395138266, "grad_norm": 0.5353322625160217, "learning_rate": 3.90477331802894e-05, "loss": 0.3721, "num_input_tokens_seen": 44402016, "step": 46470 }, { "epoch": 3.7910922587486744, "grad_norm": 1.0241155624389648, "learning_rate": 3.9044788804469355e-05, "loss": 0.3323, "num_input_tokens_seen": 44406864, "step": 46475 }, { "epoch": 3.791500122359083, "grad_norm": 0.6847905516624451, "learning_rate": 3.90418441439674e-05, "loss": 0.3324, "num_input_tokens_seen": 44411104, "step": 46480 }, { "epoch": 3.791907985969492, "grad_norm": 0.49717676639556885, "learning_rate": 3.903889919884324e-05, "loss": 0.3441, "num_input_tokens_seen": 44416096, "step": 46485 }, { "epoch": 3.7923158495799005, "grad_norm": 1.1105529069900513, "learning_rate": 3.903595396915656e-05, "loss": 0.3571, "num_input_tokens_seen": 44421504, "step": 46490 }, { "epoch": 3.792723713190309, "grad_norm": 0.879647433757782, "learning_rate": 3.903300845496706e-05, "loss": 0.3253, "num_input_tokens_seen": 44425792, "step": 46495 }, { "epoch": 3.793131576800718, "grad_norm": 0.7252090573310852, "learning_rate": 3.903006265633443e-05, "loss": 0.3224, "num_input_tokens_seen": 44430064, "step": 46500 }, { "epoch": 3.7935394404111267, "grad_norm": 0.6050965785980225, "learning_rate": 3.90271165733184e-05, "loss": 0.352, "num_input_tokens_seen": 44434912, "step": 46505 }, { "epoch": 3.7939473040215352, "grad_norm": 0.6618601679801941, "learning_rate": 3.9024170205978676e-05, "loss": 0.3098, "num_input_tokens_seen": 44439456, "step": 46510 }, { "epoch": 3.794355167631944, "grad_norm": 0.6373083591461182, "learning_rate": 3.902122355437499e-05, "loss": 0.2333, "num_input_tokens_seen": 44444096, "step": 46515 }, { "epoch": 3.7947630312423524, "grad_norm": 1.526033878326416, "learning_rate": 3.9018276618567046e-05, "loss": 0.5046, "num_input_tokens_seen": 44448736, "step": 46520 }, { "epoch": 3.7951708948527614, "grad_norm": 0.7899542450904846, "learning_rate": 3.90153293986146e-05, "loss": 0.3416, "num_input_tokens_seen": 44454064, "step": 46525 }, { "epoch": 3.79557875846317, "grad_norm": 0.7291316986083984, "learning_rate": 3.9012381894577375e-05, "loss": 0.312, "num_input_tokens_seen": 44458144, "step": 46530 }, { "epoch": 3.7959866220735785, "grad_norm": 0.6623045802116394, "learning_rate": 3.900943410651513e-05, "loss": 0.3716, "num_input_tokens_seen": 44463456, "step": 46535 }, { "epoch": 3.7963944856839875, "grad_norm": 1.0664348602294922, "learning_rate": 3.9006486034487605e-05, "loss": 0.3113, "num_input_tokens_seen": 44468480, "step": 46540 }, { "epoch": 3.796802349294396, "grad_norm": 0.703270673751831, "learning_rate": 3.9003537678554565e-05, "loss": 0.3926, "num_input_tokens_seen": 44473280, "step": 46545 }, { "epoch": 3.7972102129048046, "grad_norm": 1.080022931098938, "learning_rate": 3.9000589038775756e-05, "loss": 0.2754, "num_input_tokens_seen": 44477216, "step": 46550 }, { "epoch": 3.797618076515213, "grad_norm": 0.5973659753799438, "learning_rate": 3.899764011521097e-05, "loss": 0.3873, "num_input_tokens_seen": 44481776, "step": 46555 }, { "epoch": 3.798025940125622, "grad_norm": 0.3345741033554077, "learning_rate": 3.899469090791996e-05, "loss": 0.368, "num_input_tokens_seen": 44486208, "step": 46560 }, { "epoch": 3.798433803736031, "grad_norm": 0.7585247159004211, "learning_rate": 3.899174141696251e-05, "loss": 0.2879, "num_input_tokens_seen": 44491392, "step": 46565 }, { "epoch": 3.7988416673464394, "grad_norm": 0.582759439945221, "learning_rate": 3.898879164239841e-05, "loss": 0.3579, "num_input_tokens_seen": 44496336, "step": 46570 }, { "epoch": 3.799249530956848, "grad_norm": 0.6248022317886353, "learning_rate": 3.898584158428745e-05, "loss": 0.3642, "num_input_tokens_seen": 44501072, "step": 46575 }, { "epoch": 3.799657394567257, "grad_norm": 0.43128228187561035, "learning_rate": 3.898289124268942e-05, "loss": 0.3739, "num_input_tokens_seen": 44506080, "step": 46580 }, { "epoch": 3.8000652581776655, "grad_norm": 0.40832456946372986, "learning_rate": 3.897994061766412e-05, "loss": 0.3612, "num_input_tokens_seen": 44511184, "step": 46585 }, { "epoch": 3.800473121788074, "grad_norm": 0.4802083671092987, "learning_rate": 3.897698970927137e-05, "loss": 0.3371, "num_input_tokens_seen": 44515504, "step": 46590 }, { "epoch": 3.8008809853984826, "grad_norm": 0.9647719264030457, "learning_rate": 3.8974038517570985e-05, "loss": 0.3684, "num_input_tokens_seen": 44519824, "step": 46595 }, { "epoch": 3.801288849008891, "grad_norm": 0.9385762214660645, "learning_rate": 3.897108704262278e-05, "loss": 0.3833, "num_input_tokens_seen": 44524224, "step": 46600 }, { "epoch": 3.8016967126193, "grad_norm": 0.9973233938217163, "learning_rate": 3.896813528448656e-05, "loss": 0.365, "num_input_tokens_seen": 44528848, "step": 46605 }, { "epoch": 3.8021045762297088, "grad_norm": 0.7866073250770569, "learning_rate": 3.896518324322218e-05, "loss": 0.3519, "num_input_tokens_seen": 44534848, "step": 46610 }, { "epoch": 3.8025124398401173, "grad_norm": 1.349058985710144, "learning_rate": 3.8962230918889476e-05, "loss": 0.3046, "num_input_tokens_seen": 44539760, "step": 46615 }, { "epoch": 3.8029203034505263, "grad_norm": 1.019910216331482, "learning_rate": 3.8959278311548284e-05, "loss": 0.4017, "num_input_tokens_seen": 44544576, "step": 46620 }, { "epoch": 3.803328167060935, "grad_norm": 0.9793171286582947, "learning_rate": 3.8956325421258447e-05, "loss": 0.3965, "num_input_tokens_seen": 44548576, "step": 46625 }, { "epoch": 3.8037360306713435, "grad_norm": 0.8668566942214966, "learning_rate": 3.8953372248079825e-05, "loss": 0.3626, "num_input_tokens_seen": 44552912, "step": 46630 }, { "epoch": 3.804143894281752, "grad_norm": 0.38663986325263977, "learning_rate": 3.8950418792072274e-05, "loss": 0.3397, "num_input_tokens_seen": 44558384, "step": 46635 }, { "epoch": 3.8045517578921606, "grad_norm": 0.35660073161125183, "learning_rate": 3.894746505329566e-05, "loss": 0.3463, "num_input_tokens_seen": 44562768, "step": 46640 }, { "epoch": 3.8049596215025696, "grad_norm": 0.8675375580787659, "learning_rate": 3.8944511031809865e-05, "loss": 0.3378, "num_input_tokens_seen": 44567712, "step": 46645 }, { "epoch": 3.805367485112978, "grad_norm": 0.9758409261703491, "learning_rate": 3.8941556727674756e-05, "loss": 0.3463, "num_input_tokens_seen": 44572848, "step": 46650 }, { "epoch": 3.8057753487233867, "grad_norm": 0.6979299187660217, "learning_rate": 3.893860214095021e-05, "loss": 0.3428, "num_input_tokens_seen": 44576704, "step": 46655 }, { "epoch": 3.8061832123337958, "grad_norm": 0.48824164271354675, "learning_rate": 3.893564727169613e-05, "loss": 0.3446, "num_input_tokens_seen": 44581232, "step": 46660 }, { "epoch": 3.8065910759442043, "grad_norm": 0.5852833986282349, "learning_rate": 3.893269211997239e-05, "loss": 0.3704, "num_input_tokens_seen": 44586448, "step": 46665 }, { "epoch": 3.806998939554613, "grad_norm": 0.9462845325469971, "learning_rate": 3.8929736685838904e-05, "loss": 0.3533, "num_input_tokens_seen": 44590448, "step": 46670 }, { "epoch": 3.807406803165022, "grad_norm": 0.6760457754135132, "learning_rate": 3.8926780969355584e-05, "loss": 0.3516, "num_input_tokens_seen": 44595152, "step": 46675 }, { "epoch": 3.8078146667754305, "grad_norm": 0.5945481061935425, "learning_rate": 3.8923824970582325e-05, "loss": 0.3466, "num_input_tokens_seen": 44599808, "step": 46680 }, { "epoch": 3.808222530385839, "grad_norm": 0.581521213054657, "learning_rate": 3.892086868957905e-05, "loss": 0.3305, "num_input_tokens_seen": 44604480, "step": 46685 }, { "epoch": 3.8086303939962476, "grad_norm": 0.6611899733543396, "learning_rate": 3.8917912126405676e-05, "loss": 0.3108, "num_input_tokens_seen": 44608672, "step": 46690 }, { "epoch": 3.809038257606656, "grad_norm": 1.089917778968811, "learning_rate": 3.8914955281122144e-05, "loss": 0.3658, "num_input_tokens_seen": 44612304, "step": 46695 }, { "epoch": 3.809446121217065, "grad_norm": 0.7020634412765503, "learning_rate": 3.891199815378839e-05, "loss": 0.3781, "num_input_tokens_seen": 44617216, "step": 46700 }, { "epoch": 3.8098539848274737, "grad_norm": 0.38954466581344604, "learning_rate": 3.890904074446433e-05, "loss": 0.3609, "num_input_tokens_seen": 44622320, "step": 46705 }, { "epoch": 3.8102618484378823, "grad_norm": 0.7144309878349304, "learning_rate": 3.890608305320993e-05, "loss": 0.4005, "num_input_tokens_seen": 44627184, "step": 46710 }, { "epoch": 3.8106697120482913, "grad_norm": 0.9489406943321228, "learning_rate": 3.8903125080085134e-05, "loss": 0.3609, "num_input_tokens_seen": 44632128, "step": 46715 }, { "epoch": 3.8110775756587, "grad_norm": 0.9573136568069458, "learning_rate": 3.890016682514991e-05, "loss": 0.3397, "num_input_tokens_seen": 44637584, "step": 46720 }, { "epoch": 3.8114854392691084, "grad_norm": 0.7312304973602295, "learning_rate": 3.88972082884642e-05, "loss": 0.3485, "num_input_tokens_seen": 44642352, "step": 46725 }, { "epoch": 3.811893302879517, "grad_norm": 0.7068329453468323, "learning_rate": 3.889424947008798e-05, "loss": 0.3573, "num_input_tokens_seen": 44647296, "step": 46730 }, { "epoch": 3.8123011664899256, "grad_norm": 0.35858747363090515, "learning_rate": 3.8891290370081235e-05, "loss": 0.334, "num_input_tokens_seen": 44652240, "step": 46735 }, { "epoch": 3.8127090301003346, "grad_norm": 0.9126667976379395, "learning_rate": 3.888833098850394e-05, "loss": 0.3717, "num_input_tokens_seen": 44656240, "step": 46740 }, { "epoch": 3.813116893710743, "grad_norm": 0.9605855941772461, "learning_rate": 3.888537132541608e-05, "loss": 0.3792, "num_input_tokens_seen": 44660992, "step": 46745 }, { "epoch": 3.8135247573211517, "grad_norm": 0.6339707970619202, "learning_rate": 3.888241138087763e-05, "loss": 0.31, "num_input_tokens_seen": 44665664, "step": 46750 }, { "epoch": 3.8139326209315607, "grad_norm": 0.5998950004577637, "learning_rate": 3.887945115494861e-05, "loss": 0.349, "num_input_tokens_seen": 44670544, "step": 46755 }, { "epoch": 3.8143404845419693, "grad_norm": 0.7205291986465454, "learning_rate": 3.887649064768901e-05, "loss": 0.3708, "num_input_tokens_seen": 44674960, "step": 46760 }, { "epoch": 3.814748348152378, "grad_norm": 0.7599009275436401, "learning_rate": 3.887352985915884e-05, "loss": 0.3429, "num_input_tokens_seen": 44679344, "step": 46765 }, { "epoch": 3.8151562117627864, "grad_norm": 0.7238398194313049, "learning_rate": 3.887056878941812e-05, "loss": 0.3485, "num_input_tokens_seen": 44683936, "step": 46770 }, { "epoch": 3.815564075373195, "grad_norm": 0.45495590567588806, "learning_rate": 3.886760743852685e-05, "loss": 0.3292, "num_input_tokens_seen": 44688272, "step": 46775 }, { "epoch": 3.815971938983604, "grad_norm": 0.4269355833530426, "learning_rate": 3.886464580654509e-05, "loss": 0.3496, "num_input_tokens_seen": 44692864, "step": 46780 }, { "epoch": 3.8163798025940125, "grad_norm": 0.7803263068199158, "learning_rate": 3.886168389353284e-05, "loss": 0.3697, "num_input_tokens_seen": 44697792, "step": 46785 }, { "epoch": 3.816787666204421, "grad_norm": 0.3731858730316162, "learning_rate": 3.8858721699550144e-05, "loss": 0.3489, "num_input_tokens_seen": 44702288, "step": 46790 }, { "epoch": 3.81719552981483, "grad_norm": 0.8792861700057983, "learning_rate": 3.8855759224657054e-05, "loss": 0.3706, "num_input_tokens_seen": 44706288, "step": 46795 }, { "epoch": 3.8176033934252387, "grad_norm": 0.3780185282230377, "learning_rate": 3.885279646891361e-05, "loss": 0.3592, "num_input_tokens_seen": 44711184, "step": 46800 }, { "epoch": 3.8180112570356473, "grad_norm": 0.3527587950229645, "learning_rate": 3.8849833432379866e-05, "loss": 0.3368, "num_input_tokens_seen": 44715552, "step": 46805 }, { "epoch": 3.818419120646056, "grad_norm": 0.6365994215011597, "learning_rate": 3.88468701151159e-05, "loss": 0.3235, "num_input_tokens_seen": 44720208, "step": 46810 }, { "epoch": 3.8188269842564644, "grad_norm": 0.48210814595222473, "learning_rate": 3.884390651718174e-05, "loss": 0.2856, "num_input_tokens_seen": 44725152, "step": 46815 }, { "epoch": 3.8192348478668734, "grad_norm": 0.4812053143978119, "learning_rate": 3.884094263863749e-05, "loss": 0.419, "num_input_tokens_seen": 44730176, "step": 46820 }, { "epoch": 3.819642711477282, "grad_norm": 0.4718896746635437, "learning_rate": 3.88379784795432e-05, "loss": 0.3046, "num_input_tokens_seen": 44733824, "step": 46825 }, { "epoch": 3.8200505750876905, "grad_norm": 0.4541565775871277, "learning_rate": 3.883501403995898e-05, "loss": 0.3077, "num_input_tokens_seen": 44737216, "step": 46830 }, { "epoch": 3.8204584386980995, "grad_norm": 0.5726591944694519, "learning_rate": 3.8832049319944894e-05, "loss": 0.3842, "num_input_tokens_seen": 44741360, "step": 46835 }, { "epoch": 3.820866302308508, "grad_norm": 0.33089104294776917, "learning_rate": 3.8829084319561055e-05, "loss": 0.3417, "num_input_tokens_seen": 44745904, "step": 46840 }, { "epoch": 3.8212741659189167, "grad_norm": 0.6893137693405151, "learning_rate": 3.8826119038867546e-05, "loss": 0.361, "num_input_tokens_seen": 44750528, "step": 46845 }, { "epoch": 3.8216820295293252, "grad_norm": 0.7840582728385925, "learning_rate": 3.882315347792449e-05, "loss": 0.3311, "num_input_tokens_seen": 44755312, "step": 46850 }, { "epoch": 3.822089893139734, "grad_norm": 0.5587483644485474, "learning_rate": 3.882018763679198e-05, "loss": 0.3168, "num_input_tokens_seen": 44760560, "step": 46855 }, { "epoch": 3.822497756750143, "grad_norm": 1.0286365747451782, "learning_rate": 3.881722151553013e-05, "loss": 0.3415, "num_input_tokens_seen": 44765744, "step": 46860 }, { "epoch": 3.8229056203605514, "grad_norm": 0.7342175245285034, "learning_rate": 3.8814255114199085e-05, "loss": 0.3262, "num_input_tokens_seen": 44770704, "step": 46865 }, { "epoch": 3.82331348397096, "grad_norm": 0.6119855046272278, "learning_rate": 3.881128843285895e-05, "loss": 0.2862, "num_input_tokens_seen": 44776288, "step": 46870 }, { "epoch": 3.823721347581369, "grad_norm": 0.5509182214736938, "learning_rate": 3.8808321471569876e-05, "loss": 0.3455, "num_input_tokens_seen": 44781088, "step": 46875 }, { "epoch": 3.8241292111917775, "grad_norm": 0.692841649055481, "learning_rate": 3.8805354230391986e-05, "loss": 0.3529, "num_input_tokens_seen": 44787280, "step": 46880 }, { "epoch": 3.824537074802186, "grad_norm": 1.2328263521194458, "learning_rate": 3.880238670938543e-05, "loss": 0.4705, "num_input_tokens_seen": 44792208, "step": 46885 }, { "epoch": 3.824944938412595, "grad_norm": 0.5728160738945007, "learning_rate": 3.879941890861037e-05, "loss": 0.3387, "num_input_tokens_seen": 44797168, "step": 46890 }, { "epoch": 3.8253528020230037, "grad_norm": 0.7293134927749634, "learning_rate": 3.879645082812695e-05, "loss": 0.3471, "num_input_tokens_seen": 44801664, "step": 46895 }, { "epoch": 3.825760665633412, "grad_norm": 0.4478927254676819, "learning_rate": 3.879348246799534e-05, "loss": 0.3923, "num_input_tokens_seen": 44806496, "step": 46900 }, { "epoch": 3.826168529243821, "grad_norm": 0.758427619934082, "learning_rate": 3.8790513828275683e-05, "loss": 0.3482, "num_input_tokens_seen": 44811312, "step": 46905 }, { "epoch": 3.8265763928542293, "grad_norm": 0.2758535146713257, "learning_rate": 3.8787544909028184e-05, "loss": 0.3344, "num_input_tokens_seen": 44815200, "step": 46910 }, { "epoch": 3.8269842564646384, "grad_norm": 0.4919733703136444, "learning_rate": 3.8784575710313e-05, "loss": 0.389, "num_input_tokens_seen": 44819552, "step": 46915 }, { "epoch": 3.827392120075047, "grad_norm": 0.5578673481941223, "learning_rate": 3.878160623219033e-05, "loss": 0.3286, "num_input_tokens_seen": 44824928, "step": 46920 }, { "epoch": 3.8277999836854555, "grad_norm": 0.8878456950187683, "learning_rate": 3.8778636474720354e-05, "loss": 0.4069, "num_input_tokens_seen": 44829936, "step": 46925 }, { "epoch": 3.8282078472958645, "grad_norm": 0.341584712266922, "learning_rate": 3.8775666437963274e-05, "loss": 0.3496, "num_input_tokens_seen": 44835456, "step": 46930 }, { "epoch": 3.828615710906273, "grad_norm": 0.620219349861145, "learning_rate": 3.877269612197929e-05, "loss": 0.3567, "num_input_tokens_seen": 44840496, "step": 46935 }, { "epoch": 3.8290235745166816, "grad_norm": 0.45021823048591614, "learning_rate": 3.876972552682861e-05, "loss": 0.3393, "num_input_tokens_seen": 44844912, "step": 46940 }, { "epoch": 3.82943143812709, "grad_norm": 0.8782569766044617, "learning_rate": 3.876675465257144e-05, "loss": 0.343, "num_input_tokens_seen": 44849440, "step": 46945 }, { "epoch": 3.8298393017374988, "grad_norm": 0.6465655565261841, "learning_rate": 3.8763783499268e-05, "loss": 0.3499, "num_input_tokens_seen": 44854192, "step": 46950 }, { "epoch": 3.8302471653479078, "grad_norm": 0.6559826731681824, "learning_rate": 3.876081206697852e-05, "loss": 0.3363, "num_input_tokens_seen": 44858672, "step": 46955 }, { "epoch": 3.8306550289583163, "grad_norm": 0.362697958946228, "learning_rate": 3.8757840355763236e-05, "loss": 0.3047, "num_input_tokens_seen": 44862912, "step": 46960 }, { "epoch": 3.831062892568725, "grad_norm": 0.7766715884208679, "learning_rate": 3.875486836568237e-05, "loss": 0.3467, "num_input_tokens_seen": 44867328, "step": 46965 }, { "epoch": 3.831470756179134, "grad_norm": 0.5163026452064514, "learning_rate": 3.875189609679616e-05, "loss": 0.3424, "num_input_tokens_seen": 44873088, "step": 46970 }, { "epoch": 3.8318786197895425, "grad_norm": 0.3944879174232483, "learning_rate": 3.874892354916486e-05, "loss": 0.3102, "num_input_tokens_seen": 44877648, "step": 46975 }, { "epoch": 3.832286483399951, "grad_norm": 0.7104055285453796, "learning_rate": 3.874595072284872e-05, "loss": 0.3224, "num_input_tokens_seen": 44881904, "step": 46980 }, { "epoch": 3.8326943470103596, "grad_norm": 0.5655678510665894, "learning_rate": 3.8742977617908016e-05, "loss": 0.3682, "num_input_tokens_seen": 44886160, "step": 46985 }, { "epoch": 3.833102210620768, "grad_norm": 0.6374437212944031, "learning_rate": 3.874000423440298e-05, "loss": 0.3674, "num_input_tokens_seen": 44890528, "step": 46990 }, { "epoch": 3.833510074231177, "grad_norm": 0.44211480021476746, "learning_rate": 3.8737030572393906e-05, "loss": 0.3449, "num_input_tokens_seen": 44895936, "step": 46995 }, { "epoch": 3.8339179378415857, "grad_norm": 0.9432969689369202, "learning_rate": 3.873405663194105e-05, "loss": 0.3777, "num_input_tokens_seen": 44900848, "step": 47000 }, { "epoch": 3.8343258014519943, "grad_norm": 0.3989437520503998, "learning_rate": 3.873108241310471e-05, "loss": 0.3531, "num_input_tokens_seen": 44905280, "step": 47005 }, { "epoch": 3.8347336650624033, "grad_norm": 0.33538001775741577, "learning_rate": 3.872810791594516e-05, "loss": 0.3454, "num_input_tokens_seen": 44910080, "step": 47010 }, { "epoch": 3.835141528672812, "grad_norm": 0.6898452639579773, "learning_rate": 3.8725133140522695e-05, "loss": 0.3498, "num_input_tokens_seen": 44915008, "step": 47015 }, { "epoch": 3.8355493922832204, "grad_norm": 0.8654054999351501, "learning_rate": 3.872215808689762e-05, "loss": 0.3886, "num_input_tokens_seen": 44918816, "step": 47020 }, { "epoch": 3.835957255893629, "grad_norm": 0.7098336219787598, "learning_rate": 3.871918275513022e-05, "loss": 0.3674, "num_input_tokens_seen": 44923776, "step": 47025 }, { "epoch": 3.8363651195040376, "grad_norm": 0.6962679624557495, "learning_rate": 3.871620714528083e-05, "loss": 0.3024, "num_input_tokens_seen": 44928960, "step": 47030 }, { "epoch": 3.8367729831144466, "grad_norm": 0.4696900248527527, "learning_rate": 3.8713231257409734e-05, "loss": 0.3781, "num_input_tokens_seen": 44934144, "step": 47035 }, { "epoch": 3.837180846724855, "grad_norm": 0.5616625547409058, "learning_rate": 3.871025509157728e-05, "loss": 0.3374, "num_input_tokens_seen": 44938096, "step": 47040 }, { "epoch": 3.8375887103352637, "grad_norm": 0.333977073431015, "learning_rate": 3.870727864784377e-05, "loss": 0.3409, "num_input_tokens_seen": 44941760, "step": 47045 }, { "epoch": 3.8379965739456727, "grad_norm": 0.5985267162322998, "learning_rate": 3.8704301926269557e-05, "loss": 0.3385, "num_input_tokens_seen": 44945344, "step": 47050 }, { "epoch": 3.8384044375560813, "grad_norm": 0.6211604475975037, "learning_rate": 3.870132492691496e-05, "loss": 0.3113, "num_input_tokens_seen": 44950032, "step": 47055 }, { "epoch": 3.83881230116649, "grad_norm": 0.7137459516525269, "learning_rate": 3.8698347649840333e-05, "loss": 0.3454, "num_input_tokens_seen": 44954512, "step": 47060 }, { "epoch": 3.839220164776899, "grad_norm": 1.0253076553344727, "learning_rate": 3.869537009510602e-05, "loss": 0.3882, "num_input_tokens_seen": 44959984, "step": 47065 }, { "epoch": 3.8396280283873074, "grad_norm": 0.5597800612449646, "learning_rate": 3.869239226277237e-05, "loss": 0.3786, "num_input_tokens_seen": 44964960, "step": 47070 }, { "epoch": 3.840035891997716, "grad_norm": 0.8356392979621887, "learning_rate": 3.868941415289975e-05, "loss": 0.3189, "num_input_tokens_seen": 44969952, "step": 47075 }, { "epoch": 3.8404437556081246, "grad_norm": 0.4522091746330261, "learning_rate": 3.8686435765548525e-05, "loss": 0.3333, "num_input_tokens_seen": 44975008, "step": 47080 }, { "epoch": 3.840851619218533, "grad_norm": 0.908339262008667, "learning_rate": 3.8683457100779064e-05, "loss": 0.3707, "num_input_tokens_seen": 44980176, "step": 47085 }, { "epoch": 3.841259482828942, "grad_norm": 0.9286854267120361, "learning_rate": 3.8680478158651734e-05, "loss": 0.3385, "num_input_tokens_seen": 44984608, "step": 47090 }, { "epoch": 3.8416673464393507, "grad_norm": 0.5827764868736267, "learning_rate": 3.867749893922693e-05, "loss": 0.363, "num_input_tokens_seen": 44988656, "step": 47095 }, { "epoch": 3.8420752100497593, "grad_norm": 0.5866662859916687, "learning_rate": 3.867451944256504e-05, "loss": 0.3394, "num_input_tokens_seen": 44993200, "step": 47100 }, { "epoch": 3.8424830736601683, "grad_norm": 0.7761185765266418, "learning_rate": 3.867153966872644e-05, "loss": 0.3608, "num_input_tokens_seen": 44997488, "step": 47105 }, { "epoch": 3.842890937270577, "grad_norm": 0.6744217872619629, "learning_rate": 3.8668559617771546e-05, "loss": 0.3624, "num_input_tokens_seen": 45001056, "step": 47110 }, { "epoch": 3.8432988008809854, "grad_norm": 0.21935828030109406, "learning_rate": 3.866557928976075e-05, "loss": 0.3486, "num_input_tokens_seen": 45004864, "step": 47115 }, { "epoch": 3.843706664491394, "grad_norm": 0.5355769991874695, "learning_rate": 3.866259868475448e-05, "loss": 0.3593, "num_input_tokens_seen": 45010304, "step": 47120 }, { "epoch": 3.8441145281018025, "grad_norm": 0.2972467839717865, "learning_rate": 3.865961780281313e-05, "loss": 0.32, "num_input_tokens_seen": 45014608, "step": 47125 }, { "epoch": 3.8445223917122116, "grad_norm": 0.1493835598230362, "learning_rate": 3.865663664399714e-05, "loss": 0.3564, "num_input_tokens_seen": 45018720, "step": 47130 }, { "epoch": 3.84493025532262, "grad_norm": 0.4874756932258606, "learning_rate": 3.865365520836692e-05, "loss": 0.3324, "num_input_tokens_seen": 45023680, "step": 47135 }, { "epoch": 3.8453381189330287, "grad_norm": 0.5796132683753967, "learning_rate": 3.865067349598292e-05, "loss": 0.3485, "num_input_tokens_seen": 45028656, "step": 47140 }, { "epoch": 3.8457459825434377, "grad_norm": 0.5406408309936523, "learning_rate": 3.864769150690557e-05, "loss": 0.3742, "num_input_tokens_seen": 45032912, "step": 47145 }, { "epoch": 3.8461538461538463, "grad_norm": 0.6836053133010864, "learning_rate": 3.86447092411953e-05, "loss": 0.3286, "num_input_tokens_seen": 45037040, "step": 47150 }, { "epoch": 3.846561709764255, "grad_norm": 0.9790003299713135, "learning_rate": 3.864172669891258e-05, "loss": 0.322, "num_input_tokens_seen": 45042400, "step": 47155 }, { "epoch": 3.8469695733746634, "grad_norm": 0.4086150527000427, "learning_rate": 3.8638743880117855e-05, "loss": 0.3501, "num_input_tokens_seen": 45046880, "step": 47160 }, { "epoch": 3.847377436985072, "grad_norm": 0.650554895401001, "learning_rate": 3.863576078487159e-05, "loss": 0.2855, "num_input_tokens_seen": 45052352, "step": 47165 }, { "epoch": 3.847785300595481, "grad_norm": 0.571651041507721, "learning_rate": 3.8632777413234237e-05, "loss": 0.3983, "num_input_tokens_seen": 45056528, "step": 47170 }, { "epoch": 3.8481931642058895, "grad_norm": 0.5814691185951233, "learning_rate": 3.8629793765266283e-05, "loss": 0.3367, "num_input_tokens_seen": 45061120, "step": 47175 }, { "epoch": 3.848601027816298, "grad_norm": 0.8698330521583557, "learning_rate": 3.862680984102821e-05, "loss": 0.2811, "num_input_tokens_seen": 45066672, "step": 47180 }, { "epoch": 3.849008891426707, "grad_norm": 0.60734623670578, "learning_rate": 3.862382564058049e-05, "loss": 0.3261, "num_input_tokens_seen": 45072192, "step": 47185 }, { "epoch": 3.8494167550371157, "grad_norm": 0.6444486379623413, "learning_rate": 3.862084116398361e-05, "loss": 0.397, "num_input_tokens_seen": 45077696, "step": 47190 }, { "epoch": 3.8498246186475242, "grad_norm": 0.5321967601776123, "learning_rate": 3.8617856411298065e-05, "loss": 0.3694, "num_input_tokens_seen": 45082944, "step": 47195 }, { "epoch": 3.850232482257933, "grad_norm": 0.4014885425567627, "learning_rate": 3.861487138258436e-05, "loss": 0.3387, "num_input_tokens_seen": 45087328, "step": 47200 }, { "epoch": 3.8506403458683414, "grad_norm": 0.7996577620506287, "learning_rate": 3.861188607790299e-05, "loss": 0.3491, "num_input_tokens_seen": 45091984, "step": 47205 }, { "epoch": 3.8510482094787504, "grad_norm": 0.608212411403656, "learning_rate": 3.8608900497314486e-05, "loss": 0.3612, "num_input_tokens_seen": 45096416, "step": 47210 }, { "epoch": 3.851456073089159, "grad_norm": 0.8127476572990417, "learning_rate": 3.8605914640879345e-05, "loss": 0.3145, "num_input_tokens_seen": 45101792, "step": 47215 }, { "epoch": 3.8518639366995675, "grad_norm": 0.9652050733566284, "learning_rate": 3.86029285086581e-05, "loss": 0.372, "num_input_tokens_seen": 45106544, "step": 47220 }, { "epoch": 3.8522718003099765, "grad_norm": 0.9046639800071716, "learning_rate": 3.8599942100711266e-05, "loss": 0.3539, "num_input_tokens_seen": 45111104, "step": 47225 }, { "epoch": 3.852679663920385, "grad_norm": 0.6891274452209473, "learning_rate": 3.859695541709939e-05, "loss": 0.3605, "num_input_tokens_seen": 45116784, "step": 47230 }, { "epoch": 3.8530875275307936, "grad_norm": 0.8400041460990906, "learning_rate": 3.8593968457883e-05, "loss": 0.3469, "num_input_tokens_seen": 45121824, "step": 47235 }, { "epoch": 3.8534953911412027, "grad_norm": 0.5792241096496582, "learning_rate": 3.859098122312266e-05, "loss": 0.3691, "num_input_tokens_seen": 45125664, "step": 47240 }, { "epoch": 3.853903254751611, "grad_norm": 0.8593940734863281, "learning_rate": 3.858799371287889e-05, "loss": 0.3527, "num_input_tokens_seen": 45130816, "step": 47245 }, { "epoch": 3.85431111836202, "grad_norm": 0.8918827176094055, "learning_rate": 3.858500592721227e-05, "loss": 0.3686, "num_input_tokens_seen": 45135168, "step": 47250 }, { "epoch": 3.8547189819724283, "grad_norm": 0.7326500415802002, "learning_rate": 3.858201786618335e-05, "loss": 0.3711, "num_input_tokens_seen": 45140144, "step": 47255 }, { "epoch": 3.855126845582837, "grad_norm": 0.8511964678764343, "learning_rate": 3.85790295298527e-05, "loss": 0.3339, "num_input_tokens_seen": 45145392, "step": 47260 }, { "epoch": 3.855534709193246, "grad_norm": 0.7423393726348877, "learning_rate": 3.857604091828089e-05, "loss": 0.3395, "num_input_tokens_seen": 45149920, "step": 47265 }, { "epoch": 3.8559425728036545, "grad_norm": 0.7059102654457092, "learning_rate": 3.85730520315285e-05, "loss": 0.3434, "num_input_tokens_seen": 45154624, "step": 47270 }, { "epoch": 3.856350436414063, "grad_norm": 0.8226171731948853, "learning_rate": 3.8570062869656114e-05, "loss": 0.3626, "num_input_tokens_seen": 45159168, "step": 47275 }, { "epoch": 3.856758300024472, "grad_norm": 0.7701526880264282, "learning_rate": 3.8567073432724316e-05, "loss": 0.3428, "num_input_tokens_seen": 45164016, "step": 47280 }, { "epoch": 3.8571661636348806, "grad_norm": 0.5340381860733032, "learning_rate": 3.856408372079371e-05, "loss": 0.3493, "num_input_tokens_seen": 45168144, "step": 47285 }, { "epoch": 3.857574027245289, "grad_norm": 0.3755093216896057, "learning_rate": 3.856109373392489e-05, "loss": 0.3211, "num_input_tokens_seen": 45173088, "step": 47290 }, { "epoch": 3.8579818908556978, "grad_norm": 0.8302963972091675, "learning_rate": 3.8558103472178456e-05, "loss": 0.375, "num_input_tokens_seen": 45177888, "step": 47295 }, { "epoch": 3.8583897544661063, "grad_norm": 0.7362374067306519, "learning_rate": 3.8555112935615035e-05, "loss": 0.3438, "num_input_tokens_seen": 45182992, "step": 47300 }, { "epoch": 3.8587976180765153, "grad_norm": 0.7930573225021362, "learning_rate": 3.8552122124295234e-05, "loss": 0.3467, "num_input_tokens_seen": 45187696, "step": 47305 }, { "epoch": 3.859205481686924, "grad_norm": 0.766750156879425, "learning_rate": 3.854913103827967e-05, "loss": 0.3207, "num_input_tokens_seen": 45192352, "step": 47310 }, { "epoch": 3.8596133452973325, "grad_norm": 0.3546121120452881, "learning_rate": 3.854613967762898e-05, "loss": 0.337, "num_input_tokens_seen": 45197392, "step": 47315 }, { "epoch": 3.8600212089077415, "grad_norm": 0.5668861269950867, "learning_rate": 3.8543148042403796e-05, "loss": 0.3222, "num_input_tokens_seen": 45202064, "step": 47320 }, { "epoch": 3.86042907251815, "grad_norm": 0.5599772334098816, "learning_rate": 3.8540156132664754e-05, "loss": 0.3136, "num_input_tokens_seen": 45207328, "step": 47325 }, { "epoch": 3.8608369361285586, "grad_norm": 0.6421149969100952, "learning_rate": 3.85371639484725e-05, "loss": 0.3626, "num_input_tokens_seen": 45212208, "step": 47330 }, { "epoch": 3.861244799738967, "grad_norm": 0.944679856300354, "learning_rate": 3.853417148988768e-05, "loss": 0.3877, "num_input_tokens_seen": 45216944, "step": 47335 }, { "epoch": 3.8616526633493757, "grad_norm": 0.4416508078575134, "learning_rate": 3.8531178756970964e-05, "loss": 0.3774, "num_input_tokens_seen": 45222208, "step": 47340 }, { "epoch": 3.8620605269597847, "grad_norm": 0.7252520322799683, "learning_rate": 3.8528185749782994e-05, "loss": 0.3493, "num_input_tokens_seen": 45227728, "step": 47345 }, { "epoch": 3.8624683905701933, "grad_norm": 0.7001663446426392, "learning_rate": 3.8525192468384466e-05, "loss": 0.3404, "num_input_tokens_seen": 45232768, "step": 47350 }, { "epoch": 3.862876254180602, "grad_norm": 0.7906379699707031, "learning_rate": 3.852219891283602e-05, "loss": 0.3487, "num_input_tokens_seen": 45237152, "step": 47355 }, { "epoch": 3.863284117791011, "grad_norm": 0.4520410895347595, "learning_rate": 3.851920508319835e-05, "loss": 0.3313, "num_input_tokens_seen": 45242896, "step": 47360 }, { "epoch": 3.8636919814014195, "grad_norm": 0.410453200340271, "learning_rate": 3.851621097953214e-05, "loss": 0.3477, "num_input_tokens_seen": 45247552, "step": 47365 }, { "epoch": 3.864099845011828, "grad_norm": 0.2344045490026474, "learning_rate": 3.8513216601898075e-05, "loss": 0.3412, "num_input_tokens_seen": 45251792, "step": 47370 }, { "epoch": 3.8645077086222366, "grad_norm": 0.8373678922653198, "learning_rate": 3.851022195035685e-05, "loss": 0.3918, "num_input_tokens_seen": 45256640, "step": 47375 }, { "epoch": 3.864915572232645, "grad_norm": 0.8066696524620056, "learning_rate": 3.850722702496917e-05, "loss": 0.3567, "num_input_tokens_seen": 45261680, "step": 47380 }, { "epoch": 3.865323435843054, "grad_norm": 0.33292046189308167, "learning_rate": 3.850423182579574e-05, "loss": 0.3585, "num_input_tokens_seen": 45266176, "step": 47385 }, { "epoch": 3.8657312994534627, "grad_norm": 0.6840230822563171, "learning_rate": 3.8501236352897265e-05, "loss": 0.3566, "num_input_tokens_seen": 45271264, "step": 47390 }, { "epoch": 3.8661391630638713, "grad_norm": 0.32755520939826965, "learning_rate": 3.849824060633447e-05, "loss": 0.3371, "num_input_tokens_seen": 45275824, "step": 47395 }, { "epoch": 3.8665470266742803, "grad_norm": 0.8064569234848022, "learning_rate": 3.849524458616807e-05, "loss": 0.3287, "num_input_tokens_seen": 45280480, "step": 47400 }, { "epoch": 3.866954890284689, "grad_norm": 0.29475024342536926, "learning_rate": 3.849224829245879e-05, "loss": 0.3058, "num_input_tokens_seen": 45284576, "step": 47405 }, { "epoch": 3.8673627538950974, "grad_norm": 0.5609447956085205, "learning_rate": 3.8489251725267384e-05, "loss": 0.3122, "num_input_tokens_seen": 45289824, "step": 47410 }, { "epoch": 3.867770617505506, "grad_norm": 0.6191560626029968, "learning_rate": 3.8486254884654565e-05, "loss": 0.3742, "num_input_tokens_seen": 45294784, "step": 47415 }, { "epoch": 3.868178481115915, "grad_norm": 1.0054692029953003, "learning_rate": 3.84832577706811e-05, "loss": 0.3747, "num_input_tokens_seen": 45299856, "step": 47420 }, { "epoch": 3.8685863447263236, "grad_norm": 0.3066444396972656, "learning_rate": 3.848026038340772e-05, "loss": 0.3301, "num_input_tokens_seen": 45304352, "step": 47425 }, { "epoch": 3.868994208336732, "grad_norm": 1.0451868772506714, "learning_rate": 3.8477262722895195e-05, "loss": 0.3695, "num_input_tokens_seen": 45309024, "step": 47430 }, { "epoch": 3.8694020719471407, "grad_norm": 0.3401089012622833, "learning_rate": 3.847426478920429e-05, "loss": 0.3695, "num_input_tokens_seen": 45313392, "step": 47435 }, { "epoch": 3.8698099355575497, "grad_norm": 0.7953301668167114, "learning_rate": 3.847126658239575e-05, "loss": 0.4006, "num_input_tokens_seen": 45317552, "step": 47440 }, { "epoch": 3.8702177991679583, "grad_norm": 0.7728452682495117, "learning_rate": 3.846826810253037e-05, "loss": 0.3628, "num_input_tokens_seen": 45322208, "step": 47445 }, { "epoch": 3.870625662778367, "grad_norm": 0.38961535692214966, "learning_rate": 3.846526934966891e-05, "loss": 0.35, "num_input_tokens_seen": 45326608, "step": 47450 }, { "epoch": 3.871033526388776, "grad_norm": 0.34810036420822144, "learning_rate": 3.8462270323872165e-05, "loss": 0.3405, "num_input_tokens_seen": 45331680, "step": 47455 }, { "epoch": 3.8714413899991844, "grad_norm": 1.0115466117858887, "learning_rate": 3.8459271025200924e-05, "loss": 0.3692, "num_input_tokens_seen": 45337008, "step": 47460 }, { "epoch": 3.871849253609593, "grad_norm": 0.7482969164848328, "learning_rate": 3.8456271453715984e-05, "loss": 0.3446, "num_input_tokens_seen": 45341696, "step": 47465 }, { "epoch": 3.8722571172200015, "grad_norm": 0.47067907452583313, "learning_rate": 3.8453271609478126e-05, "loss": 0.3754, "num_input_tokens_seen": 45345936, "step": 47470 }, { "epoch": 3.87266498083041, "grad_norm": 0.49153071641921997, "learning_rate": 3.8450271492548176e-05, "loss": 0.3268, "num_input_tokens_seen": 45351024, "step": 47475 }, { "epoch": 3.873072844440819, "grad_norm": 0.792421817779541, "learning_rate": 3.8447271102986936e-05, "loss": 0.3759, "num_input_tokens_seen": 45355392, "step": 47480 }, { "epoch": 3.8734807080512277, "grad_norm": 0.6062566041946411, "learning_rate": 3.844427044085523e-05, "loss": 0.3712, "num_input_tokens_seen": 45360256, "step": 47485 }, { "epoch": 3.8738885716616362, "grad_norm": 0.20854710042476654, "learning_rate": 3.844126950621387e-05, "loss": 0.3786, "num_input_tokens_seen": 45365552, "step": 47490 }, { "epoch": 3.8742964352720453, "grad_norm": 0.7265231013298035, "learning_rate": 3.8438268299123695e-05, "loss": 0.3008, "num_input_tokens_seen": 45370256, "step": 47495 }, { "epoch": 3.874704298882454, "grad_norm": 0.8456414341926575, "learning_rate": 3.843526681964552e-05, "loss": 0.3937, "num_input_tokens_seen": 45374416, "step": 47500 }, { "epoch": 3.8751121624928624, "grad_norm": 0.8049399852752686, "learning_rate": 3.84322650678402e-05, "loss": 0.3591, "num_input_tokens_seen": 45380560, "step": 47505 }, { "epoch": 3.875520026103271, "grad_norm": 0.6727239489555359, "learning_rate": 3.8429263043768585e-05, "loss": 0.346, "num_input_tokens_seen": 45384688, "step": 47510 }, { "epoch": 3.8759278897136795, "grad_norm": 0.970392107963562, "learning_rate": 3.8426260747491504e-05, "loss": 0.3398, "num_input_tokens_seen": 45389680, "step": 47515 }, { "epoch": 3.8763357533240885, "grad_norm": 0.3483281135559082, "learning_rate": 3.8423258179069826e-05, "loss": 0.3473, "num_input_tokens_seen": 45394096, "step": 47520 }, { "epoch": 3.876743616934497, "grad_norm": 0.6489077806472778, "learning_rate": 3.8420255338564414e-05, "loss": 0.2988, "num_input_tokens_seen": 45398256, "step": 47525 }, { "epoch": 3.8771514805449057, "grad_norm": 0.5171151161193848, "learning_rate": 3.841725222603612e-05, "loss": 0.3736, "num_input_tokens_seen": 45403968, "step": 47530 }, { "epoch": 3.8775593441553147, "grad_norm": 0.981266438961029, "learning_rate": 3.841424884154583e-05, "loss": 0.3965, "num_input_tokens_seen": 45408512, "step": 47535 }, { "epoch": 3.8779672077657232, "grad_norm": 0.37121185660362244, "learning_rate": 3.841124518515441e-05, "loss": 0.3404, "num_input_tokens_seen": 45413536, "step": 47540 }, { "epoch": 3.878375071376132, "grad_norm": 0.6952897906303406, "learning_rate": 3.840824125692275e-05, "loss": 0.3692, "num_input_tokens_seen": 45418816, "step": 47545 }, { "epoch": 3.8787829349865404, "grad_norm": 0.6834164261817932, "learning_rate": 3.840523705691175e-05, "loss": 0.3518, "num_input_tokens_seen": 45422912, "step": 47550 }, { "epoch": 3.879190798596949, "grad_norm": 0.7808012366294861, "learning_rate": 3.840223258518228e-05, "loss": 0.3673, "num_input_tokens_seen": 45428608, "step": 47555 }, { "epoch": 3.879598662207358, "grad_norm": 0.5966105461120605, "learning_rate": 3.839922784179525e-05, "loss": 0.315, "num_input_tokens_seen": 45433184, "step": 47560 }, { "epoch": 3.8800065258177665, "grad_norm": 0.9752534627914429, "learning_rate": 3.839622282681157e-05, "loss": 0.3395, "num_input_tokens_seen": 45438016, "step": 47565 }, { "epoch": 3.880414389428175, "grad_norm": 0.20616371929645538, "learning_rate": 3.839321754029214e-05, "loss": 0.3519, "num_input_tokens_seen": 45442160, "step": 47570 }, { "epoch": 3.880822253038584, "grad_norm": 0.8862873315811157, "learning_rate": 3.839021198229789e-05, "loss": 0.3496, "num_input_tokens_seen": 45447472, "step": 47575 }, { "epoch": 3.8812301166489926, "grad_norm": 0.5179767608642578, "learning_rate": 3.838720615288972e-05, "loss": 0.3144, "num_input_tokens_seen": 45451792, "step": 47580 }, { "epoch": 3.881637980259401, "grad_norm": 0.8187613487243652, "learning_rate": 3.838420005212858e-05, "loss": 0.3975, "num_input_tokens_seen": 45456304, "step": 47585 }, { "epoch": 3.8820458438698098, "grad_norm": 0.3627205789089203, "learning_rate": 3.8381193680075386e-05, "loss": 0.3513, "num_input_tokens_seen": 45460672, "step": 47590 }, { "epoch": 3.8824537074802183, "grad_norm": 0.4011799991130829, "learning_rate": 3.837818703679109e-05, "loss": 0.3519, "num_input_tokens_seen": 45466144, "step": 47595 }, { "epoch": 3.8828615710906274, "grad_norm": 0.1785699427127838, "learning_rate": 3.8375180122336626e-05, "loss": 0.3676, "num_input_tokens_seen": 45471312, "step": 47600 }, { "epoch": 3.883269434701036, "grad_norm": 0.7206291556358337, "learning_rate": 3.837217293677294e-05, "loss": 0.3424, "num_input_tokens_seen": 45475904, "step": 47605 }, { "epoch": 3.8836772983114445, "grad_norm": 0.7624757289886475, "learning_rate": 3.836916548016099e-05, "loss": 0.335, "num_input_tokens_seen": 45480112, "step": 47610 }, { "epoch": 3.8840851619218535, "grad_norm": 0.523321270942688, "learning_rate": 3.836615775256174e-05, "loss": 0.3231, "num_input_tokens_seen": 45483840, "step": 47615 }, { "epoch": 3.884493025532262, "grad_norm": 0.6002041101455688, "learning_rate": 3.836314975403615e-05, "loss": 0.288, "num_input_tokens_seen": 45488448, "step": 47620 }, { "epoch": 3.8849008891426706, "grad_norm": 0.3385999798774719, "learning_rate": 3.836014148464519e-05, "loss": 0.4267, "num_input_tokens_seen": 45493920, "step": 47625 }, { "epoch": 3.8853087527530796, "grad_norm": 0.4633505940437317, "learning_rate": 3.835713294444985e-05, "loss": 0.3331, "num_input_tokens_seen": 45498928, "step": 47630 }, { "epoch": 3.885716616363488, "grad_norm": 1.0142720937728882, "learning_rate": 3.835412413351108e-05, "loss": 0.3784, "num_input_tokens_seen": 45503472, "step": 47635 }, { "epoch": 3.8861244799738968, "grad_norm": 0.35744887590408325, "learning_rate": 3.835111505188991e-05, "loss": 0.315, "num_input_tokens_seen": 45508256, "step": 47640 }, { "epoch": 3.8865323435843053, "grad_norm": 0.5462813973426819, "learning_rate": 3.83481056996473e-05, "loss": 0.3528, "num_input_tokens_seen": 45513328, "step": 47645 }, { "epoch": 3.886940207194714, "grad_norm": 0.6166942715644836, "learning_rate": 3.834509607684427e-05, "loss": 0.3664, "num_input_tokens_seen": 45517712, "step": 47650 }, { "epoch": 3.887348070805123, "grad_norm": 0.5696490406990051, "learning_rate": 3.83420861835418e-05, "loss": 0.3555, "num_input_tokens_seen": 45522848, "step": 47655 }, { "epoch": 3.8877559344155315, "grad_norm": 0.5586224794387817, "learning_rate": 3.833907601980092e-05, "loss": 0.3429, "num_input_tokens_seen": 45527744, "step": 47660 }, { "epoch": 3.88816379802594, "grad_norm": 0.9270633459091187, "learning_rate": 3.833606558568264e-05, "loss": 0.3403, "num_input_tokens_seen": 45532384, "step": 47665 }, { "epoch": 3.888571661636349, "grad_norm": 0.29817697405815125, "learning_rate": 3.833305488124797e-05, "loss": 0.3525, "num_input_tokens_seen": 45537440, "step": 47670 }, { "epoch": 3.8889795252467576, "grad_norm": 0.6836824417114258, "learning_rate": 3.833004390655794e-05, "loss": 0.3474, "num_input_tokens_seen": 45541664, "step": 47675 }, { "epoch": 3.889387388857166, "grad_norm": 0.35798582434654236, "learning_rate": 3.83270326616736e-05, "loss": 0.3896, "num_input_tokens_seen": 45547104, "step": 47680 }, { "epoch": 3.8897952524675747, "grad_norm": 0.24819061160087585, "learning_rate": 3.832402114665596e-05, "loss": 0.3837, "num_input_tokens_seen": 45551856, "step": 47685 }, { "epoch": 3.8902031160779833, "grad_norm": 0.535067617893219, "learning_rate": 3.8321009361566075e-05, "loss": 0.3519, "num_input_tokens_seen": 45556880, "step": 47690 }, { "epoch": 3.8906109796883923, "grad_norm": 0.18294553458690643, "learning_rate": 3.8317997306464984e-05, "loss": 0.3225, "num_input_tokens_seen": 45560592, "step": 47695 }, { "epoch": 3.891018843298801, "grad_norm": 0.3092637062072754, "learning_rate": 3.831498498141376e-05, "loss": 0.3213, "num_input_tokens_seen": 45564160, "step": 47700 }, { "epoch": 3.8914267069092094, "grad_norm": 0.3624913990497589, "learning_rate": 3.831197238647344e-05, "loss": 0.3371, "num_input_tokens_seen": 45568080, "step": 47705 }, { "epoch": 3.8918345705196185, "grad_norm": 0.5366337895393372, "learning_rate": 3.83089595217051e-05, "loss": 0.3914, "num_input_tokens_seen": 45572928, "step": 47710 }, { "epoch": 3.892242434130027, "grad_norm": 0.8142876625061035, "learning_rate": 3.83059463871698e-05, "loss": 0.3643, "num_input_tokens_seen": 45577840, "step": 47715 }, { "epoch": 3.8926502977404356, "grad_norm": 0.39302799105644226, "learning_rate": 3.830293298292862e-05, "loss": 0.345, "num_input_tokens_seen": 45583360, "step": 47720 }, { "epoch": 3.893058161350844, "grad_norm": 0.6391968727111816, "learning_rate": 3.829991930904265e-05, "loss": 0.3459, "num_input_tokens_seen": 45588112, "step": 47725 }, { "epoch": 3.8934660249612527, "grad_norm": 0.39141955971717834, "learning_rate": 3.8296905365572965e-05, "loss": 0.3543, "num_input_tokens_seen": 45592928, "step": 47730 }, { "epoch": 3.8938738885716617, "grad_norm": 0.29212528467178345, "learning_rate": 3.829389115258065e-05, "loss": 0.3531, "num_input_tokens_seen": 45597360, "step": 47735 }, { "epoch": 3.8942817521820703, "grad_norm": 0.3400290608406067, "learning_rate": 3.829087667012682e-05, "loss": 0.3286, "num_input_tokens_seen": 45602128, "step": 47740 }, { "epoch": 3.894689615792479, "grad_norm": 0.3382572531700134, "learning_rate": 3.828786191827256e-05, "loss": 0.3402, "num_input_tokens_seen": 45607232, "step": 47745 }, { "epoch": 3.895097479402888, "grad_norm": 0.5769382119178772, "learning_rate": 3.828484689707899e-05, "loss": 0.3537, "num_input_tokens_seen": 45611632, "step": 47750 }, { "epoch": 3.8955053430132964, "grad_norm": 0.9705279469490051, "learning_rate": 3.828183160660722e-05, "loss": 0.4231, "num_input_tokens_seen": 45615856, "step": 47755 }, { "epoch": 3.895913206623705, "grad_norm": 0.7968047857284546, "learning_rate": 3.827881604691836e-05, "loss": 0.3413, "num_input_tokens_seen": 45620848, "step": 47760 }, { "epoch": 3.8963210702341136, "grad_norm": 0.8539262413978577, "learning_rate": 3.827580021807355e-05, "loss": 0.3723, "num_input_tokens_seen": 45625264, "step": 47765 }, { "epoch": 3.896728933844522, "grad_norm": 0.8171996474266052, "learning_rate": 3.8272784120133904e-05, "loss": 0.3467, "num_input_tokens_seen": 45630320, "step": 47770 }, { "epoch": 3.897136797454931, "grad_norm": 0.685831606388092, "learning_rate": 3.826976775316057e-05, "loss": 0.3429, "num_input_tokens_seen": 45634688, "step": 47775 }, { "epoch": 3.8975446610653397, "grad_norm": 0.49995431303977966, "learning_rate": 3.826675111721467e-05, "loss": 0.3683, "num_input_tokens_seen": 45639328, "step": 47780 }, { "epoch": 3.8979525246757483, "grad_norm": 0.757872998714447, "learning_rate": 3.826373421235737e-05, "loss": 0.3371, "num_input_tokens_seen": 45644720, "step": 47785 }, { "epoch": 3.8983603882861573, "grad_norm": 0.6865023374557495, "learning_rate": 3.826071703864982e-05, "loss": 0.3297, "num_input_tokens_seen": 45649520, "step": 47790 }, { "epoch": 3.898768251896566, "grad_norm": 0.9105896949768066, "learning_rate": 3.825769959615317e-05, "loss": 0.3841, "num_input_tokens_seen": 45654320, "step": 47795 }, { "epoch": 3.8991761155069744, "grad_norm": 0.837566614151001, "learning_rate": 3.825468188492857e-05, "loss": 0.3323, "num_input_tokens_seen": 45659168, "step": 47800 }, { "epoch": 3.8995839791173834, "grad_norm": 0.38135623931884766, "learning_rate": 3.825166390503722e-05, "loss": 0.3472, "num_input_tokens_seen": 45662960, "step": 47805 }, { "epoch": 3.899991842727792, "grad_norm": 0.32455986738204956, "learning_rate": 3.824864565654025e-05, "loss": 0.3233, "num_input_tokens_seen": 45667280, "step": 47810 }, { "epoch": 3.9003997063382005, "grad_norm": 0.30765050649642944, "learning_rate": 3.824562713949887e-05, "loss": 0.3961, "num_input_tokens_seen": 45671456, "step": 47815 }, { "epoch": 3.900807569948609, "grad_norm": 0.7856479287147522, "learning_rate": 3.824260835397427e-05, "loss": 0.3514, "num_input_tokens_seen": 45676032, "step": 47820 }, { "epoch": 3.9012154335590177, "grad_norm": 0.3972533345222473, "learning_rate": 3.823958930002761e-05, "loss": 0.3361, "num_input_tokens_seen": 45681168, "step": 47825 }, { "epoch": 3.9016232971694267, "grad_norm": 0.5961962342262268, "learning_rate": 3.823656997772011e-05, "loss": 0.3218, "num_input_tokens_seen": 45684896, "step": 47830 }, { "epoch": 3.9020311607798353, "grad_norm": 0.8008838295936584, "learning_rate": 3.823355038711296e-05, "loss": 0.4188, "num_input_tokens_seen": 45689776, "step": 47835 }, { "epoch": 3.902439024390244, "grad_norm": 0.668332040309906, "learning_rate": 3.823053052826736e-05, "loss": 0.342, "num_input_tokens_seen": 45694368, "step": 47840 }, { "epoch": 3.902846888000653, "grad_norm": 0.39535051584243774, "learning_rate": 3.822751040124453e-05, "loss": 0.3885, "num_input_tokens_seen": 45698608, "step": 47845 }, { "epoch": 3.9032547516110614, "grad_norm": 0.30151697993278503, "learning_rate": 3.82244900061057e-05, "loss": 0.3472, "num_input_tokens_seen": 45703680, "step": 47850 }, { "epoch": 3.90366261522147, "grad_norm": 1.1014078855514526, "learning_rate": 3.822146934291206e-05, "loss": 0.3306, "num_input_tokens_seen": 45708608, "step": 47855 }, { "epoch": 3.9040704788318785, "grad_norm": 0.5869154930114746, "learning_rate": 3.821844841172486e-05, "loss": 0.3363, "num_input_tokens_seen": 45713248, "step": 47860 }, { "epoch": 3.904478342442287, "grad_norm": 0.8418706059455872, "learning_rate": 3.8215427212605325e-05, "loss": 0.345, "num_input_tokens_seen": 45717504, "step": 47865 }, { "epoch": 3.904886206052696, "grad_norm": 0.6464124321937561, "learning_rate": 3.82124057456147e-05, "loss": 0.3762, "num_input_tokens_seen": 45722944, "step": 47870 }, { "epoch": 3.9052940696631047, "grad_norm": 1.0290327072143555, "learning_rate": 3.820938401081422e-05, "loss": 0.3317, "num_input_tokens_seen": 45728400, "step": 47875 }, { "epoch": 3.9057019332735132, "grad_norm": 0.7637935280799866, "learning_rate": 3.820636200826514e-05, "loss": 0.3913, "num_input_tokens_seen": 45733024, "step": 47880 }, { "epoch": 3.9061097968839222, "grad_norm": 0.36307695508003235, "learning_rate": 3.820333973802872e-05, "loss": 0.3535, "num_input_tokens_seen": 45738448, "step": 47885 }, { "epoch": 3.906517660494331, "grad_norm": 0.5879579186439514, "learning_rate": 3.8200317200166205e-05, "loss": 0.3505, "num_input_tokens_seen": 45742640, "step": 47890 }, { "epoch": 3.9069255241047394, "grad_norm": 0.48688018321990967, "learning_rate": 3.819729439473888e-05, "loss": 0.3555, "num_input_tokens_seen": 45747456, "step": 47895 }, { "epoch": 3.907333387715148, "grad_norm": 0.3346247971057892, "learning_rate": 3.8194271321808e-05, "loss": 0.3336, "num_input_tokens_seen": 45752144, "step": 47900 }, { "epoch": 3.9077412513255565, "grad_norm": 0.32493919134140015, "learning_rate": 3.819124798143484e-05, "loss": 0.3125, "num_input_tokens_seen": 45756416, "step": 47905 }, { "epoch": 3.9081491149359655, "grad_norm": 0.3631923198699951, "learning_rate": 3.81882243736807e-05, "loss": 0.3766, "num_input_tokens_seen": 45761232, "step": 47910 }, { "epoch": 3.908556978546374, "grad_norm": 0.40348154306411743, "learning_rate": 3.818520049860685e-05, "loss": 0.3405, "num_input_tokens_seen": 45766752, "step": 47915 }, { "epoch": 3.9089648421567826, "grad_norm": 0.6315974593162537, "learning_rate": 3.818217635627459e-05, "loss": 0.368, "num_input_tokens_seen": 45770768, "step": 47920 }, { "epoch": 3.9093727057671916, "grad_norm": 0.3465573787689209, "learning_rate": 3.817915194674522e-05, "loss": 0.3676, "num_input_tokens_seen": 45775552, "step": 47925 }, { "epoch": 3.9097805693776, "grad_norm": 0.6637179851531982, "learning_rate": 3.817612727008003e-05, "loss": 0.3356, "num_input_tokens_seen": 45780480, "step": 47930 }, { "epoch": 3.9101884329880088, "grad_norm": 0.6534812450408936, "learning_rate": 3.817310232634035e-05, "loss": 0.3662, "num_input_tokens_seen": 45785536, "step": 47935 }, { "epoch": 3.9105962965984173, "grad_norm": 0.21789255738258362, "learning_rate": 3.8170077115587484e-05, "loss": 0.3677, "num_input_tokens_seen": 45790432, "step": 47940 }, { "epoch": 3.911004160208826, "grad_norm": 0.3380158841609955, "learning_rate": 3.816705163788275e-05, "loss": 0.3391, "num_input_tokens_seen": 45796208, "step": 47945 }, { "epoch": 3.911412023819235, "grad_norm": 0.9003346562385559, "learning_rate": 3.816402589328746e-05, "loss": 0.3302, "num_input_tokens_seen": 45801936, "step": 47950 }, { "epoch": 3.9118198874296435, "grad_norm": 0.4895935356616974, "learning_rate": 3.8160999881862976e-05, "loss": 0.336, "num_input_tokens_seen": 45806976, "step": 47955 }, { "epoch": 3.912227751040052, "grad_norm": 0.20527853071689606, "learning_rate": 3.8157973603670606e-05, "loss": 0.3548, "num_input_tokens_seen": 45811616, "step": 47960 }, { "epoch": 3.912635614650461, "grad_norm": 0.6936365365982056, "learning_rate": 3.8154947058771704e-05, "loss": 0.3487, "num_input_tokens_seen": 45816480, "step": 47965 }, { "epoch": 3.9130434782608696, "grad_norm": 0.3084689974784851, "learning_rate": 3.8151920247227624e-05, "loss": 0.3478, "num_input_tokens_seen": 45821024, "step": 47970 }, { "epoch": 3.913451341871278, "grad_norm": 0.16416029632091522, "learning_rate": 3.81488931690997e-05, "loss": 0.3611, "num_input_tokens_seen": 45825776, "step": 47975 }, { "epoch": 3.913859205481687, "grad_norm": 0.15850917994976044, "learning_rate": 3.81458658244493e-05, "loss": 0.3545, "num_input_tokens_seen": 45830192, "step": 47980 }, { "epoch": 3.9142670690920958, "grad_norm": 0.27201536297798157, "learning_rate": 3.814283821333779e-05, "loss": 0.3422, "num_input_tokens_seen": 45834192, "step": 47985 }, { "epoch": 3.9146749327025043, "grad_norm": 0.7379587888717651, "learning_rate": 3.813981033582653e-05, "loss": 0.365, "num_input_tokens_seen": 45839104, "step": 47990 }, { "epoch": 3.915082796312913, "grad_norm": 0.42995280027389526, "learning_rate": 3.81367821919769e-05, "loss": 0.3555, "num_input_tokens_seen": 45843968, "step": 47995 }, { "epoch": 3.9154906599233215, "grad_norm": 0.6563489437103271, "learning_rate": 3.813375378185028e-05, "loss": 0.3522, "num_input_tokens_seen": 45848672, "step": 48000 }, { "epoch": 3.9158985235337305, "grad_norm": 0.5431073904037476, "learning_rate": 3.813072510550805e-05, "loss": 0.3014, "num_input_tokens_seen": 45852896, "step": 48005 }, { "epoch": 3.916306387144139, "grad_norm": 0.4033452570438385, "learning_rate": 3.81276961630116e-05, "loss": 0.3668, "num_input_tokens_seen": 45857936, "step": 48010 }, { "epoch": 3.9167142507545476, "grad_norm": 0.24168872833251953, "learning_rate": 3.8124666954422323e-05, "loss": 0.3454, "num_input_tokens_seen": 45862784, "step": 48015 }, { "epoch": 3.9171221143649566, "grad_norm": 0.5938300490379333, "learning_rate": 3.812163747980163e-05, "loss": 0.3799, "num_input_tokens_seen": 45867856, "step": 48020 }, { "epoch": 3.917529977975365, "grad_norm": 0.6156619787216187, "learning_rate": 3.8118607739210915e-05, "loss": 0.3179, "num_input_tokens_seen": 45870800, "step": 48025 }, { "epoch": 3.9179378415857737, "grad_norm": 0.5067434310913086, "learning_rate": 3.81155777327116e-05, "loss": 0.358, "num_input_tokens_seen": 45875408, "step": 48030 }, { "epoch": 3.9183457051961823, "grad_norm": 0.321835994720459, "learning_rate": 3.81125474603651e-05, "loss": 0.3557, "num_input_tokens_seen": 45880160, "step": 48035 }, { "epoch": 3.918753568806591, "grad_norm": 0.557700514793396, "learning_rate": 3.810951692223283e-05, "loss": 0.3547, "num_input_tokens_seen": 45884416, "step": 48040 }, { "epoch": 3.919161432417, "grad_norm": 0.7414960265159607, "learning_rate": 3.810648611837622e-05, "loss": 0.3509, "num_input_tokens_seen": 45889888, "step": 48045 }, { "epoch": 3.9195692960274084, "grad_norm": 0.6702306270599365, "learning_rate": 3.810345504885672e-05, "loss": 0.3425, "num_input_tokens_seen": 45894848, "step": 48050 }, { "epoch": 3.919977159637817, "grad_norm": 0.33447888493537903, "learning_rate": 3.810042371373574e-05, "loss": 0.3019, "num_input_tokens_seen": 45899616, "step": 48055 }, { "epoch": 3.920385023248226, "grad_norm": 0.3608686029911041, "learning_rate": 3.8097392113074735e-05, "loss": 0.3835, "num_input_tokens_seen": 45904480, "step": 48060 }, { "epoch": 3.9207928868586346, "grad_norm": 0.6185730695724487, "learning_rate": 3.8094360246935166e-05, "loss": 0.3473, "num_input_tokens_seen": 45908656, "step": 48065 }, { "epoch": 3.921200750469043, "grad_norm": 0.2945176362991333, "learning_rate": 3.809132811537848e-05, "loss": 0.3591, "num_input_tokens_seen": 45912800, "step": 48070 }, { "epoch": 3.9216086140794517, "grad_norm": 0.6858893036842346, "learning_rate": 3.808829571846614e-05, "loss": 0.2787, "num_input_tokens_seen": 45917936, "step": 48075 }, { "epoch": 3.9220164776898603, "grad_norm": 0.3951883912086487, "learning_rate": 3.8085263056259594e-05, "loss": 0.3316, "num_input_tokens_seen": 45922640, "step": 48080 }, { "epoch": 3.9224243413002693, "grad_norm": 0.5194891691207886, "learning_rate": 3.808223012882034e-05, "loss": 0.2738, "num_input_tokens_seen": 45927568, "step": 48085 }, { "epoch": 3.922832204910678, "grad_norm": 0.5532619953155518, "learning_rate": 3.8079196936209825e-05, "loss": 0.349, "num_input_tokens_seen": 45931504, "step": 48090 }, { "epoch": 3.9232400685210864, "grad_norm": 0.5218294262886047, "learning_rate": 3.8076163478489554e-05, "loss": 0.3074, "num_input_tokens_seen": 45936720, "step": 48095 }, { "epoch": 3.9236479321314954, "grad_norm": 0.4477602243423462, "learning_rate": 3.8073129755721006e-05, "loss": 0.3402, "num_input_tokens_seen": 45941568, "step": 48100 }, { "epoch": 3.924055795741904, "grad_norm": 0.9037319421768188, "learning_rate": 3.8070095767965674e-05, "loss": 0.4285, "num_input_tokens_seen": 45946512, "step": 48105 }, { "epoch": 3.9244636593523126, "grad_norm": 0.5894010663032532, "learning_rate": 3.806706151528506e-05, "loss": 0.3049, "num_input_tokens_seen": 45952032, "step": 48110 }, { "epoch": 3.924871522962721, "grad_norm": 0.49130281805992126, "learning_rate": 3.8064026997740645e-05, "loss": 0.2986, "num_input_tokens_seen": 45956512, "step": 48115 }, { "epoch": 3.9252793865731297, "grad_norm": 0.49767428636550903, "learning_rate": 3.806099221539397e-05, "loss": 0.3478, "num_input_tokens_seen": 45960496, "step": 48120 }, { "epoch": 3.9256872501835387, "grad_norm": 0.4861530065536499, "learning_rate": 3.805795716830653e-05, "loss": 0.3591, "num_input_tokens_seen": 45965696, "step": 48125 }, { "epoch": 3.9260951137939473, "grad_norm": 0.5621034502983093, "learning_rate": 3.805492185653984e-05, "loss": 0.3254, "num_input_tokens_seen": 45971024, "step": 48130 }, { "epoch": 3.926502977404356, "grad_norm": 0.5961653590202332, "learning_rate": 3.805188628015544e-05, "loss": 0.3409, "num_input_tokens_seen": 45975600, "step": 48135 }, { "epoch": 3.926910841014765, "grad_norm": 1.033332347869873, "learning_rate": 3.8048850439214844e-05, "loss": 0.3556, "num_input_tokens_seen": 45979824, "step": 48140 }, { "epoch": 3.9273187046251734, "grad_norm": 0.8157311081886292, "learning_rate": 3.804581433377959e-05, "loss": 0.3542, "num_input_tokens_seen": 45984528, "step": 48145 }, { "epoch": 3.927726568235582, "grad_norm": 0.8187433481216431, "learning_rate": 3.8042777963911234e-05, "loss": 0.3303, "num_input_tokens_seen": 45990112, "step": 48150 }, { "epoch": 3.9281344318459905, "grad_norm": 0.8939297795295715, "learning_rate": 3.803974132967131e-05, "loss": 0.3302, "num_input_tokens_seen": 45995392, "step": 48155 }, { "epoch": 3.928542295456399, "grad_norm": 0.6957107782363892, "learning_rate": 3.8036704431121365e-05, "loss": 0.3246, "num_input_tokens_seen": 45999904, "step": 48160 }, { "epoch": 3.928950159066808, "grad_norm": 0.46000194549560547, "learning_rate": 3.803366726832296e-05, "loss": 0.3386, "num_input_tokens_seen": 46005184, "step": 48165 }, { "epoch": 3.9293580226772167, "grad_norm": 0.47275373339653015, "learning_rate": 3.8030629841337654e-05, "loss": 0.3092, "num_input_tokens_seen": 46009904, "step": 48170 }, { "epoch": 3.9297658862876252, "grad_norm": 0.4332065284252167, "learning_rate": 3.802759215022703e-05, "loss": 0.2887, "num_input_tokens_seen": 46014432, "step": 48175 }, { "epoch": 3.9301737498980343, "grad_norm": 0.6575731635093689, "learning_rate": 3.802455419505264e-05, "loss": 0.3003, "num_input_tokens_seen": 46019200, "step": 48180 }, { "epoch": 3.930581613508443, "grad_norm": 0.5413504242897034, "learning_rate": 3.8021515975876074e-05, "loss": 0.3961, "num_input_tokens_seen": 46024352, "step": 48185 }, { "epoch": 3.9309894771188514, "grad_norm": 0.5324171781539917, "learning_rate": 3.801847749275891e-05, "loss": 0.3554, "num_input_tokens_seen": 46029216, "step": 48190 }, { "epoch": 3.9313973407292604, "grad_norm": 0.4682972729206085, "learning_rate": 3.8015438745762744e-05, "loss": 0.425, "num_input_tokens_seen": 46034384, "step": 48195 }, { "epoch": 3.931805204339669, "grad_norm": 0.8738107681274414, "learning_rate": 3.801239973494917e-05, "loss": 0.3325, "num_input_tokens_seen": 46039952, "step": 48200 }, { "epoch": 3.9322130679500775, "grad_norm": 0.3433242738246918, "learning_rate": 3.800936046037977e-05, "loss": 0.3491, "num_input_tokens_seen": 46044816, "step": 48205 }, { "epoch": 3.932620931560486, "grad_norm": 0.375742107629776, "learning_rate": 3.800632092211617e-05, "loss": 0.3632, "num_input_tokens_seen": 46049696, "step": 48210 }, { "epoch": 3.9330287951708947, "grad_norm": 0.5982052087783813, "learning_rate": 3.8003281120219966e-05, "loss": 0.3187, "num_input_tokens_seen": 46055056, "step": 48215 }, { "epoch": 3.9334366587813037, "grad_norm": 0.3228549659252167, "learning_rate": 3.800024105475278e-05, "loss": 0.3434, "num_input_tokens_seen": 46059904, "step": 48220 }, { "epoch": 3.9338445223917122, "grad_norm": 0.7206620573997498, "learning_rate": 3.799720072577623e-05, "loss": 0.3755, "num_input_tokens_seen": 46065280, "step": 48225 }, { "epoch": 3.934252386002121, "grad_norm": 0.697226345539093, "learning_rate": 3.7994160133351955e-05, "loss": 0.345, "num_input_tokens_seen": 46070624, "step": 48230 }, { "epoch": 3.93466024961253, "grad_norm": 0.3663904666900635, "learning_rate": 3.799111927754156e-05, "loss": 0.3465, "num_input_tokens_seen": 46075216, "step": 48235 }, { "epoch": 3.9350681132229384, "grad_norm": 0.29819628596305847, "learning_rate": 3.798807815840671e-05, "loss": 0.3534, "num_input_tokens_seen": 46080512, "step": 48240 }, { "epoch": 3.935475976833347, "grad_norm": 0.766586422920227, "learning_rate": 3.798503677600903e-05, "loss": 0.3409, "num_input_tokens_seen": 46085072, "step": 48245 }, { "epoch": 3.9358838404437555, "grad_norm": 0.8342559933662415, "learning_rate": 3.798199513041018e-05, "loss": 0.3645, "num_input_tokens_seen": 46089760, "step": 48250 }, { "epoch": 3.936291704054164, "grad_norm": 0.3971165120601654, "learning_rate": 3.797895322167179e-05, "loss": 0.3393, "num_input_tokens_seen": 46095104, "step": 48255 }, { "epoch": 3.936699567664573, "grad_norm": 0.7216019630432129, "learning_rate": 3.797591104985554e-05, "loss": 0.3335, "num_input_tokens_seen": 46099792, "step": 48260 }, { "epoch": 3.9371074312749816, "grad_norm": 0.2359199970960617, "learning_rate": 3.797286861502309e-05, "loss": 0.3533, "num_input_tokens_seen": 46104880, "step": 48265 }, { "epoch": 3.93751529488539, "grad_norm": 0.5167378187179565, "learning_rate": 3.79698259172361e-05, "loss": 0.3146, "num_input_tokens_seen": 46109376, "step": 48270 }, { "epoch": 3.937923158495799, "grad_norm": 0.5612035989761353, "learning_rate": 3.796678295655626e-05, "loss": 0.3545, "num_input_tokens_seen": 46113520, "step": 48275 }, { "epoch": 3.938331022106208, "grad_norm": 0.4418809711933136, "learning_rate": 3.7963739733045234e-05, "loss": 0.2677, "num_input_tokens_seen": 46118032, "step": 48280 }, { "epoch": 3.9387388857166163, "grad_norm": 0.47074007987976074, "learning_rate": 3.796069624676471e-05, "loss": 0.4154, "num_input_tokens_seen": 46123232, "step": 48285 }, { "epoch": 3.939146749327025, "grad_norm": 0.5411179065704346, "learning_rate": 3.795765249777637e-05, "loss": 0.2778, "num_input_tokens_seen": 46127792, "step": 48290 }, { "epoch": 3.9395546129374335, "grad_norm": 0.3233059346675873, "learning_rate": 3.795460848614194e-05, "loss": 0.3603, "num_input_tokens_seen": 46132592, "step": 48295 }, { "epoch": 3.9399624765478425, "grad_norm": 0.4882296919822693, "learning_rate": 3.795156421192309e-05, "loss": 0.2694, "num_input_tokens_seen": 46137056, "step": 48300 }, { "epoch": 3.940370340158251, "grad_norm": 0.40988704562187195, "learning_rate": 3.794851967518154e-05, "loss": 0.3449, "num_input_tokens_seen": 46140896, "step": 48305 }, { "epoch": 3.9407782037686596, "grad_norm": 0.9497297406196594, "learning_rate": 3.7945474875979004e-05, "loss": 0.4504, "num_input_tokens_seen": 46145856, "step": 48310 }, { "epoch": 3.9411860673790686, "grad_norm": 0.8061249256134033, "learning_rate": 3.794242981437718e-05, "loss": 0.3118, "num_input_tokens_seen": 46150928, "step": 48315 }, { "epoch": 3.941593930989477, "grad_norm": 0.5014991164207458, "learning_rate": 3.793938449043781e-05, "loss": 0.3446, "num_input_tokens_seen": 46155408, "step": 48320 }, { "epoch": 3.9420017945998858, "grad_norm": 0.34752002358436584, "learning_rate": 3.7936338904222615e-05, "loss": 0.3521, "num_input_tokens_seen": 46160720, "step": 48325 }, { "epoch": 3.9424096582102943, "grad_norm": 0.7923003435134888, "learning_rate": 3.793329305579333e-05, "loss": 0.346, "num_input_tokens_seen": 46166384, "step": 48330 }, { "epoch": 3.942817521820703, "grad_norm": 0.6100225448608398, "learning_rate": 3.793024694521168e-05, "loss": 0.3482, "num_input_tokens_seen": 46170336, "step": 48335 }, { "epoch": 3.943225385431112, "grad_norm": 0.42944997549057007, "learning_rate": 3.792720057253943e-05, "loss": 0.3437, "num_input_tokens_seen": 46174720, "step": 48340 }, { "epoch": 3.9436332490415205, "grad_norm": 0.8989141583442688, "learning_rate": 3.792415393783831e-05, "loss": 0.3695, "num_input_tokens_seen": 46180112, "step": 48345 }, { "epoch": 3.944041112651929, "grad_norm": 0.5035862326622009, "learning_rate": 3.792110704117009e-05, "loss": 0.3626, "num_input_tokens_seen": 46185312, "step": 48350 }, { "epoch": 3.944448976262338, "grad_norm": 0.7107192873954773, "learning_rate": 3.791805988259652e-05, "loss": 0.3528, "num_input_tokens_seen": 46190272, "step": 48355 }, { "epoch": 3.9448568398727466, "grad_norm": 0.4550701975822449, "learning_rate": 3.791501246217936e-05, "loss": 0.3582, "num_input_tokens_seen": 46195008, "step": 48360 }, { "epoch": 3.945264703483155, "grad_norm": 0.7680263519287109, "learning_rate": 3.791196477998038e-05, "loss": 0.3695, "num_input_tokens_seen": 46199936, "step": 48365 }, { "epoch": 3.945672567093564, "grad_norm": 0.4506993889808655, "learning_rate": 3.790891683606137e-05, "loss": 0.3675, "num_input_tokens_seen": 46205344, "step": 48370 }, { "epoch": 3.9460804307039727, "grad_norm": 0.4044141471385956, "learning_rate": 3.79058686304841e-05, "loss": 0.3634, "num_input_tokens_seen": 46209856, "step": 48375 }, { "epoch": 3.9464882943143813, "grad_norm": 0.4928452670574188, "learning_rate": 3.790282016331035e-05, "loss": 0.346, "num_input_tokens_seen": 46214208, "step": 48380 }, { "epoch": 3.94689615792479, "grad_norm": 0.6934022307395935, "learning_rate": 3.7899771434601926e-05, "loss": 0.3257, "num_input_tokens_seen": 46218416, "step": 48385 }, { "epoch": 3.9473040215351984, "grad_norm": 0.846095860004425, "learning_rate": 3.789672244442061e-05, "loss": 0.3359, "num_input_tokens_seen": 46222720, "step": 48390 }, { "epoch": 3.9477118851456074, "grad_norm": 0.3546636402606964, "learning_rate": 3.7893673192828215e-05, "loss": 0.3287, "num_input_tokens_seen": 46226272, "step": 48395 }, { "epoch": 3.948119748756016, "grad_norm": 0.43355312943458557, "learning_rate": 3.789062367988654e-05, "loss": 0.3535, "num_input_tokens_seen": 46231296, "step": 48400 }, { "epoch": 3.9485276123664246, "grad_norm": 0.5683164596557617, "learning_rate": 3.7887573905657404e-05, "loss": 0.366, "num_input_tokens_seen": 46236512, "step": 48405 }, { "epoch": 3.9489354759768336, "grad_norm": 0.35013818740844727, "learning_rate": 3.788452387020262e-05, "loss": 0.2922, "num_input_tokens_seen": 46241840, "step": 48410 }, { "epoch": 3.949343339587242, "grad_norm": 0.37358444929122925, "learning_rate": 3.788147357358401e-05, "loss": 0.3836, "num_input_tokens_seen": 46246800, "step": 48415 }, { "epoch": 3.9497512031976507, "grad_norm": 0.22907185554504395, "learning_rate": 3.787842301586341e-05, "loss": 0.3119, "num_input_tokens_seen": 46251600, "step": 48420 }, { "epoch": 3.9501590668080593, "grad_norm": 0.39722344279289246, "learning_rate": 3.7875372197102646e-05, "loss": 0.3695, "num_input_tokens_seen": 46257040, "step": 48425 }, { "epoch": 3.950566930418468, "grad_norm": 0.41947951912879944, "learning_rate": 3.7872321117363554e-05, "loss": 0.3426, "num_input_tokens_seen": 46261776, "step": 48430 }, { "epoch": 3.950974794028877, "grad_norm": 0.4845018982887268, "learning_rate": 3.7869269776707985e-05, "loss": 0.3445, "num_input_tokens_seen": 46266912, "step": 48435 }, { "epoch": 3.9513826576392854, "grad_norm": 0.35518673062324524, "learning_rate": 3.786621817519779e-05, "loss": 0.3334, "num_input_tokens_seen": 46271552, "step": 48440 }, { "epoch": 3.951790521249694, "grad_norm": 0.6192256808280945, "learning_rate": 3.786316631289481e-05, "loss": 0.3356, "num_input_tokens_seen": 46276160, "step": 48445 }, { "epoch": 3.952198384860103, "grad_norm": 0.43584758043289185, "learning_rate": 3.786011418986092e-05, "loss": 0.3488, "num_input_tokens_seen": 46281552, "step": 48450 }, { "epoch": 3.9526062484705116, "grad_norm": 0.5830323696136475, "learning_rate": 3.785706180615798e-05, "loss": 0.3739, "num_input_tokens_seen": 46286240, "step": 48455 }, { "epoch": 3.95301411208092, "grad_norm": 1.0361206531524658, "learning_rate": 3.785400916184786e-05, "loss": 0.317, "num_input_tokens_seen": 46290976, "step": 48460 }, { "epoch": 3.9534219756913287, "grad_norm": 0.5228633880615234, "learning_rate": 3.785095625699244e-05, "loss": 0.3627, "num_input_tokens_seen": 46295360, "step": 48465 }, { "epoch": 3.9538298393017373, "grad_norm": 0.5907332897186279, "learning_rate": 3.7847903091653595e-05, "loss": 0.3396, "num_input_tokens_seen": 46300160, "step": 48470 }, { "epoch": 3.9542377029121463, "grad_norm": 0.5895113348960876, "learning_rate": 3.784484966589322e-05, "loss": 0.3379, "num_input_tokens_seen": 46304576, "step": 48475 }, { "epoch": 3.954645566522555, "grad_norm": 0.8781521916389465, "learning_rate": 3.7841795979773195e-05, "loss": 0.3714, "num_input_tokens_seen": 46310000, "step": 48480 }, { "epoch": 3.9550534301329634, "grad_norm": 0.7146967053413391, "learning_rate": 3.783874203335542e-05, "loss": 0.3322, "num_input_tokens_seen": 46315296, "step": 48485 }, { "epoch": 3.9554612937433724, "grad_norm": 0.8341674208641052, "learning_rate": 3.78356878267018e-05, "loss": 0.3452, "num_input_tokens_seen": 46319376, "step": 48490 }, { "epoch": 3.955869157353781, "grad_norm": 0.35620734095573425, "learning_rate": 3.7832633359874244e-05, "loss": 0.3411, "num_input_tokens_seen": 46324192, "step": 48495 }, { "epoch": 3.9562770209641895, "grad_norm": 0.8194053769111633, "learning_rate": 3.782957863293466e-05, "loss": 0.3297, "num_input_tokens_seen": 46329200, "step": 48500 }, { "epoch": 3.956684884574598, "grad_norm": 0.9044781923294067, "learning_rate": 3.782652364594497e-05, "loss": 0.3612, "num_input_tokens_seen": 46333920, "step": 48505 }, { "epoch": 3.9570927481850067, "grad_norm": 0.9862326979637146, "learning_rate": 3.78234683989671e-05, "loss": 0.3549, "num_input_tokens_seen": 46337568, "step": 48510 }, { "epoch": 3.9575006117954157, "grad_norm": 0.8698949813842773, "learning_rate": 3.782041289206297e-05, "loss": 0.3634, "num_input_tokens_seen": 46342112, "step": 48515 }, { "epoch": 3.9579084754058242, "grad_norm": 0.2419722080230713, "learning_rate": 3.781735712529452e-05, "loss": 0.3391, "num_input_tokens_seen": 46347376, "step": 48520 }, { "epoch": 3.958316339016233, "grad_norm": 0.6246768832206726, "learning_rate": 3.781430109872368e-05, "loss": 0.344, "num_input_tokens_seen": 46352384, "step": 48525 }, { "epoch": 3.958724202626642, "grad_norm": 0.3148897886276245, "learning_rate": 3.7811244812412415e-05, "loss": 0.3449, "num_input_tokens_seen": 46356960, "step": 48530 }, { "epoch": 3.9591320662370504, "grad_norm": 0.9804763793945312, "learning_rate": 3.780818826642265e-05, "loss": 0.3635, "num_input_tokens_seen": 46361664, "step": 48535 }, { "epoch": 3.959539929847459, "grad_norm": 0.9450398683547974, "learning_rate": 3.7805131460816355e-05, "loss": 0.3818, "num_input_tokens_seen": 46365984, "step": 48540 }, { "epoch": 3.959947793457868, "grad_norm": 0.8741272687911987, "learning_rate": 3.78020743956555e-05, "loss": 0.3501, "num_input_tokens_seen": 46370928, "step": 48545 }, { "epoch": 3.9603556570682765, "grad_norm": 0.9595874547958374, "learning_rate": 3.779901707100202e-05, "loss": 0.3617, "num_input_tokens_seen": 46375104, "step": 48550 }, { "epoch": 3.960763520678685, "grad_norm": 0.9138177633285522, "learning_rate": 3.779595948691791e-05, "loss": 0.3309, "num_input_tokens_seen": 46380064, "step": 48555 }, { "epoch": 3.9611713842890937, "grad_norm": 0.9743458032608032, "learning_rate": 3.779290164346513e-05, "loss": 0.3575, "num_input_tokens_seen": 46384352, "step": 48560 }, { "epoch": 3.961579247899502, "grad_norm": 0.6621360182762146, "learning_rate": 3.7789843540705684e-05, "loss": 0.3288, "num_input_tokens_seen": 46389088, "step": 48565 }, { "epoch": 3.9619871115099112, "grad_norm": 0.20273612439632416, "learning_rate": 3.778678517870153e-05, "loss": 0.3674, "num_input_tokens_seen": 46393744, "step": 48570 }, { "epoch": 3.96239497512032, "grad_norm": 0.5904329419136047, "learning_rate": 3.778372655751468e-05, "loss": 0.3507, "num_input_tokens_seen": 46398640, "step": 48575 }, { "epoch": 3.9628028387307284, "grad_norm": 0.9239822626113892, "learning_rate": 3.778066767720713e-05, "loss": 0.3296, "num_input_tokens_seen": 46402880, "step": 48580 }, { "epoch": 3.9632107023411374, "grad_norm": 0.7466748952865601, "learning_rate": 3.777760853784087e-05, "loss": 0.3255, "num_input_tokens_seen": 46407776, "step": 48585 }, { "epoch": 3.963618565951546, "grad_norm": 0.30254194140434265, "learning_rate": 3.777454913947791e-05, "loss": 0.3913, "num_input_tokens_seen": 46412256, "step": 48590 }, { "epoch": 3.9640264295619545, "grad_norm": 0.6221737861633301, "learning_rate": 3.777148948218028e-05, "loss": 0.3585, "num_input_tokens_seen": 46417280, "step": 48595 }, { "epoch": 3.964434293172363, "grad_norm": 0.4581591784954071, "learning_rate": 3.776842956600997e-05, "loss": 0.3336, "num_input_tokens_seen": 46421840, "step": 48600 }, { "epoch": 3.9648421567827716, "grad_norm": 0.5648638010025024, "learning_rate": 3.7765369391029024e-05, "loss": 0.2962, "num_input_tokens_seen": 46426432, "step": 48605 }, { "epoch": 3.9652500203931806, "grad_norm": 0.5358710289001465, "learning_rate": 3.776230895729948e-05, "loss": 0.325, "num_input_tokens_seen": 46431632, "step": 48610 }, { "epoch": 3.965657884003589, "grad_norm": 0.9453283548355103, "learning_rate": 3.775924826488334e-05, "loss": 0.3788, "num_input_tokens_seen": 46436528, "step": 48615 }, { "epoch": 3.9660657476139978, "grad_norm": 1.0621601343154907, "learning_rate": 3.775618731384266e-05, "loss": 0.4254, "num_input_tokens_seen": 46441616, "step": 48620 }, { "epoch": 3.966473611224407, "grad_norm": 0.49094581604003906, "learning_rate": 3.775312610423949e-05, "loss": 0.2845, "num_input_tokens_seen": 46447040, "step": 48625 }, { "epoch": 3.9668814748348153, "grad_norm": 0.4583461582660675, "learning_rate": 3.7750064636135876e-05, "loss": 0.3481, "num_input_tokens_seen": 46451552, "step": 48630 }, { "epoch": 3.967289338445224, "grad_norm": 0.6266968846321106, "learning_rate": 3.7747002909593854e-05, "loss": 0.3657, "num_input_tokens_seen": 46455856, "step": 48635 }, { "epoch": 3.9676972020556325, "grad_norm": 0.6894346475601196, "learning_rate": 3.774394092467552e-05, "loss": 0.3618, "num_input_tokens_seen": 46460736, "step": 48640 }, { "epoch": 3.968105065666041, "grad_norm": 0.7765456438064575, "learning_rate": 3.77408786814429e-05, "loss": 0.3558, "num_input_tokens_seen": 46464144, "step": 48645 }, { "epoch": 3.96851292927645, "grad_norm": 0.6966270208358765, "learning_rate": 3.77378161799581e-05, "loss": 0.3627, "num_input_tokens_seen": 46468592, "step": 48650 }, { "epoch": 3.9689207928868586, "grad_norm": 0.698940634727478, "learning_rate": 3.773475342028316e-05, "loss": 0.3505, "num_input_tokens_seen": 46474192, "step": 48655 }, { "epoch": 3.969328656497267, "grad_norm": 0.6158643364906311, "learning_rate": 3.773169040248019e-05, "loss": 0.3833, "num_input_tokens_seen": 46479504, "step": 48660 }, { "epoch": 3.969736520107676, "grad_norm": 0.43701452016830444, "learning_rate": 3.772862712661126e-05, "loss": 0.3606, "num_input_tokens_seen": 46484368, "step": 48665 }, { "epoch": 3.9701443837180848, "grad_norm": 0.3493594825267792, "learning_rate": 3.772556359273848e-05, "loss": 0.3343, "num_input_tokens_seen": 46489728, "step": 48670 }, { "epoch": 3.9705522473284933, "grad_norm": 0.5279993414878845, "learning_rate": 3.7722499800923923e-05, "loss": 0.3518, "num_input_tokens_seen": 46494112, "step": 48675 }, { "epoch": 3.970960110938902, "grad_norm": 0.6300938129425049, "learning_rate": 3.77194357512297e-05, "loss": 0.35, "num_input_tokens_seen": 46498960, "step": 48680 }, { "epoch": 3.9713679745493105, "grad_norm": 0.15522608160972595, "learning_rate": 3.771637144371792e-05, "loss": 0.3442, "num_input_tokens_seen": 46503312, "step": 48685 }, { "epoch": 3.9717758381597195, "grad_norm": 0.43965497612953186, "learning_rate": 3.771330687845071e-05, "loss": 0.3605, "num_input_tokens_seen": 46508512, "step": 48690 }, { "epoch": 3.972183701770128, "grad_norm": 0.6976544857025146, "learning_rate": 3.771024205549015e-05, "loss": 0.3623, "num_input_tokens_seen": 46513392, "step": 48695 }, { "epoch": 3.9725915653805366, "grad_norm": 0.8194757103919983, "learning_rate": 3.7707176974898396e-05, "loss": 0.3399, "num_input_tokens_seen": 46518448, "step": 48700 }, { "epoch": 3.9729994289909456, "grad_norm": 0.5778632164001465, "learning_rate": 3.770411163673756e-05, "loss": 0.3565, "num_input_tokens_seen": 46523520, "step": 48705 }, { "epoch": 3.973407292601354, "grad_norm": 0.9558034539222717, "learning_rate": 3.7701046041069785e-05, "loss": 0.3696, "num_input_tokens_seen": 46527968, "step": 48710 }, { "epoch": 3.9738151562117627, "grad_norm": 0.4782624840736389, "learning_rate": 3.7697980187957196e-05, "loss": 0.3817, "num_input_tokens_seen": 46533008, "step": 48715 }, { "epoch": 3.9742230198221717, "grad_norm": 0.7247973084449768, "learning_rate": 3.7694914077461945e-05, "loss": 0.3307, "num_input_tokens_seen": 46538432, "step": 48720 }, { "epoch": 3.9746308834325803, "grad_norm": 0.6549444794654846, "learning_rate": 3.7691847709646174e-05, "loss": 0.3363, "num_input_tokens_seen": 46542720, "step": 48725 }, { "epoch": 3.975038747042989, "grad_norm": 0.6032321453094482, "learning_rate": 3.7688781084572064e-05, "loss": 0.3488, "num_input_tokens_seen": 46547536, "step": 48730 }, { "epoch": 3.9754466106533974, "grad_norm": 0.7411758899688721, "learning_rate": 3.768571420230173e-05, "loss": 0.3548, "num_input_tokens_seen": 46551968, "step": 48735 }, { "epoch": 3.975854474263806, "grad_norm": 0.29324838519096375, "learning_rate": 3.768264706289738e-05, "loss": 0.3363, "num_input_tokens_seen": 46556816, "step": 48740 }, { "epoch": 3.976262337874215, "grad_norm": 0.6376280188560486, "learning_rate": 3.767957966642115e-05, "loss": 0.3806, "num_input_tokens_seen": 46562608, "step": 48745 }, { "epoch": 3.9766702014846236, "grad_norm": 0.23797857761383057, "learning_rate": 3.767651201293523e-05, "loss": 0.3438, "num_input_tokens_seen": 46568080, "step": 48750 }, { "epoch": 3.977078065095032, "grad_norm": 0.2787351906299591, "learning_rate": 3.7673444102501806e-05, "loss": 0.3784, "num_input_tokens_seen": 46572048, "step": 48755 }, { "epoch": 3.977485928705441, "grad_norm": 0.28420209884643555, "learning_rate": 3.7670375935183045e-05, "loss": 0.3396, "num_input_tokens_seen": 46576512, "step": 48760 }, { "epoch": 3.9778937923158497, "grad_norm": 0.42341887950897217, "learning_rate": 3.7667307511041164e-05, "loss": 0.2956, "num_input_tokens_seen": 46580704, "step": 48765 }, { "epoch": 3.9783016559262583, "grad_norm": 0.5512499213218689, "learning_rate": 3.766423883013832e-05, "loss": 0.32, "num_input_tokens_seen": 46585968, "step": 48770 }, { "epoch": 3.978709519536667, "grad_norm": 0.6097770929336548, "learning_rate": 3.766116989253675e-05, "loss": 0.389, "num_input_tokens_seen": 46591152, "step": 48775 }, { "epoch": 3.9791173831470754, "grad_norm": 0.5802054405212402, "learning_rate": 3.7658100698298635e-05, "loss": 0.3217, "num_input_tokens_seen": 46595968, "step": 48780 }, { "epoch": 3.9795252467574844, "grad_norm": 0.44298654794692993, "learning_rate": 3.765503124748621e-05, "loss": 0.3438, "num_input_tokens_seen": 46601040, "step": 48785 }, { "epoch": 3.979933110367893, "grad_norm": 0.4940708875656128, "learning_rate": 3.765196154016167e-05, "loss": 0.2878, "num_input_tokens_seen": 46606304, "step": 48790 }, { "epoch": 3.9803409739783016, "grad_norm": 0.195180281996727, "learning_rate": 3.764889157638726e-05, "loss": 0.3456, "num_input_tokens_seen": 46610576, "step": 48795 }, { "epoch": 3.9807488375887106, "grad_norm": 0.3672303557395935, "learning_rate": 3.7645821356225177e-05, "loss": 0.3252, "num_input_tokens_seen": 46615632, "step": 48800 }, { "epoch": 3.981156701199119, "grad_norm": 1.155644416809082, "learning_rate": 3.764275087973768e-05, "loss": 0.4059, "num_input_tokens_seen": 46621328, "step": 48805 }, { "epoch": 3.9815645648095277, "grad_norm": 0.4365742802619934, "learning_rate": 3.7639680146986986e-05, "loss": 0.3921, "num_input_tokens_seen": 46625824, "step": 48810 }, { "epoch": 3.9819724284199363, "grad_norm": 0.26726680994033813, "learning_rate": 3.7636609158035344e-05, "loss": 0.3763, "num_input_tokens_seen": 46629456, "step": 48815 }, { "epoch": 3.982380292030345, "grad_norm": 0.6878303289413452, "learning_rate": 3.763353791294502e-05, "loss": 0.3275, "num_input_tokens_seen": 46634640, "step": 48820 }, { "epoch": 3.982788155640754, "grad_norm": 0.4382306635379791, "learning_rate": 3.7630466411778235e-05, "loss": 0.3489, "num_input_tokens_seen": 46639056, "step": 48825 }, { "epoch": 3.9831960192511624, "grad_norm": 0.8747763633728027, "learning_rate": 3.762739465459727e-05, "loss": 0.3364, "num_input_tokens_seen": 46643520, "step": 48830 }, { "epoch": 3.983603882861571, "grad_norm": 0.6283436417579651, "learning_rate": 3.7624322641464385e-05, "loss": 0.3245, "num_input_tokens_seen": 46648416, "step": 48835 }, { "epoch": 3.98401174647198, "grad_norm": 0.45326539874076843, "learning_rate": 3.762125037244183e-05, "loss": 0.3329, "num_input_tokens_seen": 46653472, "step": 48840 }, { "epoch": 3.9844196100823885, "grad_norm": 0.760068416595459, "learning_rate": 3.761817784759191e-05, "loss": 0.3402, "num_input_tokens_seen": 46658064, "step": 48845 }, { "epoch": 3.984827473692797, "grad_norm": 0.5011380314826965, "learning_rate": 3.761510506697687e-05, "loss": 0.3862, "num_input_tokens_seen": 46663344, "step": 48850 }, { "epoch": 3.9852353373032057, "grad_norm": 0.28469255566596985, "learning_rate": 3.761203203065902e-05, "loss": 0.3219, "num_input_tokens_seen": 46667504, "step": 48855 }, { "epoch": 3.9856432009136142, "grad_norm": 0.941762387752533, "learning_rate": 3.760895873870064e-05, "loss": 0.3445, "num_input_tokens_seen": 46672144, "step": 48860 }, { "epoch": 3.9860510645240232, "grad_norm": 0.6438848376274109, "learning_rate": 3.7605885191164024e-05, "loss": 0.3503, "num_input_tokens_seen": 46676864, "step": 48865 }, { "epoch": 3.986458928134432, "grad_norm": 0.5452584624290466, "learning_rate": 3.760281138811147e-05, "loss": 0.3469, "num_input_tokens_seen": 46681872, "step": 48870 }, { "epoch": 3.9868667917448404, "grad_norm": 0.7904032468795776, "learning_rate": 3.759973732960529e-05, "loss": 0.4039, "num_input_tokens_seen": 46687040, "step": 48875 }, { "epoch": 3.9872746553552494, "grad_norm": 0.6290597915649414, "learning_rate": 3.7596663015707776e-05, "loss": 0.3619, "num_input_tokens_seen": 46691808, "step": 48880 }, { "epoch": 3.987682518965658, "grad_norm": 1.0261512994766235, "learning_rate": 3.759358844648127e-05, "loss": 0.3642, "num_input_tokens_seen": 46696272, "step": 48885 }, { "epoch": 3.9880903825760665, "grad_norm": 0.7651646137237549, "learning_rate": 3.759051362198808e-05, "loss": 0.3426, "num_input_tokens_seen": 46700688, "step": 48890 }, { "epoch": 3.988498246186475, "grad_norm": 0.8461153507232666, "learning_rate": 3.758743854229051e-05, "loss": 0.3676, "num_input_tokens_seen": 46705232, "step": 48895 }, { "epoch": 3.9889061097968836, "grad_norm": 0.7977856993675232, "learning_rate": 3.758436320745092e-05, "loss": 0.354, "num_input_tokens_seen": 46709584, "step": 48900 }, { "epoch": 3.9893139734072927, "grad_norm": 0.7682337760925293, "learning_rate": 3.758128761753164e-05, "loss": 0.3635, "num_input_tokens_seen": 46714496, "step": 48905 }, { "epoch": 3.9897218370177012, "grad_norm": 0.35044148564338684, "learning_rate": 3.7578211772595e-05, "loss": 0.3526, "num_input_tokens_seen": 46718784, "step": 48910 }, { "epoch": 3.99012970062811, "grad_norm": 0.6396409273147583, "learning_rate": 3.7575135672703356e-05, "loss": 0.3264, "num_input_tokens_seen": 46723376, "step": 48915 }, { "epoch": 3.990537564238519, "grad_norm": 0.7406123876571655, "learning_rate": 3.757205931791905e-05, "loss": 0.3642, "num_input_tokens_seen": 46728928, "step": 48920 }, { "epoch": 3.9909454278489274, "grad_norm": 0.5951330065727234, "learning_rate": 3.7568982708304444e-05, "loss": 0.3487, "num_input_tokens_seen": 46732576, "step": 48925 }, { "epoch": 3.991353291459336, "grad_norm": 0.5873461961746216, "learning_rate": 3.75659058439219e-05, "loss": 0.3486, "num_input_tokens_seen": 46738032, "step": 48930 }, { "epoch": 3.991761155069745, "grad_norm": 0.21433469653129578, "learning_rate": 3.756282872483379e-05, "loss": 0.369, "num_input_tokens_seen": 46742688, "step": 48935 }, { "epoch": 3.9921690186801535, "grad_norm": 0.5622087121009827, "learning_rate": 3.755975135110247e-05, "loss": 0.3395, "num_input_tokens_seen": 46747760, "step": 48940 }, { "epoch": 3.992576882290562, "grad_norm": 0.3682497441768646, "learning_rate": 3.7556673722790336e-05, "loss": 0.3243, "num_input_tokens_seen": 46752784, "step": 48945 }, { "epoch": 3.9929847459009706, "grad_norm": 0.4395735263824463, "learning_rate": 3.755359583995975e-05, "loss": 0.3352, "num_input_tokens_seen": 46757552, "step": 48950 }, { "epoch": 3.993392609511379, "grad_norm": 0.3904598355293274, "learning_rate": 3.755051770267313e-05, "loss": 0.3333, "num_input_tokens_seen": 46762048, "step": 48955 }, { "epoch": 3.993800473121788, "grad_norm": 0.5396977663040161, "learning_rate": 3.7547439310992835e-05, "loss": 0.3875, "num_input_tokens_seen": 46765920, "step": 48960 }, { "epoch": 3.9942083367321968, "grad_norm": 0.5333108305931091, "learning_rate": 3.7544360664981284e-05, "loss": 0.3712, "num_input_tokens_seen": 46770240, "step": 48965 }, { "epoch": 3.9946162003426053, "grad_norm": 0.8097891211509705, "learning_rate": 3.7541281764700864e-05, "loss": 0.3552, "num_input_tokens_seen": 46775328, "step": 48970 }, { "epoch": 3.9950240639530143, "grad_norm": 0.4751780927181244, "learning_rate": 3.7538202610214e-05, "loss": 0.3428, "num_input_tokens_seen": 46780240, "step": 48975 }, { "epoch": 3.995431927563423, "grad_norm": 0.8972764611244202, "learning_rate": 3.753512320158309e-05, "loss": 0.3622, "num_input_tokens_seen": 46784784, "step": 48980 }, { "epoch": 3.9958397911738315, "grad_norm": 0.797951340675354, "learning_rate": 3.753204353887057e-05, "loss": 0.3483, "num_input_tokens_seen": 46789888, "step": 48985 }, { "epoch": 3.99624765478424, "grad_norm": 0.7709622979164124, "learning_rate": 3.7528963622138833e-05, "loss": 0.3279, "num_input_tokens_seen": 46794256, "step": 48990 }, { "epoch": 3.9966555183946486, "grad_norm": 0.9653489589691162, "learning_rate": 3.7525883451450346e-05, "loss": 0.3648, "num_input_tokens_seen": 46799584, "step": 48995 }, { "epoch": 3.9970633820050576, "grad_norm": 0.45834803581237793, "learning_rate": 3.752280302686751e-05, "loss": 0.3186, "num_input_tokens_seen": 46804480, "step": 49000 }, { "epoch": 3.997471245615466, "grad_norm": 0.3449271321296692, "learning_rate": 3.751972234845279e-05, "loss": 0.3736, "num_input_tokens_seen": 46809264, "step": 49005 }, { "epoch": 3.9978791092258747, "grad_norm": 0.5321688055992126, "learning_rate": 3.7516641416268614e-05, "loss": 0.3736, "num_input_tokens_seen": 46813856, "step": 49010 }, { "epoch": 3.9982869728362838, "grad_norm": 0.3726487457752228, "learning_rate": 3.7513560230377434e-05, "loss": 0.3205, "num_input_tokens_seen": 46818336, "step": 49015 }, { "epoch": 3.9986948364466923, "grad_norm": 1.1265275478363037, "learning_rate": 3.751047879084171e-05, "loss": 0.3436, "num_input_tokens_seen": 46823584, "step": 49020 }, { "epoch": 3.999102700057101, "grad_norm": 0.5017306208610535, "learning_rate": 3.750739709772388e-05, "loss": 0.2854, "num_input_tokens_seen": 46828592, "step": 49025 }, { "epoch": 3.9995105636675095, "grad_norm": 1.010117530822754, "learning_rate": 3.7504315151086446e-05, "loss": 0.3602, "num_input_tokens_seen": 46833280, "step": 49030 }, { "epoch": 3.999918427277918, "grad_norm": 0.5444624423980713, "learning_rate": 3.750123295099185e-05, "loss": 0.341, "num_input_tokens_seen": 46837728, "step": 49035 }, { "epoch": 4.000326290888327, "grad_norm": 0.30300697684288025, "learning_rate": 3.7498150497502575e-05, "loss": 0.3363, "num_input_tokens_seen": 46843184, "step": 49040 }, { "epoch": 4.000326290888327, "eval_loss": 0.339950829744339, "eval_runtime": 570.9374, "eval_samples_per_second": 4.773, "eval_steps_per_second": 2.387, "num_input_tokens_seen": 46843184, "step": 49040 }, { "epoch": 4.000734154498736, "grad_norm": 0.6365991234779358, "learning_rate": 3.74950677906811e-05, "loss": 0.3574, "num_input_tokens_seen": 46848816, "step": 49045 }, { "epoch": 4.001142018109144, "grad_norm": 0.532974898815155, "learning_rate": 3.749198483058991e-05, "loss": 0.3028, "num_input_tokens_seen": 46853488, "step": 49050 }, { "epoch": 4.001549881719553, "grad_norm": 0.8290227651596069, "learning_rate": 3.748890161729149e-05, "loss": 0.3916, "num_input_tokens_seen": 46858768, "step": 49055 }, { "epoch": 4.001957745329961, "grad_norm": 0.455314040184021, "learning_rate": 3.748581815084835e-05, "loss": 0.3653, "num_input_tokens_seen": 46863008, "step": 49060 }, { "epoch": 4.00236560894037, "grad_norm": 0.40552476048469543, "learning_rate": 3.748273443132298e-05, "loss": 0.3498, "num_input_tokens_seen": 46866544, "step": 49065 }, { "epoch": 4.002773472550779, "grad_norm": 0.7779103517532349, "learning_rate": 3.747965045877788e-05, "loss": 0.3462, "num_input_tokens_seen": 46870688, "step": 49070 }, { "epoch": 4.003181336161187, "grad_norm": 0.3322553038597107, "learning_rate": 3.747656623327558e-05, "loss": 0.3193, "num_input_tokens_seen": 46874592, "step": 49075 }, { "epoch": 4.003589199771596, "grad_norm": 0.3432053327560425, "learning_rate": 3.747348175487857e-05, "loss": 0.3568, "num_input_tokens_seen": 46879520, "step": 49080 }, { "epoch": 4.0039970633820055, "grad_norm": 0.719010591506958, "learning_rate": 3.7470397023649395e-05, "loss": 0.3585, "num_input_tokens_seen": 46884656, "step": 49085 }, { "epoch": 4.004404926992414, "grad_norm": 0.6662916541099548, "learning_rate": 3.746731203965057e-05, "loss": 0.3434, "num_input_tokens_seen": 46889552, "step": 49090 }, { "epoch": 4.004812790602823, "grad_norm": 0.37134337425231934, "learning_rate": 3.746422680294462e-05, "loss": 0.3891, "num_input_tokens_seen": 46894320, "step": 49095 }, { "epoch": 4.005220654213231, "grad_norm": 0.463046133518219, "learning_rate": 3.74611413135941e-05, "loss": 0.3621, "num_input_tokens_seen": 46899072, "step": 49100 }, { "epoch": 4.00562851782364, "grad_norm": 1.0021982192993164, "learning_rate": 3.7458055571661537e-05, "loss": 0.3436, "num_input_tokens_seen": 46903856, "step": 49105 }, { "epoch": 4.006036381434049, "grad_norm": 0.4054143726825714, "learning_rate": 3.745496957720948e-05, "loss": 0.3571, "num_input_tokens_seen": 46907728, "step": 49110 }, { "epoch": 4.006444245044457, "grad_norm": 0.9041697978973389, "learning_rate": 3.745188333030048e-05, "loss": 0.3752, "num_input_tokens_seen": 46912848, "step": 49115 }, { "epoch": 4.006852108654866, "grad_norm": 0.41479602456092834, "learning_rate": 3.7448796830997105e-05, "loss": 0.3373, "num_input_tokens_seen": 46916992, "step": 49120 }, { "epoch": 4.007259972265275, "grad_norm": 0.46037718653678894, "learning_rate": 3.74457100793619e-05, "loss": 0.3335, "num_input_tokens_seen": 46921648, "step": 49125 }, { "epoch": 4.007667835875683, "grad_norm": 0.6716669797897339, "learning_rate": 3.744262307545744e-05, "loss": 0.3346, "num_input_tokens_seen": 46927056, "step": 49130 }, { "epoch": 4.008075699486092, "grad_norm": 0.8528984785079956, "learning_rate": 3.7439535819346295e-05, "loss": 0.3496, "num_input_tokens_seen": 46932880, "step": 49135 }, { "epoch": 4.0084835630965, "grad_norm": 0.9481276273727417, "learning_rate": 3.743644831109105e-05, "loss": 0.3764, "num_input_tokens_seen": 46937776, "step": 49140 }, { "epoch": 4.008891426706909, "grad_norm": 0.2617725729942322, "learning_rate": 3.7433360550754284e-05, "loss": 0.3443, "num_input_tokens_seen": 46942656, "step": 49145 }, { "epoch": 4.009299290317318, "grad_norm": 0.6961878538131714, "learning_rate": 3.743027253839857e-05, "loss": 0.3521, "num_input_tokens_seen": 46948208, "step": 49150 }, { "epoch": 4.009707153927726, "grad_norm": 0.44893577694892883, "learning_rate": 3.742718427408653e-05, "loss": 0.3559, "num_input_tokens_seen": 46952704, "step": 49155 }, { "epoch": 4.010115017538135, "grad_norm": 0.39324894547462463, "learning_rate": 3.742409575788074e-05, "loss": 0.3397, "num_input_tokens_seen": 46957232, "step": 49160 }, { "epoch": 4.010522881148544, "grad_norm": 0.8538744449615479, "learning_rate": 3.742100698984381e-05, "loss": 0.3275, "num_input_tokens_seen": 46962368, "step": 49165 }, { "epoch": 4.010930744758952, "grad_norm": 0.5974347591400146, "learning_rate": 3.741791797003834e-05, "loss": 0.3432, "num_input_tokens_seen": 46966672, "step": 49170 }, { "epoch": 4.011338608369361, "grad_norm": 0.5638179183006287, "learning_rate": 3.741482869852696e-05, "loss": 0.3714, "num_input_tokens_seen": 46971616, "step": 49175 }, { "epoch": 4.0117464719797695, "grad_norm": 0.2877819836139679, "learning_rate": 3.741173917537227e-05, "loss": 0.349, "num_input_tokens_seen": 46976416, "step": 49180 }, { "epoch": 4.0121543355901785, "grad_norm": 0.5209717750549316, "learning_rate": 3.7408649400636894e-05, "loss": 0.3365, "num_input_tokens_seen": 46980944, "step": 49185 }, { "epoch": 4.0125621992005875, "grad_norm": 0.5425300598144531, "learning_rate": 3.740555937438348e-05, "loss": 0.2987, "num_input_tokens_seen": 46986224, "step": 49190 }, { "epoch": 4.012970062810996, "grad_norm": 0.765984058380127, "learning_rate": 3.7402469096674643e-05, "loss": 0.356, "num_input_tokens_seen": 46991584, "step": 49195 }, { "epoch": 4.013377926421405, "grad_norm": 0.5237196087837219, "learning_rate": 3.739937856757303e-05, "loss": 0.3606, "num_input_tokens_seen": 46996208, "step": 49200 }, { "epoch": 4.013785790031814, "grad_norm": 0.7117522954940796, "learning_rate": 3.7396287787141285e-05, "loss": 0.3219, "num_input_tokens_seen": 47001648, "step": 49205 }, { "epoch": 4.014193653642222, "grad_norm": 0.5975469350814819, "learning_rate": 3.739319675544204e-05, "loss": 0.3713, "num_input_tokens_seen": 47005872, "step": 49210 }, { "epoch": 4.014601517252631, "grad_norm": 0.40098413825035095, "learning_rate": 3.739010547253798e-05, "loss": 0.3291, "num_input_tokens_seen": 47011232, "step": 49215 }, { "epoch": 4.01500938086304, "grad_norm": 0.2694171667098999, "learning_rate": 3.738701393849174e-05, "loss": 0.3264, "num_input_tokens_seen": 47015600, "step": 49220 }, { "epoch": 4.015417244473448, "grad_norm": 0.4879594147205353, "learning_rate": 3.738392215336599e-05, "loss": 0.312, "num_input_tokens_seen": 47020096, "step": 49225 }, { "epoch": 4.015825108083857, "grad_norm": 0.5870131254196167, "learning_rate": 3.73808301172234e-05, "loss": 0.2952, "num_input_tokens_seen": 47024832, "step": 49230 }, { "epoch": 4.016232971694265, "grad_norm": 0.5297076106071472, "learning_rate": 3.7377737830126645e-05, "loss": 0.2917, "num_input_tokens_seen": 47029472, "step": 49235 }, { "epoch": 4.016640835304674, "grad_norm": 0.4479740560054779, "learning_rate": 3.7374645292138414e-05, "loss": 0.2835, "num_input_tokens_seen": 47034080, "step": 49240 }, { "epoch": 4.017048698915083, "grad_norm": 0.4803348183631897, "learning_rate": 3.737155250332136e-05, "loss": 0.4557, "num_input_tokens_seen": 47038704, "step": 49245 }, { "epoch": 4.017456562525491, "grad_norm": 0.41820207238197327, "learning_rate": 3.736845946373821e-05, "loss": 0.2866, "num_input_tokens_seen": 47044144, "step": 49250 }, { "epoch": 4.0178644261359, "grad_norm": 0.49653708934783936, "learning_rate": 3.736536617345163e-05, "loss": 0.3494, "num_input_tokens_seen": 47048912, "step": 49255 }, { "epoch": 4.018272289746309, "grad_norm": 0.6274445056915283, "learning_rate": 3.736227263252435e-05, "loss": 0.3682, "num_input_tokens_seen": 47053760, "step": 49260 }, { "epoch": 4.018680153356717, "grad_norm": 0.6680428981781006, "learning_rate": 3.735917884101905e-05, "loss": 0.2868, "num_input_tokens_seen": 47059344, "step": 49265 }, { "epoch": 4.019088016967126, "grad_norm": 0.5130148530006409, "learning_rate": 3.7356084798998436e-05, "loss": 0.3755, "num_input_tokens_seen": 47064640, "step": 49270 }, { "epoch": 4.0194958805775345, "grad_norm": 0.644635021686554, "learning_rate": 3.735299050652524e-05, "loss": 0.3644, "num_input_tokens_seen": 47069856, "step": 49275 }, { "epoch": 4.0199037441879435, "grad_norm": 0.5327189564704895, "learning_rate": 3.7349895963662173e-05, "loss": 0.2714, "num_input_tokens_seen": 47075040, "step": 49280 }, { "epoch": 4.0203116077983525, "grad_norm": 0.7472559213638306, "learning_rate": 3.734680117047197e-05, "loss": 0.3841, "num_input_tokens_seen": 47080016, "step": 49285 }, { "epoch": 4.020719471408761, "grad_norm": 1.358381748199463, "learning_rate": 3.734370612701735e-05, "loss": 0.3043, "num_input_tokens_seen": 47084080, "step": 49290 }, { "epoch": 4.02112733501917, "grad_norm": 0.6786341071128845, "learning_rate": 3.734061083336105e-05, "loss": 0.3156, "num_input_tokens_seen": 47088368, "step": 49295 }, { "epoch": 4.021535198629579, "grad_norm": 0.480074405670166, "learning_rate": 3.7337515289565817e-05, "loss": 0.3877, "num_input_tokens_seen": 47092768, "step": 49300 }, { "epoch": 4.021943062239987, "grad_norm": 0.5767781138420105, "learning_rate": 3.7334419495694385e-05, "loss": 0.3464, "num_input_tokens_seen": 47097728, "step": 49305 }, { "epoch": 4.022350925850396, "grad_norm": 0.6062692999839783, "learning_rate": 3.733132345180951e-05, "loss": 0.3373, "num_input_tokens_seen": 47102432, "step": 49310 }, { "epoch": 4.022758789460804, "grad_norm": 0.7500323057174683, "learning_rate": 3.732822715797396e-05, "loss": 0.3176, "num_input_tokens_seen": 47106960, "step": 49315 }, { "epoch": 4.023166653071213, "grad_norm": 0.293183833360672, "learning_rate": 3.732513061425048e-05, "loss": 0.2991, "num_input_tokens_seen": 47111792, "step": 49320 }, { "epoch": 4.023574516681622, "grad_norm": 0.6684041619300842, "learning_rate": 3.732203382070183e-05, "loss": 0.2905, "num_input_tokens_seen": 47117104, "step": 49325 }, { "epoch": 4.02398238029203, "grad_norm": 1.199236273765564, "learning_rate": 3.73189367773908e-05, "loss": 0.3744, "num_input_tokens_seen": 47122480, "step": 49330 }, { "epoch": 4.024390243902439, "grad_norm": 0.7385337948799133, "learning_rate": 3.731583948438015e-05, "loss": 0.3493, "num_input_tokens_seen": 47127472, "step": 49335 }, { "epoch": 4.024798107512848, "grad_norm": 0.5470011234283447, "learning_rate": 3.731274194173268e-05, "loss": 0.3175, "num_input_tokens_seen": 47131952, "step": 49340 }, { "epoch": 4.025205971123256, "grad_norm": 0.7200578451156616, "learning_rate": 3.730964414951115e-05, "loss": 0.3602, "num_input_tokens_seen": 47137056, "step": 49345 }, { "epoch": 4.025613834733665, "grad_norm": 0.47122666239738464, "learning_rate": 3.730654610777836e-05, "loss": 0.3441, "num_input_tokens_seen": 47142000, "step": 49350 }, { "epoch": 4.026021698344073, "grad_norm": 0.388907790184021, "learning_rate": 3.730344781659712e-05, "loss": 0.3053, "num_input_tokens_seen": 47146688, "step": 49355 }, { "epoch": 4.026429561954482, "grad_norm": 0.6525353789329529, "learning_rate": 3.7300349276030216e-05, "loss": 0.3162, "num_input_tokens_seen": 47152048, "step": 49360 }, { "epoch": 4.026837425564891, "grad_norm": 0.600632905960083, "learning_rate": 3.729725048614046e-05, "loss": 0.348, "num_input_tokens_seen": 47156848, "step": 49365 }, { "epoch": 4.0272452891752994, "grad_norm": 0.7039245367050171, "learning_rate": 3.729415144699066e-05, "loss": 0.3665, "num_input_tokens_seen": 47162320, "step": 49370 }, { "epoch": 4.0276531527857085, "grad_norm": 0.6803097128868103, "learning_rate": 3.729105215864364e-05, "loss": 0.3225, "num_input_tokens_seen": 47167952, "step": 49375 }, { "epoch": 4.0280610163961175, "grad_norm": 0.623519241809845, "learning_rate": 3.7287952621162215e-05, "loss": 0.2769, "num_input_tokens_seen": 47173392, "step": 49380 }, { "epoch": 4.028468880006526, "grad_norm": 0.9937446713447571, "learning_rate": 3.7284852834609205e-05, "loss": 0.3512, "num_input_tokens_seen": 47177840, "step": 49385 }, { "epoch": 4.028876743616935, "grad_norm": 0.693276047706604, "learning_rate": 3.7281752799047455e-05, "loss": 0.3625, "num_input_tokens_seen": 47183168, "step": 49390 }, { "epoch": 4.029284607227344, "grad_norm": 0.7692773938179016, "learning_rate": 3.7278652514539794e-05, "loss": 0.3038, "num_input_tokens_seen": 47188224, "step": 49395 }, { "epoch": 4.029692470837752, "grad_norm": 0.41781604290008545, "learning_rate": 3.727555198114906e-05, "loss": 0.3776, "num_input_tokens_seen": 47192320, "step": 49400 }, { "epoch": 4.030100334448161, "grad_norm": 1.082673192024231, "learning_rate": 3.727245119893811e-05, "loss": 0.3884, "num_input_tokens_seen": 47197680, "step": 49405 }, { "epoch": 4.030508198058569, "grad_norm": 0.41697198152542114, "learning_rate": 3.7269350167969773e-05, "loss": 0.3224, "num_input_tokens_seen": 47202576, "step": 49410 }, { "epoch": 4.030916061668978, "grad_norm": 0.4761315584182739, "learning_rate": 3.7266248888306935e-05, "loss": 0.3892, "num_input_tokens_seen": 47207248, "step": 49415 }, { "epoch": 4.031323925279387, "grad_norm": 0.6349747776985168, "learning_rate": 3.726314736001245e-05, "loss": 0.2664, "num_input_tokens_seen": 47212224, "step": 49420 }, { "epoch": 4.031731788889795, "grad_norm": 0.41521650552749634, "learning_rate": 3.7260045583149166e-05, "loss": 0.3115, "num_input_tokens_seen": 47216368, "step": 49425 }, { "epoch": 4.032139652500204, "grad_norm": 0.6814337968826294, "learning_rate": 3.7256943557779974e-05, "loss": 0.2997, "num_input_tokens_seen": 47219952, "step": 49430 }, { "epoch": 4.032547516110613, "grad_norm": 0.5813196301460266, "learning_rate": 3.725384128396774e-05, "loss": 0.4616, "num_input_tokens_seen": 47225200, "step": 49435 }, { "epoch": 4.032955379721021, "grad_norm": 1.1244817972183228, "learning_rate": 3.725073876177536e-05, "loss": 0.3375, "num_input_tokens_seen": 47229584, "step": 49440 }, { "epoch": 4.03336324333143, "grad_norm": 0.9813262224197388, "learning_rate": 3.7247635991265705e-05, "loss": 0.3049, "num_input_tokens_seen": 47234176, "step": 49445 }, { "epoch": 4.033771106941838, "grad_norm": 0.49443507194519043, "learning_rate": 3.7244532972501674e-05, "loss": 0.3375, "num_input_tokens_seen": 47238528, "step": 49450 }, { "epoch": 4.034178970552247, "grad_norm": 1.0995250940322876, "learning_rate": 3.7241429705546154e-05, "loss": 0.3723, "num_input_tokens_seen": 47243024, "step": 49455 }, { "epoch": 4.034586834162656, "grad_norm": 0.44777911901474, "learning_rate": 3.723832619046207e-05, "loss": 0.2873, "num_input_tokens_seen": 47248512, "step": 49460 }, { "epoch": 4.034994697773064, "grad_norm": 0.4369383454322815, "learning_rate": 3.7235222427312305e-05, "loss": 0.3647, "num_input_tokens_seen": 47253632, "step": 49465 }, { "epoch": 4.035402561383473, "grad_norm": 0.9498782157897949, "learning_rate": 3.723211841615978e-05, "loss": 0.3321, "num_input_tokens_seen": 47259120, "step": 49470 }, { "epoch": 4.035810424993882, "grad_norm": 0.5463038682937622, "learning_rate": 3.722901415706742e-05, "loss": 0.3463, "num_input_tokens_seen": 47264176, "step": 49475 }, { "epoch": 4.0362182886042905, "grad_norm": 0.528253972530365, "learning_rate": 3.722590965009813e-05, "loss": 0.3407, "num_input_tokens_seen": 47268160, "step": 49480 }, { "epoch": 4.0366261522147, "grad_norm": 0.4578450918197632, "learning_rate": 3.722280489531485e-05, "loss": 0.3554, "num_input_tokens_seen": 47272880, "step": 49485 }, { "epoch": 4.037034015825108, "grad_norm": 0.8109445571899414, "learning_rate": 3.721969989278051e-05, "loss": 0.3453, "num_input_tokens_seen": 47277776, "step": 49490 }, { "epoch": 4.037441879435517, "grad_norm": 0.8289670944213867, "learning_rate": 3.7216594642558044e-05, "loss": 0.3275, "num_input_tokens_seen": 47283456, "step": 49495 }, { "epoch": 4.037849743045926, "grad_norm": 0.7243626713752747, "learning_rate": 3.72134891447104e-05, "loss": 0.3414, "num_input_tokens_seen": 47286992, "step": 49500 }, { "epoch": 4.038257606656334, "grad_norm": 0.5713708996772766, "learning_rate": 3.721038339930051e-05, "loss": 0.3266, "num_input_tokens_seen": 47291632, "step": 49505 }, { "epoch": 4.038665470266743, "grad_norm": 0.8762950897216797, "learning_rate": 3.720727740639135e-05, "loss": 0.3546, "num_input_tokens_seen": 47297024, "step": 49510 }, { "epoch": 4.039073333877152, "grad_norm": 0.46548470854759216, "learning_rate": 3.720417116604587e-05, "loss": 0.3882, "num_input_tokens_seen": 47302576, "step": 49515 }, { "epoch": 4.03948119748756, "grad_norm": 0.5607163310050964, "learning_rate": 3.720106467832701e-05, "loss": 0.3263, "num_input_tokens_seen": 47307824, "step": 49520 }, { "epoch": 4.039889061097969, "grad_norm": 0.6565830111503601, "learning_rate": 3.7197957943297755e-05, "loss": 0.3352, "num_input_tokens_seen": 47312432, "step": 49525 }, { "epoch": 4.040296924708377, "grad_norm": 0.7461081147193909, "learning_rate": 3.719485096102108e-05, "loss": 0.2954, "num_input_tokens_seen": 47317904, "step": 49530 }, { "epoch": 4.040704788318786, "grad_norm": 0.4538411498069763, "learning_rate": 3.719174373155996e-05, "loss": 0.3459, "num_input_tokens_seen": 47323248, "step": 49535 }, { "epoch": 4.041112651929195, "grad_norm": 0.4827330410480499, "learning_rate": 3.718863625497737e-05, "loss": 0.375, "num_input_tokens_seen": 47327520, "step": 49540 }, { "epoch": 4.041520515539603, "grad_norm": 0.48798638582229614, "learning_rate": 3.71855285313363e-05, "loss": 0.3551, "num_input_tokens_seen": 47332624, "step": 49545 }, { "epoch": 4.041928379150012, "grad_norm": 0.3162762522697449, "learning_rate": 3.7182420560699754e-05, "loss": 0.3442, "num_input_tokens_seen": 47336944, "step": 49550 }, { "epoch": 4.042336242760421, "grad_norm": 0.6401306986808777, "learning_rate": 3.7179312343130714e-05, "loss": 0.3477, "num_input_tokens_seen": 47341632, "step": 49555 }, { "epoch": 4.042744106370829, "grad_norm": 0.581965446472168, "learning_rate": 3.717620387869219e-05, "loss": 0.3182, "num_input_tokens_seen": 47346432, "step": 49560 }, { "epoch": 4.043151969981238, "grad_norm": 0.517915666103363, "learning_rate": 3.717309516744718e-05, "loss": 0.3179, "num_input_tokens_seen": 47351248, "step": 49565 }, { "epoch": 4.043559833591647, "grad_norm": 0.6185925006866455, "learning_rate": 3.716998620945871e-05, "loss": 0.3335, "num_input_tokens_seen": 47356512, "step": 49570 }, { "epoch": 4.0439676972020555, "grad_norm": 0.9355196952819824, "learning_rate": 3.71668770047898e-05, "loss": 0.3556, "num_input_tokens_seen": 47361152, "step": 49575 }, { "epoch": 4.0443755608124645, "grad_norm": 1.0421322584152222, "learning_rate": 3.716376755350345e-05, "loss": 0.33, "num_input_tokens_seen": 47365888, "step": 49580 }, { "epoch": 4.044783424422873, "grad_norm": 0.6891284584999084, "learning_rate": 3.716065785566271e-05, "loss": 0.332, "num_input_tokens_seen": 47371792, "step": 49585 }, { "epoch": 4.045191288033282, "grad_norm": 0.6220057010650635, "learning_rate": 3.7157547911330595e-05, "loss": 0.3318, "num_input_tokens_seen": 47376000, "step": 49590 }, { "epoch": 4.045599151643691, "grad_norm": 0.7682995796203613, "learning_rate": 3.7154437720570155e-05, "loss": 0.3367, "num_input_tokens_seen": 47381728, "step": 49595 }, { "epoch": 4.046007015254099, "grad_norm": 0.6950051188468933, "learning_rate": 3.715132728344442e-05, "loss": 0.322, "num_input_tokens_seen": 47385888, "step": 49600 }, { "epoch": 4.046414878864508, "grad_norm": 0.633047878742218, "learning_rate": 3.714821660001645e-05, "loss": 0.3296, "num_input_tokens_seen": 47390592, "step": 49605 }, { "epoch": 4.046822742474917, "grad_norm": 0.6501606702804565, "learning_rate": 3.714510567034929e-05, "loss": 0.3202, "num_input_tokens_seen": 47395312, "step": 49610 }, { "epoch": 4.047230606085325, "grad_norm": 0.48443078994750977, "learning_rate": 3.7141994494506e-05, "loss": 0.2376, "num_input_tokens_seen": 47399856, "step": 49615 }, { "epoch": 4.047638469695734, "grad_norm": 0.5254979729652405, "learning_rate": 3.7138883072549644e-05, "loss": 0.3463, "num_input_tokens_seen": 47404928, "step": 49620 }, { "epoch": 4.048046333306142, "grad_norm": 0.6579881906509399, "learning_rate": 3.713577140454328e-05, "loss": 0.3166, "num_input_tokens_seen": 47409824, "step": 49625 }, { "epoch": 4.048454196916551, "grad_norm": 0.5055782794952393, "learning_rate": 3.713265949054999e-05, "loss": 0.387, "num_input_tokens_seen": 47415088, "step": 49630 }, { "epoch": 4.04886206052696, "grad_norm": 1.3547426462173462, "learning_rate": 3.712954733063284e-05, "loss": 0.3634, "num_input_tokens_seen": 47420432, "step": 49635 }, { "epoch": 4.049269924137368, "grad_norm": 0.5825775265693665, "learning_rate": 3.712643492485493e-05, "loss": 0.3286, "num_input_tokens_seen": 47424704, "step": 49640 }, { "epoch": 4.049677787747777, "grad_norm": 1.1947661638259888, "learning_rate": 3.7123322273279335e-05, "loss": 0.343, "num_input_tokens_seen": 47429232, "step": 49645 }, { "epoch": 4.050085651358186, "grad_norm": 0.7479623556137085, "learning_rate": 3.712020937596914e-05, "loss": 0.3884, "num_input_tokens_seen": 47434064, "step": 49650 }, { "epoch": 4.050493514968594, "grad_norm": 0.5662007927894592, "learning_rate": 3.711709623298747e-05, "loss": 0.3783, "num_input_tokens_seen": 47439456, "step": 49655 }, { "epoch": 4.050901378579003, "grad_norm": 0.8716639280319214, "learning_rate": 3.7113982844397396e-05, "loss": 0.3632, "num_input_tokens_seen": 47444336, "step": 49660 }, { "epoch": 4.0513092421894115, "grad_norm": 0.8355754613876343, "learning_rate": 3.711086921026204e-05, "loss": 0.3431, "num_input_tokens_seen": 47450448, "step": 49665 }, { "epoch": 4.0517171057998205, "grad_norm": 0.5128024220466614, "learning_rate": 3.71077553306445e-05, "loss": 0.3876, "num_input_tokens_seen": 47454880, "step": 49670 }, { "epoch": 4.0521249694102295, "grad_norm": 0.9469459652900696, "learning_rate": 3.710464120560792e-05, "loss": 0.3389, "num_input_tokens_seen": 47459280, "step": 49675 }, { "epoch": 4.052532833020638, "grad_norm": 0.4428473114967346, "learning_rate": 3.7101526835215396e-05, "loss": 0.3286, "num_input_tokens_seen": 47464256, "step": 49680 }, { "epoch": 4.052940696631047, "grad_norm": 0.5307199358940125, "learning_rate": 3.709841221953007e-05, "loss": 0.3692, "num_input_tokens_seen": 47469184, "step": 49685 }, { "epoch": 4.053348560241456, "grad_norm": 1.050347089767456, "learning_rate": 3.709529735861506e-05, "loss": 0.3495, "num_input_tokens_seen": 47473680, "step": 49690 }, { "epoch": 4.053756423851864, "grad_norm": 0.911480188369751, "learning_rate": 3.709218225253353e-05, "loss": 0.3275, "num_input_tokens_seen": 47478240, "step": 49695 }, { "epoch": 4.054164287462273, "grad_norm": 0.41634470224380493, "learning_rate": 3.7089066901348593e-05, "loss": 0.3154, "num_input_tokens_seen": 47483792, "step": 49700 }, { "epoch": 4.054572151072681, "grad_norm": 0.7200579643249512, "learning_rate": 3.708595130512341e-05, "loss": 0.371, "num_input_tokens_seen": 47487600, "step": 49705 }, { "epoch": 4.05498001468309, "grad_norm": 0.7756212949752808, "learning_rate": 3.708283546392113e-05, "loss": 0.3463, "num_input_tokens_seen": 47492432, "step": 49710 }, { "epoch": 4.055387878293499, "grad_norm": 0.9800088405609131, "learning_rate": 3.7079719377804906e-05, "loss": 0.3124, "num_input_tokens_seen": 47497952, "step": 49715 }, { "epoch": 4.055795741903907, "grad_norm": 0.7300891876220703, "learning_rate": 3.7076603046837905e-05, "loss": 0.3148, "num_input_tokens_seen": 47502720, "step": 49720 }, { "epoch": 4.056203605514316, "grad_norm": 0.5824271440505981, "learning_rate": 3.7073486471083294e-05, "loss": 0.4293, "num_input_tokens_seen": 47508336, "step": 49725 }, { "epoch": 4.056611469124725, "grad_norm": 1.0315934419631958, "learning_rate": 3.7070369650604243e-05, "loss": 0.3667, "num_input_tokens_seen": 47512352, "step": 49730 }, { "epoch": 4.057019332735133, "grad_norm": 0.5876928567886353, "learning_rate": 3.706725258546392e-05, "loss": 0.3819, "num_input_tokens_seen": 47517408, "step": 49735 }, { "epoch": 4.057427196345542, "grad_norm": 0.8368713855743408, "learning_rate": 3.706413527572553e-05, "loss": 0.3523, "num_input_tokens_seen": 47522736, "step": 49740 }, { "epoch": 4.05783505995595, "grad_norm": 0.5333404541015625, "learning_rate": 3.7061017721452225e-05, "loss": 0.3401, "num_input_tokens_seen": 47528096, "step": 49745 }, { "epoch": 4.058242923566359, "grad_norm": 0.293993204832077, "learning_rate": 3.705789992270723e-05, "loss": 0.3453, "num_input_tokens_seen": 47533328, "step": 49750 }, { "epoch": 4.058650787176768, "grad_norm": 0.5430951714515686, "learning_rate": 3.705478187955372e-05, "loss": 0.3025, "num_input_tokens_seen": 47537584, "step": 49755 }, { "epoch": 4.059058650787176, "grad_norm": 0.3993147611618042, "learning_rate": 3.705166359205491e-05, "loss": 0.3352, "num_input_tokens_seen": 47541488, "step": 49760 }, { "epoch": 4.059466514397585, "grad_norm": 0.8774204254150391, "learning_rate": 3.704854506027399e-05, "loss": 0.4086, "num_input_tokens_seen": 47546304, "step": 49765 }, { "epoch": 4.0598743780079944, "grad_norm": 1.026677131652832, "learning_rate": 3.7045426284274185e-05, "loss": 0.3927, "num_input_tokens_seen": 47550928, "step": 49770 }, { "epoch": 4.060282241618403, "grad_norm": 0.6102412343025208, "learning_rate": 3.704230726411871e-05, "loss": 0.3759, "num_input_tokens_seen": 47555664, "step": 49775 }, { "epoch": 4.060690105228812, "grad_norm": 0.4732538163661957, "learning_rate": 3.703918799987079e-05, "loss": 0.333, "num_input_tokens_seen": 47560800, "step": 49780 }, { "epoch": 4.061097968839221, "grad_norm": 0.3037645220756531, "learning_rate": 3.703606849159364e-05, "loss": 0.3766, "num_input_tokens_seen": 47565712, "step": 49785 }, { "epoch": 4.061505832449629, "grad_norm": 0.5988223552703857, "learning_rate": 3.7032948739350496e-05, "loss": 0.3498, "num_input_tokens_seen": 47571040, "step": 49790 }, { "epoch": 4.061913696060038, "grad_norm": 0.4503971338272095, "learning_rate": 3.7029828743204596e-05, "loss": 0.3474, "num_input_tokens_seen": 47576112, "step": 49795 }, { "epoch": 4.062321559670446, "grad_norm": 0.6884850263595581, "learning_rate": 3.7026708503219175e-05, "loss": 0.3502, "num_input_tokens_seen": 47580080, "step": 49800 }, { "epoch": 4.062729423280855, "grad_norm": 0.44905176758766174, "learning_rate": 3.702358801945748e-05, "loss": 0.3432, "num_input_tokens_seen": 47584880, "step": 49805 }, { "epoch": 4.063137286891264, "grad_norm": 0.645473062992096, "learning_rate": 3.702046729198277e-05, "loss": 0.3202, "num_input_tokens_seen": 47589664, "step": 49810 }, { "epoch": 4.063545150501672, "grad_norm": 0.39417585730552673, "learning_rate": 3.70173463208583e-05, "loss": 0.3603, "num_input_tokens_seen": 47594640, "step": 49815 }, { "epoch": 4.063953014112081, "grad_norm": 0.7529312372207642, "learning_rate": 3.701422510614733e-05, "loss": 0.3431, "num_input_tokens_seen": 47599008, "step": 49820 }, { "epoch": 4.06436087772249, "grad_norm": 0.49963584542274475, "learning_rate": 3.70111036479131e-05, "loss": 0.348, "num_input_tokens_seen": 47603536, "step": 49825 }, { "epoch": 4.064768741332898, "grad_norm": 88.33193969726562, "learning_rate": 3.700798194621892e-05, "loss": 0.3993, "num_input_tokens_seen": 47608608, "step": 49830 }, { "epoch": 4.065176604943307, "grad_norm": 0.7646104693412781, "learning_rate": 3.700486000112805e-05, "loss": 0.3434, "num_input_tokens_seen": 47612800, "step": 49835 }, { "epoch": 4.065584468553715, "grad_norm": 0.5094544291496277, "learning_rate": 3.700173781270376e-05, "loss": 0.3274, "num_input_tokens_seen": 47617136, "step": 49840 }, { "epoch": 4.065992332164124, "grad_norm": 0.48923230171203613, "learning_rate": 3.699861538100935e-05, "loss": 0.3569, "num_input_tokens_seen": 47621168, "step": 49845 }, { "epoch": 4.066400195774533, "grad_norm": 1.1844452619552612, "learning_rate": 3.6995492706108105e-05, "loss": 0.3976, "num_input_tokens_seen": 47626432, "step": 49850 }, { "epoch": 4.066808059384941, "grad_norm": 1.1163980960845947, "learning_rate": 3.699236978806332e-05, "loss": 0.3128, "num_input_tokens_seen": 47631680, "step": 49855 }, { "epoch": 4.06721592299535, "grad_norm": 0.6288506388664246, "learning_rate": 3.69892466269383e-05, "loss": 0.3435, "num_input_tokens_seen": 47637312, "step": 49860 }, { "epoch": 4.067623786605759, "grad_norm": 0.39072221517562866, "learning_rate": 3.6986123222796334e-05, "loss": 0.3587, "num_input_tokens_seen": 47641840, "step": 49865 }, { "epoch": 4.0680316502161675, "grad_norm": 0.39112797379493713, "learning_rate": 3.698299957570075e-05, "loss": 0.3169, "num_input_tokens_seen": 47646672, "step": 49870 }, { "epoch": 4.0684395138265765, "grad_norm": 0.5278676152229309, "learning_rate": 3.697987568571486e-05, "loss": 0.3672, "num_input_tokens_seen": 47651312, "step": 49875 }, { "epoch": 4.068847377436985, "grad_norm": 0.7535773515701294, "learning_rate": 3.697675155290197e-05, "loss": 0.3417, "num_input_tokens_seen": 47655456, "step": 49880 }, { "epoch": 4.069255241047394, "grad_norm": 0.8477356433868408, "learning_rate": 3.697362717732543e-05, "loss": 0.3365, "num_input_tokens_seen": 47660320, "step": 49885 }, { "epoch": 4.069663104657803, "grad_norm": 0.4957873821258545, "learning_rate": 3.6970502559048535e-05, "loss": 0.3596, "num_input_tokens_seen": 47665760, "step": 49890 }, { "epoch": 4.070070968268211, "grad_norm": 0.5579737424850464, "learning_rate": 3.696737769813465e-05, "loss": 0.3042, "num_input_tokens_seen": 47669968, "step": 49895 }, { "epoch": 4.07047883187862, "grad_norm": 0.9128501415252686, "learning_rate": 3.696425259464711e-05, "loss": 0.3744, "num_input_tokens_seen": 47675712, "step": 49900 }, { "epoch": 4.070886695489029, "grad_norm": 0.5895819664001465, "learning_rate": 3.6961127248649254e-05, "loss": 0.3916, "num_input_tokens_seen": 47680480, "step": 49905 }, { "epoch": 4.071294559099437, "grad_norm": 0.755294919013977, "learning_rate": 3.695800166020442e-05, "loss": 0.3155, "num_input_tokens_seen": 47685232, "step": 49910 }, { "epoch": 4.071702422709846, "grad_norm": 0.7016602754592896, "learning_rate": 3.695487582937598e-05, "loss": 0.3083, "num_input_tokens_seen": 47689968, "step": 49915 }, { "epoch": 4.072110286320255, "grad_norm": 0.24270837008953094, "learning_rate": 3.695174975622729e-05, "loss": 0.3453, "num_input_tokens_seen": 47694688, "step": 49920 }, { "epoch": 4.072518149930663, "grad_norm": 0.6060165762901306, "learning_rate": 3.6948623440821706e-05, "loss": 0.3717, "num_input_tokens_seen": 47699520, "step": 49925 }, { "epoch": 4.072926013541072, "grad_norm": 0.7848601937294006, "learning_rate": 3.694549688322261e-05, "loss": 0.3647, "num_input_tokens_seen": 47704240, "step": 49930 }, { "epoch": 4.07333387715148, "grad_norm": 1.1903375387191772, "learning_rate": 3.694237008349336e-05, "loss": 0.3231, "num_input_tokens_seen": 47709312, "step": 49935 }, { "epoch": 4.073741740761889, "grad_norm": 0.4936274588108063, "learning_rate": 3.693924304169735e-05, "loss": 0.3654, "num_input_tokens_seen": 47713664, "step": 49940 }, { "epoch": 4.074149604372298, "grad_norm": 0.9101905822753906, "learning_rate": 3.6936115757897944e-05, "loss": 0.3966, "num_input_tokens_seen": 47718736, "step": 49945 }, { "epoch": 4.074557467982706, "grad_norm": 0.8805730938911438, "learning_rate": 3.693298823215855e-05, "loss": 0.3391, "num_input_tokens_seen": 47724016, "step": 49950 }, { "epoch": 4.074965331593115, "grad_norm": 0.38071209192276, "learning_rate": 3.692986046454255e-05, "loss": 0.3721, "num_input_tokens_seen": 47728656, "step": 49955 }, { "epoch": 4.075373195203524, "grad_norm": 0.4893496334552765, "learning_rate": 3.692673245511336e-05, "loss": 0.3272, "num_input_tokens_seen": 47733232, "step": 49960 }, { "epoch": 4.0757810588139325, "grad_norm": 0.8389661312103271, "learning_rate": 3.692360420393437e-05, "loss": 0.355, "num_input_tokens_seen": 47739168, "step": 49965 }, { "epoch": 4.0761889224243415, "grad_norm": 0.5696483850479126, "learning_rate": 3.692047571106898e-05, "loss": 0.3602, "num_input_tokens_seen": 47744448, "step": 49970 }, { "epoch": 4.07659678603475, "grad_norm": 0.4996960163116455, "learning_rate": 3.6917346976580625e-05, "loss": 0.3586, "num_input_tokens_seen": 47749824, "step": 49975 }, { "epoch": 4.077004649645159, "grad_norm": 0.41280099749565125, "learning_rate": 3.69142180005327e-05, "loss": 0.3512, "num_input_tokens_seen": 47754816, "step": 49980 }, { "epoch": 4.077412513255568, "grad_norm": 0.6077501177787781, "learning_rate": 3.691108878298864e-05, "loss": 0.3305, "num_input_tokens_seen": 47759952, "step": 49985 }, { "epoch": 4.077820376865976, "grad_norm": 1.1224846839904785, "learning_rate": 3.6907959324011875e-05, "loss": 0.3757, "num_input_tokens_seen": 47765424, "step": 49990 }, { "epoch": 4.078228240476385, "grad_norm": 0.7196332812309265, "learning_rate": 3.690482962366584e-05, "loss": 0.3648, "num_input_tokens_seen": 47769040, "step": 49995 }, { "epoch": 4.078636104086794, "grad_norm": 0.3379572927951813, "learning_rate": 3.690169968201396e-05, "loss": 0.3399, "num_input_tokens_seen": 47773856, "step": 50000 }, { "epoch": 4.079043967697202, "grad_norm": 0.7991650700569153, "learning_rate": 3.689856949911968e-05, "loss": 0.3671, "num_input_tokens_seen": 47779072, "step": 50005 }, { "epoch": 4.079451831307611, "grad_norm": 0.5842694640159607, "learning_rate": 3.6895439075046454e-05, "loss": 0.3473, "num_input_tokens_seen": 47784096, "step": 50010 }, { "epoch": 4.079859694918019, "grad_norm": 0.48238709568977356, "learning_rate": 3.689230840985774e-05, "loss": 0.354, "num_input_tokens_seen": 47788160, "step": 50015 }, { "epoch": 4.080267558528428, "grad_norm": 0.582670271396637, "learning_rate": 3.688917750361699e-05, "loss": 0.3399, "num_input_tokens_seen": 47793712, "step": 50020 }, { "epoch": 4.080675422138837, "grad_norm": 0.6665324568748474, "learning_rate": 3.688604635638765e-05, "loss": 0.3561, "num_input_tokens_seen": 47798384, "step": 50025 }, { "epoch": 4.081083285749245, "grad_norm": 0.6999081373214722, "learning_rate": 3.6882914968233215e-05, "loss": 0.3613, "num_input_tokens_seen": 47803312, "step": 50030 }, { "epoch": 4.081491149359654, "grad_norm": 0.5843276381492615, "learning_rate": 3.687978333921713e-05, "loss": 0.3368, "num_input_tokens_seen": 47807648, "step": 50035 }, { "epoch": 4.081899012970063, "grad_norm": 0.1596791297197342, "learning_rate": 3.6876651469402896e-05, "loss": 0.3549, "num_input_tokens_seen": 47812320, "step": 50040 }, { "epoch": 4.082306876580471, "grad_norm": 0.6024774312973022, "learning_rate": 3.6873519358853976e-05, "loss": 0.3369, "num_input_tokens_seen": 47816432, "step": 50045 }, { "epoch": 4.08271474019088, "grad_norm": 0.9077357053756714, "learning_rate": 3.6870387007633864e-05, "loss": 0.3464, "num_input_tokens_seen": 47821312, "step": 50050 }, { "epoch": 4.083122603801288, "grad_norm": 0.5412137508392334, "learning_rate": 3.686725441580606e-05, "loss": 0.3163, "num_input_tokens_seen": 47825984, "step": 50055 }, { "epoch": 4.0835304674116975, "grad_norm": 0.6069198250770569, "learning_rate": 3.686412158343404e-05, "loss": 0.3192, "num_input_tokens_seen": 47830864, "step": 50060 }, { "epoch": 4.0839383310221065, "grad_norm": 0.5376177430152893, "learning_rate": 3.686098851058133e-05, "loss": 0.3052, "num_input_tokens_seen": 47836336, "step": 50065 }, { "epoch": 4.084346194632515, "grad_norm": 0.5312817096710205, "learning_rate": 3.685785519731142e-05, "loss": 0.3007, "num_input_tokens_seen": 47840944, "step": 50070 }, { "epoch": 4.084754058242924, "grad_norm": 0.6009119153022766, "learning_rate": 3.6854721643687817e-05, "loss": 0.3596, "num_input_tokens_seen": 47845840, "step": 50075 }, { "epoch": 4.085161921853333, "grad_norm": 1.036941647529602, "learning_rate": 3.685158784977404e-05, "loss": 0.46, "num_input_tokens_seen": 47849856, "step": 50080 }, { "epoch": 4.085569785463741, "grad_norm": 0.8683139681816101, "learning_rate": 3.684845381563363e-05, "loss": 0.2744, "num_input_tokens_seen": 47855504, "step": 50085 }, { "epoch": 4.08597764907415, "grad_norm": 0.6172689199447632, "learning_rate": 3.684531954133008e-05, "loss": 0.3709, "num_input_tokens_seen": 47859984, "step": 50090 }, { "epoch": 4.086385512684558, "grad_norm": 0.40464267134666443, "learning_rate": 3.6842185026926944e-05, "loss": 0.3558, "num_input_tokens_seen": 47864384, "step": 50095 }, { "epoch": 4.086793376294967, "grad_norm": 0.6833532452583313, "learning_rate": 3.683905027248775e-05, "loss": 0.3359, "num_input_tokens_seen": 47869120, "step": 50100 }, { "epoch": 4.087201239905376, "grad_norm": 0.323550820350647, "learning_rate": 3.6835915278076036e-05, "loss": 0.3201, "num_input_tokens_seen": 47873408, "step": 50105 }, { "epoch": 4.087609103515784, "grad_norm": 0.874667227268219, "learning_rate": 3.6832780043755346e-05, "loss": 0.3628, "num_input_tokens_seen": 47878752, "step": 50110 }, { "epoch": 4.088016967126193, "grad_norm": 0.3949429392814636, "learning_rate": 3.6829644569589234e-05, "loss": 0.3799, "num_input_tokens_seen": 47883312, "step": 50115 }, { "epoch": 4.088424830736602, "grad_norm": 0.8274440765380859, "learning_rate": 3.6826508855641254e-05, "loss": 0.3423, "num_input_tokens_seen": 47888624, "step": 50120 }, { "epoch": 4.08883269434701, "grad_norm": 0.49494442343711853, "learning_rate": 3.682337290197496e-05, "loss": 0.3465, "num_input_tokens_seen": 47893408, "step": 50125 }, { "epoch": 4.089240557957419, "grad_norm": 0.9384241700172424, "learning_rate": 3.6820236708653936e-05, "loss": 0.3566, "num_input_tokens_seen": 47898128, "step": 50130 }, { "epoch": 4.089648421567828, "grad_norm": 0.7540651559829712, "learning_rate": 3.681710027574172e-05, "loss": 0.3275, "num_input_tokens_seen": 47902128, "step": 50135 }, { "epoch": 4.090056285178236, "grad_norm": 0.45258742570877075, "learning_rate": 3.6813963603301916e-05, "loss": 0.3332, "num_input_tokens_seen": 47906160, "step": 50140 }, { "epoch": 4.090464148788645, "grad_norm": 0.5784087777137756, "learning_rate": 3.681082669139808e-05, "loss": 0.3444, "num_input_tokens_seen": 47911072, "step": 50145 }, { "epoch": 4.090872012399053, "grad_norm": 0.673029899597168, "learning_rate": 3.680768954009381e-05, "loss": 0.3313, "num_input_tokens_seen": 47916624, "step": 50150 }, { "epoch": 4.091279876009462, "grad_norm": 0.9212244749069214, "learning_rate": 3.680455214945269e-05, "loss": 0.337, "num_input_tokens_seen": 47921984, "step": 50155 }, { "epoch": 4.091687739619871, "grad_norm": 0.6831943392753601, "learning_rate": 3.680141451953831e-05, "loss": 0.3442, "num_input_tokens_seen": 47927344, "step": 50160 }, { "epoch": 4.0920956032302795, "grad_norm": 1.020607829093933, "learning_rate": 3.6798276650414277e-05, "loss": 0.3455, "num_input_tokens_seen": 47932192, "step": 50165 }, { "epoch": 4.0925034668406886, "grad_norm": 0.5625424385070801, "learning_rate": 3.679513854214418e-05, "loss": 0.3299, "num_input_tokens_seen": 47936816, "step": 50170 }, { "epoch": 4.092911330451098, "grad_norm": 0.5590012669563293, "learning_rate": 3.6792000194791655e-05, "loss": 0.3382, "num_input_tokens_seen": 47941648, "step": 50175 }, { "epoch": 4.093319194061506, "grad_norm": 0.6466450095176697, "learning_rate": 3.6788861608420286e-05, "loss": 0.3478, "num_input_tokens_seen": 47946784, "step": 50180 }, { "epoch": 4.093727057671915, "grad_norm": 0.8495106101036072, "learning_rate": 3.67857227830937e-05, "loss": 0.3658, "num_input_tokens_seen": 47952016, "step": 50185 }, { "epoch": 4.094134921282323, "grad_norm": 0.7730959057807922, "learning_rate": 3.678258371887552e-05, "loss": 0.2964, "num_input_tokens_seen": 47956400, "step": 50190 }, { "epoch": 4.094542784892732, "grad_norm": 0.6874887943267822, "learning_rate": 3.677944441582938e-05, "loss": 0.3357, "num_input_tokens_seen": 47961488, "step": 50195 }, { "epoch": 4.094950648503141, "grad_norm": 0.7225093841552734, "learning_rate": 3.677630487401891e-05, "loss": 0.3506, "num_input_tokens_seen": 47965568, "step": 50200 }, { "epoch": 4.095358512113549, "grad_norm": 1.7613049745559692, "learning_rate": 3.677316509350774e-05, "loss": 0.4475, "num_input_tokens_seen": 47970192, "step": 50205 }, { "epoch": 4.095766375723958, "grad_norm": 0.8420023918151855, "learning_rate": 3.6770025074359515e-05, "loss": 0.3779, "num_input_tokens_seen": 47974048, "step": 50210 }, { "epoch": 4.096174239334367, "grad_norm": 0.6434286832809448, "learning_rate": 3.6766884816637886e-05, "loss": 0.2927, "num_input_tokens_seen": 47978880, "step": 50215 }, { "epoch": 4.096582102944775, "grad_norm": 0.41130927205085754, "learning_rate": 3.6763744320406505e-05, "loss": 0.3259, "num_input_tokens_seen": 47982992, "step": 50220 }, { "epoch": 4.096989966555184, "grad_norm": 0.48960351943969727, "learning_rate": 3.676060358572901e-05, "loss": 0.3817, "num_input_tokens_seen": 47987616, "step": 50225 }, { "epoch": 4.097397830165592, "grad_norm": 0.35613128542900085, "learning_rate": 3.67574626126691e-05, "loss": 0.3249, "num_input_tokens_seen": 47993536, "step": 50230 }, { "epoch": 4.097805693776001, "grad_norm": 0.8065471649169922, "learning_rate": 3.67543214012904e-05, "loss": 0.325, "num_input_tokens_seen": 47998368, "step": 50235 }, { "epoch": 4.09821355738641, "grad_norm": 0.7253344655036926, "learning_rate": 3.675117995165661e-05, "loss": 0.3519, "num_input_tokens_seen": 48003040, "step": 50240 }, { "epoch": 4.098621420996818, "grad_norm": 0.5983721613883972, "learning_rate": 3.674803826383139e-05, "loss": 0.3962, "num_input_tokens_seen": 48006880, "step": 50245 }, { "epoch": 4.099029284607227, "grad_norm": 1.1233203411102295, "learning_rate": 3.674489633787843e-05, "loss": 0.3999, "num_input_tokens_seen": 48011984, "step": 50250 }, { "epoch": 4.099437148217636, "grad_norm": 0.5474566221237183, "learning_rate": 3.674175417386141e-05, "loss": 0.372, "num_input_tokens_seen": 48017520, "step": 50255 }, { "epoch": 4.0998450118280445, "grad_norm": 1.3392350673675537, "learning_rate": 3.6738611771844026e-05, "loss": 0.3599, "num_input_tokens_seen": 48022576, "step": 50260 }, { "epoch": 4.1002528754384535, "grad_norm": 0.6719818711280823, "learning_rate": 3.6735469131889965e-05, "loss": 0.3282, "num_input_tokens_seen": 48027040, "step": 50265 }, { "epoch": 4.100660739048862, "grad_norm": 1.0504157543182373, "learning_rate": 3.673232625406294e-05, "loss": 0.358, "num_input_tokens_seen": 48032288, "step": 50270 }, { "epoch": 4.101068602659271, "grad_norm": 0.4465067982673645, "learning_rate": 3.672918313842664e-05, "loss": 0.3224, "num_input_tokens_seen": 48037088, "step": 50275 }, { "epoch": 4.10147646626968, "grad_norm": 0.37981680035591125, "learning_rate": 3.672603978504478e-05, "loss": 0.3086, "num_input_tokens_seen": 48041488, "step": 50280 }, { "epoch": 4.101884329880088, "grad_norm": 0.5097121000289917, "learning_rate": 3.6722896193981085e-05, "loss": 0.2623, "num_input_tokens_seen": 48046512, "step": 50285 }, { "epoch": 4.102292193490497, "grad_norm": 0.5405899882316589, "learning_rate": 3.671975236529925e-05, "loss": 0.3392, "num_input_tokens_seen": 48051376, "step": 50290 }, { "epoch": 4.102700057100906, "grad_norm": 0.6784169673919678, "learning_rate": 3.671660829906303e-05, "loss": 0.3452, "num_input_tokens_seen": 48056112, "step": 50295 }, { "epoch": 4.103107920711314, "grad_norm": 0.4385922849178314, "learning_rate": 3.671346399533613e-05, "loss": 0.3793, "num_input_tokens_seen": 48060704, "step": 50300 }, { "epoch": 4.103515784321723, "grad_norm": 0.6930928230285645, "learning_rate": 3.671031945418229e-05, "loss": 0.3423, "num_input_tokens_seen": 48065040, "step": 50305 }, { "epoch": 4.103923647932131, "grad_norm": 0.640279233455658, "learning_rate": 3.6707174675665255e-05, "loss": 0.3009, "num_input_tokens_seen": 48069216, "step": 50310 }, { "epoch": 4.10433151154254, "grad_norm": 0.4143171012401581, "learning_rate": 3.670402965984877e-05, "loss": 0.3299, "num_input_tokens_seen": 48073920, "step": 50315 }, { "epoch": 4.104739375152949, "grad_norm": 0.4848897159099579, "learning_rate": 3.670088440679657e-05, "loss": 0.2694, "num_input_tokens_seen": 48078416, "step": 50320 }, { "epoch": 4.105147238763357, "grad_norm": 0.6867716312408447, "learning_rate": 3.669773891657241e-05, "loss": 0.375, "num_input_tokens_seen": 48083392, "step": 50325 }, { "epoch": 4.105555102373766, "grad_norm": 0.5711789131164551, "learning_rate": 3.669459318924006e-05, "loss": 0.3063, "num_input_tokens_seen": 48087824, "step": 50330 }, { "epoch": 4.105962965984175, "grad_norm": 0.6784409284591675, "learning_rate": 3.669144722486328e-05, "loss": 0.3269, "num_input_tokens_seen": 48091776, "step": 50335 }, { "epoch": 4.106370829594583, "grad_norm": 0.5591288805007935, "learning_rate": 3.668830102350583e-05, "loss": 0.3538, "num_input_tokens_seen": 48095952, "step": 50340 }, { "epoch": 4.106778693204992, "grad_norm": 1.119710922241211, "learning_rate": 3.6685154585231475e-05, "loss": 0.3755, "num_input_tokens_seen": 48100624, "step": 50345 }, { "epoch": 4.107186556815401, "grad_norm": 0.6740055084228516, "learning_rate": 3.668200791010401e-05, "loss": 0.334, "num_input_tokens_seen": 48105472, "step": 50350 }, { "epoch": 4.1075944204258095, "grad_norm": 0.5904948711395264, "learning_rate": 3.6678860998187206e-05, "loss": 0.3624, "num_input_tokens_seen": 48110144, "step": 50355 }, { "epoch": 4.1080022840362185, "grad_norm": 0.5350564122200012, "learning_rate": 3.6675713849544855e-05, "loss": 0.2976, "num_input_tokens_seen": 48115488, "step": 50360 }, { "epoch": 4.108410147646627, "grad_norm": 0.7606169581413269, "learning_rate": 3.6672566464240746e-05, "loss": 0.3353, "num_input_tokens_seen": 48120192, "step": 50365 }, { "epoch": 4.108818011257036, "grad_norm": 0.3125392198562622, "learning_rate": 3.666941884233867e-05, "loss": 0.3302, "num_input_tokens_seen": 48124720, "step": 50370 }, { "epoch": 4.109225874867445, "grad_norm": 0.6832118630409241, "learning_rate": 3.666627098390244e-05, "loss": 0.2743, "num_input_tokens_seen": 48129472, "step": 50375 }, { "epoch": 4.109633738477853, "grad_norm": 0.5220752954483032, "learning_rate": 3.666312288899585e-05, "loss": 0.3, "num_input_tokens_seen": 48134720, "step": 50380 }, { "epoch": 4.110041602088262, "grad_norm": 0.6889985203742981, "learning_rate": 3.665997455768271e-05, "loss": 0.3323, "num_input_tokens_seen": 48139536, "step": 50385 }, { "epoch": 4.110449465698671, "grad_norm": 0.43092238903045654, "learning_rate": 3.665682599002685e-05, "loss": 0.3329, "num_input_tokens_seen": 48144352, "step": 50390 }, { "epoch": 4.110857329309079, "grad_norm": 0.38937729597091675, "learning_rate": 3.665367718609207e-05, "loss": 0.3611, "num_input_tokens_seen": 48149568, "step": 50395 }, { "epoch": 4.111265192919488, "grad_norm": 0.6119049787521362, "learning_rate": 3.665052814594222e-05, "loss": 0.301, "num_input_tokens_seen": 48153600, "step": 50400 }, { "epoch": 4.111673056529896, "grad_norm": 1.0536328554153442, "learning_rate": 3.66473788696411e-05, "loss": 0.422, "num_input_tokens_seen": 48157712, "step": 50405 }, { "epoch": 4.112080920140305, "grad_norm": 0.3029228150844574, "learning_rate": 3.664422935725257e-05, "loss": 0.3697, "num_input_tokens_seen": 48161728, "step": 50410 }, { "epoch": 4.112488783750714, "grad_norm": 0.6574508547782898, "learning_rate": 3.664107960884046e-05, "loss": 0.3865, "num_input_tokens_seen": 48167568, "step": 50415 }, { "epoch": 4.112896647361122, "grad_norm": 1.3633360862731934, "learning_rate": 3.6637929624468606e-05, "loss": 0.4024, "num_input_tokens_seen": 48171632, "step": 50420 }, { "epoch": 4.113304510971531, "grad_norm": 0.7970201969146729, "learning_rate": 3.663477940420086e-05, "loss": 0.3388, "num_input_tokens_seen": 48175904, "step": 50425 }, { "epoch": 4.11371237458194, "grad_norm": 1.1235448122024536, "learning_rate": 3.6631628948101095e-05, "loss": 0.4116, "num_input_tokens_seen": 48180960, "step": 50430 }, { "epoch": 4.114120238192348, "grad_norm": 0.5060704946517944, "learning_rate": 3.6628478256233135e-05, "loss": 0.3621, "num_input_tokens_seen": 48186592, "step": 50435 }, { "epoch": 4.114528101802757, "grad_norm": 0.8227211833000183, "learning_rate": 3.6625327328660876e-05, "loss": 0.3504, "num_input_tokens_seen": 48190944, "step": 50440 }, { "epoch": 4.114935965413165, "grad_norm": 0.72793048620224, "learning_rate": 3.6622176165448165e-05, "loss": 0.3738, "num_input_tokens_seen": 48195120, "step": 50445 }, { "epoch": 4.115343829023574, "grad_norm": 0.4297904372215271, "learning_rate": 3.661902476665888e-05, "loss": 0.36, "num_input_tokens_seen": 48199760, "step": 50450 }, { "epoch": 4.115751692633983, "grad_norm": 0.7599776983261108, "learning_rate": 3.66158731323569e-05, "loss": 0.3428, "num_input_tokens_seen": 48204704, "step": 50455 }, { "epoch": 4.116159556244392, "grad_norm": 0.7810046672821045, "learning_rate": 3.661272126260612e-05, "loss": 0.3343, "num_input_tokens_seen": 48209776, "step": 50460 }, { "epoch": 4.116567419854801, "grad_norm": 0.2902066111564636, "learning_rate": 3.66095691574704e-05, "loss": 0.3521, "num_input_tokens_seen": 48214752, "step": 50465 }, { "epoch": 4.11697528346521, "grad_norm": 0.4391036927700043, "learning_rate": 3.660641681701366e-05, "loss": 0.3282, "num_input_tokens_seen": 48219728, "step": 50470 }, { "epoch": 4.117383147075618, "grad_norm": 0.36053407192230225, "learning_rate": 3.6603264241299775e-05, "loss": 0.3231, "num_input_tokens_seen": 48224848, "step": 50475 }, { "epoch": 4.117791010686027, "grad_norm": 0.6689821481704712, "learning_rate": 3.660011143039265e-05, "loss": 0.307, "num_input_tokens_seen": 48229888, "step": 50480 }, { "epoch": 4.118198874296436, "grad_norm": 0.4993739128112793, "learning_rate": 3.65969583843562e-05, "loss": 0.3752, "num_input_tokens_seen": 48234592, "step": 50485 }, { "epoch": 4.118606737906844, "grad_norm": 0.8488726615905762, "learning_rate": 3.659380510325433e-05, "loss": 0.3337, "num_input_tokens_seen": 48239632, "step": 50490 }, { "epoch": 4.119014601517253, "grad_norm": 0.662662148475647, "learning_rate": 3.659065158715096e-05, "loss": 0.3024, "num_input_tokens_seen": 48244992, "step": 50495 }, { "epoch": 4.119422465127661, "grad_norm": 0.4295426607131958, "learning_rate": 3.658749783611001e-05, "loss": 0.2996, "num_input_tokens_seen": 48250048, "step": 50500 }, { "epoch": 4.11983032873807, "grad_norm": 0.43083831667900085, "learning_rate": 3.6584343850195395e-05, "loss": 0.331, "num_input_tokens_seen": 48254640, "step": 50505 }, { "epoch": 4.120238192348479, "grad_norm": 0.6878760457038879, "learning_rate": 3.6581189629471055e-05, "loss": 0.3125, "num_input_tokens_seen": 48260480, "step": 50510 }, { "epoch": 4.120646055958887, "grad_norm": 0.5951579809188843, "learning_rate": 3.657803517400093e-05, "loss": 0.317, "num_input_tokens_seen": 48265152, "step": 50515 }, { "epoch": 4.121053919569296, "grad_norm": 0.9912387132644653, "learning_rate": 3.6574880483848936e-05, "loss": 0.3757, "num_input_tokens_seen": 48268912, "step": 50520 }, { "epoch": 4.121461783179705, "grad_norm": 0.44999828934669495, "learning_rate": 3.6571725559079044e-05, "loss": 0.3678, "num_input_tokens_seen": 48273600, "step": 50525 }, { "epoch": 4.121869646790113, "grad_norm": 0.7752031683921814, "learning_rate": 3.65685703997552e-05, "loss": 0.2961, "num_input_tokens_seen": 48278304, "step": 50530 }, { "epoch": 4.122277510400522, "grad_norm": 0.5528050065040588, "learning_rate": 3.6565415005941335e-05, "loss": 0.3225, "num_input_tokens_seen": 48283856, "step": 50535 }, { "epoch": 4.12268537401093, "grad_norm": 0.47656944394111633, "learning_rate": 3.6562259377701426e-05, "loss": 0.3871, "num_input_tokens_seen": 48287984, "step": 50540 }, { "epoch": 4.123093237621339, "grad_norm": 0.7417023777961731, "learning_rate": 3.655910351509943e-05, "loss": 0.3246, "num_input_tokens_seen": 48292768, "step": 50545 }, { "epoch": 4.123501101231748, "grad_norm": 0.6240774393081665, "learning_rate": 3.655594741819933e-05, "loss": 0.3188, "num_input_tokens_seen": 48297024, "step": 50550 }, { "epoch": 4.1239089648421565, "grad_norm": 1.1129217147827148, "learning_rate": 3.6552791087065075e-05, "loss": 0.3962, "num_input_tokens_seen": 48301904, "step": 50555 }, { "epoch": 4.1243168284525655, "grad_norm": 0.7401772141456604, "learning_rate": 3.6549634521760653e-05, "loss": 0.3807, "num_input_tokens_seen": 48306048, "step": 50560 }, { "epoch": 4.1247246920629745, "grad_norm": 0.4648565948009491, "learning_rate": 3.6546477722350053e-05, "loss": 0.3063, "num_input_tokens_seen": 48310464, "step": 50565 }, { "epoch": 4.125132555673383, "grad_norm": 0.5283147692680359, "learning_rate": 3.654332068889725e-05, "loss": 0.3416, "num_input_tokens_seen": 48315408, "step": 50570 }, { "epoch": 4.125540419283792, "grad_norm": 0.31224095821380615, "learning_rate": 3.654016342146626e-05, "loss": 0.3503, "num_input_tokens_seen": 48319904, "step": 50575 }, { "epoch": 4.1259482828942, "grad_norm": 0.6042836904525757, "learning_rate": 3.6537005920121035e-05, "loss": 0.3255, "num_input_tokens_seen": 48324960, "step": 50580 }, { "epoch": 4.126356146504609, "grad_norm": 0.4153140187263489, "learning_rate": 3.653384818492562e-05, "loss": 0.3082, "num_input_tokens_seen": 48330192, "step": 50585 }, { "epoch": 4.126764010115018, "grad_norm": 0.587447464466095, "learning_rate": 3.6530690215943994e-05, "loss": 0.346, "num_input_tokens_seen": 48335392, "step": 50590 }, { "epoch": 4.127171873725426, "grad_norm": 0.6573511362075806, "learning_rate": 3.652753201324018e-05, "loss": 0.3195, "num_input_tokens_seen": 48339968, "step": 50595 }, { "epoch": 4.127579737335835, "grad_norm": 0.9189155697822571, "learning_rate": 3.652437357687819e-05, "loss": 0.3517, "num_input_tokens_seen": 48345152, "step": 50600 }, { "epoch": 4.127987600946244, "grad_norm": 1.0625462532043457, "learning_rate": 3.652121490692204e-05, "loss": 0.3545, "num_input_tokens_seen": 48350528, "step": 50605 }, { "epoch": 4.128395464556652, "grad_norm": 0.5686745643615723, "learning_rate": 3.6518056003435774e-05, "loss": 0.3562, "num_input_tokens_seen": 48354544, "step": 50610 }, { "epoch": 4.128803328167061, "grad_norm": 0.30576783418655396, "learning_rate": 3.651489686648339e-05, "loss": 0.3715, "num_input_tokens_seen": 48358288, "step": 50615 }, { "epoch": 4.129211191777469, "grad_norm": 0.7482529282569885, "learning_rate": 3.651173749612895e-05, "loss": 0.353, "num_input_tokens_seen": 48363024, "step": 50620 }, { "epoch": 4.129619055387878, "grad_norm": 0.6819653511047363, "learning_rate": 3.6508577892436477e-05, "loss": 0.3718, "num_input_tokens_seen": 48367392, "step": 50625 }, { "epoch": 4.130026918998287, "grad_norm": 0.5707427859306335, "learning_rate": 3.650541805547003e-05, "loss": 0.3626, "num_input_tokens_seen": 48372496, "step": 50630 }, { "epoch": 4.130434782608695, "grad_norm": 0.5938565135002136, "learning_rate": 3.650225798529364e-05, "loss": 0.3331, "num_input_tokens_seen": 48376928, "step": 50635 }, { "epoch": 4.130842646219104, "grad_norm": 0.5524967908859253, "learning_rate": 3.6499097681971375e-05, "loss": 0.317, "num_input_tokens_seen": 48381456, "step": 50640 }, { "epoch": 4.131250509829513, "grad_norm": 0.5595542788505554, "learning_rate": 3.649593714556727e-05, "loss": 0.3224, "num_input_tokens_seen": 48385888, "step": 50645 }, { "epoch": 4.1316583734399215, "grad_norm": 0.6102370619773865, "learning_rate": 3.649277637614542e-05, "loss": 0.3495, "num_input_tokens_seen": 48390368, "step": 50650 }, { "epoch": 4.1320662370503305, "grad_norm": 0.564325213432312, "learning_rate": 3.648961537376987e-05, "loss": 0.3791, "num_input_tokens_seen": 48394800, "step": 50655 }, { "epoch": 4.132474100660739, "grad_norm": 0.7294813394546509, "learning_rate": 3.64864541385047e-05, "loss": 0.3453, "num_input_tokens_seen": 48400080, "step": 50660 }, { "epoch": 4.132881964271148, "grad_norm": 0.9260441660881042, "learning_rate": 3.648329267041398e-05, "loss": 0.3379, "num_input_tokens_seen": 48404560, "step": 50665 }, { "epoch": 4.133289827881557, "grad_norm": 0.9653596878051758, "learning_rate": 3.648013096956181e-05, "loss": 0.3868, "num_input_tokens_seen": 48409072, "step": 50670 }, { "epoch": 4.133697691491965, "grad_norm": 0.5256153345108032, "learning_rate": 3.6476969036012255e-05, "loss": 0.3217, "num_input_tokens_seen": 48414320, "step": 50675 }, { "epoch": 4.134105555102374, "grad_norm": 0.57648766040802, "learning_rate": 3.647380686982942e-05, "loss": 0.3388, "num_input_tokens_seen": 48418800, "step": 50680 }, { "epoch": 4.134513418712783, "grad_norm": 0.6433247327804565, "learning_rate": 3.6470644471077384e-05, "loss": 0.3268, "num_input_tokens_seen": 48423520, "step": 50685 }, { "epoch": 4.134921282323191, "grad_norm": 0.46935543417930603, "learning_rate": 3.646748183982027e-05, "loss": 0.3769, "num_input_tokens_seen": 48427952, "step": 50690 }, { "epoch": 4.1353291459336, "grad_norm": 0.440834105014801, "learning_rate": 3.646431897612217e-05, "loss": 0.3147, "num_input_tokens_seen": 48432400, "step": 50695 }, { "epoch": 4.135737009544009, "grad_norm": 0.6588870882987976, "learning_rate": 3.646115588004719e-05, "loss": 0.3454, "num_input_tokens_seen": 48436496, "step": 50700 }, { "epoch": 4.136144873154417, "grad_norm": 0.59394371509552, "learning_rate": 3.6457992551659454e-05, "loss": 0.377, "num_input_tokens_seen": 48441424, "step": 50705 }, { "epoch": 4.136552736764826, "grad_norm": 0.9066631197929382, "learning_rate": 3.645482899102307e-05, "loss": 0.3599, "num_input_tokens_seen": 48445584, "step": 50710 }, { "epoch": 4.136960600375234, "grad_norm": 0.7647847533226013, "learning_rate": 3.645166519820218e-05, "loss": 0.3425, "num_input_tokens_seen": 48450128, "step": 50715 }, { "epoch": 4.137368463985643, "grad_norm": 0.6022504568099976, "learning_rate": 3.64485011732609e-05, "loss": 0.3267, "num_input_tokens_seen": 48455424, "step": 50720 }, { "epoch": 4.137776327596052, "grad_norm": 1.096703052520752, "learning_rate": 3.644533691626337e-05, "loss": 0.4105, "num_input_tokens_seen": 48460352, "step": 50725 }, { "epoch": 4.13818419120646, "grad_norm": 0.8798929452896118, "learning_rate": 3.644217242727372e-05, "loss": 0.41, "num_input_tokens_seen": 48465904, "step": 50730 }, { "epoch": 4.138592054816869, "grad_norm": 0.729181170463562, "learning_rate": 3.6439007706356096e-05, "loss": 0.37, "num_input_tokens_seen": 48469936, "step": 50735 }, { "epoch": 4.138999918427278, "grad_norm": 0.7821664810180664, "learning_rate": 3.643584275357465e-05, "loss": 0.3267, "num_input_tokens_seen": 48474592, "step": 50740 }, { "epoch": 4.1394077820376864, "grad_norm": 0.5206878781318665, "learning_rate": 3.643267756899352e-05, "loss": 0.3276, "num_input_tokens_seen": 48479216, "step": 50745 }, { "epoch": 4.1398156456480955, "grad_norm": 0.5308146476745605, "learning_rate": 3.642951215267688e-05, "loss": 0.3682, "num_input_tokens_seen": 48484512, "step": 50750 }, { "epoch": 4.140223509258504, "grad_norm": 1.061721682548523, "learning_rate": 3.642634650468889e-05, "loss": 0.3539, "num_input_tokens_seen": 48488624, "step": 50755 }, { "epoch": 4.140631372868913, "grad_norm": 0.3314197063446045, "learning_rate": 3.642318062509371e-05, "loss": 0.3405, "num_input_tokens_seen": 48492912, "step": 50760 }, { "epoch": 4.141039236479322, "grad_norm": 0.6231922507286072, "learning_rate": 3.6420014513955515e-05, "loss": 0.3839, "num_input_tokens_seen": 48498512, "step": 50765 }, { "epoch": 4.14144710008973, "grad_norm": 0.40458944439888, "learning_rate": 3.641684817133847e-05, "loss": 0.3715, "num_input_tokens_seen": 48503536, "step": 50770 }, { "epoch": 4.141854963700139, "grad_norm": 1.0984119176864624, "learning_rate": 3.641368159730678e-05, "loss": 0.3408, "num_input_tokens_seen": 48507232, "step": 50775 }, { "epoch": 4.142262827310548, "grad_norm": 0.8645899295806885, "learning_rate": 3.6410514791924596e-05, "loss": 0.365, "num_input_tokens_seen": 48512368, "step": 50780 }, { "epoch": 4.142670690920956, "grad_norm": 0.8126968741416931, "learning_rate": 3.640734775525613e-05, "loss": 0.3532, "num_input_tokens_seen": 48517488, "step": 50785 }, { "epoch": 4.143078554531365, "grad_norm": 0.4490002989768982, "learning_rate": 3.640418048736557e-05, "loss": 0.3147, "num_input_tokens_seen": 48522240, "step": 50790 }, { "epoch": 4.143486418141773, "grad_norm": 0.8315829634666443, "learning_rate": 3.640101298831713e-05, "loss": 0.3559, "num_input_tokens_seen": 48527040, "step": 50795 }, { "epoch": 4.143894281752182, "grad_norm": 0.4910602271556854, "learning_rate": 3.639784525817499e-05, "loss": 0.3425, "num_input_tokens_seen": 48531392, "step": 50800 }, { "epoch": 4.144302145362591, "grad_norm": 0.18702803552150726, "learning_rate": 3.639467729700338e-05, "loss": 0.3507, "num_input_tokens_seen": 48535792, "step": 50805 }, { "epoch": 4.144710008972999, "grad_norm": 0.8807573914527893, "learning_rate": 3.639150910486649e-05, "loss": 0.3612, "num_input_tokens_seen": 48539808, "step": 50810 }, { "epoch": 4.145117872583408, "grad_norm": 0.32147538661956787, "learning_rate": 3.638834068182856e-05, "loss": 0.3433, "num_input_tokens_seen": 48544976, "step": 50815 }, { "epoch": 4.145525736193817, "grad_norm": 0.411088228225708, "learning_rate": 3.6385172027953804e-05, "loss": 0.3502, "num_input_tokens_seen": 48549664, "step": 50820 }, { "epoch": 4.145933599804225, "grad_norm": 0.8594717979431152, "learning_rate": 3.638200314330645e-05, "loss": 0.3526, "num_input_tokens_seen": 48554768, "step": 50825 }, { "epoch": 4.146341463414634, "grad_norm": 0.46708688139915466, "learning_rate": 3.637883402795072e-05, "loss": 0.3214, "num_input_tokens_seen": 48560080, "step": 50830 }, { "epoch": 4.146749327025042, "grad_norm": 0.6126132607460022, "learning_rate": 3.637566468195086e-05, "loss": 0.3814, "num_input_tokens_seen": 48564704, "step": 50835 }, { "epoch": 4.147157190635451, "grad_norm": 0.8272463083267212, "learning_rate": 3.637249510537112e-05, "loss": 0.3256, "num_input_tokens_seen": 48569424, "step": 50840 }, { "epoch": 4.14756505424586, "grad_norm": 0.6652929782867432, "learning_rate": 3.636932529827573e-05, "loss": 0.3536, "num_input_tokens_seen": 48574288, "step": 50845 }, { "epoch": 4.1479729178562685, "grad_norm": 0.33889472484588623, "learning_rate": 3.636615526072895e-05, "loss": 0.3386, "num_input_tokens_seen": 48579440, "step": 50850 }, { "epoch": 4.1483807814666775, "grad_norm": 0.46122631430625916, "learning_rate": 3.636298499279503e-05, "loss": 0.38, "num_input_tokens_seen": 48584656, "step": 50855 }, { "epoch": 4.148788645077087, "grad_norm": 0.6516178846359253, "learning_rate": 3.635981449453824e-05, "loss": 0.3126, "num_input_tokens_seen": 48589968, "step": 50860 }, { "epoch": 4.149196508687495, "grad_norm": 0.45195087790489197, "learning_rate": 3.6356643766022826e-05, "loss": 0.4199, "num_input_tokens_seen": 48594512, "step": 50865 }, { "epoch": 4.149604372297904, "grad_norm": 0.31104549765586853, "learning_rate": 3.635347280731307e-05, "loss": 0.3543, "num_input_tokens_seen": 48599104, "step": 50870 }, { "epoch": 4.150012235908312, "grad_norm": 0.7065014243125916, "learning_rate": 3.635030161847325e-05, "loss": 0.3443, "num_input_tokens_seen": 48603712, "step": 50875 }, { "epoch": 4.150420099518721, "grad_norm": 0.48187121748924255, "learning_rate": 3.6347130199567635e-05, "loss": 0.3276, "num_input_tokens_seen": 48608272, "step": 50880 }, { "epoch": 4.15082796312913, "grad_norm": 0.4236299693584442, "learning_rate": 3.6343958550660525e-05, "loss": 0.3023, "num_input_tokens_seen": 48612080, "step": 50885 }, { "epoch": 4.151235826739538, "grad_norm": 1.0624182224273682, "learning_rate": 3.6340786671816184e-05, "loss": 0.3625, "num_input_tokens_seen": 48616416, "step": 50890 }, { "epoch": 4.151643690349947, "grad_norm": 0.5552636981010437, "learning_rate": 3.633761456309892e-05, "loss": 0.2577, "num_input_tokens_seen": 48622064, "step": 50895 }, { "epoch": 4.152051553960356, "grad_norm": 1.2768107652664185, "learning_rate": 3.633444222457302e-05, "loss": 0.362, "num_input_tokens_seen": 48627408, "step": 50900 }, { "epoch": 4.152459417570764, "grad_norm": 0.8875943422317505, "learning_rate": 3.63312696563028e-05, "loss": 0.3542, "num_input_tokens_seen": 48632512, "step": 50905 }, { "epoch": 4.152867281181173, "grad_norm": 0.5411070585250854, "learning_rate": 3.632809685835256e-05, "loss": 0.3128, "num_input_tokens_seen": 48637728, "step": 50910 }, { "epoch": 4.153275144791582, "grad_norm": 0.3650280237197876, "learning_rate": 3.63249238307866e-05, "loss": 0.3258, "num_input_tokens_seen": 48641440, "step": 50915 }, { "epoch": 4.15368300840199, "grad_norm": 0.34364327788352966, "learning_rate": 3.632175057366925e-05, "loss": 0.32, "num_input_tokens_seen": 48646416, "step": 50920 }, { "epoch": 4.154090872012399, "grad_norm": 0.37677037715911865, "learning_rate": 3.631857708706483e-05, "loss": 0.3246, "num_input_tokens_seen": 48651008, "step": 50925 }, { "epoch": 4.154498735622807, "grad_norm": 0.9965157508850098, "learning_rate": 3.631540337103766e-05, "loss": 0.3456, "num_input_tokens_seen": 48655664, "step": 50930 }, { "epoch": 4.154906599233216, "grad_norm": 1.0210587978363037, "learning_rate": 3.631222942565208e-05, "loss": 0.3487, "num_input_tokens_seen": 48660976, "step": 50935 }, { "epoch": 4.155314462843625, "grad_norm": 0.9569721817970276, "learning_rate": 3.6309055250972404e-05, "loss": 0.3546, "num_input_tokens_seen": 48666144, "step": 50940 }, { "epoch": 4.1557223264540335, "grad_norm": 0.4672841727733612, "learning_rate": 3.630588084706299e-05, "loss": 0.3375, "num_input_tokens_seen": 48671216, "step": 50945 }, { "epoch": 4.1561301900644425, "grad_norm": 0.8392537832260132, "learning_rate": 3.630270621398817e-05, "loss": 0.3633, "num_input_tokens_seen": 48676880, "step": 50950 }, { "epoch": 4.1565380536748515, "grad_norm": 0.8078334331512451, "learning_rate": 3.629953135181231e-05, "loss": 0.3059, "num_input_tokens_seen": 48682048, "step": 50955 }, { "epoch": 4.15694591728526, "grad_norm": 0.5596442818641663, "learning_rate": 3.629635626059974e-05, "loss": 0.3364, "num_input_tokens_seen": 48686736, "step": 50960 }, { "epoch": 4.157353780895669, "grad_norm": 0.9527925848960876, "learning_rate": 3.629318094041483e-05, "loss": 0.334, "num_input_tokens_seen": 48691152, "step": 50965 }, { "epoch": 4.157761644506077, "grad_norm": 0.5969266295433044, "learning_rate": 3.629000539132195e-05, "loss": 0.3058, "num_input_tokens_seen": 48695968, "step": 50970 }, { "epoch": 4.158169508116486, "grad_norm": 0.46950411796569824, "learning_rate": 3.628682961338545e-05, "loss": 0.2226, "num_input_tokens_seen": 48700528, "step": 50975 }, { "epoch": 4.158577371726895, "grad_norm": 1.5651650428771973, "learning_rate": 3.628365360666971e-05, "loss": 0.4631, "num_input_tokens_seen": 48705328, "step": 50980 }, { "epoch": 4.158985235337303, "grad_norm": 0.6599746346473694, "learning_rate": 3.628047737123911e-05, "loss": 0.4492, "num_input_tokens_seen": 48708928, "step": 50985 }, { "epoch": 4.159393098947712, "grad_norm": 0.7714465260505676, "learning_rate": 3.627730090715802e-05, "loss": 0.3686, "num_input_tokens_seen": 48714048, "step": 50990 }, { "epoch": 4.159800962558121, "grad_norm": 0.43992310762405396, "learning_rate": 3.627412421449084e-05, "loss": 0.3797, "num_input_tokens_seen": 48718832, "step": 50995 }, { "epoch": 4.160208826168529, "grad_norm": 0.9260745644569397, "learning_rate": 3.627094729330195e-05, "loss": 0.3916, "num_input_tokens_seen": 48723744, "step": 51000 }, { "epoch": 4.160616689778938, "grad_norm": 0.7674744725227356, "learning_rate": 3.626777014365574e-05, "loss": 0.3225, "num_input_tokens_seen": 48728912, "step": 51005 }, { "epoch": 4.161024553389346, "grad_norm": 0.768804669380188, "learning_rate": 3.626459276561663e-05, "loss": 0.3322, "num_input_tokens_seen": 48733984, "step": 51010 }, { "epoch": 4.161432416999755, "grad_norm": 0.8488309979438782, "learning_rate": 3.626141515924901e-05, "loss": 0.3373, "num_input_tokens_seen": 48739024, "step": 51015 }, { "epoch": 4.161840280610164, "grad_norm": 0.7347851395606995, "learning_rate": 3.6258237324617285e-05, "loss": 0.3483, "num_input_tokens_seen": 48742752, "step": 51020 }, { "epoch": 4.162248144220572, "grad_norm": 0.7751556038856506, "learning_rate": 3.625505926178587e-05, "loss": 0.3152, "num_input_tokens_seen": 48748240, "step": 51025 }, { "epoch": 4.162656007830981, "grad_norm": 0.38762128353118896, "learning_rate": 3.62518809708192e-05, "loss": 0.3405, "num_input_tokens_seen": 48752672, "step": 51030 }, { "epoch": 4.16306387144139, "grad_norm": 14.386609077453613, "learning_rate": 3.624870245178167e-05, "loss": 0.5042, "num_input_tokens_seen": 48757872, "step": 51035 }, { "epoch": 4.1634717350517985, "grad_norm": 0.6088822484016418, "learning_rate": 3.624552370473773e-05, "loss": 0.3632, "num_input_tokens_seen": 48762512, "step": 51040 }, { "epoch": 4.1638795986622075, "grad_norm": 0.23340226709842682, "learning_rate": 3.6242344729751794e-05, "loss": 0.3356, "num_input_tokens_seen": 48767616, "step": 51045 }, { "epoch": 4.1642874622726165, "grad_norm": 0.7123821973800659, "learning_rate": 3.623916552688831e-05, "loss": 0.3464, "num_input_tokens_seen": 48772256, "step": 51050 }, { "epoch": 4.164695325883025, "grad_norm": 0.5456139445304871, "learning_rate": 3.623598609621171e-05, "loss": 0.3696, "num_input_tokens_seen": 48777200, "step": 51055 }, { "epoch": 4.165103189493434, "grad_norm": 0.9587734937667847, "learning_rate": 3.623280643778646e-05, "loss": 0.3201, "num_input_tokens_seen": 48781328, "step": 51060 }, { "epoch": 4.165511053103842, "grad_norm": 0.5535603761672974, "learning_rate": 3.622962655167699e-05, "loss": 0.3461, "num_input_tokens_seen": 48786448, "step": 51065 }, { "epoch": 4.165918916714251, "grad_norm": 0.9464525580406189, "learning_rate": 3.622644643794777e-05, "loss": 0.3677, "num_input_tokens_seen": 48791184, "step": 51070 }, { "epoch": 4.16632678032466, "grad_norm": 0.6951953172683716, "learning_rate": 3.622326609666324e-05, "loss": 0.3326, "num_input_tokens_seen": 48796064, "step": 51075 }, { "epoch": 4.166734643935068, "grad_norm": 0.7619959712028503, "learning_rate": 3.622008552788788e-05, "loss": 0.316, "num_input_tokens_seen": 48800320, "step": 51080 }, { "epoch": 4.167142507545477, "grad_norm": 0.4640292823314667, "learning_rate": 3.6216904731686155e-05, "loss": 0.3424, "num_input_tokens_seen": 48804736, "step": 51085 }, { "epoch": 4.167550371155886, "grad_norm": 0.5888627767562866, "learning_rate": 3.621372370812254e-05, "loss": 0.2915, "num_input_tokens_seen": 48808896, "step": 51090 }, { "epoch": 4.167958234766294, "grad_norm": 0.6855395436286926, "learning_rate": 3.6210542457261504e-05, "loss": 0.3648, "num_input_tokens_seen": 48813136, "step": 51095 }, { "epoch": 4.168366098376703, "grad_norm": 0.8029905557632446, "learning_rate": 3.620736097916754e-05, "loss": 0.4044, "num_input_tokens_seen": 48817696, "step": 51100 }, { "epoch": 4.168773961987111, "grad_norm": 1.062868356704712, "learning_rate": 3.620417927390512e-05, "loss": 0.3332, "num_input_tokens_seen": 48823216, "step": 51105 }, { "epoch": 4.16918182559752, "grad_norm": 0.5798431038856506, "learning_rate": 3.620099734153876e-05, "loss": 0.3794, "num_input_tokens_seen": 48828656, "step": 51110 }, { "epoch": 4.169589689207929, "grad_norm": 0.6557026505470276, "learning_rate": 3.619781518213294e-05, "loss": 0.3315, "num_input_tokens_seen": 48834080, "step": 51115 }, { "epoch": 4.169997552818337, "grad_norm": 0.6806731820106506, "learning_rate": 3.6194632795752165e-05, "loss": 0.3708, "num_input_tokens_seen": 48839184, "step": 51120 }, { "epoch": 4.170405416428746, "grad_norm": 1.2501918077468872, "learning_rate": 3.6191450182460936e-05, "loss": 0.3335, "num_input_tokens_seen": 48844704, "step": 51125 }, { "epoch": 4.170813280039155, "grad_norm": 1.0278475284576416, "learning_rate": 3.618826734232378e-05, "loss": 0.3694, "num_input_tokens_seen": 48848944, "step": 51130 }, { "epoch": 4.171221143649563, "grad_norm": 0.9588501453399658, "learning_rate": 3.618508427540519e-05, "loss": 0.3349, "num_input_tokens_seen": 48852288, "step": 51135 }, { "epoch": 4.171629007259972, "grad_norm": 0.7270616292953491, "learning_rate": 3.618190098176969e-05, "loss": 0.3023, "num_input_tokens_seen": 48857824, "step": 51140 }, { "epoch": 4.1720368708703806, "grad_norm": 0.6879152655601501, "learning_rate": 3.617871746148182e-05, "loss": 0.2796, "num_input_tokens_seen": 48862288, "step": 51145 }, { "epoch": 4.17244473448079, "grad_norm": 0.49646544456481934, "learning_rate": 3.6175533714606085e-05, "loss": 0.3455, "num_input_tokens_seen": 48867248, "step": 51150 }, { "epoch": 4.172852598091199, "grad_norm": 0.9185404777526855, "learning_rate": 3.617234974120704e-05, "loss": 0.2785, "num_input_tokens_seen": 48872064, "step": 51155 }, { "epoch": 4.173260461701607, "grad_norm": 0.5847406983375549, "learning_rate": 3.61691655413492e-05, "loss": 0.3322, "num_input_tokens_seen": 48876464, "step": 51160 }, { "epoch": 4.173668325312016, "grad_norm": 0.8356860280036926, "learning_rate": 3.6165981115097134e-05, "loss": 0.4828, "num_input_tokens_seen": 48881376, "step": 51165 }, { "epoch": 4.174076188922425, "grad_norm": 0.6233417391777039, "learning_rate": 3.616279646251538e-05, "loss": 0.2742, "num_input_tokens_seen": 48885712, "step": 51170 }, { "epoch": 4.174484052532833, "grad_norm": 0.30899515748023987, "learning_rate": 3.615961158366847e-05, "loss": 0.2795, "num_input_tokens_seen": 48890544, "step": 51175 }, { "epoch": 4.174891916143242, "grad_norm": 1.3796141147613525, "learning_rate": 3.6156426478620984e-05, "loss": 0.3676, "num_input_tokens_seen": 48895776, "step": 51180 }, { "epoch": 4.17529977975365, "grad_norm": 0.5777325630187988, "learning_rate": 3.615324114743747e-05, "loss": 0.325, "num_input_tokens_seen": 48901136, "step": 51185 }, { "epoch": 4.175707643364059, "grad_norm": 0.5812138915061951, "learning_rate": 3.61500555901825e-05, "loss": 0.4351, "num_input_tokens_seen": 48906208, "step": 51190 }, { "epoch": 4.176115506974468, "grad_norm": 0.621887743473053, "learning_rate": 3.6146869806920645e-05, "loss": 0.3869, "num_input_tokens_seen": 48910976, "step": 51195 }, { "epoch": 4.176523370584876, "grad_norm": 0.7479153871536255, "learning_rate": 3.6143683797716464e-05, "loss": 0.3414, "num_input_tokens_seen": 48915424, "step": 51200 }, { "epoch": 4.176931234195285, "grad_norm": 0.684205949306488, "learning_rate": 3.6140497562634557e-05, "loss": 0.3261, "num_input_tokens_seen": 48920048, "step": 51205 }, { "epoch": 4.177339097805694, "grad_norm": 0.6004598736763, "learning_rate": 3.613731110173949e-05, "loss": 0.3157, "num_input_tokens_seen": 48924800, "step": 51210 }, { "epoch": 4.177746961416102, "grad_norm": 0.7546734809875488, "learning_rate": 3.6134124415095864e-05, "loss": 0.381, "num_input_tokens_seen": 48929568, "step": 51215 }, { "epoch": 4.178154825026511, "grad_norm": 1.0028544664382935, "learning_rate": 3.613093750276827e-05, "loss": 0.3426, "num_input_tokens_seen": 48934992, "step": 51220 }, { "epoch": 4.178562688636919, "grad_norm": 0.42568302154541016, "learning_rate": 3.61277503648213e-05, "loss": 0.3657, "num_input_tokens_seen": 48939616, "step": 51225 }, { "epoch": 4.178970552247328, "grad_norm": 0.44293203949928284, "learning_rate": 3.6124563001319564e-05, "loss": 0.2955, "num_input_tokens_seen": 48944672, "step": 51230 }, { "epoch": 4.179378415857737, "grad_norm": 0.3679748475551605, "learning_rate": 3.612137541232766e-05, "loss": 0.2426, "num_input_tokens_seen": 48949024, "step": 51235 }, { "epoch": 4.1797862794681455, "grad_norm": 0.45081064105033875, "learning_rate": 3.6118187597910204e-05, "loss": 0.3486, "num_input_tokens_seen": 48953584, "step": 51240 }, { "epoch": 4.1801941430785545, "grad_norm": 0.46811991930007935, "learning_rate": 3.61149995581318e-05, "loss": 0.3732, "num_input_tokens_seen": 48958256, "step": 51245 }, { "epoch": 4.1806020066889635, "grad_norm": 0.41813981533050537, "learning_rate": 3.611181129305709e-05, "loss": 0.3933, "num_input_tokens_seen": 48964160, "step": 51250 }, { "epoch": 4.181009870299372, "grad_norm": 0.5500242114067078, "learning_rate": 3.610862280275068e-05, "loss": 0.382, "num_input_tokens_seen": 48969088, "step": 51255 }, { "epoch": 4.181417733909781, "grad_norm": 0.7051308155059814, "learning_rate": 3.61054340872772e-05, "loss": 0.318, "num_input_tokens_seen": 48974128, "step": 51260 }, { "epoch": 4.18182559752019, "grad_norm": 0.5549149513244629, "learning_rate": 3.61022451467013e-05, "loss": 0.3977, "num_input_tokens_seen": 48978272, "step": 51265 }, { "epoch": 4.182233461130598, "grad_norm": 0.47450876235961914, "learning_rate": 3.6099055981087605e-05, "loss": 0.3113, "num_input_tokens_seen": 48983136, "step": 51270 }, { "epoch": 4.182641324741007, "grad_norm": 0.2847900688648224, "learning_rate": 3.609586659050076e-05, "loss": 0.3374, "num_input_tokens_seen": 48987504, "step": 51275 }, { "epoch": 4.183049188351415, "grad_norm": 0.9098730683326721, "learning_rate": 3.609267697500542e-05, "loss": 0.3913, "num_input_tokens_seen": 48991968, "step": 51280 }, { "epoch": 4.183457051961824, "grad_norm": 0.7466570138931274, "learning_rate": 3.608948713466623e-05, "loss": 0.3387, "num_input_tokens_seen": 48996704, "step": 51285 }, { "epoch": 4.183864915572233, "grad_norm": 0.7063088417053223, "learning_rate": 3.608629706954784e-05, "loss": 0.3692, "num_input_tokens_seen": 49000432, "step": 51290 }, { "epoch": 4.184272779182641, "grad_norm": 0.7908996939659119, "learning_rate": 3.6083106779714926e-05, "loss": 0.3294, "num_input_tokens_seen": 49005424, "step": 51295 }, { "epoch": 4.18468064279305, "grad_norm": 0.2350808084011078, "learning_rate": 3.607991626523215e-05, "loss": 0.3496, "num_input_tokens_seen": 49010112, "step": 51300 }, { "epoch": 4.185088506403459, "grad_norm": 0.4323931932449341, "learning_rate": 3.6076725526164176e-05, "loss": 0.3242, "num_input_tokens_seen": 49014624, "step": 51305 }, { "epoch": 4.185496370013867, "grad_norm": 0.43137794733047485, "learning_rate": 3.607353456257568e-05, "loss": 0.3333, "num_input_tokens_seen": 49019216, "step": 51310 }, { "epoch": 4.185904233624276, "grad_norm": 0.7102677822113037, "learning_rate": 3.607034337453135e-05, "loss": 0.3463, "num_input_tokens_seen": 49024288, "step": 51315 }, { "epoch": 4.186312097234684, "grad_norm": 0.44007495045661926, "learning_rate": 3.606715196209586e-05, "loss": 0.3308, "num_input_tokens_seen": 49028784, "step": 51320 }, { "epoch": 4.186719960845093, "grad_norm": 0.39809074997901917, "learning_rate": 3.60639603253339e-05, "loss": 0.3438, "num_input_tokens_seen": 49033504, "step": 51325 }, { "epoch": 4.187127824455502, "grad_norm": 0.582013726234436, "learning_rate": 3.606076846431017e-05, "loss": 0.3083, "num_input_tokens_seen": 49037776, "step": 51330 }, { "epoch": 4.1875356880659105, "grad_norm": 0.5218321084976196, "learning_rate": 3.605757637908936e-05, "loss": 0.3127, "num_input_tokens_seen": 49041328, "step": 51335 }, { "epoch": 4.1879435516763195, "grad_norm": 0.5545248985290527, "learning_rate": 3.6054384069736176e-05, "loss": 0.2938, "num_input_tokens_seen": 49046976, "step": 51340 }, { "epoch": 4.1883514152867285, "grad_norm": 0.4467228949069977, "learning_rate": 3.605119153631532e-05, "loss": 0.4577, "num_input_tokens_seen": 49052352, "step": 51345 }, { "epoch": 4.188759278897137, "grad_norm": 0.8405219316482544, "learning_rate": 3.604799877889151e-05, "loss": 0.4157, "num_input_tokens_seen": 49056576, "step": 51350 }, { "epoch": 4.189167142507546, "grad_norm": 0.4333707094192505, "learning_rate": 3.604480579752946e-05, "loss": 0.3643, "num_input_tokens_seen": 49061472, "step": 51355 }, { "epoch": 4.189575006117954, "grad_norm": 0.5544817447662354, "learning_rate": 3.60416125922939e-05, "loss": 0.294, "num_input_tokens_seen": 49066368, "step": 51360 }, { "epoch": 4.189982869728363, "grad_norm": 0.6379683017730713, "learning_rate": 3.603841916324953e-05, "loss": 0.3278, "num_input_tokens_seen": 49072304, "step": 51365 }, { "epoch": 4.190390733338772, "grad_norm": 0.6019519567489624, "learning_rate": 3.6035225510461104e-05, "loss": 0.3756, "num_input_tokens_seen": 49077216, "step": 51370 }, { "epoch": 4.19079859694918, "grad_norm": 0.6298232674598694, "learning_rate": 3.603203163399334e-05, "loss": 0.2924, "num_input_tokens_seen": 49081712, "step": 51375 }, { "epoch": 4.191206460559589, "grad_norm": 0.4365382194519043, "learning_rate": 3.602883753391099e-05, "loss": 0.3146, "num_input_tokens_seen": 49086672, "step": 51380 }, { "epoch": 4.191614324169998, "grad_norm": 0.47934120893478394, "learning_rate": 3.602564321027878e-05, "loss": 0.315, "num_input_tokens_seen": 49091056, "step": 51385 }, { "epoch": 4.192022187780406, "grad_norm": 0.5608993172645569, "learning_rate": 3.602244866316147e-05, "loss": 0.3688, "num_input_tokens_seen": 49094944, "step": 51390 }, { "epoch": 4.192430051390815, "grad_norm": 1.0940755605697632, "learning_rate": 3.601925389262381e-05, "loss": 0.3399, "num_input_tokens_seen": 49099600, "step": 51395 }, { "epoch": 4.192837915001224, "grad_norm": 0.5053637027740479, "learning_rate": 3.6016058898730554e-05, "loss": 0.3089, "num_input_tokens_seen": 49104368, "step": 51400 }, { "epoch": 4.193245778611632, "grad_norm": 0.5167269110679626, "learning_rate": 3.6012863681546475e-05, "loss": 0.3412, "num_input_tokens_seen": 49109072, "step": 51405 }, { "epoch": 4.193653642222041, "grad_norm": 0.33902955055236816, "learning_rate": 3.6009668241136315e-05, "loss": 0.3401, "num_input_tokens_seen": 49114448, "step": 51410 }, { "epoch": 4.194061505832449, "grad_norm": 0.865397572517395, "learning_rate": 3.600647257756486e-05, "loss": 0.2798, "num_input_tokens_seen": 49120000, "step": 51415 }, { "epoch": 4.194469369442858, "grad_norm": 0.4822289049625397, "learning_rate": 3.600327669089689e-05, "loss": 0.3917, "num_input_tokens_seen": 49124688, "step": 51420 }, { "epoch": 4.194877233053267, "grad_norm": 0.5565299987792969, "learning_rate": 3.600008058119718e-05, "loss": 0.4249, "num_input_tokens_seen": 49130160, "step": 51425 }, { "epoch": 4.195285096663675, "grad_norm": 0.7915596961975098, "learning_rate": 3.59968842485305e-05, "loss": 0.356, "num_input_tokens_seen": 49134144, "step": 51430 }, { "epoch": 4.1956929602740844, "grad_norm": 0.36634552478790283, "learning_rate": 3.599368769296166e-05, "loss": 0.3036, "num_input_tokens_seen": 49139232, "step": 51435 }, { "epoch": 4.196100823884493, "grad_norm": 0.8374499678611755, "learning_rate": 3.599049091455544e-05, "loss": 0.2757, "num_input_tokens_seen": 49144400, "step": 51440 }, { "epoch": 4.196508687494902, "grad_norm": 0.523421049118042, "learning_rate": 3.598729391337663e-05, "loss": 0.3255, "num_input_tokens_seen": 49148384, "step": 51445 }, { "epoch": 4.196916551105311, "grad_norm": 0.41976866126060486, "learning_rate": 3.5984096689490046e-05, "loss": 0.3137, "num_input_tokens_seen": 49153824, "step": 51450 }, { "epoch": 4.197324414715719, "grad_norm": 0.5543695688247681, "learning_rate": 3.5980899242960495e-05, "loss": 0.295, "num_input_tokens_seen": 49158816, "step": 51455 }, { "epoch": 4.197732278326128, "grad_norm": 0.5268046259880066, "learning_rate": 3.5977701573852776e-05, "loss": 0.3488, "num_input_tokens_seen": 49163616, "step": 51460 }, { "epoch": 4.198140141936537, "grad_norm": 1.0273940563201904, "learning_rate": 3.597450368223171e-05, "loss": 0.3405, "num_input_tokens_seen": 49168560, "step": 51465 }, { "epoch": 4.198548005546945, "grad_norm": 0.5660605430603027, "learning_rate": 3.597130556816212e-05, "loss": 0.2896, "num_input_tokens_seen": 49173264, "step": 51470 }, { "epoch": 4.198955869157354, "grad_norm": 0.5135296583175659, "learning_rate": 3.596810723170883e-05, "loss": 0.2852, "num_input_tokens_seen": 49176912, "step": 51475 }, { "epoch": 4.199363732767763, "grad_norm": 1.183070182800293, "learning_rate": 3.596490867293666e-05, "loss": 0.3495, "num_input_tokens_seen": 49181088, "step": 51480 }, { "epoch": 4.199771596378171, "grad_norm": 0.5578466653823853, "learning_rate": 3.596170989191046e-05, "loss": 0.3076, "num_input_tokens_seen": 49186304, "step": 51485 }, { "epoch": 4.20017945998858, "grad_norm": 0.5260284543037415, "learning_rate": 3.5958510888695054e-05, "loss": 0.3262, "num_input_tokens_seen": 49191600, "step": 51490 }, { "epoch": 4.200587323598988, "grad_norm": 0.7397551536560059, "learning_rate": 3.595531166335529e-05, "loss": 0.3042, "num_input_tokens_seen": 49196080, "step": 51495 }, { "epoch": 4.200995187209397, "grad_norm": 0.8053205013275146, "learning_rate": 3.5952112215956014e-05, "loss": 0.3739, "num_input_tokens_seen": 49200944, "step": 51500 }, { "epoch": 4.201403050819806, "grad_norm": 0.967337429523468, "learning_rate": 3.5948912546562074e-05, "loss": 0.3778, "num_input_tokens_seen": 49205840, "step": 51505 }, { "epoch": 4.201810914430214, "grad_norm": 0.5370399951934814, "learning_rate": 3.594571265523833e-05, "loss": 0.3135, "num_input_tokens_seen": 49210592, "step": 51510 }, { "epoch": 4.202218778040623, "grad_norm": 0.6265926957130432, "learning_rate": 3.5942512542049644e-05, "loss": 0.3399, "num_input_tokens_seen": 49215712, "step": 51515 }, { "epoch": 4.202626641651032, "grad_norm": 0.7493483424186707, "learning_rate": 3.593931220706088e-05, "loss": 0.3121, "num_input_tokens_seen": 49219920, "step": 51520 }, { "epoch": 4.20303450526144, "grad_norm": 0.5208650231361389, "learning_rate": 3.59361116503369e-05, "loss": 0.3648, "num_input_tokens_seen": 49224032, "step": 51525 }, { "epoch": 4.203442368871849, "grad_norm": 0.5943984389305115, "learning_rate": 3.593291087194259e-05, "loss": 0.3586, "num_input_tokens_seen": 49228672, "step": 51530 }, { "epoch": 4.2038502324822575, "grad_norm": 0.7439974546432495, "learning_rate": 3.592970987194282e-05, "loss": 0.2901, "num_input_tokens_seen": 49233040, "step": 51535 }, { "epoch": 4.2042580960926665, "grad_norm": 0.45099568367004395, "learning_rate": 3.592650865040247e-05, "loss": 0.4398, "num_input_tokens_seen": 49238096, "step": 51540 }, { "epoch": 4.2046659597030756, "grad_norm": 2.9424004554748535, "learning_rate": 3.592330720738644e-05, "loss": 0.4487, "num_input_tokens_seen": 49243120, "step": 51545 }, { "epoch": 4.205073823313484, "grad_norm": 0.9804885983467102, "learning_rate": 3.5920105542959614e-05, "loss": 0.3837, "num_input_tokens_seen": 49247712, "step": 51550 }, { "epoch": 4.205481686923893, "grad_norm": 0.6401994824409485, "learning_rate": 3.591690365718689e-05, "loss": 0.3741, "num_input_tokens_seen": 49252784, "step": 51555 }, { "epoch": 4.205889550534302, "grad_norm": 0.8022270202636719, "learning_rate": 3.591370155013316e-05, "loss": 0.3727, "num_input_tokens_seen": 49257328, "step": 51560 }, { "epoch": 4.20629741414471, "grad_norm": 1.1105676889419556, "learning_rate": 3.591049922186334e-05, "loss": 0.3471, "num_input_tokens_seen": 49262144, "step": 51565 }, { "epoch": 4.206705277755119, "grad_norm": 0.3926997482776642, "learning_rate": 3.5907296672442337e-05, "loss": 0.3672, "num_input_tokens_seen": 49266112, "step": 51570 }, { "epoch": 4.207113141365527, "grad_norm": 0.3378160297870636, "learning_rate": 3.590409390193507e-05, "loss": 0.3468, "num_input_tokens_seen": 49271184, "step": 51575 }, { "epoch": 4.207521004975936, "grad_norm": 0.6363416910171509, "learning_rate": 3.590089091040645e-05, "loss": 0.3395, "num_input_tokens_seen": 49276480, "step": 51580 }, { "epoch": 4.207928868586345, "grad_norm": 0.4765464663505554, "learning_rate": 3.589768769792141e-05, "loss": 0.3736, "num_input_tokens_seen": 49281024, "step": 51585 }, { "epoch": 4.208336732196753, "grad_norm": 0.2559516131877899, "learning_rate": 3.589448426454486e-05, "loss": 0.3065, "num_input_tokens_seen": 49284832, "step": 51590 }, { "epoch": 4.208744595807162, "grad_norm": 0.36007922887802124, "learning_rate": 3.589128061034175e-05, "loss": 0.3389, "num_input_tokens_seen": 49289952, "step": 51595 }, { "epoch": 4.209152459417571, "grad_norm": 0.5774608850479126, "learning_rate": 3.5888076735377004e-05, "loss": 0.292, "num_input_tokens_seen": 49294384, "step": 51600 }, { "epoch": 4.209560323027979, "grad_norm": 0.28053149580955505, "learning_rate": 3.588487263971557e-05, "loss": 0.3738, "num_input_tokens_seen": 49298864, "step": 51605 }, { "epoch": 4.209968186638388, "grad_norm": 0.6027495861053467, "learning_rate": 3.58816683234224e-05, "loss": 0.3299, "num_input_tokens_seen": 49303712, "step": 51610 }, { "epoch": 4.210376050248797, "grad_norm": 0.6264950037002563, "learning_rate": 3.587846378656243e-05, "loss": 0.365, "num_input_tokens_seen": 49308640, "step": 51615 }, { "epoch": 4.210783913859205, "grad_norm": 0.31345149874687195, "learning_rate": 3.587525902920062e-05, "loss": 0.3356, "num_input_tokens_seen": 49314416, "step": 51620 }, { "epoch": 4.211191777469614, "grad_norm": 0.43845072388648987, "learning_rate": 3.5872054051401936e-05, "loss": 0.368, "num_input_tokens_seen": 49319024, "step": 51625 }, { "epoch": 4.2115996410800225, "grad_norm": 0.6601786017417908, "learning_rate": 3.586884885323133e-05, "loss": 0.3006, "num_input_tokens_seen": 49323712, "step": 51630 }, { "epoch": 4.2120075046904315, "grad_norm": 0.39954233169555664, "learning_rate": 3.5865643434753784e-05, "loss": 0.3445, "num_input_tokens_seen": 49328480, "step": 51635 }, { "epoch": 4.2124153683008405, "grad_norm": 0.5089400410652161, "learning_rate": 3.5862437796034264e-05, "loss": 0.3501, "num_input_tokens_seen": 49332784, "step": 51640 }, { "epoch": 4.212823231911249, "grad_norm": 0.6894615292549133, "learning_rate": 3.5859231937137746e-05, "loss": 0.3673, "num_input_tokens_seen": 49338576, "step": 51645 }, { "epoch": 4.213231095521658, "grad_norm": 0.3321956694126129, "learning_rate": 3.5856025858129206e-05, "loss": 0.2983, "num_input_tokens_seen": 49342736, "step": 51650 }, { "epoch": 4.213638959132067, "grad_norm": 0.6252691149711609, "learning_rate": 3.585281955907363e-05, "loss": 0.3112, "num_input_tokens_seen": 49347088, "step": 51655 }, { "epoch": 4.214046822742475, "grad_norm": 0.4945421516895294, "learning_rate": 3.584961304003602e-05, "loss": 0.3466, "num_input_tokens_seen": 49351264, "step": 51660 }, { "epoch": 4.214454686352884, "grad_norm": 1.1126712560653687, "learning_rate": 3.584640630108136e-05, "loss": 0.3703, "num_input_tokens_seen": 49355600, "step": 51665 }, { "epoch": 4.214862549963292, "grad_norm": 1.062822699546814, "learning_rate": 3.584319934227466e-05, "loss": 0.4159, "num_input_tokens_seen": 49360816, "step": 51670 }, { "epoch": 4.215270413573701, "grad_norm": 0.5279358625411987, "learning_rate": 3.5839992163680905e-05, "loss": 0.3268, "num_input_tokens_seen": 49366432, "step": 51675 }, { "epoch": 4.21567827718411, "grad_norm": 0.31878748536109924, "learning_rate": 3.5836784765365126e-05, "loss": 0.3652, "num_input_tokens_seen": 49370672, "step": 51680 }, { "epoch": 4.216086140794518, "grad_norm": 0.8691565990447998, "learning_rate": 3.5833577147392324e-05, "loss": 0.3266, "num_input_tokens_seen": 49375232, "step": 51685 }, { "epoch": 4.216494004404927, "grad_norm": 0.48357564210891724, "learning_rate": 3.583036930982751e-05, "loss": 0.3583, "num_input_tokens_seen": 49380000, "step": 51690 }, { "epoch": 4.216901868015336, "grad_norm": 0.5414863228797913, "learning_rate": 3.582716125273572e-05, "loss": 0.331, "num_input_tokens_seen": 49384480, "step": 51695 }, { "epoch": 4.217309731625744, "grad_norm": 0.40244394540786743, "learning_rate": 3.582395297618196e-05, "loss": 0.3684, "num_input_tokens_seen": 49389168, "step": 51700 }, { "epoch": 4.217717595236153, "grad_norm": 0.4541422128677368, "learning_rate": 3.582074448023128e-05, "loss": 0.3416, "num_input_tokens_seen": 49394384, "step": 51705 }, { "epoch": 4.218125458846561, "grad_norm": 0.39205536246299744, "learning_rate": 3.5817535764948715e-05, "loss": 0.346, "num_input_tokens_seen": 49398960, "step": 51710 }, { "epoch": 4.21853332245697, "grad_norm": 0.3379835784435272, "learning_rate": 3.581432683039929e-05, "loss": 0.3345, "num_input_tokens_seen": 49404400, "step": 51715 }, { "epoch": 4.218941186067379, "grad_norm": 0.6624436378479004, "learning_rate": 3.5811117676648055e-05, "loss": 0.3188, "num_input_tokens_seen": 49409952, "step": 51720 }, { "epoch": 4.2193490496777875, "grad_norm": 0.5235177278518677, "learning_rate": 3.5807908303760064e-05, "loss": 0.3415, "num_input_tokens_seen": 49415360, "step": 51725 }, { "epoch": 4.2197569132881965, "grad_norm": 0.6773957014083862, "learning_rate": 3.580469871180036e-05, "loss": 0.3676, "num_input_tokens_seen": 49420288, "step": 51730 }, { "epoch": 4.2201647768986055, "grad_norm": 0.8235424757003784, "learning_rate": 3.5801488900834e-05, "loss": 0.3285, "num_input_tokens_seen": 49425216, "step": 51735 }, { "epoch": 4.220572640509014, "grad_norm": 0.5153393745422363, "learning_rate": 3.579827887092606e-05, "loss": 0.3421, "num_input_tokens_seen": 49430080, "step": 51740 }, { "epoch": 4.220980504119423, "grad_norm": 0.6608501672744751, "learning_rate": 3.579506862214158e-05, "loss": 0.3348, "num_input_tokens_seen": 49435488, "step": 51745 }, { "epoch": 4.221388367729831, "grad_norm": 0.6547418832778931, "learning_rate": 3.579185815454566e-05, "loss": 0.3675, "num_input_tokens_seen": 49439856, "step": 51750 }, { "epoch": 4.22179623134024, "grad_norm": 0.8102059364318848, "learning_rate": 3.578864746820335e-05, "loss": 0.3393, "num_input_tokens_seen": 49444976, "step": 51755 }, { "epoch": 4.222204094950649, "grad_norm": 0.527814507484436, "learning_rate": 3.578543656317975e-05, "loss": 0.385, "num_input_tokens_seen": 49450224, "step": 51760 }, { "epoch": 4.222611958561057, "grad_norm": 0.45711830258369446, "learning_rate": 3.5782225439539924e-05, "loss": 0.381, "num_input_tokens_seen": 49455120, "step": 51765 }, { "epoch": 4.223019822171466, "grad_norm": 0.6603305339813232, "learning_rate": 3.577901409734898e-05, "loss": 0.3649, "num_input_tokens_seen": 49460576, "step": 51770 }, { "epoch": 4.223427685781875, "grad_norm": 0.4460791349411011, "learning_rate": 3.5775802536672e-05, "loss": 0.3598, "num_input_tokens_seen": 49464448, "step": 51775 }, { "epoch": 4.223835549392283, "grad_norm": 0.5332357883453369, "learning_rate": 3.5772590757574074e-05, "loss": 0.34, "num_input_tokens_seen": 49469120, "step": 51780 }, { "epoch": 4.224243413002692, "grad_norm": 0.5933045744895935, "learning_rate": 3.576937876012032e-05, "loss": 0.2939, "num_input_tokens_seen": 49474096, "step": 51785 }, { "epoch": 4.2246512766131, "grad_norm": 0.6328836679458618, "learning_rate": 3.576616654437583e-05, "loss": 0.3173, "num_input_tokens_seen": 49478624, "step": 51790 }, { "epoch": 4.225059140223509, "grad_norm": 0.461520791053772, "learning_rate": 3.5762954110405714e-05, "loss": 0.3762, "num_input_tokens_seen": 49484128, "step": 51795 }, { "epoch": 4.225467003833918, "grad_norm": 0.5324951410293579, "learning_rate": 3.5759741458275095e-05, "loss": 0.2878, "num_input_tokens_seen": 49489280, "step": 51800 }, { "epoch": 4.225874867444326, "grad_norm": 1.0925238132476807, "learning_rate": 3.575652858804909e-05, "loss": 0.3683, "num_input_tokens_seen": 49494416, "step": 51805 }, { "epoch": 4.226282731054735, "grad_norm": 0.5753475427627563, "learning_rate": 3.5753315499792824e-05, "loss": 0.3494, "num_input_tokens_seen": 49499008, "step": 51810 }, { "epoch": 4.226690594665144, "grad_norm": 0.4928767681121826, "learning_rate": 3.5750102193571424e-05, "loss": 0.3489, "num_input_tokens_seen": 49503328, "step": 51815 }, { "epoch": 4.227098458275552, "grad_norm": 0.9179812669754028, "learning_rate": 3.5746888669450005e-05, "loss": 0.4048, "num_input_tokens_seen": 49508320, "step": 51820 }, { "epoch": 4.227506321885961, "grad_norm": 0.3211711347103119, "learning_rate": 3.5743674927493735e-05, "loss": 0.3354, "num_input_tokens_seen": 49513120, "step": 51825 }, { "epoch": 4.22791418549637, "grad_norm": 1.0310372114181519, "learning_rate": 3.574046096776773e-05, "loss": 0.3444, "num_input_tokens_seen": 49517984, "step": 51830 }, { "epoch": 4.228322049106779, "grad_norm": 0.8327124118804932, "learning_rate": 3.5737246790337156e-05, "loss": 0.3743, "num_input_tokens_seen": 49522416, "step": 51835 }, { "epoch": 4.228729912717188, "grad_norm": 0.5982949733734131, "learning_rate": 3.573403239526715e-05, "loss": 0.3682, "num_input_tokens_seen": 49526864, "step": 51840 }, { "epoch": 4.229137776327596, "grad_norm": 0.36914828419685364, "learning_rate": 3.573081778262286e-05, "loss": 0.363, "num_input_tokens_seen": 49532368, "step": 51845 }, { "epoch": 4.229545639938005, "grad_norm": 0.7557299733161926, "learning_rate": 3.572760295246946e-05, "loss": 0.3432, "num_input_tokens_seen": 49537472, "step": 51850 }, { "epoch": 4.229953503548414, "grad_norm": 0.6378794312477112, "learning_rate": 3.57243879048721e-05, "loss": 0.3502, "num_input_tokens_seen": 49543088, "step": 51855 }, { "epoch": 4.230361367158822, "grad_norm": 0.36286431550979614, "learning_rate": 3.572117263989596e-05, "loss": 0.3122, "num_input_tokens_seen": 49548736, "step": 51860 }, { "epoch": 4.230769230769231, "grad_norm": 0.3973332345485687, "learning_rate": 3.571795715760621e-05, "loss": 0.3329, "num_input_tokens_seen": 49552912, "step": 51865 }, { "epoch": 4.23117709437964, "grad_norm": 0.43818560242652893, "learning_rate": 3.5714741458068016e-05, "loss": 0.3263, "num_input_tokens_seen": 49557904, "step": 51870 }, { "epoch": 4.231584957990048, "grad_norm": 0.7949991226196289, "learning_rate": 3.571152554134656e-05, "loss": 0.3931, "num_input_tokens_seen": 49562960, "step": 51875 }, { "epoch": 4.231992821600457, "grad_norm": 0.8752870559692383, "learning_rate": 3.5708309407507046e-05, "loss": 0.3365, "num_input_tokens_seen": 49566000, "step": 51880 }, { "epoch": 4.232400685210865, "grad_norm": 0.8797035813331604, "learning_rate": 3.5705093056614644e-05, "loss": 0.3352, "num_input_tokens_seen": 49571152, "step": 51885 }, { "epoch": 4.232808548821274, "grad_norm": 0.6785750389099121, "learning_rate": 3.5701876488734553e-05, "loss": 0.3432, "num_input_tokens_seen": 49575776, "step": 51890 }, { "epoch": 4.233216412431683, "grad_norm": 0.2528481185436249, "learning_rate": 3.569865970393198e-05, "loss": 0.3397, "num_input_tokens_seen": 49581136, "step": 51895 }, { "epoch": 4.233624276042091, "grad_norm": 0.3312116265296936, "learning_rate": 3.5695442702272115e-05, "loss": 0.3593, "num_input_tokens_seen": 49585728, "step": 51900 }, { "epoch": 4.2340321396525, "grad_norm": 0.5244143009185791, "learning_rate": 3.5692225483820177e-05, "loss": 0.3002, "num_input_tokens_seen": 49590384, "step": 51905 }, { "epoch": 4.234440003262909, "grad_norm": 0.49878019094467163, "learning_rate": 3.568900804864136e-05, "loss": 0.3419, "num_input_tokens_seen": 49594992, "step": 51910 }, { "epoch": 4.234847866873317, "grad_norm": 0.893398642539978, "learning_rate": 3.56857903968009e-05, "loss": 0.3591, "num_input_tokens_seen": 49600048, "step": 51915 }, { "epoch": 4.235255730483726, "grad_norm": 1.0921119451522827, "learning_rate": 3.5682572528364014e-05, "loss": 0.3646, "num_input_tokens_seen": 49605680, "step": 51920 }, { "epoch": 4.2356635940941345, "grad_norm": 0.5482010841369629, "learning_rate": 3.567935444339592e-05, "loss": 0.3111, "num_input_tokens_seen": 49609008, "step": 51925 }, { "epoch": 4.2360714577045435, "grad_norm": 0.516741931438446, "learning_rate": 3.5676136141961835e-05, "loss": 0.3012, "num_input_tokens_seen": 49613776, "step": 51930 }, { "epoch": 4.2364793213149525, "grad_norm": 0.4956199526786804, "learning_rate": 3.567291762412702e-05, "loss": 0.412, "num_input_tokens_seen": 49616864, "step": 51935 }, { "epoch": 4.236887184925361, "grad_norm": 0.5328033566474915, "learning_rate": 3.5669698889956704e-05, "loss": 0.3185, "num_input_tokens_seen": 49622016, "step": 51940 }, { "epoch": 4.23729504853577, "grad_norm": 0.5015774369239807, "learning_rate": 3.5666479939516115e-05, "loss": 0.3137, "num_input_tokens_seen": 49627376, "step": 51945 }, { "epoch": 4.237702912146179, "grad_norm": 0.39644280076026917, "learning_rate": 3.5663260772870515e-05, "loss": 0.4019, "num_input_tokens_seen": 49631936, "step": 51950 }, { "epoch": 4.238110775756587, "grad_norm": 0.44187232851982117, "learning_rate": 3.5660041390085153e-05, "loss": 0.3295, "num_input_tokens_seen": 49637360, "step": 51955 }, { "epoch": 4.238518639366996, "grad_norm": 0.6694073677062988, "learning_rate": 3.565682179122528e-05, "loss": 0.3523, "num_input_tokens_seen": 49641760, "step": 51960 }, { "epoch": 4.238926502977405, "grad_norm": 0.8268389105796814, "learning_rate": 3.565360197635615e-05, "loss": 0.3471, "num_input_tokens_seen": 49646144, "step": 51965 }, { "epoch": 4.239334366587813, "grad_norm": 0.7801890969276428, "learning_rate": 3.565038194554304e-05, "loss": 0.335, "num_input_tokens_seen": 49649936, "step": 51970 }, { "epoch": 4.239742230198222, "grad_norm": 0.981783926486969, "learning_rate": 3.5647161698851214e-05, "loss": 0.346, "num_input_tokens_seen": 49655072, "step": 51975 }, { "epoch": 4.24015009380863, "grad_norm": 0.9951088428497314, "learning_rate": 3.564394123634595e-05, "loss": 0.343, "num_input_tokens_seen": 49659120, "step": 51980 }, { "epoch": 4.240557957419039, "grad_norm": 0.5694193243980408, "learning_rate": 3.564072055809251e-05, "loss": 0.3293, "num_input_tokens_seen": 49663936, "step": 51985 }, { "epoch": 4.240965821029448, "grad_norm": 0.7360208034515381, "learning_rate": 3.56374996641562e-05, "loss": 0.3019, "num_input_tokens_seen": 49669008, "step": 51990 }, { "epoch": 4.241373684639856, "grad_norm": 0.6203144192695618, "learning_rate": 3.563427855460229e-05, "loss": 0.3316, "num_input_tokens_seen": 49673360, "step": 51995 }, { "epoch": 4.241781548250265, "grad_norm": 0.6488263607025146, "learning_rate": 3.5631057229496056e-05, "loss": 0.3487, "num_input_tokens_seen": 49677888, "step": 52000 }, { "epoch": 4.242189411860674, "grad_norm": 0.7211293578147888, "learning_rate": 3.562783568890282e-05, "loss": 0.336, "num_input_tokens_seen": 49682272, "step": 52005 }, { "epoch": 4.242597275471082, "grad_norm": 0.35858213901519775, "learning_rate": 3.562461393288787e-05, "loss": 0.3392, "num_input_tokens_seen": 49687168, "step": 52010 }, { "epoch": 4.243005139081491, "grad_norm": 0.6223121285438538, "learning_rate": 3.562139196151651e-05, "loss": 0.3333, "num_input_tokens_seen": 49692688, "step": 52015 }, { "epoch": 4.2434130026918995, "grad_norm": 0.7022059559822083, "learning_rate": 3.5618169774854043e-05, "loss": 0.3572, "num_input_tokens_seen": 49697696, "step": 52020 }, { "epoch": 4.2438208663023085, "grad_norm": 0.9568516612052917, "learning_rate": 3.56149473729658e-05, "loss": 0.3621, "num_input_tokens_seen": 49702800, "step": 52025 }, { "epoch": 4.2442287299127175, "grad_norm": 0.5545644760131836, "learning_rate": 3.5611724755917067e-05, "loss": 0.3341, "num_input_tokens_seen": 49707440, "step": 52030 }, { "epoch": 4.244636593523126, "grad_norm": 0.7229521870613098, "learning_rate": 3.560850192377319e-05, "loss": 0.314, "num_input_tokens_seen": 49712384, "step": 52035 }, { "epoch": 4.245044457133535, "grad_norm": 0.7234739661216736, "learning_rate": 3.560527887659949e-05, "loss": 0.3451, "num_input_tokens_seen": 49716736, "step": 52040 }, { "epoch": 4.245452320743944, "grad_norm": 0.8960627317428589, "learning_rate": 3.5602055614461294e-05, "loss": 0.3243, "num_input_tokens_seen": 49721856, "step": 52045 }, { "epoch": 4.245860184354352, "grad_norm": 0.6194531917572021, "learning_rate": 3.559883213742393e-05, "loss": 0.3219, "num_input_tokens_seen": 49726192, "step": 52050 }, { "epoch": 4.246268047964761, "grad_norm": 0.6373556852340698, "learning_rate": 3.559560844555275e-05, "loss": 0.3231, "num_input_tokens_seen": 49731120, "step": 52055 }, { "epoch": 4.246675911575169, "grad_norm": 1.051021933555603, "learning_rate": 3.559238453891309e-05, "loss": 0.4868, "num_input_tokens_seen": 49735232, "step": 52060 }, { "epoch": 4.247083775185578, "grad_norm": 0.44803768396377563, "learning_rate": 3.558916041757029e-05, "loss": 0.2772, "num_input_tokens_seen": 49739904, "step": 52065 }, { "epoch": 4.247491638795987, "grad_norm": 0.7921231389045715, "learning_rate": 3.558593608158971e-05, "loss": 0.3964, "num_input_tokens_seen": 49745232, "step": 52070 }, { "epoch": 4.247899502406395, "grad_norm": 1.0621033906936646, "learning_rate": 3.558271153103669e-05, "loss": 0.3301, "num_input_tokens_seen": 49750208, "step": 52075 }, { "epoch": 4.248307366016804, "grad_norm": 0.6174147129058838, "learning_rate": 3.5579486765976626e-05, "loss": 0.3081, "num_input_tokens_seen": 49754336, "step": 52080 }, { "epoch": 4.248715229627213, "grad_norm": 0.7429507970809937, "learning_rate": 3.557626178647485e-05, "loss": 0.3394, "num_input_tokens_seen": 49759504, "step": 52085 }, { "epoch": 4.249123093237621, "grad_norm": 0.6753788590431213, "learning_rate": 3.5573036592596744e-05, "loss": 0.3589, "num_input_tokens_seen": 49765088, "step": 52090 }, { "epoch": 4.24953095684803, "grad_norm": 1.0018625259399414, "learning_rate": 3.556981118440768e-05, "loss": 0.3542, "num_input_tokens_seen": 49769984, "step": 52095 }, { "epoch": 4.249938820458438, "grad_norm": 0.6141242384910583, "learning_rate": 3.556658556197303e-05, "loss": 0.2964, "num_input_tokens_seen": 49774800, "step": 52100 }, { "epoch": 4.250346684068847, "grad_norm": 0.5479617118835449, "learning_rate": 3.556335972535819e-05, "loss": 0.3644, "num_input_tokens_seen": 49780048, "step": 52105 }, { "epoch": 4.250754547679256, "grad_norm": 0.5607684850692749, "learning_rate": 3.556013367462853e-05, "loss": 0.4058, "num_input_tokens_seen": 49784592, "step": 52110 }, { "epoch": 4.251162411289664, "grad_norm": 0.7473827600479126, "learning_rate": 3.555690740984945e-05, "loss": 0.3459, "num_input_tokens_seen": 49789392, "step": 52115 }, { "epoch": 4.251570274900073, "grad_norm": 1.241411805152893, "learning_rate": 3.555368093108634e-05, "loss": 0.3335, "num_input_tokens_seen": 49794624, "step": 52120 }, { "epoch": 4.2519781385104825, "grad_norm": 0.7414066195487976, "learning_rate": 3.5550454238404605e-05, "loss": 0.3253, "num_input_tokens_seen": 49800320, "step": 52125 }, { "epoch": 4.252386002120891, "grad_norm": 1.1651782989501953, "learning_rate": 3.554722733186964e-05, "loss": 0.3555, "num_input_tokens_seen": 49805552, "step": 52130 }, { "epoch": 4.2527938657313, "grad_norm": 0.9512754678726196, "learning_rate": 3.554400021154687e-05, "loss": 0.4095, "num_input_tokens_seen": 49810288, "step": 52135 }, { "epoch": 4.253201729341708, "grad_norm": 0.9463072419166565, "learning_rate": 3.5540772877501685e-05, "loss": 0.3704, "num_input_tokens_seen": 49815328, "step": 52140 }, { "epoch": 4.253609592952117, "grad_norm": 1.0179094076156616, "learning_rate": 3.5537545329799515e-05, "loss": 0.3609, "num_input_tokens_seen": 49820160, "step": 52145 }, { "epoch": 4.254017456562526, "grad_norm": 0.5357614755630493, "learning_rate": 3.553431756850579e-05, "loss": 0.3201, "num_input_tokens_seen": 49824832, "step": 52150 }, { "epoch": 4.254425320172934, "grad_norm": 1.268898367881775, "learning_rate": 3.553108959368591e-05, "loss": 0.3841, "num_input_tokens_seen": 49829312, "step": 52155 }, { "epoch": 4.254833183783343, "grad_norm": 0.6695134043693542, "learning_rate": 3.552786140540532e-05, "loss": 0.3111, "num_input_tokens_seen": 49834080, "step": 52160 }, { "epoch": 4.255241047393752, "grad_norm": 0.6754332780838013, "learning_rate": 3.552463300372946e-05, "loss": 0.4764, "num_input_tokens_seen": 49839168, "step": 52165 }, { "epoch": 4.25564891100416, "grad_norm": 0.8511844873428345, "learning_rate": 3.5521404388723754e-05, "loss": 0.3134, "num_input_tokens_seen": 49843760, "step": 52170 }, { "epoch": 4.256056774614569, "grad_norm": 0.684181272983551, "learning_rate": 3.5518175560453654e-05, "loss": 0.3462, "num_input_tokens_seen": 49848784, "step": 52175 }, { "epoch": 4.256464638224978, "grad_norm": 0.5112839341163635, "learning_rate": 3.551494651898461e-05, "loss": 0.3445, "num_input_tokens_seen": 49853072, "step": 52180 }, { "epoch": 4.256872501835386, "grad_norm": 0.4385923445224762, "learning_rate": 3.5511717264382063e-05, "loss": 0.3375, "num_input_tokens_seen": 49856992, "step": 52185 }, { "epoch": 4.257280365445795, "grad_norm": 0.609933078289032, "learning_rate": 3.5508487796711484e-05, "loss": 0.3129, "num_input_tokens_seen": 49862336, "step": 52190 }, { "epoch": 4.257688229056203, "grad_norm": 0.7290842533111572, "learning_rate": 3.5505258116038315e-05, "loss": 0.3191, "num_input_tokens_seen": 49867392, "step": 52195 }, { "epoch": 4.258096092666612, "grad_norm": 0.47846511006355286, "learning_rate": 3.550202822242802e-05, "loss": 0.3341, "num_input_tokens_seen": 49872784, "step": 52200 }, { "epoch": 4.258503956277021, "grad_norm": 0.8100652098655701, "learning_rate": 3.5498798115946086e-05, "loss": 0.4007, "num_input_tokens_seen": 49877376, "step": 52205 }, { "epoch": 4.258911819887429, "grad_norm": 0.8465543389320374, "learning_rate": 3.549556779665797e-05, "loss": 0.334, "num_input_tokens_seen": 49881440, "step": 52210 }, { "epoch": 4.259319683497838, "grad_norm": 0.4892284870147705, "learning_rate": 3.549233726462916e-05, "loss": 0.3446, "num_input_tokens_seen": 49886464, "step": 52215 }, { "epoch": 4.259727547108247, "grad_norm": 0.8255361914634705, "learning_rate": 3.548910651992512e-05, "loss": 0.3316, "num_input_tokens_seen": 49891744, "step": 52220 }, { "epoch": 4.2601354107186555, "grad_norm": 0.7358638048171997, "learning_rate": 3.548587556261136e-05, "loss": 0.3131, "num_input_tokens_seen": 49895936, "step": 52225 }, { "epoch": 4.2605432743290645, "grad_norm": 0.8732330799102783, "learning_rate": 3.5482644392753353e-05, "loss": 0.3238, "num_input_tokens_seen": 49900144, "step": 52230 }, { "epoch": 4.260951137939473, "grad_norm": 0.7036721110343933, "learning_rate": 3.547941301041661e-05, "loss": 0.304, "num_input_tokens_seen": 49904896, "step": 52235 }, { "epoch": 4.261359001549882, "grad_norm": 0.9621337056159973, "learning_rate": 3.5476181415666605e-05, "loss": 0.3532, "num_input_tokens_seen": 49909264, "step": 52240 }, { "epoch": 4.261766865160291, "grad_norm": 1.0325148105621338, "learning_rate": 3.547294960856886e-05, "loss": 0.3341, "num_input_tokens_seen": 49913344, "step": 52245 }, { "epoch": 4.262174728770699, "grad_norm": 0.4916518032550812, "learning_rate": 3.546971758918887e-05, "loss": 0.3792, "num_input_tokens_seen": 49917408, "step": 52250 }, { "epoch": 4.262582592381108, "grad_norm": 0.4638759195804596, "learning_rate": 3.546648535759216e-05, "loss": 0.3469, "num_input_tokens_seen": 49921760, "step": 52255 }, { "epoch": 4.262990455991517, "grad_norm": 0.44038352370262146, "learning_rate": 3.5463252913844235e-05, "loss": 0.3324, "num_input_tokens_seen": 49925952, "step": 52260 }, { "epoch": 4.263398319601925, "grad_norm": 0.6173897981643677, "learning_rate": 3.5460020258010624e-05, "loss": 0.3388, "num_input_tokens_seen": 49930944, "step": 52265 }, { "epoch": 4.263806183212334, "grad_norm": 1.0224153995513916, "learning_rate": 3.545678739015685e-05, "loss": 0.338, "num_input_tokens_seen": 49935568, "step": 52270 }, { "epoch": 4.264214046822742, "grad_norm": 0.5467246174812317, "learning_rate": 3.545355431034842e-05, "loss": 0.3334, "num_input_tokens_seen": 49940096, "step": 52275 }, { "epoch": 4.264621910433151, "grad_norm": 0.9830416440963745, "learning_rate": 3.545032101865091e-05, "loss": 0.4173, "num_input_tokens_seen": 49945008, "step": 52280 }, { "epoch": 4.26502977404356, "grad_norm": 0.8841496109962463, "learning_rate": 3.5447087515129815e-05, "loss": 0.3793, "num_input_tokens_seen": 49948544, "step": 52285 }, { "epoch": 4.265437637653968, "grad_norm": 0.5541791319847107, "learning_rate": 3.544385379985071e-05, "loss": 0.3365, "num_input_tokens_seen": 49953520, "step": 52290 }, { "epoch": 4.265845501264377, "grad_norm": 0.22910356521606445, "learning_rate": 3.544061987287912e-05, "loss": 0.3005, "num_input_tokens_seen": 49958096, "step": 52295 }, { "epoch": 4.266253364874786, "grad_norm": 0.5104798078536987, "learning_rate": 3.54373857342806e-05, "loss": 0.3741, "num_input_tokens_seen": 49962336, "step": 52300 }, { "epoch": 4.266661228485194, "grad_norm": 0.37308505177497864, "learning_rate": 3.543415138412071e-05, "loss": 0.3622, "num_input_tokens_seen": 49968032, "step": 52305 }, { "epoch": 4.267069092095603, "grad_norm": 0.8385600447654724, "learning_rate": 3.5430916822465e-05, "loss": 0.358, "num_input_tokens_seen": 49972592, "step": 52310 }, { "epoch": 4.267476955706012, "grad_norm": 0.20277194678783417, "learning_rate": 3.542768204937904e-05, "loss": 0.3551, "num_input_tokens_seen": 49976576, "step": 52315 }, { "epoch": 4.2678848193164205, "grad_norm": 0.4512907862663269, "learning_rate": 3.5424447064928404e-05, "loss": 0.3526, "num_input_tokens_seen": 49982000, "step": 52320 }, { "epoch": 4.2682926829268295, "grad_norm": 0.6132741570472717, "learning_rate": 3.5421211869178646e-05, "loss": 0.3677, "num_input_tokens_seen": 49987120, "step": 52325 }, { "epoch": 4.268700546537238, "grad_norm": 0.5969265103340149, "learning_rate": 3.541797646219536e-05, "loss": 0.3628, "num_input_tokens_seen": 49992992, "step": 52330 }, { "epoch": 4.269108410147647, "grad_norm": 0.3527475893497467, "learning_rate": 3.5414740844044115e-05, "loss": 0.2972, "num_input_tokens_seen": 49997968, "step": 52335 }, { "epoch": 4.269516273758056, "grad_norm": 0.3952694833278656, "learning_rate": 3.5411505014790495e-05, "loss": 0.3693, "num_input_tokens_seen": 50002848, "step": 52340 }, { "epoch": 4.269924137368464, "grad_norm": 1.065412163734436, "learning_rate": 3.540826897450009e-05, "loss": 0.3557, "num_input_tokens_seen": 50007456, "step": 52345 }, { "epoch": 4.270332000978873, "grad_norm": 0.6848130822181702, "learning_rate": 3.5405032723238504e-05, "loss": 0.3167, "num_input_tokens_seen": 50011376, "step": 52350 }, { "epoch": 4.270739864589281, "grad_norm": 0.6368657946586609, "learning_rate": 3.5401796261071315e-05, "loss": 0.3563, "num_input_tokens_seen": 50015824, "step": 52355 }, { "epoch": 4.27114772819969, "grad_norm": 0.4338805079460144, "learning_rate": 3.539855958806415e-05, "loss": 0.3383, "num_input_tokens_seen": 50020416, "step": 52360 }, { "epoch": 4.271555591810099, "grad_norm": 0.40773943066596985, "learning_rate": 3.539532270428258e-05, "loss": 0.326, "num_input_tokens_seen": 50025552, "step": 52365 }, { "epoch": 4.271963455420507, "grad_norm": 0.6771367788314819, "learning_rate": 3.539208560979226e-05, "loss": 0.3477, "num_input_tokens_seen": 50030832, "step": 52370 }, { "epoch": 4.272371319030916, "grad_norm": 0.4179505407810211, "learning_rate": 3.5388848304658764e-05, "loss": 0.3353, "num_input_tokens_seen": 50035440, "step": 52375 }, { "epoch": 4.272779182641325, "grad_norm": 0.834835946559906, "learning_rate": 3.5385610788947734e-05, "loss": 0.3331, "num_input_tokens_seen": 50039824, "step": 52380 }, { "epoch": 4.273187046251733, "grad_norm": 0.3431541919708252, "learning_rate": 3.538237306272478e-05, "loss": 0.3733, "num_input_tokens_seen": 50044944, "step": 52385 }, { "epoch": 4.273594909862142, "grad_norm": 0.4978522062301636, "learning_rate": 3.537913512605555e-05, "loss": 0.3625, "num_input_tokens_seen": 50050352, "step": 52390 }, { "epoch": 4.274002773472551, "grad_norm": 0.7405841946601868, "learning_rate": 3.537589697900565e-05, "loss": 0.3565, "num_input_tokens_seen": 50054832, "step": 52395 }, { "epoch": 4.274410637082959, "grad_norm": 0.3685922622680664, "learning_rate": 3.537265862164073e-05, "loss": 0.3543, "num_input_tokens_seen": 50059456, "step": 52400 }, { "epoch": 4.274818500693368, "grad_norm": 0.5537686347961426, "learning_rate": 3.536942005402643e-05, "loss": 0.3695, "num_input_tokens_seen": 50063792, "step": 52405 }, { "epoch": 4.2752263643037764, "grad_norm": 0.6757731437683105, "learning_rate": 3.536618127622838e-05, "loss": 0.3371, "num_input_tokens_seen": 50067936, "step": 52410 }, { "epoch": 4.2756342279141855, "grad_norm": 0.5183483958244324, "learning_rate": 3.5362942288312256e-05, "loss": 0.3501, "num_input_tokens_seen": 50072336, "step": 52415 }, { "epoch": 4.2760420915245945, "grad_norm": 0.669437825679779, "learning_rate": 3.535970309034369e-05, "loss": 0.3341, "num_input_tokens_seen": 50077952, "step": 52420 }, { "epoch": 4.276449955135003, "grad_norm": 0.3636379837989807, "learning_rate": 3.535646368238834e-05, "loss": 0.3307, "num_input_tokens_seen": 50082432, "step": 52425 }, { "epoch": 4.276857818745412, "grad_norm": 2.1149215698242188, "learning_rate": 3.535322406451187e-05, "loss": 0.327, "num_input_tokens_seen": 50087248, "step": 52430 }, { "epoch": 4.277265682355821, "grad_norm": 0.5348941683769226, "learning_rate": 3.534998423677995e-05, "loss": 0.351, "num_input_tokens_seen": 50091728, "step": 52435 }, { "epoch": 4.277673545966229, "grad_norm": 0.22762654721736908, "learning_rate": 3.534674419925826e-05, "loss": 0.3494, "num_input_tokens_seen": 50097008, "step": 52440 }, { "epoch": 4.278081409576638, "grad_norm": 0.5544464588165283, "learning_rate": 3.534350395201245e-05, "loss": 0.3388, "num_input_tokens_seen": 50101200, "step": 52445 }, { "epoch": 4.278489273187046, "grad_norm": 0.47903358936309814, "learning_rate": 3.534026349510821e-05, "loss": 0.3355, "num_input_tokens_seen": 50105568, "step": 52450 }, { "epoch": 4.278897136797455, "grad_norm": 0.7014790773391724, "learning_rate": 3.533702282861122e-05, "loss": 0.3538, "num_input_tokens_seen": 50110000, "step": 52455 }, { "epoch": 4.279305000407864, "grad_norm": 0.5906722545623779, "learning_rate": 3.5333781952587176e-05, "loss": 0.2969, "num_input_tokens_seen": 50115200, "step": 52460 }, { "epoch": 4.279712864018272, "grad_norm": 0.8605539798736572, "learning_rate": 3.5330540867101766e-05, "loss": 0.4104, "num_input_tokens_seen": 50119600, "step": 52465 }, { "epoch": 4.280120727628681, "grad_norm": 0.6810795068740845, "learning_rate": 3.532729957222068e-05, "loss": 0.3377, "num_input_tokens_seen": 50124656, "step": 52470 }, { "epoch": 4.28052859123909, "grad_norm": 0.6564911603927612, "learning_rate": 3.532405806800961e-05, "loss": 0.3576, "num_input_tokens_seen": 50129776, "step": 52475 }, { "epoch": 4.280936454849498, "grad_norm": 0.6109479069709778, "learning_rate": 3.532081635453428e-05, "loss": 0.3219, "num_input_tokens_seen": 50134496, "step": 52480 }, { "epoch": 4.281344318459907, "grad_norm": 0.6268386244773865, "learning_rate": 3.531757443186038e-05, "loss": 0.3858, "num_input_tokens_seen": 50139728, "step": 52485 }, { "epoch": 4.281752182070315, "grad_norm": 0.40573418140411377, "learning_rate": 3.5314332300053645e-05, "loss": 0.3324, "num_input_tokens_seen": 50144768, "step": 52490 }, { "epoch": 4.282160045680724, "grad_norm": 1.0474356412887573, "learning_rate": 3.531108995917977e-05, "loss": 0.3501, "num_input_tokens_seen": 50149552, "step": 52495 }, { "epoch": 4.282567909291133, "grad_norm": 0.6348980665206909, "learning_rate": 3.530784740930447e-05, "loss": 0.3461, "num_input_tokens_seen": 50154272, "step": 52500 }, { "epoch": 4.282975772901541, "grad_norm": 1.0083720684051514, "learning_rate": 3.53046046504935e-05, "loss": 0.3494, "num_input_tokens_seen": 50158624, "step": 52505 }, { "epoch": 4.28338363651195, "grad_norm": 0.5803816318511963, "learning_rate": 3.5301361682812565e-05, "loss": 0.3405, "num_input_tokens_seen": 50164320, "step": 52510 }, { "epoch": 4.283791500122359, "grad_norm": 0.2692616879940033, "learning_rate": 3.5298118506327405e-05, "loss": 0.3301, "num_input_tokens_seen": 50169232, "step": 52515 }, { "epoch": 4.2841993637327676, "grad_norm": 0.6103282570838928, "learning_rate": 3.529487512110376e-05, "loss": 0.369, "num_input_tokens_seen": 50174064, "step": 52520 }, { "epoch": 4.284607227343177, "grad_norm": 1.1929514408111572, "learning_rate": 3.529163152720738e-05, "loss": 0.3663, "num_input_tokens_seen": 50178208, "step": 52525 }, { "epoch": 4.285015090953586, "grad_norm": 0.6020172238349915, "learning_rate": 3.528838772470399e-05, "loss": 0.3613, "num_input_tokens_seen": 50183232, "step": 52530 }, { "epoch": 4.285422954563994, "grad_norm": 0.5470007061958313, "learning_rate": 3.5285143713659366e-05, "loss": 0.3445, "num_input_tokens_seen": 50188320, "step": 52535 }, { "epoch": 4.285830818174403, "grad_norm": 0.8833596110343933, "learning_rate": 3.528189949413924e-05, "loss": 0.3509, "num_input_tokens_seen": 50193104, "step": 52540 }, { "epoch": 4.286238681784811, "grad_norm": 0.5758373141288757, "learning_rate": 3.5278655066209376e-05, "loss": 0.3592, "num_input_tokens_seen": 50197248, "step": 52545 }, { "epoch": 4.28664654539522, "grad_norm": 0.8617509603500366, "learning_rate": 3.5275410429935545e-05, "loss": 0.3592, "num_input_tokens_seen": 50202064, "step": 52550 }, { "epoch": 4.287054409005629, "grad_norm": 0.9720206260681152, "learning_rate": 3.527216558538351e-05, "loss": 0.3441, "num_input_tokens_seen": 50206528, "step": 52555 }, { "epoch": 4.287462272616037, "grad_norm": 0.6598264575004578, "learning_rate": 3.5268920532619045e-05, "loss": 0.3473, "num_input_tokens_seen": 50212128, "step": 52560 }, { "epoch": 4.287870136226446, "grad_norm": 0.8072025775909424, "learning_rate": 3.5265675271707924e-05, "loss": 0.3799, "num_input_tokens_seen": 50216672, "step": 52565 }, { "epoch": 4.288277999836854, "grad_norm": 0.9248318672180176, "learning_rate": 3.526242980271593e-05, "loss": 0.3369, "num_input_tokens_seen": 50221392, "step": 52570 }, { "epoch": 4.288685863447263, "grad_norm": 0.6487011909484863, "learning_rate": 3.5259184125708836e-05, "loss": 0.3611, "num_input_tokens_seen": 50226512, "step": 52575 }, { "epoch": 4.289093727057672, "grad_norm": 0.7712125182151794, "learning_rate": 3.525593824075245e-05, "loss": 0.3375, "num_input_tokens_seen": 50230752, "step": 52580 }, { "epoch": 4.28950159066808, "grad_norm": 1.05764639377594, "learning_rate": 3.5252692147912546e-05, "loss": 0.3644, "num_input_tokens_seen": 50235488, "step": 52585 }, { "epoch": 4.289909454278489, "grad_norm": 0.5448245406150818, "learning_rate": 3.5249445847254936e-05, "loss": 0.3369, "num_input_tokens_seen": 50239200, "step": 52590 }, { "epoch": 4.290317317888898, "grad_norm": 0.4380970597267151, "learning_rate": 3.524619933884541e-05, "loss": 0.3458, "num_input_tokens_seen": 50244192, "step": 52595 }, { "epoch": 4.290725181499306, "grad_norm": 0.432372123003006, "learning_rate": 3.524295262274978e-05, "loss": 0.3644, "num_input_tokens_seen": 50248720, "step": 52600 }, { "epoch": 4.291133045109715, "grad_norm": 0.5658811926841736, "learning_rate": 3.523970569903385e-05, "loss": 0.3425, "num_input_tokens_seen": 50253920, "step": 52605 }, { "epoch": 4.291540908720124, "grad_norm": 0.573373556137085, "learning_rate": 3.523645856776344e-05, "loss": 0.3335, "num_input_tokens_seen": 50259232, "step": 52610 }, { "epoch": 4.2919487723305325, "grad_norm": 0.4652363955974579, "learning_rate": 3.523321122900436e-05, "loss": 0.3377, "num_input_tokens_seen": 50263856, "step": 52615 }, { "epoch": 4.2923566359409415, "grad_norm": 1.0839329957962036, "learning_rate": 3.5229963682822436e-05, "loss": 0.3513, "num_input_tokens_seen": 50268752, "step": 52620 }, { "epoch": 4.29276449955135, "grad_norm": 0.8751831650733948, "learning_rate": 3.5226715929283506e-05, "loss": 0.3549, "num_input_tokens_seen": 50273312, "step": 52625 }, { "epoch": 4.293172363161759, "grad_norm": 0.8050699830055237, "learning_rate": 3.5223467968453374e-05, "loss": 0.3653, "num_input_tokens_seen": 50278368, "step": 52630 }, { "epoch": 4.293580226772168, "grad_norm": 1.0248725414276123, "learning_rate": 3.52202198003979e-05, "loss": 0.3323, "num_input_tokens_seen": 50283056, "step": 52635 }, { "epoch": 4.293988090382576, "grad_norm": 0.32466310262680054, "learning_rate": 3.521697142518291e-05, "loss": 0.3339, "num_input_tokens_seen": 50287552, "step": 52640 }, { "epoch": 4.294395953992985, "grad_norm": 0.7663176655769348, "learning_rate": 3.521372284287425e-05, "loss": 0.3349, "num_input_tokens_seen": 50292368, "step": 52645 }, { "epoch": 4.294803817603394, "grad_norm": 0.5970181226730347, "learning_rate": 3.5210474053537774e-05, "loss": 0.3696, "num_input_tokens_seen": 50296880, "step": 52650 }, { "epoch": 4.295211681213802, "grad_norm": 1.0586999654769897, "learning_rate": 3.520722505723932e-05, "loss": 0.3421, "num_input_tokens_seen": 50301488, "step": 52655 }, { "epoch": 4.295619544824211, "grad_norm": 0.6653204560279846, "learning_rate": 3.5203975854044756e-05, "loss": 0.4033, "num_input_tokens_seen": 50306144, "step": 52660 }, { "epoch": 4.296027408434619, "grad_norm": 0.6140251755714417, "learning_rate": 3.520072644401993e-05, "loss": 0.3193, "num_input_tokens_seen": 50310272, "step": 52665 }, { "epoch": 4.296435272045028, "grad_norm": 0.4355098009109497, "learning_rate": 3.5197476827230724e-05, "loss": 0.3483, "num_input_tokens_seen": 50315312, "step": 52670 }, { "epoch": 4.296843135655437, "grad_norm": 0.4385145604610443, "learning_rate": 3.5194227003742994e-05, "loss": 0.3574, "num_input_tokens_seen": 50319456, "step": 52675 }, { "epoch": 4.297250999265845, "grad_norm": 0.48819127678871155, "learning_rate": 3.5190976973622614e-05, "loss": 0.357, "num_input_tokens_seen": 50324672, "step": 52680 }, { "epoch": 4.297658862876254, "grad_norm": 0.6243384480476379, "learning_rate": 3.5187726736935455e-05, "loss": 0.3252, "num_input_tokens_seen": 50329792, "step": 52685 }, { "epoch": 4.298066726486663, "grad_norm": 0.7455590963363647, "learning_rate": 3.518447629374742e-05, "loss": 0.3611, "num_input_tokens_seen": 50334368, "step": 52690 }, { "epoch": 4.298474590097071, "grad_norm": 0.44903483986854553, "learning_rate": 3.518122564412436e-05, "loss": 0.3636, "num_input_tokens_seen": 50339424, "step": 52695 }, { "epoch": 4.29888245370748, "grad_norm": 0.4187447726726532, "learning_rate": 3.517797478813219e-05, "loss": 0.3272, "num_input_tokens_seen": 50343760, "step": 52700 }, { "epoch": 4.2992903173178885, "grad_norm": 0.5177248120307922, "learning_rate": 3.517472372583679e-05, "loss": 0.3587, "num_input_tokens_seen": 50347984, "step": 52705 }, { "epoch": 4.2996981809282975, "grad_norm": 0.5606897473335266, "learning_rate": 3.517147245730407e-05, "loss": 0.3832, "num_input_tokens_seen": 50352448, "step": 52710 }, { "epoch": 4.3001060445387065, "grad_norm": 0.42443540692329407, "learning_rate": 3.516822098259993e-05, "loss": 0.3131, "num_input_tokens_seen": 50357472, "step": 52715 }, { "epoch": 4.300513908149115, "grad_norm": 0.2586863040924072, "learning_rate": 3.5164969301790265e-05, "loss": 0.3602, "num_input_tokens_seen": 50363088, "step": 52720 }, { "epoch": 4.300921771759524, "grad_norm": 0.2748432457447052, "learning_rate": 3.516171741494099e-05, "loss": 0.3607, "num_input_tokens_seen": 50367472, "step": 52725 }, { "epoch": 4.301329635369933, "grad_norm": 0.702225387096405, "learning_rate": 3.515846532211802e-05, "loss": 0.3367, "num_input_tokens_seen": 50371824, "step": 52730 }, { "epoch": 4.301737498980341, "grad_norm": 0.7971734404563904, "learning_rate": 3.515521302338728e-05, "loss": 0.364, "num_input_tokens_seen": 50376544, "step": 52735 }, { "epoch": 4.30214536259075, "grad_norm": 0.4515892565250397, "learning_rate": 3.515196051881469e-05, "loss": 0.3783, "num_input_tokens_seen": 50381264, "step": 52740 }, { "epoch": 4.302553226201159, "grad_norm": 0.4002084732055664, "learning_rate": 3.5148707808466165e-05, "loss": 0.3395, "num_input_tokens_seen": 50385776, "step": 52745 }, { "epoch": 4.302961089811567, "grad_norm": 0.34216073155403137, "learning_rate": 3.5145454892407654e-05, "loss": 0.3518, "num_input_tokens_seen": 50391648, "step": 52750 }, { "epoch": 4.303368953421976, "grad_norm": 1.0469670295715332, "learning_rate": 3.5142201770705073e-05, "loss": 0.3362, "num_input_tokens_seen": 50397168, "step": 52755 }, { "epoch": 4.303776817032384, "grad_norm": 0.7392762899398804, "learning_rate": 3.513894844342438e-05, "loss": 0.3325, "num_input_tokens_seen": 50401536, "step": 52760 }, { "epoch": 4.304184680642793, "grad_norm": 0.6511965990066528, "learning_rate": 3.51356949106315e-05, "loss": 0.3583, "num_input_tokens_seen": 50406016, "step": 52765 }, { "epoch": 4.304592544253202, "grad_norm": 0.4420973062515259, "learning_rate": 3.51324411723924e-05, "loss": 0.3558, "num_input_tokens_seen": 50411424, "step": 52770 }, { "epoch": 4.30500040786361, "grad_norm": 0.7613050937652588, "learning_rate": 3.512918722877302e-05, "loss": 0.3737, "num_input_tokens_seen": 50415152, "step": 52775 }, { "epoch": 4.305408271474019, "grad_norm": 0.6056230664253235, "learning_rate": 3.5125933079839325e-05, "loss": 0.3437, "num_input_tokens_seen": 50419152, "step": 52780 }, { "epoch": 4.305816135084428, "grad_norm": 0.9131790399551392, "learning_rate": 3.512267872565727e-05, "loss": 0.3592, "num_input_tokens_seen": 50424160, "step": 52785 }, { "epoch": 4.306223998694836, "grad_norm": 0.3849387466907501, "learning_rate": 3.511942416629281e-05, "loss": 0.3351, "num_input_tokens_seen": 50428832, "step": 52790 }, { "epoch": 4.306631862305245, "grad_norm": 0.8923152089118958, "learning_rate": 3.511616940181193e-05, "loss": 0.4141, "num_input_tokens_seen": 50434192, "step": 52795 }, { "epoch": 4.307039725915653, "grad_norm": 0.7822198271751404, "learning_rate": 3.511291443228059e-05, "loss": 0.3748, "num_input_tokens_seen": 50438944, "step": 52800 }, { "epoch": 4.307447589526062, "grad_norm": 0.5053207874298096, "learning_rate": 3.5109659257764765e-05, "loss": 0.3019, "num_input_tokens_seen": 50443840, "step": 52805 }, { "epoch": 4.3078554531364714, "grad_norm": 0.2233998328447342, "learning_rate": 3.510640387833045e-05, "loss": 0.3259, "num_input_tokens_seen": 50449088, "step": 52810 }, { "epoch": 4.30826331674688, "grad_norm": 0.519612729549408, "learning_rate": 3.510314829404363e-05, "loss": 0.3049, "num_input_tokens_seen": 50453664, "step": 52815 }, { "epoch": 4.308671180357289, "grad_norm": 0.5677968263626099, "learning_rate": 3.5099892504970276e-05, "loss": 0.289, "num_input_tokens_seen": 50458256, "step": 52820 }, { "epoch": 4.309079043967698, "grad_norm": 1.3964178562164307, "learning_rate": 3.50966365111764e-05, "loss": 0.326, "num_input_tokens_seen": 50462576, "step": 52825 }, { "epoch": 4.309486907578106, "grad_norm": 0.5832451581954956, "learning_rate": 3.509338031272798e-05, "loss": 0.3228, "num_input_tokens_seen": 50467392, "step": 52830 }, { "epoch": 4.309894771188515, "grad_norm": 0.5508865118026733, "learning_rate": 3.5090123909691045e-05, "loss": 0.3856, "num_input_tokens_seen": 50471600, "step": 52835 }, { "epoch": 4.310302634798923, "grad_norm": 1.0509823560714722, "learning_rate": 3.508686730213158e-05, "loss": 0.4206, "num_input_tokens_seen": 50476368, "step": 52840 }, { "epoch": 4.310710498409332, "grad_norm": 1.085156798362732, "learning_rate": 3.5083610490115596e-05, "loss": 0.4578, "num_input_tokens_seen": 50480560, "step": 52845 }, { "epoch": 4.311118362019741, "grad_norm": 0.5805385112762451, "learning_rate": 3.508035347370912e-05, "loss": 0.2883, "num_input_tokens_seen": 50485952, "step": 52850 }, { "epoch": 4.311526225630149, "grad_norm": 0.9504157900810242, "learning_rate": 3.507709625297816e-05, "loss": 0.3502, "num_input_tokens_seen": 50490336, "step": 52855 }, { "epoch": 4.311934089240558, "grad_norm": 0.7155392169952393, "learning_rate": 3.507383882798874e-05, "loss": 0.3253, "num_input_tokens_seen": 50495232, "step": 52860 }, { "epoch": 4.312341952850967, "grad_norm": 0.48459959030151367, "learning_rate": 3.5070581198806883e-05, "loss": 0.3558, "num_input_tokens_seen": 50500128, "step": 52865 }, { "epoch": 4.312749816461375, "grad_norm": 0.9883306622505188, "learning_rate": 3.506732336549863e-05, "loss": 0.3654, "num_input_tokens_seen": 50504608, "step": 52870 }, { "epoch": 4.313157680071784, "grad_norm": 0.4944111704826355, "learning_rate": 3.506406532813001e-05, "loss": 0.3272, "num_input_tokens_seen": 50509648, "step": 52875 }, { "epoch": 4.313565543682193, "grad_norm": 0.7066797018051147, "learning_rate": 3.506080708676707e-05, "loss": 0.3033, "num_input_tokens_seen": 50513888, "step": 52880 }, { "epoch": 4.313973407292601, "grad_norm": 0.6042672991752625, "learning_rate": 3.505754864147584e-05, "loss": 0.4122, "num_input_tokens_seen": 50519440, "step": 52885 }, { "epoch": 4.31438127090301, "grad_norm": 0.8913856744766235, "learning_rate": 3.5054289992322365e-05, "loss": 0.356, "num_input_tokens_seen": 50524944, "step": 52890 }, { "epoch": 4.314789134513418, "grad_norm": 0.8953555822372437, "learning_rate": 3.505103113937271e-05, "loss": 0.3386, "num_input_tokens_seen": 50528992, "step": 52895 }, { "epoch": 4.315196998123827, "grad_norm": 0.48641854524612427, "learning_rate": 3.504777208269292e-05, "loss": 0.3332, "num_input_tokens_seen": 50534176, "step": 52900 }, { "epoch": 4.315604861734236, "grad_norm": 0.4764973819255829, "learning_rate": 3.504451282234907e-05, "loss": 0.337, "num_input_tokens_seen": 50538096, "step": 52905 }, { "epoch": 4.3160127253446445, "grad_norm": 0.8691137433052063, "learning_rate": 3.50412533584072e-05, "loss": 0.3543, "num_input_tokens_seen": 50543552, "step": 52910 }, { "epoch": 4.3164205889550535, "grad_norm": 0.4164488613605499, "learning_rate": 3.50379936909334e-05, "loss": 0.3569, "num_input_tokens_seen": 50547888, "step": 52915 }, { "epoch": 4.316828452565462, "grad_norm": 0.6405144333839417, "learning_rate": 3.5034733819993726e-05, "loss": 0.3454, "num_input_tokens_seen": 50552544, "step": 52920 }, { "epoch": 4.317236316175871, "grad_norm": 0.4432380795478821, "learning_rate": 3.503147374565427e-05, "loss": 0.3225, "num_input_tokens_seen": 50557424, "step": 52925 }, { "epoch": 4.31764417978628, "grad_norm": 0.6065000295639038, "learning_rate": 3.5028213467981096e-05, "loss": 0.3561, "num_input_tokens_seen": 50562368, "step": 52930 }, { "epoch": 4.318052043396688, "grad_norm": 0.3688698410987854, "learning_rate": 3.5024952987040303e-05, "loss": 0.3795, "num_input_tokens_seen": 50566816, "step": 52935 }, { "epoch": 4.318459907007097, "grad_norm": 0.8809942007064819, "learning_rate": 3.5021692302897974e-05, "loss": 0.348, "num_input_tokens_seen": 50572480, "step": 52940 }, { "epoch": 4.318867770617506, "grad_norm": 0.5599163174629211, "learning_rate": 3.5018431415620204e-05, "loss": 0.3414, "num_input_tokens_seen": 50577392, "step": 52945 }, { "epoch": 4.319275634227914, "grad_norm": 0.3009834587574005, "learning_rate": 3.5015170325273074e-05, "loss": 0.3444, "num_input_tokens_seen": 50582192, "step": 52950 }, { "epoch": 4.319683497838323, "grad_norm": 0.4557265043258667, "learning_rate": 3.50119090319227e-05, "loss": 0.3539, "num_input_tokens_seen": 50586832, "step": 52955 }, { "epoch": 4.320091361448732, "grad_norm": 0.639033317565918, "learning_rate": 3.5008647535635186e-05, "loss": 0.341, "num_input_tokens_seen": 50591232, "step": 52960 }, { "epoch": 4.32049922505914, "grad_norm": 0.8829482197761536, "learning_rate": 3.500538583647664e-05, "loss": 0.3927, "num_input_tokens_seen": 50595392, "step": 52965 }, { "epoch": 4.320907088669549, "grad_norm": 0.2320845127105713, "learning_rate": 3.500212393451317e-05, "loss": 0.3416, "num_input_tokens_seen": 50600288, "step": 52970 }, { "epoch": 4.321314952279957, "grad_norm": 0.6767736077308655, "learning_rate": 3.4998861829810905e-05, "loss": 0.3024, "num_input_tokens_seen": 50604752, "step": 52975 }, { "epoch": 4.321722815890366, "grad_norm": 0.5047569274902344, "learning_rate": 3.499559952243596e-05, "loss": 0.3644, "num_input_tokens_seen": 50609888, "step": 52980 }, { "epoch": 4.322130679500775, "grad_norm": 1.077222466468811, "learning_rate": 3.4992337012454456e-05, "loss": 0.3615, "num_input_tokens_seen": 50614896, "step": 52985 }, { "epoch": 4.322538543111183, "grad_norm": 1.0625206232070923, "learning_rate": 3.4989074299932526e-05, "loss": 0.3737, "num_input_tokens_seen": 50620576, "step": 52990 }, { "epoch": 4.322946406721592, "grad_norm": 0.36906203627586365, "learning_rate": 3.4985811384936315e-05, "loss": 0.3743, "num_input_tokens_seen": 50625712, "step": 52995 }, { "epoch": 4.323354270332001, "grad_norm": 1.3969676494598389, "learning_rate": 3.498254826753193e-05, "loss": 0.347, "num_input_tokens_seen": 50630624, "step": 53000 }, { "epoch": 4.3237621339424095, "grad_norm": 0.6413675546646118, "learning_rate": 3.497928494778555e-05, "loss": 0.3477, "num_input_tokens_seen": 50635584, "step": 53005 }, { "epoch": 4.3241699975528185, "grad_norm": 0.35252442955970764, "learning_rate": 3.49760214257633e-05, "loss": 0.3473, "num_input_tokens_seen": 50640848, "step": 53010 }, { "epoch": 4.324577861163227, "grad_norm": 0.5569402575492859, "learning_rate": 3.4972757701531336e-05, "loss": 0.3649, "num_input_tokens_seen": 50645424, "step": 53015 }, { "epoch": 4.324985724773636, "grad_norm": 0.503645658493042, "learning_rate": 3.496949377515582e-05, "loss": 0.3289, "num_input_tokens_seen": 50650384, "step": 53020 }, { "epoch": 4.325393588384045, "grad_norm": 0.6272241473197937, "learning_rate": 3.496622964670289e-05, "loss": 0.332, "num_input_tokens_seen": 50655776, "step": 53025 }, { "epoch": 4.325801451994453, "grad_norm": 0.5921856760978699, "learning_rate": 3.4962965316238716e-05, "loss": 0.3542, "num_input_tokens_seen": 50659904, "step": 53030 }, { "epoch": 4.326209315604862, "grad_norm": 0.5367630124092102, "learning_rate": 3.495970078382947e-05, "loss": 0.3759, "num_input_tokens_seen": 50664816, "step": 53035 }, { "epoch": 4.326617179215271, "grad_norm": 0.26664862036705017, "learning_rate": 3.4956436049541325e-05, "loss": 0.326, "num_input_tokens_seen": 50669152, "step": 53040 }, { "epoch": 4.327025042825679, "grad_norm": 0.7778674960136414, "learning_rate": 3.495317111344045e-05, "loss": 0.3186, "num_input_tokens_seen": 50674064, "step": 53045 }, { "epoch": 4.327432906436088, "grad_norm": 0.925369381904602, "learning_rate": 3.4949905975593036e-05, "loss": 0.3557, "num_input_tokens_seen": 50678464, "step": 53050 }, { "epoch": 4.327840770046496, "grad_norm": 1.084442138671875, "learning_rate": 3.494664063606524e-05, "loss": 0.3594, "num_input_tokens_seen": 50682416, "step": 53055 }, { "epoch": 4.328248633656905, "grad_norm": 0.5156557559967041, "learning_rate": 3.494337509492328e-05, "loss": 0.3309, "num_input_tokens_seen": 50687152, "step": 53060 }, { "epoch": 4.328656497267314, "grad_norm": 0.4695972204208374, "learning_rate": 3.494010935223332e-05, "loss": 0.3452, "num_input_tokens_seen": 50691616, "step": 53065 }, { "epoch": 4.329064360877722, "grad_norm": 1.0194003582000732, "learning_rate": 3.4936843408061576e-05, "loss": 0.3448, "num_input_tokens_seen": 50695856, "step": 53070 }, { "epoch": 4.329472224488131, "grad_norm": 0.6314237713813782, "learning_rate": 3.4933577262474224e-05, "loss": 0.3128, "num_input_tokens_seen": 50701504, "step": 53075 }, { "epoch": 4.32988008809854, "grad_norm": 0.5848932862281799, "learning_rate": 3.49303109155375e-05, "loss": 0.371, "num_input_tokens_seen": 50706416, "step": 53080 }, { "epoch": 4.330287951708948, "grad_norm": 0.3518643379211426, "learning_rate": 3.4927044367317586e-05, "loss": 0.3516, "num_input_tokens_seen": 50711936, "step": 53085 }, { "epoch": 4.330695815319357, "grad_norm": 1.063916563987732, "learning_rate": 3.49237776178807e-05, "loss": 0.3439, "num_input_tokens_seen": 50717072, "step": 53090 }, { "epoch": 4.331103678929766, "grad_norm": 0.7124416828155518, "learning_rate": 3.4920510667293064e-05, "loss": 0.3824, "num_input_tokens_seen": 50721280, "step": 53095 }, { "epoch": 4.3315115425401745, "grad_norm": 0.29008257389068604, "learning_rate": 3.491724351562089e-05, "loss": 0.3696, "num_input_tokens_seen": 50727088, "step": 53100 }, { "epoch": 4.3319194061505835, "grad_norm": 0.6606584191322327, "learning_rate": 3.49139761629304e-05, "loss": 0.3217, "num_input_tokens_seen": 50731328, "step": 53105 }, { "epoch": 4.332327269760992, "grad_norm": 1.0048807859420776, "learning_rate": 3.491070860928782e-05, "loss": 0.3309, "num_input_tokens_seen": 50735728, "step": 53110 }, { "epoch": 4.332735133371401, "grad_norm": 0.5062792301177979, "learning_rate": 3.49074408547594e-05, "loss": 0.3314, "num_input_tokens_seen": 50739968, "step": 53115 }, { "epoch": 4.33314299698181, "grad_norm": 0.8013326525688171, "learning_rate": 3.490417289941136e-05, "loss": 0.3535, "num_input_tokens_seen": 50744960, "step": 53120 }, { "epoch": 4.333550860592218, "grad_norm": 0.8136218786239624, "learning_rate": 3.490090474330994e-05, "loss": 0.3329, "num_input_tokens_seen": 50749744, "step": 53125 }, { "epoch": 4.333958724202627, "grad_norm": 0.9081004858016968, "learning_rate": 3.4897636386521396e-05, "loss": 0.3412, "num_input_tokens_seen": 50754256, "step": 53130 }, { "epoch": 4.334366587813035, "grad_norm": 0.7341827750205994, "learning_rate": 3.4894367829111965e-05, "loss": 0.3875, "num_input_tokens_seen": 50758928, "step": 53135 }, { "epoch": 4.334774451423444, "grad_norm": 1.1736183166503906, "learning_rate": 3.4891099071147894e-05, "loss": 0.3232, "num_input_tokens_seen": 50763840, "step": 53140 }, { "epoch": 4.335182315033853, "grad_norm": 0.5667001605033875, "learning_rate": 3.4887830112695454e-05, "loss": 0.3502, "num_input_tokens_seen": 50768464, "step": 53145 }, { "epoch": 4.335590178644261, "grad_norm": 0.5208514332771301, "learning_rate": 3.48845609538209e-05, "loss": 0.3571, "num_input_tokens_seen": 50773840, "step": 53150 }, { "epoch": 4.33599804225467, "grad_norm": 0.6866583228111267, "learning_rate": 3.48812915945905e-05, "loss": 0.3296, "num_input_tokens_seen": 50777968, "step": 53155 }, { "epoch": 4.336405905865079, "grad_norm": 0.7583377361297607, "learning_rate": 3.487802203507051e-05, "loss": 0.3467, "num_input_tokens_seen": 50783248, "step": 53160 }, { "epoch": 4.336813769475487, "grad_norm": 0.6644468903541565, "learning_rate": 3.4874752275327215e-05, "loss": 0.2819, "num_input_tokens_seen": 50788160, "step": 53165 }, { "epoch": 4.337221633085896, "grad_norm": 1.1527693271636963, "learning_rate": 3.487148231542689e-05, "loss": 0.5022, "num_input_tokens_seen": 50792800, "step": 53170 }, { "epoch": 4.337629496696305, "grad_norm": 0.37690383195877075, "learning_rate": 3.486821215543581e-05, "loss": 0.3259, "num_input_tokens_seen": 50796896, "step": 53175 }, { "epoch": 4.338037360306713, "grad_norm": 0.9208016991615295, "learning_rate": 3.4864941795420266e-05, "loss": 0.3596, "num_input_tokens_seen": 50802224, "step": 53180 }, { "epoch": 4.338445223917122, "grad_norm": 1.0047197341918945, "learning_rate": 3.4861671235446536e-05, "loss": 0.3786, "num_input_tokens_seen": 50807344, "step": 53185 }, { "epoch": 4.33885308752753, "grad_norm": 0.6267553567886353, "learning_rate": 3.4858400475580924e-05, "loss": 0.3587, "num_input_tokens_seen": 50811616, "step": 53190 }, { "epoch": 4.339260951137939, "grad_norm": 1.1762573719024658, "learning_rate": 3.485512951588973e-05, "loss": 0.3904, "num_input_tokens_seen": 50815440, "step": 53195 }, { "epoch": 4.339668814748348, "grad_norm": 1.1406041383743286, "learning_rate": 3.485185835643924e-05, "loss": 0.3963, "num_input_tokens_seen": 50821040, "step": 53200 }, { "epoch": 4.3400766783587565, "grad_norm": 0.3951166272163391, "learning_rate": 3.484858699729577e-05, "loss": 0.3226, "num_input_tokens_seen": 50826112, "step": 53205 }, { "epoch": 4.3404845419691656, "grad_norm": 0.32813602685928345, "learning_rate": 3.4845315438525624e-05, "loss": 0.314, "num_input_tokens_seen": 50830976, "step": 53210 }, { "epoch": 4.340892405579575, "grad_norm": 0.5402196645736694, "learning_rate": 3.484204368019512e-05, "loss": 0.3884, "num_input_tokens_seen": 50835456, "step": 53215 }, { "epoch": 4.341300269189983, "grad_norm": 0.8066011667251587, "learning_rate": 3.483877172237058e-05, "loss": 0.344, "num_input_tokens_seen": 50840512, "step": 53220 }, { "epoch": 4.341708132800392, "grad_norm": 0.5636814832687378, "learning_rate": 3.4835499565118305e-05, "loss": 0.3443, "num_input_tokens_seen": 50844176, "step": 53225 }, { "epoch": 4.3421159964108, "grad_norm": 0.46567755937576294, "learning_rate": 3.483222720850464e-05, "loss": 0.3477, "num_input_tokens_seen": 50850352, "step": 53230 }, { "epoch": 4.342523860021209, "grad_norm": 0.703442394733429, "learning_rate": 3.4828954652595906e-05, "loss": 0.3393, "num_input_tokens_seen": 50854528, "step": 53235 }, { "epoch": 4.342931723631618, "grad_norm": 0.8039109706878662, "learning_rate": 3.482568189745844e-05, "loss": 0.3152, "num_input_tokens_seen": 50859808, "step": 53240 }, { "epoch": 4.343339587242026, "grad_norm": 0.47060877084732056, "learning_rate": 3.482240894315857e-05, "loss": 0.3398, "num_input_tokens_seen": 50865360, "step": 53245 }, { "epoch": 4.343747450852435, "grad_norm": 1.0523067712783813, "learning_rate": 3.481913578976266e-05, "loss": 0.3466, "num_input_tokens_seen": 50870960, "step": 53250 }, { "epoch": 4.344155314462844, "grad_norm": 0.47636309266090393, "learning_rate": 3.481586243733702e-05, "loss": 0.3041, "num_input_tokens_seen": 50876048, "step": 53255 }, { "epoch": 4.344563178073252, "grad_norm": 0.7753326296806335, "learning_rate": 3.481258888594803e-05, "loss": 0.4073, "num_input_tokens_seen": 50880816, "step": 53260 }, { "epoch": 4.344971041683661, "grad_norm": 0.7225488424301147, "learning_rate": 3.480931513566204e-05, "loss": 0.3705, "num_input_tokens_seen": 50886256, "step": 53265 }, { "epoch": 4.345378905294069, "grad_norm": 1.0363413095474243, "learning_rate": 3.480604118654539e-05, "loss": 0.3496, "num_input_tokens_seen": 50891104, "step": 53270 }, { "epoch": 4.345786768904478, "grad_norm": 0.8978935480117798, "learning_rate": 3.480276703866445e-05, "loss": 0.367, "num_input_tokens_seen": 50895904, "step": 53275 }, { "epoch": 4.346194632514887, "grad_norm": 0.9487448334693909, "learning_rate": 3.479949269208559e-05, "loss": 0.3763, "num_input_tokens_seen": 50900288, "step": 53280 }, { "epoch": 4.346602496125295, "grad_norm": 0.5590143203735352, "learning_rate": 3.479621814687519e-05, "loss": 0.3377, "num_input_tokens_seen": 50905200, "step": 53285 }, { "epoch": 4.347010359735704, "grad_norm": 0.35137656331062317, "learning_rate": 3.47929434030996e-05, "loss": 0.3238, "num_input_tokens_seen": 50910048, "step": 53290 }, { "epoch": 4.347418223346113, "grad_norm": 0.5256724953651428, "learning_rate": 3.47896684608252e-05, "loss": 0.3167, "num_input_tokens_seen": 50914528, "step": 53295 }, { "epoch": 4.3478260869565215, "grad_norm": 0.8718928694725037, "learning_rate": 3.478639332011839e-05, "loss": 0.3661, "num_input_tokens_seen": 50920272, "step": 53300 }, { "epoch": 4.3482339505669305, "grad_norm": 0.5523901581764221, "learning_rate": 3.4783117981045556e-05, "loss": 0.3722, "num_input_tokens_seen": 50925168, "step": 53305 }, { "epoch": 4.3486418141773395, "grad_norm": 0.9474191665649414, "learning_rate": 3.477984244367306e-05, "loss": 0.3807, "num_input_tokens_seen": 50929296, "step": 53310 }, { "epoch": 4.349049677787748, "grad_norm": 0.7790464758872986, "learning_rate": 3.477656670806732e-05, "loss": 0.3281, "num_input_tokens_seen": 50934384, "step": 53315 }, { "epoch": 4.349457541398157, "grad_norm": 0.7506502270698547, "learning_rate": 3.4773290774294725e-05, "loss": 0.3351, "num_input_tokens_seen": 50939696, "step": 53320 }, { "epoch": 4.349865405008565, "grad_norm": 0.5586177706718445, "learning_rate": 3.477001464242168e-05, "loss": 0.3235, "num_input_tokens_seen": 50944208, "step": 53325 }, { "epoch": 4.350273268618974, "grad_norm": 0.7771506309509277, "learning_rate": 3.476673831251459e-05, "loss": 0.3221, "num_input_tokens_seen": 50948816, "step": 53330 }, { "epoch": 4.350681132229383, "grad_norm": 0.9691618084907532, "learning_rate": 3.476346178463987e-05, "loss": 0.3661, "num_input_tokens_seen": 50953760, "step": 53335 }, { "epoch": 4.351088995839791, "grad_norm": 0.6958453059196472, "learning_rate": 3.4760185058863916e-05, "loss": 0.3434, "num_input_tokens_seen": 50958624, "step": 53340 }, { "epoch": 4.3514968594502, "grad_norm": 1.0447578430175781, "learning_rate": 3.475690813525317e-05, "loss": 0.3306, "num_input_tokens_seen": 50963504, "step": 53345 }, { "epoch": 4.351904723060609, "grad_norm": 1.0703128576278687, "learning_rate": 3.475363101387404e-05, "loss": 0.353, "num_input_tokens_seen": 50967920, "step": 53350 }, { "epoch": 4.352312586671017, "grad_norm": 0.8762474656105042, "learning_rate": 3.4750353694792954e-05, "loss": 0.3409, "num_input_tokens_seen": 50972992, "step": 53355 }, { "epoch": 4.352720450281426, "grad_norm": 0.9758913516998291, "learning_rate": 3.474707617807633e-05, "loss": 0.3334, "num_input_tokens_seen": 50979008, "step": 53360 }, { "epoch": 4.353128313891834, "grad_norm": 0.9081549048423767, "learning_rate": 3.474379846379063e-05, "loss": 0.3375, "num_input_tokens_seen": 50983248, "step": 53365 }, { "epoch": 4.353536177502243, "grad_norm": 0.626703143119812, "learning_rate": 3.474052055200227e-05, "loss": 0.2896, "num_input_tokens_seen": 50988176, "step": 53370 }, { "epoch": 4.353944041112652, "grad_norm": 1.184571385383606, "learning_rate": 3.4737242442777696e-05, "loss": 0.4643, "num_input_tokens_seen": 50993008, "step": 53375 }, { "epoch": 4.35435190472306, "grad_norm": 1.0347614288330078, "learning_rate": 3.473396413618335e-05, "loss": 0.4172, "num_input_tokens_seen": 50997360, "step": 53380 }, { "epoch": 4.354759768333469, "grad_norm": 0.39537313580513, "learning_rate": 3.4730685632285695e-05, "loss": 0.3857, "num_input_tokens_seen": 51001856, "step": 53385 }, { "epoch": 4.355167631943878, "grad_norm": 0.7937029600143433, "learning_rate": 3.472740693115117e-05, "loss": 0.3241, "num_input_tokens_seen": 51007280, "step": 53390 }, { "epoch": 4.3555754955542865, "grad_norm": 0.7860531806945801, "learning_rate": 3.472412803284625e-05, "loss": 0.3605, "num_input_tokens_seen": 51011904, "step": 53395 }, { "epoch": 4.3559833591646955, "grad_norm": 1.1218479871749878, "learning_rate": 3.472084893743738e-05, "loss": 0.3589, "num_input_tokens_seen": 51016432, "step": 53400 }, { "epoch": 4.356391222775104, "grad_norm": 0.4363124966621399, "learning_rate": 3.4717569644991035e-05, "loss": 0.353, "num_input_tokens_seen": 51021376, "step": 53405 }, { "epoch": 4.356799086385513, "grad_norm": 0.7890521287918091, "learning_rate": 3.471429015557368e-05, "loss": 0.3357, "num_input_tokens_seen": 51026288, "step": 53410 }, { "epoch": 4.357206949995922, "grad_norm": 0.6179122924804688, "learning_rate": 3.4711010469251795e-05, "loss": 0.3158, "num_input_tokens_seen": 51030496, "step": 53415 }, { "epoch": 4.35761481360633, "grad_norm": 1.1045467853546143, "learning_rate": 3.4707730586091846e-05, "loss": 0.3765, "num_input_tokens_seen": 51035200, "step": 53420 }, { "epoch": 4.358022677216739, "grad_norm": 0.6163635849952698, "learning_rate": 3.4704450506160336e-05, "loss": 0.3751, "num_input_tokens_seen": 51040064, "step": 53425 }, { "epoch": 4.358430540827148, "grad_norm": 0.7728872895240784, "learning_rate": 3.4701170229523733e-05, "loss": 0.3338, "num_input_tokens_seen": 51045056, "step": 53430 }, { "epoch": 4.358838404437556, "grad_norm": 0.3712615370750427, "learning_rate": 3.4697889756248534e-05, "loss": 0.3576, "num_input_tokens_seen": 51049696, "step": 53435 }, { "epoch": 4.359246268047965, "grad_norm": 0.5076575875282288, "learning_rate": 3.469460908640122e-05, "loss": 0.3556, "num_input_tokens_seen": 51054592, "step": 53440 }, { "epoch": 4.359654131658374, "grad_norm": 0.3420315682888031, "learning_rate": 3.4691328220048305e-05, "loss": 0.3383, "num_input_tokens_seen": 51058464, "step": 53445 }, { "epoch": 4.360061995268782, "grad_norm": 0.3620012700557709, "learning_rate": 3.4688047157256294e-05, "loss": 0.3472, "num_input_tokens_seen": 51063040, "step": 53450 }, { "epoch": 4.360469858879191, "grad_norm": 0.48266083002090454, "learning_rate": 3.468476589809167e-05, "loss": 0.3312, "num_input_tokens_seen": 51068000, "step": 53455 }, { "epoch": 4.360877722489599, "grad_norm": 0.5243359208106995, "learning_rate": 3.468148444262096e-05, "loss": 0.3176, "num_input_tokens_seen": 51072576, "step": 53460 }, { "epoch": 4.361285586100008, "grad_norm": 0.9533901810646057, "learning_rate": 3.4678202790910674e-05, "loss": 0.3612, "num_input_tokens_seen": 51076544, "step": 53465 }, { "epoch": 4.361693449710417, "grad_norm": 0.7657737731933594, "learning_rate": 3.467492094302734e-05, "loss": 0.312, "num_input_tokens_seen": 51082576, "step": 53470 }, { "epoch": 4.362101313320825, "grad_norm": 0.9223800301551819, "learning_rate": 3.4671638899037456e-05, "loss": 0.3699, "num_input_tokens_seen": 51086896, "step": 53475 }, { "epoch": 4.362509176931234, "grad_norm": 0.6652072072029114, "learning_rate": 3.466835665900757e-05, "loss": 0.4024, "num_input_tokens_seen": 51091440, "step": 53480 }, { "epoch": 4.362917040541642, "grad_norm": 0.9531735181808472, "learning_rate": 3.4665074223004195e-05, "loss": 0.3738, "num_input_tokens_seen": 51097040, "step": 53485 }, { "epoch": 4.363324904152051, "grad_norm": 0.37377357482910156, "learning_rate": 3.466179159109388e-05, "loss": 0.3437, "num_input_tokens_seen": 51100128, "step": 53490 }, { "epoch": 4.36373276776246, "grad_norm": 0.5580371618270874, "learning_rate": 3.465850876334315e-05, "loss": 0.3166, "num_input_tokens_seen": 51105088, "step": 53495 }, { "epoch": 4.364140631372869, "grad_norm": 0.9057334661483765, "learning_rate": 3.465522573981856e-05, "loss": 0.3529, "num_input_tokens_seen": 51110480, "step": 53500 }, { "epoch": 4.364548494983278, "grad_norm": 0.9911891222000122, "learning_rate": 3.465194252058665e-05, "loss": 0.3682, "num_input_tokens_seen": 51115488, "step": 53505 }, { "epoch": 4.364956358593687, "grad_norm": 0.8291819095611572, "learning_rate": 3.464865910571395e-05, "loss": 0.3304, "num_input_tokens_seen": 51120032, "step": 53510 }, { "epoch": 4.365364222204095, "grad_norm": 0.5516725778579712, "learning_rate": 3.464537549526704e-05, "loss": 0.3431, "num_input_tokens_seen": 51123392, "step": 53515 }, { "epoch": 4.365772085814504, "grad_norm": 0.6777897477149963, "learning_rate": 3.464209168931246e-05, "loss": 0.3903, "num_input_tokens_seen": 51128912, "step": 53520 }, { "epoch": 4.366179949424913, "grad_norm": 0.5452449917793274, "learning_rate": 3.463880768791679e-05, "loss": 0.3045, "num_input_tokens_seen": 51133248, "step": 53525 }, { "epoch": 4.366587813035321, "grad_norm": 0.534940242767334, "learning_rate": 3.463552349114657e-05, "loss": 0.2766, "num_input_tokens_seen": 51138240, "step": 53530 }, { "epoch": 4.36699567664573, "grad_norm": 0.7040473222732544, "learning_rate": 3.4632239099068394e-05, "loss": 0.2802, "num_input_tokens_seen": 51143280, "step": 53535 }, { "epoch": 4.367403540256138, "grad_norm": 1.504191279411316, "learning_rate": 3.462895451174882e-05, "loss": 0.3091, "num_input_tokens_seen": 51148160, "step": 53540 }, { "epoch": 4.367811403866547, "grad_norm": 1.386160969734192, "learning_rate": 3.462566972925443e-05, "loss": 0.4217, "num_input_tokens_seen": 51152336, "step": 53545 }, { "epoch": 4.368219267476956, "grad_norm": 0.6958690881729126, "learning_rate": 3.4622384751651805e-05, "loss": 0.3419, "num_input_tokens_seen": 51157488, "step": 53550 }, { "epoch": 4.368627131087364, "grad_norm": 0.6716588139533997, "learning_rate": 3.461909957900753e-05, "loss": 0.257, "num_input_tokens_seen": 51163072, "step": 53555 }, { "epoch": 4.369034994697773, "grad_norm": 0.4277600347995758, "learning_rate": 3.46158142113882e-05, "loss": 0.3777, "num_input_tokens_seen": 51167856, "step": 53560 }, { "epoch": 4.369442858308182, "grad_norm": 0.6689061522483826, "learning_rate": 3.461252864886039e-05, "loss": 0.3656, "num_input_tokens_seen": 51172880, "step": 53565 }, { "epoch": 4.36985072191859, "grad_norm": 0.6747300624847412, "learning_rate": 3.460924289149072e-05, "loss": 0.3942, "num_input_tokens_seen": 51177296, "step": 53570 }, { "epoch": 4.370258585528999, "grad_norm": 0.515915036201477, "learning_rate": 3.4605956939345776e-05, "loss": 0.332, "num_input_tokens_seen": 51181808, "step": 53575 }, { "epoch": 4.370666449139407, "grad_norm": 0.603518545627594, "learning_rate": 3.460267079249216e-05, "loss": 0.3494, "num_input_tokens_seen": 51186144, "step": 53580 }, { "epoch": 4.371074312749816, "grad_norm": 0.7250170111656189, "learning_rate": 3.4599384450996495e-05, "loss": 0.3087, "num_input_tokens_seen": 51190512, "step": 53585 }, { "epoch": 4.371482176360225, "grad_norm": 0.5411790609359741, "learning_rate": 3.4596097914925387e-05, "loss": 0.3581, "num_input_tokens_seen": 51194736, "step": 53590 }, { "epoch": 4.3718900399706335, "grad_norm": 0.6244077682495117, "learning_rate": 3.459281118434544e-05, "loss": 0.3795, "num_input_tokens_seen": 51198944, "step": 53595 }, { "epoch": 4.3722979035810425, "grad_norm": 0.4542994499206543, "learning_rate": 3.45895242593233e-05, "loss": 0.3255, "num_input_tokens_seen": 51203200, "step": 53600 }, { "epoch": 4.3727057671914515, "grad_norm": 0.7580944895744324, "learning_rate": 3.458623713992557e-05, "loss": 0.3357, "num_input_tokens_seen": 51208000, "step": 53605 }, { "epoch": 4.37311363080186, "grad_norm": 0.6094252467155457, "learning_rate": 3.458294982621889e-05, "loss": 0.3417, "num_input_tokens_seen": 51213104, "step": 53610 }, { "epoch": 4.373521494412269, "grad_norm": 0.7291916012763977, "learning_rate": 3.457966231826989e-05, "loss": 0.3507, "num_input_tokens_seen": 51218480, "step": 53615 }, { "epoch": 4.373929358022677, "grad_norm": 0.71257084608078, "learning_rate": 3.457637461614521e-05, "loss": 0.3861, "num_input_tokens_seen": 51222752, "step": 53620 }, { "epoch": 4.374337221633086, "grad_norm": 0.9001014232635498, "learning_rate": 3.457308671991148e-05, "loss": 0.3212, "num_input_tokens_seen": 51226928, "step": 53625 }, { "epoch": 4.374745085243495, "grad_norm": 0.302115261554718, "learning_rate": 3.456979862963535e-05, "loss": 0.3578, "num_input_tokens_seen": 51231632, "step": 53630 }, { "epoch": 4.375152948853903, "grad_norm": 0.5804460048675537, "learning_rate": 3.4566510345383474e-05, "loss": 0.3332, "num_input_tokens_seen": 51237616, "step": 53635 }, { "epoch": 4.375560812464312, "grad_norm": 0.7564589977264404, "learning_rate": 3.456322186722249e-05, "loss": 0.3259, "num_input_tokens_seen": 51243072, "step": 53640 }, { "epoch": 4.375968676074721, "grad_norm": 0.7768298983573914, "learning_rate": 3.455993319521907e-05, "loss": 0.3513, "num_input_tokens_seen": 51246640, "step": 53645 }, { "epoch": 4.376376539685129, "grad_norm": 0.5298656225204468, "learning_rate": 3.455664432943986e-05, "loss": 0.3666, "num_input_tokens_seen": 51251312, "step": 53650 }, { "epoch": 4.376784403295538, "grad_norm": 0.6987878680229187, "learning_rate": 3.455335526995153e-05, "loss": 0.3664, "num_input_tokens_seen": 51255680, "step": 53655 }, { "epoch": 4.377192266905947, "grad_norm": 1.0569672584533691, "learning_rate": 3.455006601682075e-05, "loss": 0.3426, "num_input_tokens_seen": 51261392, "step": 53660 }, { "epoch": 4.377600130516355, "grad_norm": 0.6740078926086426, "learning_rate": 3.4546776570114194e-05, "loss": 0.3269, "num_input_tokens_seen": 51265952, "step": 53665 }, { "epoch": 4.378007994126764, "grad_norm": 0.6467174291610718, "learning_rate": 3.454348692989853e-05, "loss": 0.3572, "num_input_tokens_seen": 51271056, "step": 53670 }, { "epoch": 4.378415857737172, "grad_norm": 0.999322772026062, "learning_rate": 3.4540197096240436e-05, "loss": 0.3568, "num_input_tokens_seen": 51275216, "step": 53675 }, { "epoch": 4.378823721347581, "grad_norm": 0.9296665191650391, "learning_rate": 3.453690706920661e-05, "loss": 0.3489, "num_input_tokens_seen": 51279888, "step": 53680 }, { "epoch": 4.37923158495799, "grad_norm": 0.6462793350219727, "learning_rate": 3.453361684886372e-05, "loss": 0.3172, "num_input_tokens_seen": 51285760, "step": 53685 }, { "epoch": 4.3796394485683985, "grad_norm": 0.7880139946937561, "learning_rate": 3.4530326435278476e-05, "loss": 0.3456, "num_input_tokens_seen": 51290560, "step": 53690 }, { "epoch": 4.3800473121788075, "grad_norm": 0.8509432673454285, "learning_rate": 3.4527035828517565e-05, "loss": 0.3743, "num_input_tokens_seen": 51295648, "step": 53695 }, { "epoch": 4.380455175789216, "grad_norm": 0.6068891286849976, "learning_rate": 3.4523745028647684e-05, "loss": 0.3411, "num_input_tokens_seen": 51300224, "step": 53700 }, { "epoch": 4.380863039399625, "grad_norm": 0.5077837109565735, "learning_rate": 3.452045403573554e-05, "loss": 0.3253, "num_input_tokens_seen": 51304992, "step": 53705 }, { "epoch": 4.381270903010034, "grad_norm": 0.7817550301551819, "learning_rate": 3.451716284984783e-05, "loss": 0.3226, "num_input_tokens_seen": 51310000, "step": 53710 }, { "epoch": 4.381678766620442, "grad_norm": 0.6584733128547668, "learning_rate": 3.451387147105129e-05, "loss": 0.3132, "num_input_tokens_seen": 51314320, "step": 53715 }, { "epoch": 4.382086630230851, "grad_norm": 0.9103905558586121, "learning_rate": 3.45105798994126e-05, "loss": 0.3725, "num_input_tokens_seen": 51318704, "step": 53720 }, { "epoch": 4.38249449384126, "grad_norm": 0.2986425757408142, "learning_rate": 3.4507288134998506e-05, "loss": 0.3549, "num_input_tokens_seen": 51323744, "step": 53725 }, { "epoch": 4.382902357451668, "grad_norm": 0.9095970988273621, "learning_rate": 3.4503996177875716e-05, "loss": 0.3392, "num_input_tokens_seen": 51328912, "step": 53730 }, { "epoch": 4.383310221062077, "grad_norm": 1.0980010032653809, "learning_rate": 3.450070402811096e-05, "loss": 0.3862, "num_input_tokens_seen": 51333520, "step": 53735 }, { "epoch": 4.383718084672486, "grad_norm": 0.4974658787250519, "learning_rate": 3.4497411685770976e-05, "loss": 0.3161, "num_input_tokens_seen": 51337392, "step": 53740 }, { "epoch": 4.384125948282894, "grad_norm": 0.9298709630966187, "learning_rate": 3.44941191509225e-05, "loss": 0.3231, "num_input_tokens_seen": 51342256, "step": 53745 }, { "epoch": 4.384533811893303, "grad_norm": 1.096876621246338, "learning_rate": 3.4490826423632256e-05, "loss": 0.3918, "num_input_tokens_seen": 51347264, "step": 53750 }, { "epoch": 4.384941675503711, "grad_norm": 0.6482169032096863, "learning_rate": 3.448753350396699e-05, "loss": 0.3325, "num_input_tokens_seen": 51351648, "step": 53755 }, { "epoch": 4.38534953911412, "grad_norm": 0.5994202494621277, "learning_rate": 3.448424039199345e-05, "loss": 0.3537, "num_input_tokens_seen": 51356528, "step": 53760 }, { "epoch": 4.385757402724529, "grad_norm": 0.6406784057617188, "learning_rate": 3.448094708777839e-05, "loss": 0.3707, "num_input_tokens_seen": 51361824, "step": 53765 }, { "epoch": 4.386165266334937, "grad_norm": 0.6333896517753601, "learning_rate": 3.4477653591388566e-05, "loss": 0.363, "num_input_tokens_seen": 51366000, "step": 53770 }, { "epoch": 4.386573129945346, "grad_norm": 1.0307643413543701, "learning_rate": 3.447435990289073e-05, "loss": 0.3126, "num_input_tokens_seen": 51371360, "step": 53775 }, { "epoch": 4.386980993555755, "grad_norm": 0.7878912091255188, "learning_rate": 3.4471066022351644e-05, "loss": 0.3248, "num_input_tokens_seen": 51376480, "step": 53780 }, { "epoch": 4.3873888571661634, "grad_norm": 0.8710309863090515, "learning_rate": 3.446777194983807e-05, "loss": 0.3305, "num_input_tokens_seen": 51381744, "step": 53785 }, { "epoch": 4.3877967207765725, "grad_norm": 0.4773743748664856, "learning_rate": 3.446447768541678e-05, "loss": 0.3653, "num_input_tokens_seen": 51385856, "step": 53790 }, { "epoch": 4.388204584386981, "grad_norm": 0.7439945340156555, "learning_rate": 3.446118322915456e-05, "loss": 0.3235, "num_input_tokens_seen": 51390928, "step": 53795 }, { "epoch": 4.38861244799739, "grad_norm": 0.6647640466690063, "learning_rate": 3.445788858111817e-05, "loss": 0.3846, "num_input_tokens_seen": 51396592, "step": 53800 }, { "epoch": 4.389020311607799, "grad_norm": 0.9043461680412292, "learning_rate": 3.44545937413744e-05, "loss": 0.3583, "num_input_tokens_seen": 51401136, "step": 53805 }, { "epoch": 4.389428175218207, "grad_norm": 0.2774599492549896, "learning_rate": 3.4451298709990024e-05, "loss": 0.3455, "num_input_tokens_seen": 51406448, "step": 53810 }, { "epoch": 4.389836038828616, "grad_norm": 0.6202229857444763, "learning_rate": 3.444800348703184e-05, "loss": 0.266, "num_input_tokens_seen": 51410688, "step": 53815 }, { "epoch": 4.390243902439025, "grad_norm": 1.1950865983963013, "learning_rate": 3.444470807256664e-05, "loss": 0.3936, "num_input_tokens_seen": 51415056, "step": 53820 }, { "epoch": 4.390651766049433, "grad_norm": 0.6865426301956177, "learning_rate": 3.444141246666123e-05, "loss": 0.3201, "num_input_tokens_seen": 51420400, "step": 53825 }, { "epoch": 4.391059629659842, "grad_norm": 0.8489624857902527, "learning_rate": 3.44381166693824e-05, "loss": 0.4202, "num_input_tokens_seen": 51425056, "step": 53830 }, { "epoch": 4.39146749327025, "grad_norm": 1.2342582941055298, "learning_rate": 3.443482068079695e-05, "loss": 0.3545, "num_input_tokens_seen": 51429936, "step": 53835 }, { "epoch": 4.391875356880659, "grad_norm": 0.6361926198005676, "learning_rate": 3.4431524500971704e-05, "loss": 0.3345, "num_input_tokens_seen": 51434432, "step": 53840 }, { "epoch": 4.392283220491068, "grad_norm": 0.9982490539550781, "learning_rate": 3.442822812997345e-05, "loss": 0.3372, "num_input_tokens_seen": 51439024, "step": 53845 }, { "epoch": 4.392691084101476, "grad_norm": 0.7149542570114136, "learning_rate": 3.442493156786903e-05, "loss": 0.3422, "num_input_tokens_seen": 51442528, "step": 53850 }, { "epoch": 4.393098947711885, "grad_norm": 0.9817015528678894, "learning_rate": 3.442163481472524e-05, "loss": 0.3455, "num_input_tokens_seen": 51447200, "step": 53855 }, { "epoch": 4.393506811322294, "grad_norm": 0.7460170388221741, "learning_rate": 3.441833787060893e-05, "loss": 0.3275, "num_input_tokens_seen": 51451872, "step": 53860 }, { "epoch": 4.393914674932702, "grad_norm": 0.5399665832519531, "learning_rate": 3.4415040735586896e-05, "loss": 0.3706, "num_input_tokens_seen": 51456896, "step": 53865 }, { "epoch": 4.394322538543111, "grad_norm": 0.33545294404029846, "learning_rate": 3.4411743409726e-05, "loss": 0.3317, "num_input_tokens_seen": 51461680, "step": 53870 }, { "epoch": 4.39473040215352, "grad_norm": 1.1160627603530884, "learning_rate": 3.440844589309306e-05, "loss": 0.33, "num_input_tokens_seen": 51466448, "step": 53875 }, { "epoch": 4.395138265763928, "grad_norm": 1.0911810398101807, "learning_rate": 3.440514818575492e-05, "loss": 0.3651, "num_input_tokens_seen": 51471648, "step": 53880 }, { "epoch": 4.395546129374337, "grad_norm": 0.4637161195278168, "learning_rate": 3.440185028777842e-05, "loss": 0.3422, "num_input_tokens_seen": 51476672, "step": 53885 }, { "epoch": 4.3959539929847455, "grad_norm": 0.4311656653881073, "learning_rate": 3.439855219923042e-05, "loss": 0.2692, "num_input_tokens_seen": 51480896, "step": 53890 }, { "epoch": 4.3963618565951545, "grad_norm": 0.34328439831733704, "learning_rate": 3.439525392017774e-05, "loss": 0.3796, "num_input_tokens_seen": 51485568, "step": 53895 }, { "epoch": 4.396769720205564, "grad_norm": 0.4232001304626465, "learning_rate": 3.4391955450687274e-05, "loss": 0.3863, "num_input_tokens_seen": 51489440, "step": 53900 }, { "epoch": 4.397177583815972, "grad_norm": 0.6569623947143555, "learning_rate": 3.438865679082586e-05, "loss": 0.2864, "num_input_tokens_seen": 51494048, "step": 53905 }, { "epoch": 4.397585447426381, "grad_norm": 0.6972896456718445, "learning_rate": 3.4385357940660364e-05, "loss": 0.3977, "num_input_tokens_seen": 51499232, "step": 53910 }, { "epoch": 4.39799331103679, "grad_norm": 1.275183081626892, "learning_rate": 3.438205890025764e-05, "loss": 0.3185, "num_input_tokens_seen": 51504480, "step": 53915 }, { "epoch": 4.398401174647198, "grad_norm": 0.6423726677894592, "learning_rate": 3.4378759669684575e-05, "loss": 0.3881, "num_input_tokens_seen": 51509216, "step": 53920 }, { "epoch": 4.398809038257607, "grad_norm": 0.8684931993484497, "learning_rate": 3.437546024900804e-05, "loss": 0.3861, "num_input_tokens_seen": 51514688, "step": 53925 }, { "epoch": 4.399216901868015, "grad_norm": 0.9300991296768188, "learning_rate": 3.4372160638294905e-05, "loss": 0.3288, "num_input_tokens_seen": 51518352, "step": 53930 }, { "epoch": 4.399624765478424, "grad_norm": 0.8403412103652954, "learning_rate": 3.4368860837612057e-05, "loss": 0.3825, "num_input_tokens_seen": 51522896, "step": 53935 }, { "epoch": 4.400032629088833, "grad_norm": 0.822920024394989, "learning_rate": 3.436556084702638e-05, "loss": 0.3337, "num_input_tokens_seen": 51527520, "step": 53940 }, { "epoch": 4.400440492699241, "grad_norm": 0.6098729372024536, "learning_rate": 3.4362260666604764e-05, "loss": 0.3029, "num_input_tokens_seen": 51532368, "step": 53945 }, { "epoch": 4.40084835630965, "grad_norm": 0.4948224425315857, "learning_rate": 3.43589602964141e-05, "loss": 0.3199, "num_input_tokens_seen": 51536624, "step": 53950 }, { "epoch": 4.401256219920059, "grad_norm": 0.45262792706489563, "learning_rate": 3.435565973652129e-05, "loss": 0.2294, "num_input_tokens_seen": 51541216, "step": 53955 }, { "epoch": 4.401664083530467, "grad_norm": 0.8721117377281189, "learning_rate": 3.435235898699324e-05, "loss": 0.3737, "num_input_tokens_seen": 51545984, "step": 53960 }, { "epoch": 4.402071947140876, "grad_norm": 0.8847656846046448, "learning_rate": 3.434905804789684e-05, "loss": 0.3069, "num_input_tokens_seen": 51551280, "step": 53965 }, { "epoch": 4.402479810751284, "grad_norm": 0.5878553986549377, "learning_rate": 3.434575691929901e-05, "loss": 0.4232, "num_input_tokens_seen": 51556176, "step": 53970 }, { "epoch": 4.402887674361693, "grad_norm": 0.8373771905899048, "learning_rate": 3.434245560126665e-05, "loss": 0.2899, "num_input_tokens_seen": 51560832, "step": 53975 }, { "epoch": 4.403295537972102, "grad_norm": 0.525229811668396, "learning_rate": 3.43391540938667e-05, "loss": 0.3909, "num_input_tokens_seen": 51564992, "step": 53980 }, { "epoch": 4.4037034015825105, "grad_norm": 0.4186759889125824, "learning_rate": 3.433585239716605e-05, "loss": 0.3299, "num_input_tokens_seen": 51569952, "step": 53985 }, { "epoch": 4.4041112651929195, "grad_norm": 0.3891424834728241, "learning_rate": 3.433255051123165e-05, "loss": 0.4116, "num_input_tokens_seen": 51574880, "step": 53990 }, { "epoch": 4.4045191288033285, "grad_norm": 0.6161439418792725, "learning_rate": 3.432924843613041e-05, "loss": 0.3603, "num_input_tokens_seen": 51579744, "step": 53995 }, { "epoch": 4.404926992413737, "grad_norm": 0.9451726675033569, "learning_rate": 3.4325946171929266e-05, "loss": 0.3209, "num_input_tokens_seen": 51584544, "step": 54000 }, { "epoch": 4.405334856024146, "grad_norm": 0.707970917224884, "learning_rate": 3.4322643718695164e-05, "loss": 0.3152, "num_input_tokens_seen": 51589296, "step": 54005 }, { "epoch": 4.405742719634555, "grad_norm": 1.2431546449661255, "learning_rate": 3.4319341076495024e-05, "loss": 0.3727, "num_input_tokens_seen": 51594480, "step": 54010 }, { "epoch": 4.406150583244963, "grad_norm": 0.23373644053936005, "learning_rate": 3.431603824539582e-05, "loss": 0.3741, "num_input_tokens_seen": 51599184, "step": 54015 }, { "epoch": 4.406558446855372, "grad_norm": 0.8382506966590881, "learning_rate": 3.431273522546446e-05, "loss": 0.3479, "num_input_tokens_seen": 51604608, "step": 54020 }, { "epoch": 4.40696631046578, "grad_norm": 0.5002251863479614, "learning_rate": 3.430943201676792e-05, "loss": 0.3628, "num_input_tokens_seen": 51610080, "step": 54025 }, { "epoch": 4.407374174076189, "grad_norm": 0.5721073150634766, "learning_rate": 3.430612861937315e-05, "loss": 0.3211, "num_input_tokens_seen": 51616000, "step": 54030 }, { "epoch": 4.407782037686598, "grad_norm": 0.5667336583137512, "learning_rate": 3.4302825033347106e-05, "loss": 0.305, "num_input_tokens_seen": 51620592, "step": 54035 }, { "epoch": 4.408189901297006, "grad_norm": 0.8093854784965515, "learning_rate": 3.429952125875676e-05, "loss": 0.3085, "num_input_tokens_seen": 51625056, "step": 54040 }, { "epoch": 4.408597764907415, "grad_norm": 0.6738660931587219, "learning_rate": 3.429621729566906e-05, "loss": 0.3441, "num_input_tokens_seen": 51629616, "step": 54045 }, { "epoch": 4.409005628517823, "grad_norm": 0.6263274550437927, "learning_rate": 3.429291314415098e-05, "loss": 0.339, "num_input_tokens_seen": 51633904, "step": 54050 }, { "epoch": 4.409413492128232, "grad_norm": 0.7345707416534424, "learning_rate": 3.428960880426951e-05, "loss": 0.349, "num_input_tokens_seen": 51638336, "step": 54055 }, { "epoch": 4.409821355738641, "grad_norm": 0.7956143021583557, "learning_rate": 3.428630427609162e-05, "loss": 0.3063, "num_input_tokens_seen": 51643264, "step": 54060 }, { "epoch": 4.410229219349049, "grad_norm": 0.7697275876998901, "learning_rate": 3.428299955968428e-05, "loss": 0.3042, "num_input_tokens_seen": 51648400, "step": 54065 }, { "epoch": 4.410637082959458, "grad_norm": 0.7172573804855347, "learning_rate": 3.427969465511449e-05, "loss": 0.3206, "num_input_tokens_seen": 51652736, "step": 54070 }, { "epoch": 4.411044946569867, "grad_norm": 1.845266580581665, "learning_rate": 3.427638956244922e-05, "loss": 0.3042, "num_input_tokens_seen": 51657536, "step": 54075 }, { "epoch": 4.4114528101802755, "grad_norm": 0.8317855000495911, "learning_rate": 3.427308428175549e-05, "loss": 0.2809, "num_input_tokens_seen": 51662656, "step": 54080 }, { "epoch": 4.4118606737906845, "grad_norm": 0.6787340044975281, "learning_rate": 3.4269778813100274e-05, "loss": 0.2906, "num_input_tokens_seen": 51666896, "step": 54085 }, { "epoch": 4.4122685374010935, "grad_norm": 0.5054638385772705, "learning_rate": 3.426647315655058e-05, "loss": 0.2216, "num_input_tokens_seen": 51671200, "step": 54090 }, { "epoch": 4.412676401011502, "grad_norm": 1.6925323009490967, "learning_rate": 3.4263167312173414e-05, "loss": 0.41, "num_input_tokens_seen": 51676048, "step": 54095 }, { "epoch": 4.413084264621911, "grad_norm": 0.5438263416290283, "learning_rate": 3.425986128003579e-05, "loss": 0.347, "num_input_tokens_seen": 51680432, "step": 54100 }, { "epoch": 4.413492128232319, "grad_norm": 0.8133565187454224, "learning_rate": 3.425655506020471e-05, "loss": 0.3828, "num_input_tokens_seen": 51685664, "step": 54105 }, { "epoch": 4.413899991842728, "grad_norm": 1.7874233722686768, "learning_rate": 3.425324865274718e-05, "loss": 0.4347, "num_input_tokens_seen": 51690704, "step": 54110 }, { "epoch": 4.414307855453137, "grad_norm": 1.4176678657531738, "learning_rate": 3.424994205773024e-05, "loss": 0.3003, "num_input_tokens_seen": 51695200, "step": 54115 }, { "epoch": 4.414715719063545, "grad_norm": 1.257165551185608, "learning_rate": 3.4246635275220914e-05, "loss": 0.4764, "num_input_tokens_seen": 51700160, "step": 54120 }, { "epoch": 4.415123582673954, "grad_norm": 0.8516473174095154, "learning_rate": 3.424332830528621e-05, "loss": 0.3773, "num_input_tokens_seen": 51704032, "step": 54125 }, { "epoch": 4.415531446284363, "grad_norm": 0.8461865186691284, "learning_rate": 3.424002114799317e-05, "loss": 0.3536, "num_input_tokens_seen": 51708976, "step": 54130 }, { "epoch": 4.415939309894771, "grad_norm": 1.022743821144104, "learning_rate": 3.423671380340883e-05, "loss": 0.3408, "num_input_tokens_seen": 51713680, "step": 54135 }, { "epoch": 4.41634717350518, "grad_norm": 2.1966447830200195, "learning_rate": 3.423340627160022e-05, "loss": 0.3738, "num_input_tokens_seen": 51718768, "step": 54140 }, { "epoch": 4.416755037115588, "grad_norm": 1.1309720277786255, "learning_rate": 3.42300985526344e-05, "loss": 0.3443, "num_input_tokens_seen": 51724016, "step": 54145 }, { "epoch": 4.417162900725997, "grad_norm": 0.9293351173400879, "learning_rate": 3.4226790646578394e-05, "loss": 0.3887, "num_input_tokens_seen": 51729328, "step": 54150 }, { "epoch": 4.417570764336406, "grad_norm": 1.0758923292160034, "learning_rate": 3.422348255349926e-05, "loss": 0.3445, "num_input_tokens_seen": 51734400, "step": 54155 }, { "epoch": 4.417978627946814, "grad_norm": 1.1861790418624878, "learning_rate": 3.422017427346407e-05, "loss": 0.3456, "num_input_tokens_seen": 51739264, "step": 54160 }, { "epoch": 4.418386491557223, "grad_norm": 1.1357377767562866, "learning_rate": 3.421686580653985e-05, "loss": 0.3406, "num_input_tokens_seen": 51744128, "step": 54165 }, { "epoch": 4.418794355167632, "grad_norm": 0.7459312677383423, "learning_rate": 3.421355715279369e-05, "loss": 0.3339, "num_input_tokens_seen": 51748432, "step": 54170 }, { "epoch": 4.41920221877804, "grad_norm": 1.1034296751022339, "learning_rate": 3.421024831229262e-05, "loss": 0.3184, "num_input_tokens_seen": 51753648, "step": 54175 }, { "epoch": 4.419610082388449, "grad_norm": 0.9915022253990173, "learning_rate": 3.420693928510375e-05, "loss": 0.2759, "num_input_tokens_seen": 51758560, "step": 54180 }, { "epoch": 4.4200179459988576, "grad_norm": 1.0605764389038086, "learning_rate": 3.420363007129412e-05, "loss": 0.3741, "num_input_tokens_seen": 51762736, "step": 54185 }, { "epoch": 4.420425809609267, "grad_norm": 0.5159937739372253, "learning_rate": 3.420032067093083e-05, "loss": 0.4021, "num_input_tokens_seen": 51767968, "step": 54190 }, { "epoch": 4.420833673219676, "grad_norm": 0.541416585445404, "learning_rate": 3.4197011084080944e-05, "loss": 0.3672, "num_input_tokens_seen": 51773344, "step": 54195 }, { "epoch": 4.421241536830084, "grad_norm": 0.5221415758132935, "learning_rate": 3.419370131081156e-05, "loss": 0.2908, "num_input_tokens_seen": 51778048, "step": 54200 }, { "epoch": 4.421649400440493, "grad_norm": 1.05019211769104, "learning_rate": 3.4190391351189746e-05, "loss": 0.4551, "num_input_tokens_seen": 51783328, "step": 54205 }, { "epoch": 4.422057264050902, "grad_norm": 0.6336339712142944, "learning_rate": 3.418708120528261e-05, "loss": 0.3197, "num_input_tokens_seen": 51788480, "step": 54210 }, { "epoch": 4.42246512766131, "grad_norm": 0.5387460589408875, "learning_rate": 3.418377087315724e-05, "loss": 0.3372, "num_input_tokens_seen": 51793840, "step": 54215 }, { "epoch": 4.422872991271719, "grad_norm": 0.2727468013763428, "learning_rate": 3.4180460354880736e-05, "loss": 0.3315, "num_input_tokens_seen": 51798496, "step": 54220 }, { "epoch": 4.423280854882128, "grad_norm": 0.8968713283538818, "learning_rate": 3.41771496505202e-05, "loss": 0.3257, "num_input_tokens_seen": 51803440, "step": 54225 }, { "epoch": 4.423688718492536, "grad_norm": 1.01632821559906, "learning_rate": 3.417383876014274e-05, "loss": 0.3051, "num_input_tokens_seen": 51808000, "step": 54230 }, { "epoch": 4.424096582102945, "grad_norm": 0.5697697401046753, "learning_rate": 3.417052768381547e-05, "loss": 0.3435, "num_input_tokens_seen": 51813104, "step": 54235 }, { "epoch": 4.424504445713353, "grad_norm": 0.4779079258441925, "learning_rate": 3.41672164216055e-05, "loss": 0.2459, "num_input_tokens_seen": 51818080, "step": 54240 }, { "epoch": 4.424912309323762, "grad_norm": 0.4280150234699249, "learning_rate": 3.416390497357995e-05, "loss": 0.3505, "num_input_tokens_seen": 51823232, "step": 54245 }, { "epoch": 4.425320172934171, "grad_norm": 0.634125292301178, "learning_rate": 3.416059333980594e-05, "loss": 0.3745, "num_input_tokens_seen": 51827952, "step": 54250 }, { "epoch": 4.425728036544579, "grad_norm": 1.275444746017456, "learning_rate": 3.4157281520350595e-05, "loss": 0.4365, "num_input_tokens_seen": 51832736, "step": 54255 }, { "epoch": 4.426135900154988, "grad_norm": 0.6128066182136536, "learning_rate": 3.415396951528105e-05, "loss": 0.2761, "num_input_tokens_seen": 51837616, "step": 54260 }, { "epoch": 4.426543763765397, "grad_norm": 0.5041423439979553, "learning_rate": 3.415065732466442e-05, "loss": 0.373, "num_input_tokens_seen": 51842160, "step": 54265 }, { "epoch": 4.426951627375805, "grad_norm": 0.9010701179504395, "learning_rate": 3.414734494856786e-05, "loss": 0.3817, "num_input_tokens_seen": 51846464, "step": 54270 }, { "epoch": 4.427359490986214, "grad_norm": 0.7893185615539551, "learning_rate": 3.4144032387058506e-05, "loss": 0.3505, "num_input_tokens_seen": 51852032, "step": 54275 }, { "epoch": 4.4277673545966225, "grad_norm": 0.5615320205688477, "learning_rate": 3.41407196402035e-05, "loss": 0.3189, "num_input_tokens_seen": 51856720, "step": 54280 }, { "epoch": 4.4281752182070315, "grad_norm": 0.8451802730560303, "learning_rate": 3.413740670806999e-05, "loss": 0.3319, "num_input_tokens_seen": 51862208, "step": 54285 }, { "epoch": 4.4285830818174405, "grad_norm": 1.083720088005066, "learning_rate": 3.413409359072514e-05, "loss": 0.3717, "num_input_tokens_seen": 51866832, "step": 54290 }, { "epoch": 4.428990945427849, "grad_norm": 0.9945074319839478, "learning_rate": 3.4130780288236076e-05, "loss": 0.3473, "num_input_tokens_seen": 51870640, "step": 54295 }, { "epoch": 4.429398809038258, "grad_norm": 0.9995909929275513, "learning_rate": 3.4127466800669984e-05, "loss": 0.3411, "num_input_tokens_seen": 51875808, "step": 54300 }, { "epoch": 4.429806672648667, "grad_norm": 0.8713817596435547, "learning_rate": 3.412415312809402e-05, "loss": 0.3282, "num_input_tokens_seen": 51879744, "step": 54305 }, { "epoch": 4.430214536259075, "grad_norm": 0.8150754570960999, "learning_rate": 3.4120839270575344e-05, "loss": 0.3476, "num_input_tokens_seen": 51884448, "step": 54310 }, { "epoch": 4.430622399869484, "grad_norm": 1.0297385454177856, "learning_rate": 3.4117525228181143e-05, "loss": 0.3198, "num_input_tokens_seen": 51889344, "step": 54315 }, { "epoch": 4.431030263479892, "grad_norm": 0.660629153251648, "learning_rate": 3.411421100097857e-05, "loss": 0.3744, "num_input_tokens_seen": 51894144, "step": 54320 }, { "epoch": 4.431438127090301, "grad_norm": 0.47331106662750244, "learning_rate": 3.411089658903482e-05, "loss": 0.3053, "num_input_tokens_seen": 51898640, "step": 54325 }, { "epoch": 4.43184599070071, "grad_norm": 0.573848307132721, "learning_rate": 3.410758199241706e-05, "loss": 0.2786, "num_input_tokens_seen": 51903280, "step": 54330 }, { "epoch": 4.432253854311118, "grad_norm": 0.7429041862487793, "learning_rate": 3.410426721119249e-05, "loss": 0.3162, "num_input_tokens_seen": 51908080, "step": 54335 }, { "epoch": 4.432661717921527, "grad_norm": 0.5209428668022156, "learning_rate": 3.410095224542828e-05, "loss": 0.3913, "num_input_tokens_seen": 51913120, "step": 54340 }, { "epoch": 4.433069581531936, "grad_norm": 0.5345248579978943, "learning_rate": 3.409763709519165e-05, "loss": 0.4514, "num_input_tokens_seen": 51918704, "step": 54345 }, { "epoch": 4.433477445142344, "grad_norm": 0.9865813255310059, "learning_rate": 3.4094321760549766e-05, "loss": 0.3758, "num_input_tokens_seen": 51923792, "step": 54350 }, { "epoch": 4.433885308752753, "grad_norm": 0.6099672913551331, "learning_rate": 3.409100624156986e-05, "loss": 0.3226, "num_input_tokens_seen": 51929472, "step": 54355 }, { "epoch": 4.434293172363162, "grad_norm": 0.43225324153900146, "learning_rate": 3.408769053831912e-05, "loss": 0.3341, "num_input_tokens_seen": 51935280, "step": 54360 }, { "epoch": 4.43470103597357, "grad_norm": 0.6626643538475037, "learning_rate": 3.408437465086474e-05, "loss": 0.3513, "num_input_tokens_seen": 51940304, "step": 54365 }, { "epoch": 4.435108899583979, "grad_norm": 1.0319007635116577, "learning_rate": 3.4081058579273954e-05, "loss": 0.3626, "num_input_tokens_seen": 51945552, "step": 54370 }, { "epoch": 4.4355167631943875, "grad_norm": 0.7595497965812683, "learning_rate": 3.407774232361397e-05, "loss": 0.3437, "num_input_tokens_seen": 51950528, "step": 54375 }, { "epoch": 4.4359246268047965, "grad_norm": 0.3331030011177063, "learning_rate": 3.407442588395201e-05, "loss": 0.3657, "num_input_tokens_seen": 51955568, "step": 54380 }, { "epoch": 4.4363324904152055, "grad_norm": 0.8432842493057251, "learning_rate": 3.4071109260355286e-05, "loss": 0.3307, "num_input_tokens_seen": 51961120, "step": 54385 }, { "epoch": 4.436740354025614, "grad_norm": 0.9058042168617249, "learning_rate": 3.406779245289104e-05, "loss": 0.3576, "num_input_tokens_seen": 51966176, "step": 54390 }, { "epoch": 4.437148217636023, "grad_norm": 0.9037587642669678, "learning_rate": 3.406447546162649e-05, "loss": 0.352, "num_input_tokens_seen": 51971104, "step": 54395 }, { "epoch": 4.437556081246431, "grad_norm": 0.6034300923347473, "learning_rate": 3.406115828662887e-05, "loss": 0.3579, "num_input_tokens_seen": 51975184, "step": 54400 }, { "epoch": 4.43796394485684, "grad_norm": 1.0007612705230713, "learning_rate": 3.4057840927965435e-05, "loss": 0.3596, "num_input_tokens_seen": 51979904, "step": 54405 }, { "epoch": 4.438371808467249, "grad_norm": 0.6979630589485168, "learning_rate": 3.40545233857034e-05, "loss": 0.3165, "num_input_tokens_seen": 51984528, "step": 54410 }, { "epoch": 4.438779672077657, "grad_norm": 0.3487929403781891, "learning_rate": 3.4051205659910024e-05, "loss": 0.3561, "num_input_tokens_seen": 51988656, "step": 54415 }, { "epoch": 4.439187535688066, "grad_norm": 0.6169028878211975, "learning_rate": 3.404788775065256e-05, "loss": 0.33, "num_input_tokens_seen": 51992976, "step": 54420 }, { "epoch": 4.439595399298475, "grad_norm": 0.9186736345291138, "learning_rate": 3.404456965799826e-05, "loss": 0.3411, "num_input_tokens_seen": 51997840, "step": 54425 }, { "epoch": 4.440003262908883, "grad_norm": 0.7348781228065491, "learning_rate": 3.404125138201436e-05, "loss": 0.3409, "num_input_tokens_seen": 52002592, "step": 54430 }, { "epoch": 4.440411126519292, "grad_norm": 0.7699544429779053, "learning_rate": 3.403793292276815e-05, "loss": 0.3532, "num_input_tokens_seen": 52006832, "step": 54435 }, { "epoch": 4.440818990129701, "grad_norm": 0.7071421146392822, "learning_rate": 3.4034614280326875e-05, "loss": 0.3213, "num_input_tokens_seen": 52012064, "step": 54440 }, { "epoch": 4.441226853740109, "grad_norm": 0.9698227643966675, "learning_rate": 3.4031295454757814e-05, "loss": 0.4105, "num_input_tokens_seen": 52017216, "step": 54445 }, { "epoch": 4.441634717350518, "grad_norm": 0.3259536921977997, "learning_rate": 3.402797644612823e-05, "loss": 0.3649, "num_input_tokens_seen": 52021824, "step": 54450 }, { "epoch": 4.442042580960926, "grad_norm": 0.7480854392051697, "learning_rate": 3.4024657254505396e-05, "loss": 0.341, "num_input_tokens_seen": 52027184, "step": 54455 }, { "epoch": 4.442450444571335, "grad_norm": 0.5819778442382812, "learning_rate": 3.4021337879956596e-05, "loss": 0.3471, "num_input_tokens_seen": 52031632, "step": 54460 }, { "epoch": 4.442858308181744, "grad_norm": 0.9084956645965576, "learning_rate": 3.401801832254911e-05, "loss": 0.3332, "num_input_tokens_seen": 52036720, "step": 54465 }, { "epoch": 4.443266171792152, "grad_norm": 0.7817802429199219, "learning_rate": 3.401469858235023e-05, "loss": 0.3539, "num_input_tokens_seen": 52041504, "step": 54470 }, { "epoch": 4.4436740354025615, "grad_norm": 1.0039677619934082, "learning_rate": 3.401137865942723e-05, "loss": 0.3538, "num_input_tokens_seen": 52045824, "step": 54475 }, { "epoch": 4.4440818990129705, "grad_norm": 0.3237136900424957, "learning_rate": 3.400805855384742e-05, "loss": 0.3209, "num_input_tokens_seen": 52050160, "step": 54480 }, { "epoch": 4.444489762623379, "grad_norm": 0.6702341437339783, "learning_rate": 3.4004738265678085e-05, "loss": 0.3351, "num_input_tokens_seen": 52054624, "step": 54485 }, { "epoch": 4.444897626233788, "grad_norm": 0.4170234501361847, "learning_rate": 3.400141779498654e-05, "loss": 0.3389, "num_input_tokens_seen": 52059024, "step": 54490 }, { "epoch": 4.445305489844196, "grad_norm": 0.8629714846611023, "learning_rate": 3.399809714184007e-05, "loss": 0.3749, "num_input_tokens_seen": 52063520, "step": 54495 }, { "epoch": 4.445713353454605, "grad_norm": 0.4623579680919647, "learning_rate": 3.399477630630601e-05, "loss": 0.3537, "num_input_tokens_seen": 52068000, "step": 54500 }, { "epoch": 4.446121217065014, "grad_norm": 0.7960073947906494, "learning_rate": 3.3991455288451654e-05, "loss": 0.3374, "num_input_tokens_seen": 52073312, "step": 54505 }, { "epoch": 4.446529080675422, "grad_norm": 0.5431888699531555, "learning_rate": 3.398813408834431e-05, "loss": 0.3634, "num_input_tokens_seen": 52077520, "step": 54510 }, { "epoch": 4.446936944285831, "grad_norm": 0.9028751254081726, "learning_rate": 3.398481270605131e-05, "loss": 0.3621, "num_input_tokens_seen": 52082160, "step": 54515 }, { "epoch": 4.44734480789624, "grad_norm": 0.7608041763305664, "learning_rate": 3.398149114163998e-05, "loss": 0.3339, "num_input_tokens_seen": 52087392, "step": 54520 }, { "epoch": 4.447752671506648, "grad_norm": 1.004949688911438, "learning_rate": 3.397816939517763e-05, "loss": 0.3401, "num_input_tokens_seen": 52092032, "step": 54525 }, { "epoch": 4.448160535117057, "grad_norm": 0.181358203291893, "learning_rate": 3.3974847466731614e-05, "loss": 0.3246, "num_input_tokens_seen": 52096752, "step": 54530 }, { "epoch": 4.448568398727465, "grad_norm": 1.0590308904647827, "learning_rate": 3.397152535636925e-05, "loss": 0.391, "num_input_tokens_seen": 52101664, "step": 54535 }, { "epoch": 4.448976262337874, "grad_norm": 0.5816561579704285, "learning_rate": 3.396820306415788e-05, "loss": 0.3231, "num_input_tokens_seen": 52105664, "step": 54540 }, { "epoch": 4.449384125948283, "grad_norm": 0.4236285388469696, "learning_rate": 3.3964880590164835e-05, "loss": 0.3468, "num_input_tokens_seen": 52110096, "step": 54545 }, { "epoch": 4.449791989558691, "grad_norm": 0.5171403288841248, "learning_rate": 3.396155793445748e-05, "loss": 0.3595, "num_input_tokens_seen": 52115136, "step": 54550 }, { "epoch": 4.4501998531691, "grad_norm": 0.8479436039924622, "learning_rate": 3.395823509710315e-05, "loss": 0.302, "num_input_tokens_seen": 52120304, "step": 54555 }, { "epoch": 4.450607716779509, "grad_norm": 0.331620991230011, "learning_rate": 3.39549120781692e-05, "loss": 0.3904, "num_input_tokens_seen": 52125344, "step": 54560 }, { "epoch": 4.451015580389917, "grad_norm": 0.41969743371009827, "learning_rate": 3.3951588877722986e-05, "loss": 0.2952, "num_input_tokens_seen": 52130016, "step": 54565 }, { "epoch": 4.451423444000326, "grad_norm": 0.44603830575942993, "learning_rate": 3.394826549583188e-05, "loss": 0.3114, "num_input_tokens_seen": 52134960, "step": 54570 }, { "epoch": 4.451831307610735, "grad_norm": 0.41576912999153137, "learning_rate": 3.3944941932563224e-05, "loss": 0.3388, "num_input_tokens_seen": 52139248, "step": 54575 }, { "epoch": 4.4522391712211435, "grad_norm": 0.8845343589782715, "learning_rate": 3.3941618187984403e-05, "loss": 0.3406, "num_input_tokens_seen": 52144368, "step": 54580 }, { "epoch": 4.4526470348315526, "grad_norm": 0.25454896688461304, "learning_rate": 3.393829426216277e-05, "loss": 0.362, "num_input_tokens_seen": 52148976, "step": 54585 }, { "epoch": 4.453054898441961, "grad_norm": 0.7816449999809265, "learning_rate": 3.393497015516572e-05, "loss": 0.3262, "num_input_tokens_seen": 52153424, "step": 54590 }, { "epoch": 4.45346276205237, "grad_norm": 0.8656225800514221, "learning_rate": 3.393164586706062e-05, "loss": 0.368, "num_input_tokens_seen": 52158560, "step": 54595 }, { "epoch": 4.453870625662779, "grad_norm": 0.9917592406272888, "learning_rate": 3.392832139791485e-05, "loss": 0.3664, "num_input_tokens_seen": 52163008, "step": 54600 }, { "epoch": 4.454278489273187, "grad_norm": 1.0904546976089478, "learning_rate": 3.39249967477958e-05, "loss": 0.3136, "num_input_tokens_seen": 52167552, "step": 54605 }, { "epoch": 4.454686352883596, "grad_norm": 0.6967620253562927, "learning_rate": 3.392167191677087e-05, "loss": 0.3525, "num_input_tokens_seen": 52171488, "step": 54610 }, { "epoch": 4.455094216494004, "grad_norm": 1.1602691411972046, "learning_rate": 3.3918346904907425e-05, "loss": 0.3694, "num_input_tokens_seen": 52176672, "step": 54615 }, { "epoch": 4.455502080104413, "grad_norm": 0.6452620625495911, "learning_rate": 3.3915021712272886e-05, "loss": 0.3631, "num_input_tokens_seen": 52182064, "step": 54620 }, { "epoch": 4.455909943714822, "grad_norm": 0.6091495752334595, "learning_rate": 3.391169633893465e-05, "loss": 0.3185, "num_input_tokens_seen": 52186960, "step": 54625 }, { "epoch": 4.45631780732523, "grad_norm": 0.6254968047142029, "learning_rate": 3.39083707849601e-05, "loss": 0.3624, "num_input_tokens_seen": 52191136, "step": 54630 }, { "epoch": 4.456725670935639, "grad_norm": 0.4892941415309906, "learning_rate": 3.390504505041667e-05, "loss": 0.3211, "num_input_tokens_seen": 52195104, "step": 54635 }, { "epoch": 4.457133534546048, "grad_norm": 0.6560854911804199, "learning_rate": 3.3901719135371755e-05, "loss": 0.3432, "num_input_tokens_seen": 52199488, "step": 54640 }, { "epoch": 4.457541398156456, "grad_norm": 1.0931593179702759, "learning_rate": 3.389839303989279e-05, "loss": 0.3267, "num_input_tokens_seen": 52204512, "step": 54645 }, { "epoch": 4.457949261766865, "grad_norm": 0.7481072545051575, "learning_rate": 3.3895066764047165e-05, "loss": 0.3285, "num_input_tokens_seen": 52209776, "step": 54650 }, { "epoch": 4.458357125377274, "grad_norm": 1.2175934314727783, "learning_rate": 3.389174030790232e-05, "loss": 0.3166, "num_input_tokens_seen": 52215072, "step": 54655 }, { "epoch": 4.458764988987682, "grad_norm": 0.5926437377929688, "learning_rate": 3.388841367152568e-05, "loss": 0.3062, "num_input_tokens_seen": 52219440, "step": 54660 }, { "epoch": 4.459172852598091, "grad_norm": 0.7837279438972473, "learning_rate": 3.388508685498467e-05, "loss": 0.3062, "num_input_tokens_seen": 52223872, "step": 54665 }, { "epoch": 4.4595807162084995, "grad_norm": 0.6211020946502686, "learning_rate": 3.388175985834673e-05, "loss": 0.279, "num_input_tokens_seen": 52228960, "step": 54670 }, { "epoch": 4.4599885798189085, "grad_norm": 0.9296433925628662, "learning_rate": 3.387843268167929e-05, "loss": 0.2266, "num_input_tokens_seen": 52233584, "step": 54675 }, { "epoch": 4.4603964434293175, "grad_norm": 0.6266736388206482, "learning_rate": 3.3875105325049785e-05, "loss": 0.4489, "num_input_tokens_seen": 52237888, "step": 54680 }, { "epoch": 4.460804307039726, "grad_norm": 0.4576972424983978, "learning_rate": 3.3871777788525665e-05, "loss": 0.3904, "num_input_tokens_seen": 52241808, "step": 54685 }, { "epoch": 4.461212170650135, "grad_norm": 0.5441585779190063, "learning_rate": 3.386845007217439e-05, "loss": 0.4169, "num_input_tokens_seen": 52245904, "step": 54690 }, { "epoch": 4.461620034260544, "grad_norm": 0.37049147486686707, "learning_rate": 3.386512217606339e-05, "loss": 0.357, "num_input_tokens_seen": 52250368, "step": 54695 }, { "epoch": 4.462027897870952, "grad_norm": 0.686682403087616, "learning_rate": 3.386179410026014e-05, "loss": 0.3637, "num_input_tokens_seen": 52254528, "step": 54700 }, { "epoch": 4.462435761481361, "grad_norm": 0.4694463014602661, "learning_rate": 3.385846584483208e-05, "loss": 0.3708, "num_input_tokens_seen": 52259072, "step": 54705 }, { "epoch": 4.462843625091769, "grad_norm": 0.39309361577033997, "learning_rate": 3.3855137409846685e-05, "loss": 0.3267, "num_input_tokens_seen": 52264480, "step": 54710 }, { "epoch": 4.463251488702178, "grad_norm": 0.7126986384391785, "learning_rate": 3.385180879537143e-05, "loss": 0.3357, "num_input_tokens_seen": 52269248, "step": 54715 }, { "epoch": 4.463659352312587, "grad_norm": 0.6760274171829224, "learning_rate": 3.3848480001473754e-05, "loss": 0.3732, "num_input_tokens_seen": 52273968, "step": 54720 }, { "epoch": 4.464067215922995, "grad_norm": 1.0851144790649414, "learning_rate": 3.384515102822116e-05, "loss": 0.3364, "num_input_tokens_seen": 52278112, "step": 54725 }, { "epoch": 4.464475079533404, "grad_norm": 0.6865426898002625, "learning_rate": 3.384182187568111e-05, "loss": 0.3202, "num_input_tokens_seen": 52282784, "step": 54730 }, { "epoch": 4.464882943143813, "grad_norm": 0.37206023931503296, "learning_rate": 3.3838492543921085e-05, "loss": 0.3289, "num_input_tokens_seen": 52287408, "step": 54735 }, { "epoch": 4.465290806754221, "grad_norm": 0.5816906094551086, "learning_rate": 3.3835163033008565e-05, "loss": 0.34, "num_input_tokens_seen": 52292256, "step": 54740 }, { "epoch": 4.46569867036463, "grad_norm": 0.762450635433197, "learning_rate": 3.3831833343011054e-05, "loss": 0.3265, "num_input_tokens_seen": 52296288, "step": 54745 }, { "epoch": 4.466106533975038, "grad_norm": 0.6060999035835266, "learning_rate": 3.382850347399603e-05, "loss": 0.3222, "num_input_tokens_seen": 52301040, "step": 54750 }, { "epoch": 4.466514397585447, "grad_norm": 0.8262760043144226, "learning_rate": 3.3825173426031e-05, "loss": 0.3458, "num_input_tokens_seen": 52304480, "step": 54755 }, { "epoch": 4.466922261195856, "grad_norm": 0.2992265224456787, "learning_rate": 3.382184319918345e-05, "loss": 0.396, "num_input_tokens_seen": 52308944, "step": 54760 }, { "epoch": 4.4673301248062645, "grad_norm": 0.4163741171360016, "learning_rate": 3.381851279352088e-05, "loss": 0.3266, "num_input_tokens_seen": 52312752, "step": 54765 }, { "epoch": 4.4677379884166735, "grad_norm": 1.080371618270874, "learning_rate": 3.381518220911081e-05, "loss": 0.3559, "num_input_tokens_seen": 52318032, "step": 54770 }, { "epoch": 4.4681458520270825, "grad_norm": 0.749705970287323, "learning_rate": 3.381185144602074e-05, "loss": 0.366, "num_input_tokens_seen": 52322944, "step": 54775 }, { "epoch": 4.468553715637491, "grad_norm": 0.6875787377357483, "learning_rate": 3.3808520504318194e-05, "loss": 0.3457, "num_input_tokens_seen": 52328624, "step": 54780 }, { "epoch": 4.4689615792479, "grad_norm": 0.7594531178474426, "learning_rate": 3.380518938407067e-05, "loss": 0.3435, "num_input_tokens_seen": 52333216, "step": 54785 }, { "epoch": 4.469369442858309, "grad_norm": 0.5137636065483093, "learning_rate": 3.38018580853457e-05, "loss": 0.3009, "num_input_tokens_seen": 52338064, "step": 54790 }, { "epoch": 4.469777306468717, "grad_norm": 0.625868022441864, "learning_rate": 3.379852660821081e-05, "loss": 0.3782, "num_input_tokens_seen": 52343168, "step": 54795 }, { "epoch": 4.470185170079126, "grad_norm": 0.9127179384231567, "learning_rate": 3.3795194952733526e-05, "loss": 0.3721, "num_input_tokens_seen": 52347840, "step": 54800 }, { "epoch": 4.470593033689534, "grad_norm": 0.6121170520782471, "learning_rate": 3.379186311898138e-05, "loss": 0.33, "num_input_tokens_seen": 52353216, "step": 54805 }, { "epoch": 4.471000897299943, "grad_norm": 0.9571378231048584, "learning_rate": 3.378853110702189e-05, "loss": 0.3791, "num_input_tokens_seen": 52357536, "step": 54810 }, { "epoch": 4.471408760910352, "grad_norm": 0.8955357670783997, "learning_rate": 3.378519891692262e-05, "loss": 0.3181, "num_input_tokens_seen": 52362336, "step": 54815 }, { "epoch": 4.47181662452076, "grad_norm": 0.5832148790359497, "learning_rate": 3.3781866548751104e-05, "loss": 0.3322, "num_input_tokens_seen": 52367008, "step": 54820 }, { "epoch": 4.472224488131169, "grad_norm": 0.7694835066795349, "learning_rate": 3.377853400257488e-05, "loss": 0.3334, "num_input_tokens_seen": 52372272, "step": 54825 }, { "epoch": 4.472632351741578, "grad_norm": 0.5558803677558899, "learning_rate": 3.3775201278461496e-05, "loss": 0.3527, "num_input_tokens_seen": 52376496, "step": 54830 }, { "epoch": 4.473040215351986, "grad_norm": 0.9657886624336243, "learning_rate": 3.377186837647852e-05, "loss": 0.3522, "num_input_tokens_seen": 52381344, "step": 54835 }, { "epoch": 4.473448078962395, "grad_norm": 0.645993173122406, "learning_rate": 3.376853529669349e-05, "loss": 0.3433, "num_input_tokens_seen": 52386128, "step": 54840 }, { "epoch": 4.473855942572803, "grad_norm": 0.4221992790699005, "learning_rate": 3.3765202039173985e-05, "loss": 0.3384, "num_input_tokens_seen": 52390624, "step": 54845 }, { "epoch": 4.474263806183212, "grad_norm": 0.7956132292747498, "learning_rate": 3.3761868603987555e-05, "loss": 0.3812, "num_input_tokens_seen": 52394848, "step": 54850 }, { "epoch": 4.474671669793621, "grad_norm": 1.5395652055740356, "learning_rate": 3.3758534991201775e-05, "loss": 0.3923, "num_input_tokens_seen": 52398704, "step": 54855 }, { "epoch": 4.475079533404029, "grad_norm": 0.4225800037384033, "learning_rate": 3.375520120088421e-05, "loss": 0.3619, "num_input_tokens_seen": 52403824, "step": 54860 }, { "epoch": 4.475487397014438, "grad_norm": 0.5624234080314636, "learning_rate": 3.375186723310244e-05, "loss": 0.3829, "num_input_tokens_seen": 52408688, "step": 54865 }, { "epoch": 4.475895260624847, "grad_norm": 0.47849997878074646, "learning_rate": 3.374853308792403e-05, "loss": 0.3245, "num_input_tokens_seen": 52412992, "step": 54870 }, { "epoch": 4.476303124235256, "grad_norm": 1.2925124168395996, "learning_rate": 3.3745198765416584e-05, "loss": 0.4009, "num_input_tokens_seen": 52418080, "step": 54875 }, { "epoch": 4.476710987845665, "grad_norm": 0.7237902879714966, "learning_rate": 3.374186426564767e-05, "loss": 0.36, "num_input_tokens_seen": 52423312, "step": 54880 }, { "epoch": 4.477118851456073, "grad_norm": 0.28535357117652893, "learning_rate": 3.3738529588684885e-05, "loss": 0.3493, "num_input_tokens_seen": 52426736, "step": 54885 }, { "epoch": 4.477526715066482, "grad_norm": 0.24111820757389069, "learning_rate": 3.3735194734595814e-05, "loss": 0.3522, "num_input_tokens_seen": 52431728, "step": 54890 }, { "epoch": 4.477934578676891, "grad_norm": 0.6191524267196655, "learning_rate": 3.373185970344805e-05, "loss": 0.3652, "num_input_tokens_seen": 52436096, "step": 54895 }, { "epoch": 4.478342442287299, "grad_norm": 0.774926483631134, "learning_rate": 3.372852449530922e-05, "loss": 0.3311, "num_input_tokens_seen": 52440608, "step": 54900 }, { "epoch": 4.478750305897708, "grad_norm": 0.8211487531661987, "learning_rate": 3.372518911024689e-05, "loss": 0.3728, "num_input_tokens_seen": 52446016, "step": 54905 }, { "epoch": 4.479158169508117, "grad_norm": 0.5935454368591309, "learning_rate": 3.3721853548328694e-05, "loss": 0.3311, "num_input_tokens_seen": 52451168, "step": 54910 }, { "epoch": 4.479566033118525, "grad_norm": 0.5086121559143066, "learning_rate": 3.371851780962223e-05, "loss": 0.3171, "num_input_tokens_seen": 52456112, "step": 54915 }, { "epoch": 4.479973896728934, "grad_norm": 0.6669679880142212, "learning_rate": 3.371518189419511e-05, "loss": 0.2757, "num_input_tokens_seen": 52461088, "step": 54920 }, { "epoch": 4.480381760339343, "grad_norm": 0.38606488704681396, "learning_rate": 3.371184580211497e-05, "loss": 0.2965, "num_input_tokens_seen": 52465712, "step": 54925 }, { "epoch": 4.480789623949751, "grad_norm": 1.0855205059051514, "learning_rate": 3.37085095334494e-05, "loss": 0.404, "num_input_tokens_seen": 52470816, "step": 54930 }, { "epoch": 4.48119748756016, "grad_norm": 0.7675104141235352, "learning_rate": 3.3705173088266054e-05, "loss": 0.3857, "num_input_tokens_seen": 52475312, "step": 54935 }, { "epoch": 4.481605351170568, "grad_norm": 0.4245603382587433, "learning_rate": 3.3701836466632546e-05, "loss": 0.2761, "num_input_tokens_seen": 52480064, "step": 54940 }, { "epoch": 4.482013214780977, "grad_norm": 0.762493371963501, "learning_rate": 3.369849966861651e-05, "loss": 0.3022, "num_input_tokens_seen": 52484912, "step": 54945 }, { "epoch": 4.482421078391386, "grad_norm": 1.1096339225769043, "learning_rate": 3.3695162694285576e-05, "loss": 0.3484, "num_input_tokens_seen": 52489600, "step": 54950 }, { "epoch": 4.482828942001794, "grad_norm": 0.7515203356742859, "learning_rate": 3.36918255437074e-05, "loss": 0.3069, "num_input_tokens_seen": 52493536, "step": 54955 }, { "epoch": 4.483236805612203, "grad_norm": 0.6950993537902832, "learning_rate": 3.368848821694961e-05, "loss": 0.2858, "num_input_tokens_seen": 52497872, "step": 54960 }, { "epoch": 4.4836446692226115, "grad_norm": 1.0207360982894897, "learning_rate": 3.368515071407985e-05, "loss": 0.3658, "num_input_tokens_seen": 52502080, "step": 54965 }, { "epoch": 4.4840525328330205, "grad_norm": 0.9779051542282104, "learning_rate": 3.368181303516578e-05, "loss": 0.3245, "num_input_tokens_seen": 52505968, "step": 54970 }, { "epoch": 4.4844603964434295, "grad_norm": 0.676023006439209, "learning_rate": 3.367847518027505e-05, "loss": 0.4251, "num_input_tokens_seen": 52510496, "step": 54975 }, { "epoch": 4.484868260053838, "grad_norm": 1.153961420059204, "learning_rate": 3.367513714947531e-05, "loss": 0.3424, "num_input_tokens_seen": 52514816, "step": 54980 }, { "epoch": 4.485276123664247, "grad_norm": 1.097447395324707, "learning_rate": 3.367179894283423e-05, "loss": 0.3495, "num_input_tokens_seen": 52519152, "step": 54985 }, { "epoch": 4.485683987274656, "grad_norm": 0.9884364604949951, "learning_rate": 3.366846056041947e-05, "loss": 0.3348, "num_input_tokens_seen": 52524080, "step": 54990 }, { "epoch": 4.486091850885064, "grad_norm": 0.9770086407661438, "learning_rate": 3.3665122002298696e-05, "loss": 0.3423, "num_input_tokens_seen": 52528912, "step": 54995 }, { "epoch": 4.486499714495473, "grad_norm": 1.0185761451721191, "learning_rate": 3.366178326853959e-05, "loss": 0.3685, "num_input_tokens_seen": 52533168, "step": 55000 }, { "epoch": 4.486907578105882, "grad_norm": 0.7468409538269043, "learning_rate": 3.36584443592098e-05, "loss": 0.3551, "num_input_tokens_seen": 52537360, "step": 55005 }, { "epoch": 4.48731544171629, "grad_norm": 0.9050847887992859, "learning_rate": 3.365510527437704e-05, "loss": 0.297, "num_input_tokens_seen": 52541808, "step": 55010 }, { "epoch": 4.487723305326699, "grad_norm": 0.9279364943504333, "learning_rate": 3.3651766014108976e-05, "loss": 0.3601, "num_input_tokens_seen": 52546160, "step": 55015 }, { "epoch": 4.488131168937107, "grad_norm": 1.0386900901794434, "learning_rate": 3.3648426578473276e-05, "loss": 0.3967, "num_input_tokens_seen": 52549984, "step": 55020 }, { "epoch": 4.488539032547516, "grad_norm": 0.7982625365257263, "learning_rate": 3.364508696753765e-05, "loss": 0.3389, "num_input_tokens_seen": 52554528, "step": 55025 }, { "epoch": 4.488946896157925, "grad_norm": 0.4113534390926361, "learning_rate": 3.364174718136979e-05, "loss": 0.2998, "num_input_tokens_seen": 52558928, "step": 55030 }, { "epoch": 4.489354759768333, "grad_norm": 0.9058850407600403, "learning_rate": 3.363840722003738e-05, "loss": 0.2888, "num_input_tokens_seen": 52564240, "step": 55035 }, { "epoch": 4.489762623378742, "grad_norm": 0.9737362265586853, "learning_rate": 3.3635067083608124e-05, "loss": 0.3509, "num_input_tokens_seen": 52569216, "step": 55040 }, { "epoch": 4.490170486989151, "grad_norm": 0.9181092381477356, "learning_rate": 3.3631726772149735e-05, "loss": 0.345, "num_input_tokens_seen": 52574576, "step": 55045 }, { "epoch": 4.490578350599559, "grad_norm": 1.3170342445373535, "learning_rate": 3.362838628572991e-05, "loss": 0.329, "num_input_tokens_seen": 52578464, "step": 55050 }, { "epoch": 4.490986214209968, "grad_norm": 0.8065999746322632, "learning_rate": 3.362504562441636e-05, "loss": 0.3294, "num_input_tokens_seen": 52583920, "step": 55055 }, { "epoch": 4.4913940778203765, "grad_norm": 1.0793070793151855, "learning_rate": 3.36217047882768e-05, "loss": 0.3361, "num_input_tokens_seen": 52588112, "step": 55060 }, { "epoch": 4.4918019414307855, "grad_norm": 1.4868292808532715, "learning_rate": 3.361836377737895e-05, "loss": 0.3913, "num_input_tokens_seen": 52593472, "step": 55065 }, { "epoch": 4.4922098050411945, "grad_norm": 1.045453429222107, "learning_rate": 3.361502259179052e-05, "loss": 0.3696, "num_input_tokens_seen": 52598464, "step": 55070 }, { "epoch": 4.492617668651603, "grad_norm": 0.8565677404403687, "learning_rate": 3.361168123157925e-05, "loss": 0.3334, "num_input_tokens_seen": 52602832, "step": 55075 }, { "epoch": 4.493025532262012, "grad_norm": 0.9895622134208679, "learning_rate": 3.360833969681286e-05, "loss": 0.3603, "num_input_tokens_seen": 52607376, "step": 55080 }, { "epoch": 4.493433395872421, "grad_norm": 0.8213149905204773, "learning_rate": 3.360499798755908e-05, "loss": 0.3337, "num_input_tokens_seen": 52612096, "step": 55085 }, { "epoch": 4.493841259482829, "grad_norm": 1.4089423418045044, "learning_rate": 3.360165610388566e-05, "loss": 0.3585, "num_input_tokens_seen": 52616624, "step": 55090 }, { "epoch": 4.494249123093238, "grad_norm": 1.4739431142807007, "learning_rate": 3.359831404586031e-05, "loss": 0.351, "num_input_tokens_seen": 52620736, "step": 55095 }, { "epoch": 4.494656986703646, "grad_norm": 0.965246319770813, "learning_rate": 3.359497181355079e-05, "loss": 0.3307, "num_input_tokens_seen": 52624784, "step": 55100 }, { "epoch": 4.495064850314055, "grad_norm": 1.3416907787322998, "learning_rate": 3.359162940702485e-05, "loss": 0.3775, "num_input_tokens_seen": 52629760, "step": 55105 }, { "epoch": 4.495472713924464, "grad_norm": 1.5238404273986816, "learning_rate": 3.358828682635023e-05, "loss": 0.368, "num_input_tokens_seen": 52634432, "step": 55110 }, { "epoch": 4.495880577534872, "grad_norm": 0.6767469644546509, "learning_rate": 3.3584944071594685e-05, "loss": 0.3399, "num_input_tokens_seen": 52638992, "step": 55115 }, { "epoch": 4.496288441145281, "grad_norm": 1.1368097066879272, "learning_rate": 3.358160114282597e-05, "loss": 0.3408, "num_input_tokens_seen": 52644208, "step": 55120 }, { "epoch": 4.49669630475569, "grad_norm": 1.8947579860687256, "learning_rate": 3.357825804011185e-05, "loss": 0.3218, "num_input_tokens_seen": 52648384, "step": 55125 }, { "epoch": 4.497104168366098, "grad_norm": 1.382155179977417, "learning_rate": 3.357491476352008e-05, "loss": 0.3323, "num_input_tokens_seen": 52653008, "step": 55130 }, { "epoch": 4.497512031976507, "grad_norm": 1.2764129638671875, "learning_rate": 3.357157131311843e-05, "loss": 0.3299, "num_input_tokens_seen": 52658720, "step": 55135 }, { "epoch": 4.497919895586916, "grad_norm": 0.7518261075019836, "learning_rate": 3.3568227688974684e-05, "loss": 0.3298, "num_input_tokens_seen": 52663856, "step": 55140 }, { "epoch": 4.498327759197324, "grad_norm": 1.4460413455963135, "learning_rate": 3.3564883891156596e-05, "loss": 0.3116, "num_input_tokens_seen": 52668592, "step": 55145 }, { "epoch": 4.498735622807733, "grad_norm": 1.4187730550765991, "learning_rate": 3.356153991973195e-05, "loss": 0.4049, "num_input_tokens_seen": 52673536, "step": 55150 }, { "epoch": 4.499143486418141, "grad_norm": 1.3276289701461792, "learning_rate": 3.355819577476853e-05, "loss": 0.3438, "num_input_tokens_seen": 52678304, "step": 55155 }, { "epoch": 4.4995513500285504, "grad_norm": 0.868790864944458, "learning_rate": 3.3554851456334113e-05, "loss": 0.3379, "num_input_tokens_seen": 52683152, "step": 55160 }, { "epoch": 4.4999592136389595, "grad_norm": 1.0492762327194214, "learning_rate": 3.35515069644965e-05, "loss": 0.3567, "num_input_tokens_seen": 52687968, "step": 55165 }, { "epoch": 4.500367077249368, "grad_norm": 0.6049374938011169, "learning_rate": 3.3548162299323474e-05, "loss": 0.3059, "num_input_tokens_seen": 52692880, "step": 55170 }, { "epoch": 4.500367077249368, "eval_loss": 0.3385254144668579, "eval_runtime": 570.9614, "eval_samples_per_second": 4.773, "eval_steps_per_second": 2.387, "num_input_tokens_seen": 52692880, "step": 55170 }, { "epoch": 4.500774940859777, "grad_norm": 0.7357668280601501, "learning_rate": 3.3544817460882815e-05, "loss": 0.3287, "num_input_tokens_seen": 52697632, "step": 55175 }, { "epoch": 4.501182804470185, "grad_norm": 0.7026228308677673, "learning_rate": 3.3541472449242356e-05, "loss": 0.3514, "num_input_tokens_seen": 52702848, "step": 55180 }, { "epoch": 4.501590668080594, "grad_norm": 2.421462297439575, "learning_rate": 3.353812726446987e-05, "loss": 0.3469, "num_input_tokens_seen": 52707216, "step": 55185 }, { "epoch": 4.501998531691003, "grad_norm": 1.5811700820922852, "learning_rate": 3.353478190663317e-05, "loss": 0.3112, "num_input_tokens_seen": 52711088, "step": 55190 }, { "epoch": 4.502406395301411, "grad_norm": 1.2790238857269287, "learning_rate": 3.353143637580007e-05, "loss": 0.3226, "num_input_tokens_seen": 52715568, "step": 55195 }, { "epoch": 4.50281425891182, "grad_norm": 1.0492209196090698, "learning_rate": 3.3528090672038394e-05, "loss": 0.3562, "num_input_tokens_seen": 52720496, "step": 55200 }, { "epoch": 4.503222122522229, "grad_norm": 1.4521139860153198, "learning_rate": 3.352474479541593e-05, "loss": 0.3441, "num_input_tokens_seen": 52725552, "step": 55205 }, { "epoch": 4.503629986132637, "grad_norm": 0.9261218905448914, "learning_rate": 3.352139874600051e-05, "loss": 0.2933, "num_input_tokens_seen": 52731152, "step": 55210 }, { "epoch": 4.504037849743046, "grad_norm": 2.122044801712036, "learning_rate": 3.351805252385997e-05, "loss": 0.3002, "num_input_tokens_seen": 52735632, "step": 55215 }, { "epoch": 4.504445713353455, "grad_norm": 0.6881261467933655, "learning_rate": 3.3514706129062115e-05, "loss": 0.2759, "num_input_tokens_seen": 52740608, "step": 55220 }, { "epoch": 4.504853576963863, "grad_norm": 1.2316811084747314, "learning_rate": 3.3511359561674785e-05, "loss": 0.3083, "num_input_tokens_seen": 52744976, "step": 55225 }, { "epoch": 4.505261440574272, "grad_norm": 2.117168664932251, "learning_rate": 3.350801282176582e-05, "loss": 0.3481, "num_input_tokens_seen": 52749920, "step": 55230 }, { "epoch": 4.50566930418468, "grad_norm": 1.4416970014572144, "learning_rate": 3.350466590940304e-05, "loss": 0.3599, "num_input_tokens_seen": 52754544, "step": 55235 }, { "epoch": 4.506077167795089, "grad_norm": 2.4265713691711426, "learning_rate": 3.3501318824654296e-05, "loss": 0.3281, "num_input_tokens_seen": 52758384, "step": 55240 }, { "epoch": 4.506485031405498, "grad_norm": 1.5796009302139282, "learning_rate": 3.3497971567587444e-05, "loss": 0.3838, "num_input_tokens_seen": 52763360, "step": 55245 }, { "epoch": 4.506892895015906, "grad_norm": 0.9808875322341919, "learning_rate": 3.349462413827031e-05, "loss": 0.3271, "num_input_tokens_seen": 52767760, "step": 55250 }, { "epoch": 4.507300758626315, "grad_norm": 1.9844448566436768, "learning_rate": 3.349127653677075e-05, "loss": 0.3668, "num_input_tokens_seen": 52772496, "step": 55255 }, { "epoch": 4.507708622236724, "grad_norm": 1.2914029359817505, "learning_rate": 3.348792876315663e-05, "loss": 0.3176, "num_input_tokens_seen": 52777200, "step": 55260 }, { "epoch": 4.5081164858471325, "grad_norm": 0.9660171866416931, "learning_rate": 3.348458081749581e-05, "loss": 0.34, "num_input_tokens_seen": 52781296, "step": 55265 }, { "epoch": 4.5085243494575415, "grad_norm": 2.1506175994873047, "learning_rate": 3.348123269985613e-05, "loss": 0.3609, "num_input_tokens_seen": 52785136, "step": 55270 }, { "epoch": 4.508932213067951, "grad_norm": 1.4333475828170776, "learning_rate": 3.347788441030547e-05, "loss": 0.2816, "num_input_tokens_seen": 52789376, "step": 55275 }, { "epoch": 4.509340076678359, "grad_norm": 1.0802805423736572, "learning_rate": 3.3474535948911694e-05, "loss": 0.4074, "num_input_tokens_seen": 52793584, "step": 55280 }, { "epoch": 4.509747940288768, "grad_norm": 1.3579639196395874, "learning_rate": 3.347118731574268e-05, "loss": 0.3467, "num_input_tokens_seen": 52797888, "step": 55285 }, { "epoch": 4.510155803899176, "grad_norm": 0.86188143491745, "learning_rate": 3.3467838510866295e-05, "loss": 0.3447, "num_input_tokens_seen": 52802816, "step": 55290 }, { "epoch": 4.510563667509585, "grad_norm": 0.777700662612915, "learning_rate": 3.346448953435042e-05, "loss": 0.3001, "num_input_tokens_seen": 52807472, "step": 55295 }, { "epoch": 4.510971531119994, "grad_norm": 1.0734405517578125, "learning_rate": 3.346114038626295e-05, "loss": 0.3959, "num_input_tokens_seen": 52811792, "step": 55300 }, { "epoch": 4.511379394730402, "grad_norm": 0.7926437854766846, "learning_rate": 3.345779106667174e-05, "loss": 0.3523, "num_input_tokens_seen": 52816240, "step": 55305 }, { "epoch": 4.511787258340811, "grad_norm": 0.8870998024940491, "learning_rate": 3.345444157564471e-05, "loss": 0.3429, "num_input_tokens_seen": 52820544, "step": 55310 }, { "epoch": 4.512195121951219, "grad_norm": 0.8830315470695496, "learning_rate": 3.3451091913249746e-05, "loss": 0.3381, "num_input_tokens_seen": 52824576, "step": 55315 }, { "epoch": 4.512602985561628, "grad_norm": 0.9784783124923706, "learning_rate": 3.3447742079554734e-05, "loss": 0.3162, "num_input_tokens_seen": 52829872, "step": 55320 }, { "epoch": 4.513010849172037, "grad_norm": 3.811460256576538, "learning_rate": 3.344439207462758e-05, "loss": 0.3417, "num_input_tokens_seen": 52834624, "step": 55325 }, { "epoch": 4.513418712782445, "grad_norm": 1.897883415222168, "learning_rate": 3.344104189853618e-05, "loss": 0.4013, "num_input_tokens_seen": 52839344, "step": 55330 }, { "epoch": 4.513826576392854, "grad_norm": 1.4603888988494873, "learning_rate": 3.343769155134846e-05, "loss": 0.3178, "num_input_tokens_seen": 52843952, "step": 55335 }, { "epoch": 4.514234440003263, "grad_norm": 2.0287179946899414, "learning_rate": 3.343434103313231e-05, "loss": 0.3493, "num_input_tokens_seen": 52848800, "step": 55340 }, { "epoch": 4.514642303613671, "grad_norm": 1.63801908493042, "learning_rate": 3.3430990343955656e-05, "loss": 0.3052, "num_input_tokens_seen": 52854656, "step": 55345 }, { "epoch": 4.51505016722408, "grad_norm": 0.7255733013153076, "learning_rate": 3.3427639483886414e-05, "loss": 0.3401, "num_input_tokens_seen": 52859744, "step": 55350 }, { "epoch": 4.515458030834489, "grad_norm": 1.812254548072815, "learning_rate": 3.34242884529925e-05, "loss": 0.3568, "num_input_tokens_seen": 52864208, "step": 55355 }, { "epoch": 4.5158658944448975, "grad_norm": 1.0661234855651855, "learning_rate": 3.3420937251341826e-05, "loss": 0.2978, "num_input_tokens_seen": 52868800, "step": 55360 }, { "epoch": 4.5162737580553065, "grad_norm": 1.3610448837280273, "learning_rate": 3.3417585879002345e-05, "loss": 0.3692, "num_input_tokens_seen": 52874208, "step": 55365 }, { "epoch": 4.516681621665715, "grad_norm": 0.8741020560264587, "learning_rate": 3.3414234336041974e-05, "loss": 0.4334, "num_input_tokens_seen": 52878272, "step": 55370 }, { "epoch": 4.517089485276124, "grad_norm": 0.4988544285297394, "learning_rate": 3.341088262252864e-05, "loss": 0.3814, "num_input_tokens_seen": 52882064, "step": 55375 }, { "epoch": 4.517497348886533, "grad_norm": 0.8001043200492859, "learning_rate": 3.3407530738530294e-05, "loss": 0.3762, "num_input_tokens_seen": 52886560, "step": 55380 }, { "epoch": 4.517905212496941, "grad_norm": 1.7941430807113647, "learning_rate": 3.3404178684114875e-05, "loss": 0.4265, "num_input_tokens_seen": 52891104, "step": 55385 }, { "epoch": 4.51831307610735, "grad_norm": 0.5606815814971924, "learning_rate": 3.340082645935032e-05, "loss": 0.2722, "num_input_tokens_seen": 52895520, "step": 55390 }, { "epoch": 4.518720939717758, "grad_norm": 1.6691220998764038, "learning_rate": 3.339747406430458e-05, "loss": 0.3653, "num_input_tokens_seen": 52900032, "step": 55395 }, { "epoch": 4.519128803328167, "grad_norm": 2.1661410331726074, "learning_rate": 3.339412149904562e-05, "loss": 0.3884, "num_input_tokens_seen": 52904576, "step": 55400 }, { "epoch": 4.519536666938576, "grad_norm": 1.1399941444396973, "learning_rate": 3.339076876364137e-05, "loss": 0.3955, "num_input_tokens_seen": 52909376, "step": 55405 }, { "epoch": 4.519944530548984, "grad_norm": 1.667252540588379, "learning_rate": 3.3387415858159815e-05, "loss": 0.4149, "num_input_tokens_seen": 52913712, "step": 55410 }, { "epoch": 4.520352394159393, "grad_norm": 0.7427809834480286, "learning_rate": 3.33840627826689e-05, "loss": 0.3537, "num_input_tokens_seen": 52918048, "step": 55415 }, { "epoch": 4.520760257769802, "grad_norm": 1.212311863899231, "learning_rate": 3.33807095372366e-05, "loss": 0.3183, "num_input_tokens_seen": 52922336, "step": 55420 }, { "epoch": 4.52116812138021, "grad_norm": 0.9878573417663574, "learning_rate": 3.3377356121930884e-05, "loss": 0.3597, "num_input_tokens_seen": 52927152, "step": 55425 }, { "epoch": 4.521575984990619, "grad_norm": 0.9370602965354919, "learning_rate": 3.33740025368197e-05, "loss": 0.35, "num_input_tokens_seen": 52931888, "step": 55430 }, { "epoch": 4.521983848601028, "grad_norm": 1.2962994575500488, "learning_rate": 3.337064878197105e-05, "loss": 0.3579, "num_input_tokens_seen": 52937376, "step": 55435 }, { "epoch": 4.522391712211436, "grad_norm": 1.4265797138214111, "learning_rate": 3.336729485745291e-05, "loss": 0.3049, "num_input_tokens_seen": 52942592, "step": 55440 }, { "epoch": 4.522799575821845, "grad_norm": 0.6904115080833435, "learning_rate": 3.3363940763333255e-05, "loss": 0.3069, "num_input_tokens_seen": 52946448, "step": 55445 }, { "epoch": 4.5232074394322535, "grad_norm": 1.4763026237487793, "learning_rate": 3.336058649968008e-05, "loss": 0.3424, "num_input_tokens_seen": 52950656, "step": 55450 }, { "epoch": 4.5236153030426625, "grad_norm": 0.8879836797714233, "learning_rate": 3.335723206656137e-05, "loss": 0.3455, "num_input_tokens_seen": 52955344, "step": 55455 }, { "epoch": 4.5240231666530715, "grad_norm": 1.0159991979599, "learning_rate": 3.33538774640451e-05, "loss": 0.2894, "num_input_tokens_seen": 52960528, "step": 55460 }, { "epoch": 4.52443103026348, "grad_norm": 1.5841238498687744, "learning_rate": 3.33505226921993e-05, "loss": 0.3642, "num_input_tokens_seen": 52965040, "step": 55465 }, { "epoch": 4.524838893873889, "grad_norm": 2.0186214447021484, "learning_rate": 3.334716775109196e-05, "loss": 0.3578, "num_input_tokens_seen": 52970144, "step": 55470 }, { "epoch": 4.525246757484298, "grad_norm": 1.400075078010559, "learning_rate": 3.334381264079106e-05, "loss": 0.3264, "num_input_tokens_seen": 52974928, "step": 55475 }, { "epoch": 4.525654621094706, "grad_norm": 1.3146507740020752, "learning_rate": 3.334045736136464e-05, "loss": 0.3624, "num_input_tokens_seen": 52980272, "step": 55480 }, { "epoch": 4.526062484705115, "grad_norm": 1.365828037261963, "learning_rate": 3.333710191288068e-05, "loss": 0.3338, "num_input_tokens_seen": 52985568, "step": 55485 }, { "epoch": 4.526470348315524, "grad_norm": 1.4064815044403076, "learning_rate": 3.333374629540721e-05, "loss": 0.3757, "num_input_tokens_seen": 52990224, "step": 55490 }, { "epoch": 4.526878211925932, "grad_norm": 1.2038880586624146, "learning_rate": 3.333039050901225e-05, "loss": 0.35, "num_input_tokens_seen": 52994800, "step": 55495 }, { "epoch": 4.527286075536341, "grad_norm": 1.7621992826461792, "learning_rate": 3.33270345537638e-05, "loss": 0.3304, "num_input_tokens_seen": 52999552, "step": 55500 }, { "epoch": 4.527693939146749, "grad_norm": 1.1465680599212646, "learning_rate": 3.332367842972991e-05, "loss": 0.3502, "num_input_tokens_seen": 53003792, "step": 55505 }, { "epoch": 4.528101802757158, "grad_norm": 0.9645105600357056, "learning_rate": 3.332032213697859e-05, "loss": 0.3258, "num_input_tokens_seen": 53009024, "step": 55510 }, { "epoch": 4.528509666367567, "grad_norm": 1.1852422952651978, "learning_rate": 3.3316965675577884e-05, "loss": 0.3101, "num_input_tokens_seen": 53013920, "step": 55515 }, { "epoch": 4.528917529977975, "grad_norm": 2.6632282733917236, "learning_rate": 3.331360904559581e-05, "loss": 0.3806, "num_input_tokens_seen": 53018944, "step": 55520 }, { "epoch": 4.529325393588384, "grad_norm": 1.187337875366211, "learning_rate": 3.331025224710041e-05, "loss": 0.3014, "num_input_tokens_seen": 53022784, "step": 55525 }, { "epoch": 4.529733257198792, "grad_norm": 2.067159414291382, "learning_rate": 3.3306895280159736e-05, "loss": 0.3527, "num_input_tokens_seen": 53027168, "step": 55530 }, { "epoch": 4.530141120809201, "grad_norm": 1.5759021043777466, "learning_rate": 3.3303538144841815e-05, "loss": 0.3689, "num_input_tokens_seen": 53031872, "step": 55535 }, { "epoch": 4.53054898441961, "grad_norm": 1.6741830110549927, "learning_rate": 3.3300180841214714e-05, "loss": 0.3863, "num_input_tokens_seen": 53037024, "step": 55540 }, { "epoch": 4.530956848030018, "grad_norm": 2.8680264949798584, "learning_rate": 3.3296823369346466e-05, "loss": 0.3379, "num_input_tokens_seen": 53042288, "step": 55545 }, { "epoch": 4.531364711640427, "grad_norm": 0.9558842778205872, "learning_rate": 3.3293465729305146e-05, "loss": 0.2988, "num_input_tokens_seen": 53046928, "step": 55550 }, { "epoch": 4.531772575250836, "grad_norm": 1.5498334169387817, "learning_rate": 3.3290107921158793e-05, "loss": 0.393, "num_input_tokens_seen": 53052192, "step": 55555 }, { "epoch": 4.5321804388612446, "grad_norm": 0.9027157425880432, "learning_rate": 3.328674994497547e-05, "loss": 0.3463, "num_input_tokens_seen": 53057040, "step": 55560 }, { "epoch": 4.532588302471654, "grad_norm": 1.0883853435516357, "learning_rate": 3.328339180082326e-05, "loss": 0.2534, "num_input_tokens_seen": 53061584, "step": 55565 }, { "epoch": 4.532996166082063, "grad_norm": 1.161118984222412, "learning_rate": 3.328003348877021e-05, "loss": 0.3025, "num_input_tokens_seen": 53066592, "step": 55570 }, { "epoch": 4.533404029692471, "grad_norm": 1.0708955526351929, "learning_rate": 3.327667500888439e-05, "loss": 0.4217, "num_input_tokens_seen": 53070768, "step": 55575 }, { "epoch": 4.53381189330288, "grad_norm": 1.5025219917297363, "learning_rate": 3.32733163612339e-05, "loss": 0.4162, "num_input_tokens_seen": 53075888, "step": 55580 }, { "epoch": 4.534219756913288, "grad_norm": 2.1438417434692383, "learning_rate": 3.326995754588679e-05, "loss": 0.3337, "num_input_tokens_seen": 53080736, "step": 55585 }, { "epoch": 4.534627620523697, "grad_norm": 1.4196995496749878, "learning_rate": 3.326659856291117e-05, "loss": 0.3849, "num_input_tokens_seen": 53085744, "step": 55590 }, { "epoch": 4.535035484134106, "grad_norm": 1.1364363431930542, "learning_rate": 3.3263239412375094e-05, "loss": 0.2735, "num_input_tokens_seen": 53089856, "step": 55595 }, { "epoch": 4.535443347744514, "grad_norm": 0.8471636176109314, "learning_rate": 3.325988009434667e-05, "loss": 0.309, "num_input_tokens_seen": 53094656, "step": 55600 }, { "epoch": 4.535851211354923, "grad_norm": 1.150162935256958, "learning_rate": 3.325652060889399e-05, "loss": 0.3451, "num_input_tokens_seen": 53099424, "step": 55605 }, { "epoch": 4.536259074965331, "grad_norm": 1.4246647357940674, "learning_rate": 3.3253160956085145e-05, "loss": 0.408, "num_input_tokens_seen": 53103248, "step": 55610 }, { "epoch": 4.53666693857574, "grad_norm": 0.4340183436870575, "learning_rate": 3.324980113598824e-05, "loss": 0.2681, "num_input_tokens_seen": 53107568, "step": 55615 }, { "epoch": 4.537074802186149, "grad_norm": 1.1355146169662476, "learning_rate": 3.324644114867136e-05, "loss": 0.3868, "num_input_tokens_seen": 53112976, "step": 55620 }, { "epoch": 4.537482665796558, "grad_norm": 0.838084876537323, "learning_rate": 3.324308099420263e-05, "loss": 0.2619, "num_input_tokens_seen": 53118656, "step": 55625 }, { "epoch": 4.537890529406966, "grad_norm": 1.1997706890106201, "learning_rate": 3.3239720672650155e-05, "loss": 0.3122, "num_input_tokens_seen": 53124368, "step": 55630 }, { "epoch": 4.538298393017375, "grad_norm": 1.0962038040161133, "learning_rate": 3.323636018408204e-05, "loss": 0.377, "num_input_tokens_seen": 53129488, "step": 55635 }, { "epoch": 4.538706256627783, "grad_norm": 1.1788668632507324, "learning_rate": 3.32329995285664e-05, "loss": 0.2759, "num_input_tokens_seen": 53133856, "step": 55640 }, { "epoch": 4.539114120238192, "grad_norm": 1.6472358703613281, "learning_rate": 3.3229638706171356e-05, "loss": 0.4597, "num_input_tokens_seen": 53138704, "step": 55645 }, { "epoch": 4.539521983848601, "grad_norm": 1.6399500370025635, "learning_rate": 3.322627771696503e-05, "loss": 0.4093, "num_input_tokens_seen": 53142944, "step": 55650 }, { "epoch": 4.5399298474590095, "grad_norm": 1.285383939743042, "learning_rate": 3.322291656101556e-05, "loss": 0.4621, "num_input_tokens_seen": 53147664, "step": 55655 }, { "epoch": 4.5403377110694185, "grad_norm": 0.9458208084106445, "learning_rate": 3.321955523839106e-05, "loss": 0.3342, "num_input_tokens_seen": 53152864, "step": 55660 }, { "epoch": 4.540745574679827, "grad_norm": 1.116800308227539, "learning_rate": 3.3216193749159676e-05, "loss": 0.3412, "num_input_tokens_seen": 53157872, "step": 55665 }, { "epoch": 4.541153438290236, "grad_norm": 0.5729138255119324, "learning_rate": 3.3212832093389525e-05, "loss": 0.4112, "num_input_tokens_seen": 53161952, "step": 55670 }, { "epoch": 4.541561301900645, "grad_norm": 0.6190139651298523, "learning_rate": 3.320947027114876e-05, "loss": 0.3198, "num_input_tokens_seen": 53165872, "step": 55675 }, { "epoch": 4.541969165511053, "grad_norm": 1.0731221437454224, "learning_rate": 3.320610828250552e-05, "loss": 0.3529, "num_input_tokens_seen": 53170080, "step": 55680 }, { "epoch": 4.542377029121462, "grad_norm": 0.7620832920074463, "learning_rate": 3.320274612752796e-05, "loss": 0.3865, "num_input_tokens_seen": 53174960, "step": 55685 }, { "epoch": 4.542784892731871, "grad_norm": 0.5674694776535034, "learning_rate": 3.319938380628421e-05, "loss": 0.395, "num_input_tokens_seen": 53179872, "step": 55690 }, { "epoch": 4.543192756342279, "grad_norm": 0.5141404867172241, "learning_rate": 3.319602131884245e-05, "loss": 0.3275, "num_input_tokens_seen": 53184096, "step": 55695 }, { "epoch": 4.543600619952688, "grad_norm": 0.8304487466812134, "learning_rate": 3.319265866527081e-05, "loss": 0.3723, "num_input_tokens_seen": 53189296, "step": 55700 }, { "epoch": 4.544008483563097, "grad_norm": 0.8483947515487671, "learning_rate": 3.318929584563746e-05, "loss": 0.3388, "num_input_tokens_seen": 53194000, "step": 55705 }, { "epoch": 4.544416347173505, "grad_norm": 0.2941771149635315, "learning_rate": 3.318593286001057e-05, "loss": 0.3608, "num_input_tokens_seen": 53199168, "step": 55710 }, { "epoch": 4.544824210783914, "grad_norm": 0.6673526167869568, "learning_rate": 3.318256970845829e-05, "loss": 0.3366, "num_input_tokens_seen": 53203568, "step": 55715 }, { "epoch": 4.545232074394322, "grad_norm": 0.5320954918861389, "learning_rate": 3.31792063910488e-05, "loss": 0.323, "num_input_tokens_seen": 53208928, "step": 55720 }, { "epoch": 4.545639938004731, "grad_norm": 0.5837615728378296, "learning_rate": 3.317584290785028e-05, "loss": 0.3466, "num_input_tokens_seen": 53212976, "step": 55725 }, { "epoch": 4.54604780161514, "grad_norm": 0.8821954727172852, "learning_rate": 3.317247925893089e-05, "loss": 0.3384, "num_input_tokens_seen": 53218128, "step": 55730 }, { "epoch": 4.546455665225548, "grad_norm": 0.6470192670822144, "learning_rate": 3.3169115444358825e-05, "loss": 0.3281, "num_input_tokens_seen": 53222416, "step": 55735 }, { "epoch": 4.546863528835957, "grad_norm": 0.9987761974334717, "learning_rate": 3.316575146420225e-05, "loss": 0.3295, "num_input_tokens_seen": 53228240, "step": 55740 }, { "epoch": 4.5472713924463655, "grad_norm": 1.2061963081359863, "learning_rate": 3.316238731852937e-05, "loss": 0.3303, "num_input_tokens_seen": 53233536, "step": 55745 }, { "epoch": 4.5476792560567745, "grad_norm": 1.2150273323059082, "learning_rate": 3.315902300740837e-05, "loss": 0.372, "num_input_tokens_seen": 53238176, "step": 55750 }, { "epoch": 4.5480871196671835, "grad_norm": 0.9755009412765503, "learning_rate": 3.315565853090744e-05, "loss": 0.3556, "num_input_tokens_seen": 53242032, "step": 55755 }, { "epoch": 4.548494983277592, "grad_norm": 0.8847497701644897, "learning_rate": 3.315229388909478e-05, "loss": 0.3508, "num_input_tokens_seen": 53247568, "step": 55760 }, { "epoch": 4.548902846888001, "grad_norm": 0.7381470799446106, "learning_rate": 3.314892908203858e-05, "loss": 0.3283, "num_input_tokens_seen": 53252368, "step": 55765 }, { "epoch": 4.54931071049841, "grad_norm": 0.7588692307472229, "learning_rate": 3.314556410980705e-05, "loss": 0.348, "num_input_tokens_seen": 53257296, "step": 55770 }, { "epoch": 4.549718574108818, "grad_norm": 0.8436700701713562, "learning_rate": 3.31421989724684e-05, "loss": 0.3661, "num_input_tokens_seen": 53262288, "step": 55775 }, { "epoch": 4.550126437719227, "grad_norm": 0.9777345657348633, "learning_rate": 3.313883367009084e-05, "loss": 0.3706, "num_input_tokens_seen": 53266000, "step": 55780 }, { "epoch": 4.550534301329636, "grad_norm": 0.8977844715118408, "learning_rate": 3.313546820274257e-05, "loss": 0.3759, "num_input_tokens_seen": 53271392, "step": 55785 }, { "epoch": 4.550942164940044, "grad_norm": 1.0941317081451416, "learning_rate": 3.313210257049183e-05, "loss": 0.3653, "num_input_tokens_seen": 53276176, "step": 55790 }, { "epoch": 4.551350028550453, "grad_norm": 0.6456494927406311, "learning_rate": 3.312873677340681e-05, "loss": 0.3451, "num_input_tokens_seen": 53280640, "step": 55795 }, { "epoch": 4.551757892160861, "grad_norm": 0.5324196219444275, "learning_rate": 3.312537081155576e-05, "loss": 0.357, "num_input_tokens_seen": 53285536, "step": 55800 }, { "epoch": 4.55216575577127, "grad_norm": 0.9718455672264099, "learning_rate": 3.312200468500689e-05, "loss": 0.3166, "num_input_tokens_seen": 53289904, "step": 55805 }, { "epoch": 4.552573619381679, "grad_norm": 0.7327224016189575, "learning_rate": 3.311863839382845e-05, "loss": 0.2833, "num_input_tokens_seen": 53294544, "step": 55810 }, { "epoch": 4.552981482992087, "grad_norm": 1.1103025674819946, "learning_rate": 3.311527193808864e-05, "loss": 0.3786, "num_input_tokens_seen": 53299568, "step": 55815 }, { "epoch": 4.553389346602496, "grad_norm": 0.5012401938438416, "learning_rate": 3.311190531785574e-05, "loss": 0.3448, "num_input_tokens_seen": 53305136, "step": 55820 }, { "epoch": 4.553797210212905, "grad_norm": 0.6454377770423889, "learning_rate": 3.310853853319795e-05, "loss": 0.3702, "num_input_tokens_seen": 53309552, "step": 55825 }, { "epoch": 4.554205073823313, "grad_norm": 0.5054283738136292, "learning_rate": 3.3105171584183536e-05, "loss": 0.3108, "num_input_tokens_seen": 53314208, "step": 55830 }, { "epoch": 4.554612937433722, "grad_norm": 0.7126699686050415, "learning_rate": 3.310180447088074e-05, "loss": 0.3619, "num_input_tokens_seen": 53318416, "step": 55835 }, { "epoch": 4.555020801044131, "grad_norm": 0.8977711200714111, "learning_rate": 3.30984371933578e-05, "loss": 0.4149, "num_input_tokens_seen": 53323376, "step": 55840 }, { "epoch": 4.555428664654539, "grad_norm": 0.9882485866546631, "learning_rate": 3.3095069751682987e-05, "loss": 0.324, "num_input_tokens_seen": 53327520, "step": 55845 }, { "epoch": 4.5558365282649484, "grad_norm": 0.9313353300094604, "learning_rate": 3.309170214592454e-05, "loss": 0.3298, "num_input_tokens_seen": 53332112, "step": 55850 }, { "epoch": 4.556244391875357, "grad_norm": 0.9103034734725952, "learning_rate": 3.3088334376150736e-05, "loss": 0.3333, "num_input_tokens_seen": 53337664, "step": 55855 }, { "epoch": 4.556652255485766, "grad_norm": 0.9892252087593079, "learning_rate": 3.308496644242983e-05, "loss": 0.3507, "num_input_tokens_seen": 53343168, "step": 55860 }, { "epoch": 4.557060119096175, "grad_norm": 0.37185245752334595, "learning_rate": 3.308159834483009e-05, "loss": 0.3302, "num_input_tokens_seen": 53346960, "step": 55865 }, { "epoch": 4.557467982706583, "grad_norm": 0.856277585029602, "learning_rate": 3.307823008341979e-05, "loss": 0.3825, "num_input_tokens_seen": 53351440, "step": 55870 }, { "epoch": 4.557875846316992, "grad_norm": 0.7739441394805908, "learning_rate": 3.3074861658267194e-05, "loss": 0.337, "num_input_tokens_seen": 53356288, "step": 55875 }, { "epoch": 4.5582837099274, "grad_norm": 0.7108875513076782, "learning_rate": 3.3071493069440585e-05, "loss": 0.3458, "num_input_tokens_seen": 53359984, "step": 55880 }, { "epoch": 4.558691573537809, "grad_norm": 1.0274584293365479, "learning_rate": 3.3068124317008235e-05, "loss": 0.3351, "num_input_tokens_seen": 53363808, "step": 55885 }, { "epoch": 4.559099437148218, "grad_norm": 1.1190599203109741, "learning_rate": 3.306475540103844e-05, "loss": 0.4116, "num_input_tokens_seen": 53368880, "step": 55890 }, { "epoch": 4.559507300758626, "grad_norm": 0.8790348172187805, "learning_rate": 3.306138632159949e-05, "loss": 0.3383, "num_input_tokens_seen": 53373872, "step": 55895 }, { "epoch": 4.559915164369035, "grad_norm": 0.7109144926071167, "learning_rate": 3.305801707875965e-05, "loss": 0.373, "num_input_tokens_seen": 53377888, "step": 55900 }, { "epoch": 4.560323027979444, "grad_norm": 0.7922900319099426, "learning_rate": 3.305464767258723e-05, "loss": 0.3625, "num_input_tokens_seen": 53382848, "step": 55905 }, { "epoch": 4.560730891589852, "grad_norm": 0.540047824382782, "learning_rate": 3.305127810315053e-05, "loss": 0.344, "num_input_tokens_seen": 53388448, "step": 55910 }, { "epoch": 4.561138755200261, "grad_norm": 1.0707433223724365, "learning_rate": 3.304790837051783e-05, "loss": 0.2939, "num_input_tokens_seen": 53393040, "step": 55915 }, { "epoch": 4.56154661881067, "grad_norm": 0.7560158967971802, "learning_rate": 3.304453847475746e-05, "loss": 0.336, "num_input_tokens_seen": 53397600, "step": 55920 }, { "epoch": 4.561954482421078, "grad_norm": 0.6389771103858948, "learning_rate": 3.3041168415937715e-05, "loss": 0.3567, "num_input_tokens_seen": 53402224, "step": 55925 }, { "epoch": 4.562362346031487, "grad_norm": 0.6360741853713989, "learning_rate": 3.30377981941269e-05, "loss": 0.3686, "num_input_tokens_seen": 53407152, "step": 55930 }, { "epoch": 4.562770209641895, "grad_norm": 0.5431450009346008, "learning_rate": 3.3034427809393335e-05, "loss": 0.3883, "num_input_tokens_seen": 53411584, "step": 55935 }, { "epoch": 4.563178073252304, "grad_norm": 0.6247318387031555, "learning_rate": 3.303105726180532e-05, "loss": 0.3703, "num_input_tokens_seen": 53416288, "step": 55940 }, { "epoch": 4.563585936862713, "grad_norm": 0.9180610179901123, "learning_rate": 3.30276865514312e-05, "loss": 0.358, "num_input_tokens_seen": 53421376, "step": 55945 }, { "epoch": 4.5639938004731215, "grad_norm": 0.4259958863258362, "learning_rate": 3.302431567833927e-05, "loss": 0.3528, "num_input_tokens_seen": 53425936, "step": 55950 }, { "epoch": 4.5644016640835305, "grad_norm": 0.7620508074760437, "learning_rate": 3.302094464259788e-05, "loss": 0.3727, "num_input_tokens_seen": 53431632, "step": 55955 }, { "epoch": 4.564809527693939, "grad_norm": 0.8274070024490356, "learning_rate": 3.3017573444275354e-05, "loss": 0.3552, "num_input_tokens_seen": 53435808, "step": 55960 }, { "epoch": 4.565217391304348, "grad_norm": 0.714449942111969, "learning_rate": 3.3014202083440015e-05, "loss": 0.3366, "num_input_tokens_seen": 53440576, "step": 55965 }, { "epoch": 4.565625254914757, "grad_norm": 1.0473870038986206, "learning_rate": 3.301083056016021e-05, "loss": 0.3492, "num_input_tokens_seen": 53444880, "step": 55970 }, { "epoch": 4.566033118525165, "grad_norm": 0.9732948541641235, "learning_rate": 3.300745887450427e-05, "loss": 0.35, "num_input_tokens_seen": 53449760, "step": 55975 }, { "epoch": 4.566440982135574, "grad_norm": 0.707758367061615, "learning_rate": 3.3004087026540547e-05, "loss": 0.3351, "num_input_tokens_seen": 53454592, "step": 55980 }, { "epoch": 4.566848845745983, "grad_norm": 0.8318315744400024, "learning_rate": 3.300071501633737e-05, "loss": 0.3623, "num_input_tokens_seen": 53459424, "step": 55985 }, { "epoch": 4.567256709356391, "grad_norm": 0.6507371664047241, "learning_rate": 3.2997342843963115e-05, "loss": 0.365, "num_input_tokens_seen": 53463712, "step": 55990 }, { "epoch": 4.5676645729668, "grad_norm": 0.9766066074371338, "learning_rate": 3.2993970509486104e-05, "loss": 0.3552, "num_input_tokens_seen": 53469296, "step": 55995 }, { "epoch": 4.568072436577209, "grad_norm": 0.5978583097457886, "learning_rate": 3.299059801297472e-05, "loss": 0.3439, "num_input_tokens_seen": 53473504, "step": 56000 }, { "epoch": 4.568480300187617, "grad_norm": 1.0519460439682007, "learning_rate": 3.29872253544973e-05, "loss": 0.3267, "num_input_tokens_seen": 53478224, "step": 56005 }, { "epoch": 4.568888163798026, "grad_norm": 0.6247302889823914, "learning_rate": 3.2983852534122215e-05, "loss": 0.3336, "num_input_tokens_seen": 53482576, "step": 56010 }, { "epoch": 4.569296027408434, "grad_norm": 0.6927273273468018, "learning_rate": 3.298047955191783e-05, "loss": 0.3219, "num_input_tokens_seen": 53487520, "step": 56015 }, { "epoch": 4.569703891018843, "grad_norm": 0.6532773375511169, "learning_rate": 3.297710640795253e-05, "loss": 0.3373, "num_input_tokens_seen": 53492784, "step": 56020 }, { "epoch": 4.570111754629252, "grad_norm": 0.7180150747299194, "learning_rate": 3.297373310229466e-05, "loss": 0.3197, "num_input_tokens_seen": 53497936, "step": 56025 }, { "epoch": 4.57051961823966, "grad_norm": 1.3765151500701904, "learning_rate": 3.297035963501262e-05, "loss": 0.2934, "num_input_tokens_seen": 53502528, "step": 56030 }, { "epoch": 4.570927481850069, "grad_norm": 0.81497722864151, "learning_rate": 3.296698600617477e-05, "loss": 0.3954, "num_input_tokens_seen": 53507152, "step": 56035 }, { "epoch": 4.571335345460478, "grad_norm": 0.7018285989761353, "learning_rate": 3.296361221584949e-05, "loss": 0.3045, "num_input_tokens_seen": 53512512, "step": 56040 }, { "epoch": 4.5717432090708865, "grad_norm": 1.7421391010284424, "learning_rate": 3.296023826410519e-05, "loss": 0.3171, "num_input_tokens_seen": 53517488, "step": 56045 }, { "epoch": 4.5721510726812955, "grad_norm": 1.0092523097991943, "learning_rate": 3.295686415101023e-05, "loss": 0.2732, "num_input_tokens_seen": 53522368, "step": 56050 }, { "epoch": 4.5725589362917045, "grad_norm": 0.6023897528648376, "learning_rate": 3.295348987663303e-05, "loss": 0.4044, "num_input_tokens_seen": 53527248, "step": 56055 }, { "epoch": 4.572966799902113, "grad_norm": 0.8010538220405579, "learning_rate": 3.2950115441041954e-05, "loss": 0.424, "num_input_tokens_seen": 53531936, "step": 56060 }, { "epoch": 4.573374663512522, "grad_norm": 0.8360061049461365, "learning_rate": 3.2946740844305424e-05, "loss": 0.3953, "num_input_tokens_seen": 53537664, "step": 56065 }, { "epoch": 4.57378252712293, "grad_norm": 1.2276699542999268, "learning_rate": 3.2943366086491836e-05, "loss": 0.3047, "num_input_tokens_seen": 53542736, "step": 56070 }, { "epoch": 4.574190390733339, "grad_norm": 1.2953788042068481, "learning_rate": 3.2939991167669584e-05, "loss": 0.4041, "num_input_tokens_seen": 53546896, "step": 56075 }, { "epoch": 4.574598254343748, "grad_norm": 1.5030155181884766, "learning_rate": 3.293661608790709e-05, "loss": 0.3476, "num_input_tokens_seen": 53551024, "step": 56080 }, { "epoch": 4.575006117954156, "grad_norm": 0.9254368543624878, "learning_rate": 3.2933240847272754e-05, "loss": 0.3434, "num_input_tokens_seen": 53555680, "step": 56085 }, { "epoch": 4.575413981564565, "grad_norm": 0.8433831930160522, "learning_rate": 3.292986544583501e-05, "loss": 0.3338, "num_input_tokens_seen": 53560416, "step": 56090 }, { "epoch": 4.575821845174973, "grad_norm": 1.3437362909317017, "learning_rate": 3.292648988366225e-05, "loss": 0.3469, "num_input_tokens_seen": 53565040, "step": 56095 }, { "epoch": 4.576229708785382, "grad_norm": 1.0321919918060303, "learning_rate": 3.292311416082292e-05, "loss": 0.3368, "num_input_tokens_seen": 53569856, "step": 56100 }, { "epoch": 4.576637572395791, "grad_norm": 0.7212608456611633, "learning_rate": 3.291973827738543e-05, "loss": 0.3419, "num_input_tokens_seen": 53573584, "step": 56105 }, { "epoch": 4.577045436006199, "grad_norm": 1.907889485359192, "learning_rate": 3.291636223341821e-05, "loss": 0.4053, "num_input_tokens_seen": 53578848, "step": 56110 }, { "epoch": 4.577453299616608, "grad_norm": 0.7770547866821289, "learning_rate": 3.2912986028989686e-05, "loss": 0.3225, "num_input_tokens_seen": 53584272, "step": 56115 }, { "epoch": 4.577861163227017, "grad_norm": 1.2784534692764282, "learning_rate": 3.29096096641683e-05, "loss": 0.3492, "num_input_tokens_seen": 53589088, "step": 56120 }, { "epoch": 4.578269026837425, "grad_norm": 1.186956524848938, "learning_rate": 3.290623313902249e-05, "loss": 0.3546, "num_input_tokens_seen": 53593824, "step": 56125 }, { "epoch": 4.578676890447834, "grad_norm": 1.290711522102356, "learning_rate": 3.290285645362069e-05, "loss": 0.3762, "num_input_tokens_seen": 53598336, "step": 56130 }, { "epoch": 4.579084754058243, "grad_norm": 1.026816487312317, "learning_rate": 3.2899479608031354e-05, "loss": 0.2956, "num_input_tokens_seen": 53602752, "step": 56135 }, { "epoch": 4.5794926176686515, "grad_norm": 0.6234369277954102, "learning_rate": 3.289610260232291e-05, "loss": 0.3189, "num_input_tokens_seen": 53607360, "step": 56140 }, { "epoch": 4.5799004812790605, "grad_norm": 0.6248686909675598, "learning_rate": 3.289272543656383e-05, "loss": 0.3452, "num_input_tokens_seen": 53612016, "step": 56145 }, { "epoch": 4.580308344889469, "grad_norm": 0.916317880153656, "learning_rate": 3.288934811082256e-05, "loss": 0.3634, "num_input_tokens_seen": 53616240, "step": 56150 }, { "epoch": 4.580716208499878, "grad_norm": 0.42213767766952515, "learning_rate": 3.288597062516756e-05, "loss": 0.3606, "num_input_tokens_seen": 53620496, "step": 56155 }, { "epoch": 4.581124072110287, "grad_norm": 0.7776411771774292, "learning_rate": 3.2882592979667276e-05, "loss": 0.3692, "num_input_tokens_seen": 53625248, "step": 56160 }, { "epoch": 4.581531935720695, "grad_norm": 1.0234054327011108, "learning_rate": 3.287921517439019e-05, "loss": 0.3171, "num_input_tokens_seen": 53629296, "step": 56165 }, { "epoch": 4.581939799331104, "grad_norm": 1.1578691005706787, "learning_rate": 3.287583720940476e-05, "loss": 0.3877, "num_input_tokens_seen": 53633648, "step": 56170 }, { "epoch": 4.582347662941513, "grad_norm": 0.8687900304794312, "learning_rate": 3.2872459084779455e-05, "loss": 0.3415, "num_input_tokens_seen": 53637856, "step": 56175 }, { "epoch": 4.582755526551921, "grad_norm": 0.5785396695137024, "learning_rate": 3.286908080058275e-05, "loss": 0.2643, "num_input_tokens_seen": 53643552, "step": 56180 }, { "epoch": 4.58316339016233, "grad_norm": 0.6826051473617554, "learning_rate": 3.286570235688312e-05, "loss": 0.3823, "num_input_tokens_seen": 53649168, "step": 56185 }, { "epoch": 4.583571253772739, "grad_norm": 0.7876459360122681, "learning_rate": 3.286232375374905e-05, "loss": 0.3015, "num_input_tokens_seen": 53653840, "step": 56190 }, { "epoch": 4.583979117383147, "grad_norm": 0.7735099792480469, "learning_rate": 3.2858944991249006e-05, "loss": 0.3968, "num_input_tokens_seen": 53658064, "step": 56195 }, { "epoch": 4.584386980993556, "grad_norm": 0.8349817991256714, "learning_rate": 3.28555660694515e-05, "loss": 0.3511, "num_input_tokens_seen": 53663120, "step": 56200 }, { "epoch": 4.584794844603964, "grad_norm": 0.8461610078811646, "learning_rate": 3.2852186988425e-05, "loss": 0.3501, "num_input_tokens_seen": 53667792, "step": 56205 }, { "epoch": 4.585202708214373, "grad_norm": 1.207523226737976, "learning_rate": 3.284880774823801e-05, "loss": 0.3697, "num_input_tokens_seen": 53672448, "step": 56210 }, { "epoch": 4.585610571824782, "grad_norm": 0.7165794968605042, "learning_rate": 3.284542834895902e-05, "loss": 0.3166, "num_input_tokens_seen": 53677568, "step": 56215 }, { "epoch": 4.58601843543519, "grad_norm": 1.3510209321975708, "learning_rate": 3.2842048790656535e-05, "loss": 0.3205, "num_input_tokens_seen": 53683168, "step": 56220 }, { "epoch": 4.586426299045599, "grad_norm": 1.2340031862258911, "learning_rate": 3.283866907339905e-05, "loss": 0.3696, "num_input_tokens_seen": 53689104, "step": 56225 }, { "epoch": 4.586834162656007, "grad_norm": 0.8979434370994568, "learning_rate": 3.283528919725507e-05, "loss": 0.3334, "num_input_tokens_seen": 53693568, "step": 56230 }, { "epoch": 4.587242026266416, "grad_norm": 0.8022439479827881, "learning_rate": 3.283190916229311e-05, "loss": 0.3627, "num_input_tokens_seen": 53698416, "step": 56235 }, { "epoch": 4.587649889876825, "grad_norm": 1.0010032653808594, "learning_rate": 3.282852896858168e-05, "loss": 0.3484, "num_input_tokens_seen": 53703760, "step": 56240 }, { "epoch": 4.5880577534872335, "grad_norm": 0.7426115274429321, "learning_rate": 3.2825148616189306e-05, "loss": 0.3822, "num_input_tokens_seen": 53708912, "step": 56245 }, { "epoch": 4.588465617097643, "grad_norm": 0.8096494078636169, "learning_rate": 3.282176810518448e-05, "loss": 0.3398, "num_input_tokens_seen": 53713648, "step": 56250 }, { "epoch": 4.588873480708052, "grad_norm": 1.2698630094528198, "learning_rate": 3.281838743563574e-05, "loss": 0.3164, "num_input_tokens_seen": 53718560, "step": 56255 }, { "epoch": 4.58928134431846, "grad_norm": 0.8774892091751099, "learning_rate": 3.281500660761161e-05, "loss": 0.3465, "num_input_tokens_seen": 53723488, "step": 56260 }, { "epoch": 4.589689207928869, "grad_norm": 0.7567839026451111, "learning_rate": 3.281162562118062e-05, "loss": 0.3323, "num_input_tokens_seen": 53728000, "step": 56265 }, { "epoch": 4.590097071539278, "grad_norm": 0.7975338101387024, "learning_rate": 3.28082444764113e-05, "loss": 0.3653, "num_input_tokens_seen": 53733520, "step": 56270 }, { "epoch": 4.590504935149686, "grad_norm": 1.042633295059204, "learning_rate": 3.280486317337217e-05, "loss": 0.314, "num_input_tokens_seen": 53738496, "step": 56275 }, { "epoch": 4.590912798760095, "grad_norm": 1.0650086402893066, "learning_rate": 3.2801481712131785e-05, "loss": 0.3686, "num_input_tokens_seen": 53743600, "step": 56280 }, { "epoch": 4.591320662370503, "grad_norm": 1.156016230583191, "learning_rate": 3.279810009275869e-05, "loss": 0.3548, "num_input_tokens_seen": 53747600, "step": 56285 }, { "epoch": 4.591728525980912, "grad_norm": 1.1062055826187134, "learning_rate": 3.279471831532141e-05, "loss": 0.3435, "num_input_tokens_seen": 53752192, "step": 56290 }, { "epoch": 4.592136389591321, "grad_norm": 1.329990029335022, "learning_rate": 3.279133637988851e-05, "loss": 0.3203, "num_input_tokens_seen": 53757488, "step": 56295 }, { "epoch": 4.592544253201729, "grad_norm": 1.298648476600647, "learning_rate": 3.2787954286528525e-05, "loss": 0.3203, "num_input_tokens_seen": 53762864, "step": 56300 }, { "epoch": 4.592952116812138, "grad_norm": 1.2418129444122314, "learning_rate": 3.278457203531002e-05, "loss": 0.36, "num_input_tokens_seen": 53767808, "step": 56305 }, { "epoch": 4.593359980422546, "grad_norm": 1.2137871980667114, "learning_rate": 3.278118962630154e-05, "loss": 0.3497, "num_input_tokens_seen": 53772592, "step": 56310 }, { "epoch": 4.593767844032955, "grad_norm": 0.7009333372116089, "learning_rate": 3.277780705957167e-05, "loss": 0.3338, "num_input_tokens_seen": 53778032, "step": 56315 }, { "epoch": 4.594175707643364, "grad_norm": 0.9189146757125854, "learning_rate": 3.277442433518894e-05, "loss": 0.3218, "num_input_tokens_seen": 53783552, "step": 56320 }, { "epoch": 4.594583571253772, "grad_norm": 1.18659508228302, "learning_rate": 3.277104145322194e-05, "loss": 0.3115, "num_input_tokens_seen": 53788608, "step": 56325 }, { "epoch": 4.594991434864181, "grad_norm": 0.9555169343948364, "learning_rate": 3.2767658413739226e-05, "loss": 0.3684, "num_input_tokens_seen": 53793344, "step": 56330 }, { "epoch": 4.59539929847459, "grad_norm": 0.8034799695014954, "learning_rate": 3.2764275216809385e-05, "loss": 0.3796, "num_input_tokens_seen": 53798624, "step": 56335 }, { "epoch": 4.5958071620849985, "grad_norm": 1.0315418243408203, "learning_rate": 3.276089186250098e-05, "loss": 0.3275, "num_input_tokens_seen": 53803712, "step": 56340 }, { "epoch": 4.5962150256954075, "grad_norm": 0.798966646194458, "learning_rate": 3.2757508350882587e-05, "loss": 0.2903, "num_input_tokens_seen": 53808912, "step": 56345 }, { "epoch": 4.5966228893058165, "grad_norm": 1.2785568237304688, "learning_rate": 3.27541246820228e-05, "loss": 0.3659, "num_input_tokens_seen": 53813520, "step": 56350 }, { "epoch": 4.597030752916225, "grad_norm": 1.2017422914505005, "learning_rate": 3.2750740855990206e-05, "loss": 0.3542, "num_input_tokens_seen": 53817552, "step": 56355 }, { "epoch": 4.597438616526634, "grad_norm": 0.9879053831100464, "learning_rate": 3.274735687285339e-05, "loss": 0.3534, "num_input_tokens_seen": 53821968, "step": 56360 }, { "epoch": 4.597846480137042, "grad_norm": 1.1526778936386108, "learning_rate": 3.274397273268093e-05, "loss": 0.3605, "num_input_tokens_seen": 53827312, "step": 56365 }, { "epoch": 4.598254343747451, "grad_norm": 0.8698686957359314, "learning_rate": 3.2740588435541445e-05, "loss": 0.3396, "num_input_tokens_seen": 53832176, "step": 56370 }, { "epoch": 4.59866220735786, "grad_norm": 0.6613410711288452, "learning_rate": 3.273720398150352e-05, "loss": 0.3292, "num_input_tokens_seen": 53836560, "step": 56375 }, { "epoch": 4.599070070968268, "grad_norm": 1.1088905334472656, "learning_rate": 3.273381937063575e-05, "loss": 0.3412, "num_input_tokens_seen": 53841728, "step": 56380 }, { "epoch": 4.599477934578677, "grad_norm": 0.8605710864067078, "learning_rate": 3.273043460300677e-05, "loss": 0.3472, "num_input_tokens_seen": 53846064, "step": 56385 }, { "epoch": 4.599885798189086, "grad_norm": 0.9219126105308533, "learning_rate": 3.272704967868514e-05, "loss": 0.322, "num_input_tokens_seen": 53850784, "step": 56390 }, { "epoch": 4.600293661799494, "grad_norm": 0.9154943227767944, "learning_rate": 3.2723664597739504e-05, "loss": 0.3359, "num_input_tokens_seen": 53856064, "step": 56395 }, { "epoch": 4.600701525409903, "grad_norm": 0.8977603912353516, "learning_rate": 3.272027936023848e-05, "loss": 0.3456, "num_input_tokens_seen": 53860544, "step": 56400 }, { "epoch": 4.601109389020312, "grad_norm": 1.2247892618179321, "learning_rate": 3.271689396625066e-05, "loss": 0.3371, "num_input_tokens_seen": 53865472, "step": 56405 }, { "epoch": 4.60151725263072, "grad_norm": 1.3660186529159546, "learning_rate": 3.271350841584468e-05, "loss": 0.3441, "num_input_tokens_seen": 53871424, "step": 56410 }, { "epoch": 4.601925116241129, "grad_norm": 2.090550661087036, "learning_rate": 3.2710122709089163e-05, "loss": 0.3316, "num_input_tokens_seen": 53875408, "step": 56415 }, { "epoch": 4.602332979851537, "grad_norm": 1.597687840461731, "learning_rate": 3.2706736846052735e-05, "loss": 0.3335, "num_input_tokens_seen": 53880272, "step": 56420 }, { "epoch": 4.602740843461946, "grad_norm": 1.0803958177566528, "learning_rate": 3.270335082680404e-05, "loss": 0.3248, "num_input_tokens_seen": 53884848, "step": 56425 }, { "epoch": 4.603148707072355, "grad_norm": 1.2278714179992676, "learning_rate": 3.2699964651411674e-05, "loss": 0.3685, "num_input_tokens_seen": 53889680, "step": 56430 }, { "epoch": 4.6035565706827635, "grad_norm": 0.9524399638175964, "learning_rate": 3.269657831994431e-05, "loss": 0.3213, "num_input_tokens_seen": 53894096, "step": 56435 }, { "epoch": 4.6039644342931725, "grad_norm": 1.4481399059295654, "learning_rate": 3.269319183247056e-05, "loss": 0.3424, "num_input_tokens_seen": 53899120, "step": 56440 }, { "epoch": 4.604372297903581, "grad_norm": 0.8433237671852112, "learning_rate": 3.2689805189059096e-05, "loss": 0.3665, "num_input_tokens_seen": 53904512, "step": 56445 }, { "epoch": 4.60478016151399, "grad_norm": 1.5642491579055786, "learning_rate": 3.268641838977855e-05, "loss": 0.3259, "num_input_tokens_seen": 53909616, "step": 56450 }, { "epoch": 4.605188025124399, "grad_norm": 0.7886659502983093, "learning_rate": 3.268303143469756e-05, "loss": 0.335, "num_input_tokens_seen": 53913616, "step": 56455 }, { "epoch": 4.605595888734807, "grad_norm": 1.3998266458511353, "learning_rate": 3.26796443238848e-05, "loss": 0.3432, "num_input_tokens_seen": 53918480, "step": 56460 }, { "epoch": 4.606003752345216, "grad_norm": 1.0700246095657349, "learning_rate": 3.26762570574089e-05, "loss": 0.3332, "num_input_tokens_seen": 53922144, "step": 56465 }, { "epoch": 4.606411615955625, "grad_norm": 0.7416752576828003, "learning_rate": 3.2672869635338535e-05, "loss": 0.2785, "num_input_tokens_seen": 53927104, "step": 56470 }, { "epoch": 4.606819479566033, "grad_norm": 0.5433118343353271, "learning_rate": 3.2669482057742365e-05, "loss": 0.3541, "num_input_tokens_seen": 53932192, "step": 56475 }, { "epoch": 4.607227343176442, "grad_norm": 1.0653316974639893, "learning_rate": 3.266609432468905e-05, "loss": 0.3245, "num_input_tokens_seen": 53937200, "step": 56480 }, { "epoch": 4.607635206786851, "grad_norm": 0.9097850918769836, "learning_rate": 3.266270643624726e-05, "loss": 0.3079, "num_input_tokens_seen": 53942528, "step": 56485 }, { "epoch": 4.608043070397259, "grad_norm": 0.7141517400741577, "learning_rate": 3.265931839248567e-05, "loss": 0.3555, "num_input_tokens_seen": 53947536, "step": 56490 }, { "epoch": 4.608450934007668, "grad_norm": 1.1842297315597534, "learning_rate": 3.2655930193472946e-05, "loss": 0.3787, "num_input_tokens_seen": 53952208, "step": 56495 }, { "epoch": 4.608858797618076, "grad_norm": 1.1157594919204712, "learning_rate": 3.265254183927777e-05, "loss": 0.3025, "num_input_tokens_seen": 53957584, "step": 56500 }, { "epoch": 4.609266661228485, "grad_norm": 1.0304399728775024, "learning_rate": 3.2649153329968824e-05, "loss": 0.3463, "num_input_tokens_seen": 53962208, "step": 56505 }, { "epoch": 4.609674524838894, "grad_norm": 0.960281252861023, "learning_rate": 3.26457646656148e-05, "loss": 0.2883, "num_input_tokens_seen": 53966192, "step": 56510 }, { "epoch": 4.610082388449302, "grad_norm": 1.011397123336792, "learning_rate": 3.264237584628436e-05, "loss": 0.2925, "num_input_tokens_seen": 53970544, "step": 56515 }, { "epoch": 4.610490252059711, "grad_norm": 1.8272285461425781, "learning_rate": 3.263898687204622e-05, "loss": 0.3957, "num_input_tokens_seen": 53974816, "step": 56520 }, { "epoch": 4.610898115670119, "grad_norm": 0.7231114506721497, "learning_rate": 3.263559774296906e-05, "loss": 0.3456, "num_input_tokens_seen": 53979600, "step": 56525 }, { "epoch": 4.611305979280528, "grad_norm": 1.4981590509414673, "learning_rate": 3.263220845912157e-05, "loss": 0.365, "num_input_tokens_seen": 53984032, "step": 56530 }, { "epoch": 4.611713842890937, "grad_norm": 1.3425096273422241, "learning_rate": 3.262881902057247e-05, "loss": 0.3187, "num_input_tokens_seen": 53988720, "step": 56535 }, { "epoch": 4.612121706501346, "grad_norm": 1.4756293296813965, "learning_rate": 3.2625429427390444e-05, "loss": 0.3963, "num_input_tokens_seen": 53992336, "step": 56540 }, { "epoch": 4.612529570111755, "grad_norm": 1.3675119876861572, "learning_rate": 3.26220396796442e-05, "loss": 0.2984, "num_input_tokens_seen": 53997616, "step": 56545 }, { "epoch": 4.612937433722164, "grad_norm": 0.7727825045585632, "learning_rate": 3.2618649777402455e-05, "loss": 0.3631, "num_input_tokens_seen": 54002688, "step": 56550 }, { "epoch": 4.613345297332572, "grad_norm": 1.0405486822128296, "learning_rate": 3.261525972073392e-05, "loss": 0.3282, "num_input_tokens_seen": 54007760, "step": 56555 }, { "epoch": 4.613753160942981, "grad_norm": 1.1211820840835571, "learning_rate": 3.2611869509707294e-05, "loss": 0.3683, "num_input_tokens_seen": 54013008, "step": 56560 }, { "epoch": 4.61416102455339, "grad_norm": 1.2427946329116821, "learning_rate": 3.260847914439132e-05, "loss": 0.3008, "num_input_tokens_seen": 54018432, "step": 56565 }, { "epoch": 4.614568888163798, "grad_norm": 1.3243550062179565, "learning_rate": 3.26050886248547e-05, "loss": 0.3359, "num_input_tokens_seen": 54022992, "step": 56570 }, { "epoch": 4.614976751774207, "grad_norm": 0.78606116771698, "learning_rate": 3.260169795116617e-05, "loss": 0.3376, "num_input_tokens_seen": 54027632, "step": 56575 }, { "epoch": 4.615384615384615, "grad_norm": 1.528627872467041, "learning_rate": 3.2598307123394454e-05, "loss": 0.3874, "num_input_tokens_seen": 54031728, "step": 56580 }, { "epoch": 4.615792478995024, "grad_norm": 1.4465500116348267, "learning_rate": 3.2594916141608275e-05, "loss": 0.365, "num_input_tokens_seen": 54036144, "step": 56585 }, { "epoch": 4.616200342605433, "grad_norm": 1.378608226776123, "learning_rate": 3.259152500587638e-05, "loss": 0.3146, "num_input_tokens_seen": 54040912, "step": 56590 }, { "epoch": 4.616608206215841, "grad_norm": 1.2652463912963867, "learning_rate": 3.25881337162675e-05, "loss": 0.3095, "num_input_tokens_seen": 54045680, "step": 56595 }, { "epoch": 4.61701606982625, "grad_norm": 1.6323720216751099, "learning_rate": 3.258474227285037e-05, "loss": 0.3787, "num_input_tokens_seen": 54050352, "step": 56600 }, { "epoch": 4.617423933436659, "grad_norm": 0.7278100848197937, "learning_rate": 3.258135067569372e-05, "loss": 0.2559, "num_input_tokens_seen": 54055376, "step": 56605 }, { "epoch": 4.617831797047067, "grad_norm": 0.9505592584609985, "learning_rate": 3.2577958924866334e-05, "loss": 0.3713, "num_input_tokens_seen": 54060624, "step": 56610 }, { "epoch": 4.618239660657476, "grad_norm": 1.547078251838684, "learning_rate": 3.2574567020436934e-05, "loss": 0.4031, "num_input_tokens_seen": 54065968, "step": 56615 }, { "epoch": 4.618647524267885, "grad_norm": 1.2973675727844238, "learning_rate": 3.257117496247428e-05, "loss": 0.4237, "num_input_tokens_seen": 54070320, "step": 56620 }, { "epoch": 4.619055387878293, "grad_norm": 0.6615657210350037, "learning_rate": 3.2567782751047134e-05, "loss": 0.3371, "num_input_tokens_seen": 54074720, "step": 56625 }, { "epoch": 4.619463251488702, "grad_norm": 1.315829873085022, "learning_rate": 3.256439038622424e-05, "loss": 0.38, "num_input_tokens_seen": 54079728, "step": 56630 }, { "epoch": 4.6198711150991105, "grad_norm": 0.8094953298568726, "learning_rate": 3.2560997868074364e-05, "loss": 0.3102, "num_input_tokens_seen": 54084944, "step": 56635 }, { "epoch": 4.6202789787095195, "grad_norm": 1.142440915107727, "learning_rate": 3.255760519666628e-05, "loss": 0.3297, "num_input_tokens_seen": 54090032, "step": 56640 }, { "epoch": 4.6206868423199285, "grad_norm": 0.4936630427837372, "learning_rate": 3.255421237206875e-05, "loss": 0.3637, "num_input_tokens_seen": 54094672, "step": 56645 }, { "epoch": 4.621094705930337, "grad_norm": 1.116054892539978, "learning_rate": 3.2550819394350546e-05, "loss": 0.3389, "num_input_tokens_seen": 54099936, "step": 56650 }, { "epoch": 4.621502569540746, "grad_norm": 1.502769947052002, "learning_rate": 3.254742626358044e-05, "loss": 0.3429, "num_input_tokens_seen": 54105888, "step": 56655 }, { "epoch": 4.621910433151154, "grad_norm": 1.205678939819336, "learning_rate": 3.254403297982721e-05, "loss": 0.3579, "num_input_tokens_seen": 54110416, "step": 56660 }, { "epoch": 4.622318296761563, "grad_norm": 0.6432334780693054, "learning_rate": 3.254063954315963e-05, "loss": 0.3854, "num_input_tokens_seen": 54115600, "step": 56665 }, { "epoch": 4.622726160371972, "grad_norm": 0.6673864126205444, "learning_rate": 3.25372459536465e-05, "loss": 0.3363, "num_input_tokens_seen": 54120480, "step": 56670 }, { "epoch": 4.62313402398238, "grad_norm": 0.8194302320480347, "learning_rate": 3.25338522113566e-05, "loss": 0.3418, "num_input_tokens_seen": 54124784, "step": 56675 }, { "epoch": 4.623541887592789, "grad_norm": 0.783855140209198, "learning_rate": 3.253045831635871e-05, "loss": 0.356, "num_input_tokens_seen": 54129408, "step": 56680 }, { "epoch": 4.623949751203198, "grad_norm": 0.7783285975456238, "learning_rate": 3.2527064268721627e-05, "loss": 0.348, "num_input_tokens_seen": 54134016, "step": 56685 }, { "epoch": 4.624357614813606, "grad_norm": 1.7769739627838135, "learning_rate": 3.252367006851416e-05, "loss": 0.3546, "num_input_tokens_seen": 54138704, "step": 56690 }, { "epoch": 4.624765478424015, "grad_norm": 0.42316922545433044, "learning_rate": 3.2520275715805086e-05, "loss": 0.3954, "num_input_tokens_seen": 54142720, "step": 56695 }, { "epoch": 4.625173342034424, "grad_norm": 0.7319958209991455, "learning_rate": 3.251688121066323e-05, "loss": 0.299, "num_input_tokens_seen": 54147808, "step": 56700 }, { "epoch": 4.625581205644832, "grad_norm": 0.6838682293891907, "learning_rate": 3.2513486553157375e-05, "loss": 0.2767, "num_input_tokens_seen": 54151888, "step": 56705 }, { "epoch": 4.625989069255241, "grad_norm": 1.2688446044921875, "learning_rate": 3.2510091743356346e-05, "loss": 0.4009, "num_input_tokens_seen": 54156688, "step": 56710 }, { "epoch": 4.626396932865649, "grad_norm": 0.8399841785430908, "learning_rate": 3.2506696781328955e-05, "loss": 0.2959, "num_input_tokens_seen": 54161552, "step": 56715 }, { "epoch": 4.626804796476058, "grad_norm": 0.714562177658081, "learning_rate": 3.2503301667144e-05, "loss": 0.246, "num_input_tokens_seen": 54166080, "step": 56720 }, { "epoch": 4.627212660086467, "grad_norm": 0.7660650610923767, "learning_rate": 3.249990640087031e-05, "loss": 0.3401, "num_input_tokens_seen": 54171328, "step": 56725 }, { "epoch": 4.6276205236968755, "grad_norm": 1.0673861503601074, "learning_rate": 3.249651098257671e-05, "loss": 0.3119, "num_input_tokens_seen": 54175760, "step": 56730 }, { "epoch": 4.6280283873072845, "grad_norm": 1.5961804389953613, "learning_rate": 3.2493115412332024e-05, "loss": 0.3531, "num_input_tokens_seen": 54181152, "step": 56735 }, { "epoch": 4.6284362509176935, "grad_norm": 1.2007536888122559, "learning_rate": 3.2489719690205055e-05, "loss": 0.3848, "num_input_tokens_seen": 54185520, "step": 56740 }, { "epoch": 4.628844114528102, "grad_norm": 1.06736421585083, "learning_rate": 3.2486323816264656e-05, "loss": 0.3666, "num_input_tokens_seen": 54190816, "step": 56745 }, { "epoch": 4.629251978138511, "grad_norm": 0.49432480335235596, "learning_rate": 3.248292779057965e-05, "loss": 0.3425, "num_input_tokens_seen": 54195904, "step": 56750 }, { "epoch": 4.62965984174892, "grad_norm": 0.6518234014511108, "learning_rate": 3.247953161321889e-05, "loss": 0.284, "num_input_tokens_seen": 54200704, "step": 56755 }, { "epoch": 4.630067705359328, "grad_norm": 0.9260135293006897, "learning_rate": 3.2476135284251194e-05, "loss": 0.3584, "num_input_tokens_seen": 54206048, "step": 56760 }, { "epoch": 4.630475568969737, "grad_norm": 0.6731841564178467, "learning_rate": 3.247273880374542e-05, "loss": 0.3783, "num_input_tokens_seen": 54210528, "step": 56765 }, { "epoch": 4.630883432580145, "grad_norm": 1.6815836429595947, "learning_rate": 3.246934217177039e-05, "loss": 0.3888, "num_input_tokens_seen": 54215888, "step": 56770 }, { "epoch": 4.631291296190554, "grad_norm": 0.7159838080406189, "learning_rate": 3.2465945388394984e-05, "loss": 0.3436, "num_input_tokens_seen": 54220880, "step": 56775 }, { "epoch": 4.631699159800963, "grad_norm": 0.3271683156490326, "learning_rate": 3.246254845368803e-05, "loss": 0.3557, "num_input_tokens_seen": 54225280, "step": 56780 }, { "epoch": 4.632107023411371, "grad_norm": 0.7406017780303955, "learning_rate": 3.24591513677184e-05, "loss": 0.3234, "num_input_tokens_seen": 54230592, "step": 56785 }, { "epoch": 4.63251488702178, "grad_norm": 0.27750200033187866, "learning_rate": 3.245575413055493e-05, "loss": 0.3632, "num_input_tokens_seen": 54235184, "step": 56790 }, { "epoch": 4.632922750632188, "grad_norm": 1.2065348625183105, "learning_rate": 3.24523567422665e-05, "loss": 0.3175, "num_input_tokens_seen": 54239888, "step": 56795 }, { "epoch": 4.633330614242597, "grad_norm": 1.2694162130355835, "learning_rate": 3.2448959202921964e-05, "loss": 0.3496, "num_input_tokens_seen": 54244672, "step": 56800 }, { "epoch": 4.633738477853006, "grad_norm": 0.24624666571617126, "learning_rate": 3.244556151259018e-05, "loss": 0.3469, "num_input_tokens_seen": 54248624, "step": 56805 }, { "epoch": 4.634146341463414, "grad_norm": 0.29730507731437683, "learning_rate": 3.244216367134004e-05, "loss": 0.3926, "num_input_tokens_seen": 54253296, "step": 56810 }, { "epoch": 4.634554205073823, "grad_norm": 1.0045228004455566, "learning_rate": 3.24387656792404e-05, "loss": 0.3919, "num_input_tokens_seen": 54258368, "step": 56815 }, { "epoch": 4.634962068684232, "grad_norm": 1.2742986679077148, "learning_rate": 3.243536753636014e-05, "loss": 0.332, "num_input_tokens_seen": 54263648, "step": 56820 }, { "epoch": 4.6353699322946404, "grad_norm": 1.1777961254119873, "learning_rate": 3.243196924276814e-05, "loss": 0.3438, "num_input_tokens_seen": 54267744, "step": 56825 }, { "epoch": 4.6357777959050495, "grad_norm": 1.3681849241256714, "learning_rate": 3.242857079853328e-05, "loss": 0.3623, "num_input_tokens_seen": 54272176, "step": 56830 }, { "epoch": 4.6361856595154585, "grad_norm": 0.6871484518051147, "learning_rate": 3.2425172203724454e-05, "loss": 0.3642, "num_input_tokens_seen": 54277184, "step": 56835 }, { "epoch": 4.636593523125867, "grad_norm": 0.7430439591407776, "learning_rate": 3.242177345841053e-05, "loss": 0.3467, "num_input_tokens_seen": 54282784, "step": 56840 }, { "epoch": 4.637001386736276, "grad_norm": 1.1792099475860596, "learning_rate": 3.241837456266042e-05, "loss": 0.3486, "num_input_tokens_seen": 54287920, "step": 56845 }, { "epoch": 4.637409250346684, "grad_norm": 0.8685057759284973, "learning_rate": 3.2414975516543004e-05, "loss": 0.3301, "num_input_tokens_seen": 54293536, "step": 56850 }, { "epoch": 4.637817113957093, "grad_norm": 1.4690086841583252, "learning_rate": 3.241157632012719e-05, "loss": 0.3047, "num_input_tokens_seen": 54297840, "step": 56855 }, { "epoch": 4.638224977567502, "grad_norm": 0.9494391679763794, "learning_rate": 3.2408176973481875e-05, "loss": 0.3457, "num_input_tokens_seen": 54302512, "step": 56860 }, { "epoch": 4.63863284117791, "grad_norm": 0.5958971381187439, "learning_rate": 3.2404777476675966e-05, "loss": 0.3149, "num_input_tokens_seen": 54307936, "step": 56865 }, { "epoch": 4.639040704788319, "grad_norm": 0.9979683756828308, "learning_rate": 3.240137782977835e-05, "loss": 0.3446, "num_input_tokens_seen": 54313360, "step": 56870 }, { "epoch": 4.639448568398727, "grad_norm": 0.6391253471374512, "learning_rate": 3.239797803285796e-05, "loss": 0.3691, "num_input_tokens_seen": 54317648, "step": 56875 }, { "epoch": 4.639856432009136, "grad_norm": 1.2822892665863037, "learning_rate": 3.239457808598369e-05, "loss": 0.3366, "num_input_tokens_seen": 54323184, "step": 56880 }, { "epoch": 4.640264295619545, "grad_norm": 0.7212537527084351, "learning_rate": 3.2391177989224476e-05, "loss": 0.3035, "num_input_tokens_seen": 54327312, "step": 56885 }, { "epoch": 4.640672159229953, "grad_norm": 1.4477064609527588, "learning_rate": 3.2387777742649214e-05, "loss": 0.3304, "num_input_tokens_seen": 54331184, "step": 56890 }, { "epoch": 4.641080022840362, "grad_norm": 0.7923237681388855, "learning_rate": 3.238437734632685e-05, "loss": 0.3113, "num_input_tokens_seen": 54336048, "step": 56895 }, { "epoch": 4.641487886450771, "grad_norm": 1.767474889755249, "learning_rate": 3.238097680032628e-05, "loss": 0.3548, "num_input_tokens_seen": 54341232, "step": 56900 }, { "epoch": 4.641895750061179, "grad_norm": 0.7984811663627625, "learning_rate": 3.237757610471645e-05, "loss": 0.319, "num_input_tokens_seen": 54345568, "step": 56905 }, { "epoch": 4.642303613671588, "grad_norm": 0.630645751953125, "learning_rate": 3.237417525956629e-05, "loss": 0.3497, "num_input_tokens_seen": 54350224, "step": 56910 }, { "epoch": 4.642711477281997, "grad_norm": 1.2515075206756592, "learning_rate": 3.237077426494473e-05, "loss": 0.3084, "num_input_tokens_seen": 54353840, "step": 56915 }, { "epoch": 4.643119340892405, "grad_norm": 0.8566297888755798, "learning_rate": 3.236737312092071e-05, "loss": 0.3097, "num_input_tokens_seen": 54358688, "step": 56920 }, { "epoch": 4.643527204502814, "grad_norm": 0.5773205161094666, "learning_rate": 3.2363971827563174e-05, "loss": 0.2875, "num_input_tokens_seen": 54363968, "step": 56925 }, { "epoch": 4.6439350681132225, "grad_norm": 2.0887653827667236, "learning_rate": 3.236057038494105e-05, "loss": 0.3239, "num_input_tokens_seen": 54368688, "step": 56930 }, { "epoch": 4.6443429317236316, "grad_norm": 1.3463813066482544, "learning_rate": 3.235716879312329e-05, "loss": 0.4295, "num_input_tokens_seen": 54373248, "step": 56935 }, { "epoch": 4.644750795334041, "grad_norm": 0.7908326983451843, "learning_rate": 3.235376705217885e-05, "loss": 0.2523, "num_input_tokens_seen": 54378672, "step": 56940 }, { "epoch": 4.645158658944449, "grad_norm": 0.9130420088768005, "learning_rate": 3.2350365162176674e-05, "loss": 0.3445, "num_input_tokens_seen": 54383328, "step": 56945 }, { "epoch": 4.645566522554858, "grad_norm": 0.7895049452781677, "learning_rate": 3.2346963123185715e-05, "loss": 0.2944, "num_input_tokens_seen": 54388416, "step": 56950 }, { "epoch": 4.645974386165267, "grad_norm": 1.4344837665557861, "learning_rate": 3.234356093527494e-05, "loss": 0.3972, "num_input_tokens_seen": 54393664, "step": 56955 }, { "epoch": 4.646382249775675, "grad_norm": 1.4342899322509766, "learning_rate": 3.23401585985133e-05, "loss": 0.3186, "num_input_tokens_seen": 54397984, "step": 56960 }, { "epoch": 4.646790113386084, "grad_norm": 2.047697067260742, "learning_rate": 3.233675611296977e-05, "loss": 0.4239, "num_input_tokens_seen": 54401984, "step": 56965 }, { "epoch": 4.647197976996493, "grad_norm": 1.3001457452774048, "learning_rate": 3.233335347871331e-05, "loss": 0.3634, "num_input_tokens_seen": 54406592, "step": 56970 }, { "epoch": 4.647605840606901, "grad_norm": 0.9227193593978882, "learning_rate": 3.2329950695812894e-05, "loss": 0.3149, "num_input_tokens_seen": 54411120, "step": 56975 }, { "epoch": 4.64801370421731, "grad_norm": 2.1351068019866943, "learning_rate": 3.232654776433749e-05, "loss": 0.3534, "num_input_tokens_seen": 54415616, "step": 56980 }, { "epoch": 4.648421567827718, "grad_norm": 0.9096919894218445, "learning_rate": 3.232314468435608e-05, "loss": 0.3259, "num_input_tokens_seen": 54420576, "step": 56985 }, { "epoch": 4.648829431438127, "grad_norm": 1.0265687704086304, "learning_rate": 3.2319741455937636e-05, "loss": 0.29, "num_input_tokens_seen": 54426176, "step": 56990 }, { "epoch": 4.649237295048536, "grad_norm": 1.2792109251022339, "learning_rate": 3.231633807915114e-05, "loss": 0.3334, "num_input_tokens_seen": 54431120, "step": 56995 }, { "epoch": 4.649645158658944, "grad_norm": 1.1794747114181519, "learning_rate": 3.231293455406559e-05, "loss": 0.3787, "num_input_tokens_seen": 54435840, "step": 57000 }, { "epoch": 4.650053022269353, "grad_norm": 1.0479850769042969, "learning_rate": 3.230953088074995e-05, "loss": 0.327, "num_input_tokens_seen": 54440800, "step": 57005 }, { "epoch": 4.650460885879761, "grad_norm": 2.1269850730895996, "learning_rate": 3.230612705927323e-05, "loss": 0.3358, "num_input_tokens_seen": 54445680, "step": 57010 }, { "epoch": 4.65086874949017, "grad_norm": 0.9500226974487305, "learning_rate": 3.2302723089704416e-05, "loss": 0.2749, "num_input_tokens_seen": 54450736, "step": 57015 }, { "epoch": 4.651276613100579, "grad_norm": 1.0462061166763306, "learning_rate": 3.229931897211252e-05, "loss": 0.3195, "num_input_tokens_seen": 54455744, "step": 57020 }, { "epoch": 4.6516844767109875, "grad_norm": 1.2062715291976929, "learning_rate": 3.2295914706566514e-05, "loss": 0.3356, "num_input_tokens_seen": 54461008, "step": 57025 }, { "epoch": 4.6520923403213965, "grad_norm": 1.7700477838516235, "learning_rate": 3.229251029313543e-05, "loss": 0.3347, "num_input_tokens_seen": 54464528, "step": 57030 }, { "epoch": 4.6525002039318055, "grad_norm": 2.965273380279541, "learning_rate": 3.228910573188826e-05, "loss": 0.3551, "num_input_tokens_seen": 54469744, "step": 57035 }, { "epoch": 4.652908067542214, "grad_norm": 1.818207025527954, "learning_rate": 3.2285701022894004e-05, "loss": 0.3656, "num_input_tokens_seen": 54474992, "step": 57040 }, { "epoch": 4.653315931152623, "grad_norm": 2.4140872955322266, "learning_rate": 3.2282296166221695e-05, "loss": 0.2764, "num_input_tokens_seen": 54478976, "step": 57045 }, { "epoch": 4.653723794763032, "grad_norm": 1.520195722579956, "learning_rate": 3.227889116194033e-05, "loss": 0.3515, "num_input_tokens_seen": 54483744, "step": 57050 }, { "epoch": 4.65413165837344, "grad_norm": 1.495981216430664, "learning_rate": 3.227548601011894e-05, "loss": 0.3433, "num_input_tokens_seen": 54488480, "step": 57055 }, { "epoch": 4.654539521983849, "grad_norm": 2.3799638748168945, "learning_rate": 3.227208071082654e-05, "loss": 0.3477, "num_input_tokens_seen": 54493968, "step": 57060 }, { "epoch": 4.654947385594257, "grad_norm": 2.1786727905273438, "learning_rate": 3.226867526413215e-05, "loss": 0.4532, "num_input_tokens_seen": 54498336, "step": 57065 }, { "epoch": 4.655355249204666, "grad_norm": 0.894523561000824, "learning_rate": 3.2265269670104806e-05, "loss": 0.3971, "num_input_tokens_seen": 54503408, "step": 57070 }, { "epoch": 4.655763112815075, "grad_norm": 0.5458727478981018, "learning_rate": 3.226186392881353e-05, "loss": 0.3286, "num_input_tokens_seen": 54508032, "step": 57075 }, { "epoch": 4.656170976425483, "grad_norm": 1.2341195344924927, "learning_rate": 3.2258458040327355e-05, "loss": 0.3156, "num_input_tokens_seen": 54512752, "step": 57080 }, { "epoch": 4.656578840035892, "grad_norm": 0.874871015548706, "learning_rate": 3.225505200471532e-05, "loss": 0.2891, "num_input_tokens_seen": 54517552, "step": 57085 }, { "epoch": 4.6569867036463, "grad_norm": 1.1119972467422485, "learning_rate": 3.225164582204647e-05, "loss": 0.3659, "num_input_tokens_seen": 54522448, "step": 57090 }, { "epoch": 4.657394567256709, "grad_norm": 0.6546391844749451, "learning_rate": 3.2248239492389835e-05, "loss": 0.2905, "num_input_tokens_seen": 54527808, "step": 57095 }, { "epoch": 4.657802430867118, "grad_norm": 1.07683265209198, "learning_rate": 3.224483301581446e-05, "loss": 0.3745, "num_input_tokens_seen": 54532768, "step": 57100 }, { "epoch": 4.658210294477527, "grad_norm": 2.0092413425445557, "learning_rate": 3.2241426392389406e-05, "loss": 0.4036, "num_input_tokens_seen": 54537664, "step": 57105 }, { "epoch": 4.658618158087935, "grad_norm": 1.167655348777771, "learning_rate": 3.223801962218372e-05, "loss": 0.3486, "num_input_tokens_seen": 54542592, "step": 57110 }, { "epoch": 4.659026021698344, "grad_norm": 1.867529273033142, "learning_rate": 3.223461270526644e-05, "loss": 0.3327, "num_input_tokens_seen": 54547120, "step": 57115 }, { "epoch": 4.6594338853087525, "grad_norm": 0.9387982487678528, "learning_rate": 3.223120564170665e-05, "loss": 0.4138, "num_input_tokens_seen": 54551680, "step": 57120 }, { "epoch": 4.6598417489191615, "grad_norm": 1.3102976083755493, "learning_rate": 3.222779843157338e-05, "loss": 0.3463, "num_input_tokens_seen": 54556544, "step": 57125 }, { "epoch": 4.6602496125295705, "grad_norm": 1.4390654563903809, "learning_rate": 3.222439107493571e-05, "loss": 0.4244, "num_input_tokens_seen": 54562288, "step": 57130 }, { "epoch": 4.660657476139979, "grad_norm": 0.9207435846328735, "learning_rate": 3.222098357186271e-05, "loss": 0.3226, "num_input_tokens_seen": 54567168, "step": 57135 }, { "epoch": 4.661065339750388, "grad_norm": 1.2417155504226685, "learning_rate": 3.2217575922423446e-05, "loss": 0.3245, "num_input_tokens_seen": 54571440, "step": 57140 }, { "epoch": 4.661473203360796, "grad_norm": 0.3734259605407715, "learning_rate": 3.221416812668697e-05, "loss": 0.3553, "num_input_tokens_seen": 54576176, "step": 57145 }, { "epoch": 4.661881066971205, "grad_norm": 0.8416047096252441, "learning_rate": 3.221076018472238e-05, "loss": 0.3522, "num_input_tokens_seen": 54579872, "step": 57150 }, { "epoch": 4.662288930581614, "grad_norm": 0.5150159001350403, "learning_rate": 3.220735209659874e-05, "loss": 0.3382, "num_input_tokens_seen": 54585088, "step": 57155 }, { "epoch": 4.662696794192022, "grad_norm": 0.36919453740119934, "learning_rate": 3.220394386238514e-05, "loss": 0.3685, "num_input_tokens_seen": 54590576, "step": 57160 }, { "epoch": 4.663104657802431, "grad_norm": 0.7801339626312256, "learning_rate": 3.220053548215066e-05, "loss": 0.3442, "num_input_tokens_seen": 54595312, "step": 57165 }, { "epoch": 4.66351252141284, "grad_norm": 1.3760794401168823, "learning_rate": 3.219712695596438e-05, "loss": 0.3264, "num_input_tokens_seen": 54600000, "step": 57170 }, { "epoch": 4.663920385023248, "grad_norm": 0.5458818078041077, "learning_rate": 3.21937182838954e-05, "loss": 0.3277, "num_input_tokens_seen": 54604416, "step": 57175 }, { "epoch": 4.664328248633657, "grad_norm": 1.6201810836791992, "learning_rate": 3.21903094660128e-05, "loss": 0.408, "num_input_tokens_seen": 54609376, "step": 57180 }, { "epoch": 4.664736112244066, "grad_norm": 1.3715988397598267, "learning_rate": 3.218690050238569e-05, "loss": 0.356, "num_input_tokens_seen": 54614848, "step": 57185 }, { "epoch": 4.665143975854474, "grad_norm": 0.49496710300445557, "learning_rate": 3.218349139308316e-05, "loss": 0.3348, "num_input_tokens_seen": 54618480, "step": 57190 }, { "epoch": 4.665551839464883, "grad_norm": 1.22744882106781, "learning_rate": 3.218008213817431e-05, "loss": 0.3399, "num_input_tokens_seen": 54624368, "step": 57195 }, { "epoch": 4.665959703075291, "grad_norm": 0.7636399865150452, "learning_rate": 3.217667273772825e-05, "loss": 0.3425, "num_input_tokens_seen": 54628880, "step": 57200 }, { "epoch": 4.6663675666857, "grad_norm": 1.0182697772979736, "learning_rate": 3.217326319181408e-05, "loss": 0.3686, "num_input_tokens_seen": 54634800, "step": 57205 }, { "epoch": 4.666775430296109, "grad_norm": 0.8518248200416565, "learning_rate": 3.216985350050091e-05, "loss": 0.3398, "num_input_tokens_seen": 54639680, "step": 57210 }, { "epoch": 4.667183293906517, "grad_norm": 1.042704463005066, "learning_rate": 3.216644366385786e-05, "loss": 0.3428, "num_input_tokens_seen": 54644816, "step": 57215 }, { "epoch": 4.667591157516926, "grad_norm": 0.8785368800163269, "learning_rate": 3.216303368195404e-05, "loss": 0.3623, "num_input_tokens_seen": 54649728, "step": 57220 }, { "epoch": 4.667999021127335, "grad_norm": 1.9638160467147827, "learning_rate": 3.215962355485857e-05, "loss": 0.3529, "num_input_tokens_seen": 54654928, "step": 57225 }, { "epoch": 4.668406884737744, "grad_norm": 0.9906433820724487, "learning_rate": 3.215621328264058e-05, "loss": 0.3529, "num_input_tokens_seen": 54658512, "step": 57230 }, { "epoch": 4.668814748348153, "grad_norm": 0.885450005531311, "learning_rate": 3.215280286536918e-05, "loss": 0.3339, "num_input_tokens_seen": 54663536, "step": 57235 }, { "epoch": 4.669222611958561, "grad_norm": 1.3642007112503052, "learning_rate": 3.2149392303113504e-05, "loss": 0.34, "num_input_tokens_seen": 54669440, "step": 57240 }, { "epoch": 4.66963047556897, "grad_norm": 0.8095383644104004, "learning_rate": 3.214598159594269e-05, "loss": 0.3281, "num_input_tokens_seen": 54673728, "step": 57245 }, { "epoch": 4.670038339179379, "grad_norm": 0.6754347085952759, "learning_rate": 3.214257074392586e-05, "loss": 0.3332, "num_input_tokens_seen": 54678480, "step": 57250 }, { "epoch": 4.670446202789787, "grad_norm": 1.1035187244415283, "learning_rate": 3.213915974713216e-05, "loss": 0.3816, "num_input_tokens_seen": 54683376, "step": 57255 }, { "epoch": 4.670854066400196, "grad_norm": 0.8036801815032959, "learning_rate": 3.213574860563072e-05, "loss": 0.3283, "num_input_tokens_seen": 54687712, "step": 57260 }, { "epoch": 4.671261930010605, "grad_norm": 1.5240981578826904, "learning_rate": 3.21323373194907e-05, "loss": 0.3169, "num_input_tokens_seen": 54693008, "step": 57265 }, { "epoch": 4.671669793621013, "grad_norm": 0.4916989505290985, "learning_rate": 3.212892588878123e-05, "loss": 0.355, "num_input_tokens_seen": 54698496, "step": 57270 }, { "epoch": 4.672077657231422, "grad_norm": 2.4772160053253174, "learning_rate": 3.2125514313571456e-05, "loss": 0.3755, "num_input_tokens_seen": 54703680, "step": 57275 }, { "epoch": 4.67248552084183, "grad_norm": 0.7333495020866394, "learning_rate": 3.2122102593930545e-05, "loss": 0.3115, "num_input_tokens_seen": 54707984, "step": 57280 }, { "epoch": 4.672893384452239, "grad_norm": 0.8902022838592529, "learning_rate": 3.211869072992763e-05, "loss": 0.3695, "num_input_tokens_seen": 54712960, "step": 57285 }, { "epoch": 4.673301248062648, "grad_norm": 0.6133390069007874, "learning_rate": 3.2115278721631875e-05, "loss": 0.2903, "num_input_tokens_seen": 54717648, "step": 57290 }, { "epoch": 4.673709111673056, "grad_norm": 0.8562213778495789, "learning_rate": 3.2111866569112445e-05, "loss": 0.4035, "num_input_tokens_seen": 54721568, "step": 57295 }, { "epoch": 4.674116975283465, "grad_norm": 4.425440788269043, "learning_rate": 3.210845427243851e-05, "loss": 0.3306, "num_input_tokens_seen": 54726128, "step": 57300 }, { "epoch": 4.674524838893874, "grad_norm": 0.8584269285202026, "learning_rate": 3.2105041831679225e-05, "loss": 0.3539, "num_input_tokens_seen": 54731120, "step": 57305 }, { "epoch": 4.674932702504282, "grad_norm": 0.9481752514839172, "learning_rate": 3.210162924690376e-05, "loss": 0.2729, "num_input_tokens_seen": 54735040, "step": 57310 }, { "epoch": 4.675340566114691, "grad_norm": 1.703324556350708, "learning_rate": 3.209821651818128e-05, "loss": 0.3413, "num_input_tokens_seen": 54740272, "step": 57315 }, { "epoch": 4.6757484297251, "grad_norm": 0.4286893308162689, "learning_rate": 3.2094803645580985e-05, "loss": 0.2699, "num_input_tokens_seen": 54745984, "step": 57320 }, { "epoch": 4.6761562933355085, "grad_norm": 1.049254059791565, "learning_rate": 3.2091390629172016e-05, "loss": 0.3409, "num_input_tokens_seen": 54750336, "step": 57325 }, { "epoch": 4.6765641569459175, "grad_norm": 0.6401040554046631, "learning_rate": 3.208797746902359e-05, "loss": 0.2728, "num_input_tokens_seen": 54755072, "step": 57330 }, { "epoch": 4.676972020556326, "grad_norm": 0.8569510579109192, "learning_rate": 3.208456416520486e-05, "loss": 0.4147, "num_input_tokens_seen": 54759696, "step": 57335 }, { "epoch": 4.677379884166735, "grad_norm": 0.5584927797317505, "learning_rate": 3.2081150717785034e-05, "loss": 0.3907, "num_input_tokens_seen": 54764992, "step": 57340 }, { "epoch": 4.677787747777144, "grad_norm": 1.7969985008239746, "learning_rate": 3.2077737126833284e-05, "loss": 0.2615, "num_input_tokens_seen": 54769072, "step": 57345 }, { "epoch": 4.678195611387552, "grad_norm": 0.5796071290969849, "learning_rate": 3.207432339241881e-05, "loss": 0.3698, "num_input_tokens_seen": 54773552, "step": 57350 }, { "epoch": 4.678603474997961, "grad_norm": 0.8752330541610718, "learning_rate": 3.207090951461081e-05, "loss": 0.3375, "num_input_tokens_seen": 54777984, "step": 57355 }, { "epoch": 4.679011338608369, "grad_norm": 1.1014368534088135, "learning_rate": 3.206749549347848e-05, "loss": 0.3483, "num_input_tokens_seen": 54782816, "step": 57360 }, { "epoch": 4.679419202218778, "grad_norm": 0.5796964764595032, "learning_rate": 3.206408132909102e-05, "loss": 0.3985, "num_input_tokens_seen": 54786976, "step": 57365 }, { "epoch": 4.679827065829187, "grad_norm": 1.1276881694793701, "learning_rate": 3.2060667021517624e-05, "loss": 0.3834, "num_input_tokens_seen": 54792096, "step": 57370 }, { "epoch": 4.680234929439595, "grad_norm": 0.6566528081893921, "learning_rate": 3.205725257082752e-05, "loss": 0.3431, "num_input_tokens_seen": 54796816, "step": 57375 }, { "epoch": 4.680642793050004, "grad_norm": 0.9607199430465698, "learning_rate": 3.20538379770899e-05, "loss": 0.3021, "num_input_tokens_seen": 54801664, "step": 57380 }, { "epoch": 4.681050656660413, "grad_norm": 0.8916268944740295, "learning_rate": 3.2050423240373975e-05, "loss": 0.3452, "num_input_tokens_seen": 54806016, "step": 57385 }, { "epoch": 4.681458520270821, "grad_norm": 1.057699203491211, "learning_rate": 3.204700836074898e-05, "loss": 0.3773, "num_input_tokens_seen": 54810624, "step": 57390 }, { "epoch": 4.68186638388123, "grad_norm": 0.46133100986480713, "learning_rate": 3.204359333828411e-05, "loss": 0.352, "num_input_tokens_seen": 54815072, "step": 57395 }, { "epoch": 4.682274247491639, "grad_norm": 0.6948308944702148, "learning_rate": 3.20401781730486e-05, "loss": 0.3887, "num_input_tokens_seen": 54818544, "step": 57400 }, { "epoch": 4.682682111102047, "grad_norm": 0.9091702699661255, "learning_rate": 3.203676286511167e-05, "loss": 0.3436, "num_input_tokens_seen": 54823008, "step": 57405 }, { "epoch": 4.683089974712456, "grad_norm": 0.6791921257972717, "learning_rate": 3.2033347414542545e-05, "loss": 0.3436, "num_input_tokens_seen": 54827632, "step": 57410 }, { "epoch": 4.6834978383228645, "grad_norm": 0.6576984524726868, "learning_rate": 3.202993182141046e-05, "loss": 0.4293, "num_input_tokens_seen": 54832752, "step": 57415 }, { "epoch": 4.6839057019332735, "grad_norm": 0.774573028087616, "learning_rate": 3.2026516085784644e-05, "loss": 0.3273, "num_input_tokens_seen": 54837424, "step": 57420 }, { "epoch": 4.6843135655436825, "grad_norm": 1.0238370895385742, "learning_rate": 3.2023100207734324e-05, "loss": 0.3447, "num_input_tokens_seen": 54842976, "step": 57425 }, { "epoch": 4.684721429154091, "grad_norm": 0.7162394523620605, "learning_rate": 3.201968418732876e-05, "loss": 0.3303, "num_input_tokens_seen": 54847328, "step": 57430 }, { "epoch": 4.6851292927645, "grad_norm": 1.3666269779205322, "learning_rate": 3.201626802463717e-05, "loss": 0.3365, "num_input_tokens_seen": 54852400, "step": 57435 }, { "epoch": 4.685537156374908, "grad_norm": 0.6191187500953674, "learning_rate": 3.2012851719728814e-05, "loss": 0.2817, "num_input_tokens_seen": 54857488, "step": 57440 }, { "epoch": 4.685945019985317, "grad_norm": 0.6899523138999939, "learning_rate": 3.200943527267293e-05, "loss": 0.2572, "num_input_tokens_seen": 54861760, "step": 57445 }, { "epoch": 4.686352883595726, "grad_norm": 0.5551542639732361, "learning_rate": 3.200601868353878e-05, "loss": 0.3888, "num_input_tokens_seen": 54866384, "step": 57450 }, { "epoch": 4.686760747206134, "grad_norm": 1.141513705253601, "learning_rate": 3.20026019523956e-05, "loss": 0.3416, "num_input_tokens_seen": 54872080, "step": 57455 }, { "epoch": 4.687168610816543, "grad_norm": 0.6993191838264465, "learning_rate": 3.199918507931265e-05, "loss": 0.3153, "num_input_tokens_seen": 54877280, "step": 57460 }, { "epoch": 4.687576474426952, "grad_norm": 1.5617742538452148, "learning_rate": 3.19957680643592e-05, "loss": 0.3095, "num_input_tokens_seen": 54882064, "step": 57465 }, { "epoch": 4.68798433803736, "grad_norm": 1.5492331981658936, "learning_rate": 3.199235090760451e-05, "loss": 0.4931, "num_input_tokens_seen": 54886144, "step": 57470 }, { "epoch": 4.688392201647769, "grad_norm": 2.3348422050476074, "learning_rate": 3.198893360911782e-05, "loss": 0.3005, "num_input_tokens_seen": 54890912, "step": 57475 }, { "epoch": 4.688800065258178, "grad_norm": 1.9809080362319946, "learning_rate": 3.1985516168968434e-05, "loss": 0.3334, "num_input_tokens_seen": 54894928, "step": 57480 }, { "epoch": 4.689207928868586, "grad_norm": 1.2438327074050903, "learning_rate": 3.1982098587225595e-05, "loss": 0.3199, "num_input_tokens_seen": 54899840, "step": 57485 }, { "epoch": 4.689615792478995, "grad_norm": 0.7851750254631042, "learning_rate": 3.197868086395859e-05, "loss": 0.3182, "num_input_tokens_seen": 54904768, "step": 57490 }, { "epoch": 4.690023656089403, "grad_norm": 1.191465139389038, "learning_rate": 3.197526299923668e-05, "loss": 0.305, "num_input_tokens_seen": 54909968, "step": 57495 }, { "epoch": 4.690431519699812, "grad_norm": 1.7053569555282593, "learning_rate": 3.197184499312916e-05, "loss": 0.3152, "num_input_tokens_seen": 54914912, "step": 57500 }, { "epoch": 4.690839383310221, "grad_norm": 1.7723087072372437, "learning_rate": 3.19684268457053e-05, "loss": 0.3652, "num_input_tokens_seen": 54920080, "step": 57505 }, { "epoch": 4.691247246920629, "grad_norm": 1.010802984237671, "learning_rate": 3.196500855703439e-05, "loss": 0.3448, "num_input_tokens_seen": 54924464, "step": 57510 }, { "epoch": 4.6916551105310385, "grad_norm": 1.5478951930999756, "learning_rate": 3.196159012718571e-05, "loss": 0.3041, "num_input_tokens_seen": 54929376, "step": 57515 }, { "epoch": 4.6920629741414475, "grad_norm": 1.2769640684127808, "learning_rate": 3.1958171556228575e-05, "loss": 0.3366, "num_input_tokens_seen": 54934544, "step": 57520 }, { "epoch": 4.692470837751856, "grad_norm": 1.3029859066009521, "learning_rate": 3.195475284423224e-05, "loss": 0.4088, "num_input_tokens_seen": 54938960, "step": 57525 }, { "epoch": 4.692878701362265, "grad_norm": 0.7019935250282288, "learning_rate": 3.195133399126603e-05, "loss": 0.2901, "num_input_tokens_seen": 54944064, "step": 57530 }, { "epoch": 4.693286564972674, "grad_norm": 1.7350046634674072, "learning_rate": 3.194791499739924e-05, "loss": 0.3109, "num_input_tokens_seen": 54948896, "step": 57535 }, { "epoch": 4.693694428583082, "grad_norm": 0.8188924789428711, "learning_rate": 3.194449586270115e-05, "loss": 0.3303, "num_input_tokens_seen": 54953680, "step": 57540 }, { "epoch": 4.694102292193491, "grad_norm": 1.6109315156936646, "learning_rate": 3.1941076587241084e-05, "loss": 0.3196, "num_input_tokens_seen": 54959168, "step": 57545 }, { "epoch": 4.694510155803899, "grad_norm": 0.7513240575790405, "learning_rate": 3.1937657171088356e-05, "loss": 0.3202, "num_input_tokens_seen": 54963744, "step": 57550 }, { "epoch": 4.694918019414308, "grad_norm": 1.0182733535766602, "learning_rate": 3.1934237614312256e-05, "loss": 0.3235, "num_input_tokens_seen": 54968480, "step": 57555 }, { "epoch": 4.695325883024717, "grad_norm": 1.4021188020706177, "learning_rate": 3.19308179169821e-05, "loss": 0.3397, "num_input_tokens_seen": 54974000, "step": 57560 }, { "epoch": 4.695733746635125, "grad_norm": 1.3575531244277954, "learning_rate": 3.1927398079167226e-05, "loss": 0.3409, "num_input_tokens_seen": 54978608, "step": 57565 }, { "epoch": 4.696141610245534, "grad_norm": 1.978784203529358, "learning_rate": 3.1923978100936926e-05, "loss": 0.3048, "num_input_tokens_seen": 54984400, "step": 57570 }, { "epoch": 4.696549473855942, "grad_norm": 0.9943259358406067, "learning_rate": 3.192055798236053e-05, "loss": 0.3831, "num_input_tokens_seen": 54988928, "step": 57575 }, { "epoch": 4.696957337466351, "grad_norm": 1.7177327871322632, "learning_rate": 3.191713772350737e-05, "loss": 0.3141, "num_input_tokens_seen": 54993216, "step": 57580 }, { "epoch": 4.69736520107676, "grad_norm": 0.7316614985466003, "learning_rate": 3.191371732444676e-05, "loss": 0.2978, "num_input_tokens_seen": 54998160, "step": 57585 }, { "epoch": 4.697773064687168, "grad_norm": 2.257089853286743, "learning_rate": 3.191029678524805e-05, "loss": 0.4541, "num_input_tokens_seen": 55002816, "step": 57590 }, { "epoch": 4.698180928297577, "grad_norm": 2.6770780086517334, "learning_rate": 3.190687610598055e-05, "loss": 0.3219, "num_input_tokens_seen": 55007488, "step": 57595 }, { "epoch": 4.698588791907986, "grad_norm": 1.4788726568222046, "learning_rate": 3.190345528671361e-05, "loss": 0.3673, "num_input_tokens_seen": 55012096, "step": 57600 }, { "epoch": 4.698996655518394, "grad_norm": 1.9279391765594482, "learning_rate": 3.190003432751657e-05, "loss": 0.3442, "num_input_tokens_seen": 55016864, "step": 57605 }, { "epoch": 4.699404519128803, "grad_norm": 1.6880650520324707, "learning_rate": 3.189661322845876e-05, "loss": 0.3134, "num_input_tokens_seen": 55021856, "step": 57610 }, { "epoch": 4.699812382739212, "grad_norm": 2.7593085765838623, "learning_rate": 3.189319198960953e-05, "loss": 0.38, "num_input_tokens_seen": 55026784, "step": 57615 }, { "epoch": 4.7002202463496205, "grad_norm": 2.9282190799713135, "learning_rate": 3.188977061103823e-05, "loss": 0.3737, "num_input_tokens_seen": 55031664, "step": 57620 }, { "epoch": 4.7006281099600296, "grad_norm": 0.6425009369850159, "learning_rate": 3.1886349092814205e-05, "loss": 0.337, "num_input_tokens_seen": 55036688, "step": 57625 }, { "epoch": 4.701035973570438, "grad_norm": 0.45352044701576233, "learning_rate": 3.1882927435006814e-05, "loss": 0.3233, "num_input_tokens_seen": 55041504, "step": 57630 }, { "epoch": 4.701443837180847, "grad_norm": 2.348262310028076, "learning_rate": 3.1879505637685406e-05, "loss": 0.313, "num_input_tokens_seen": 55046208, "step": 57635 }, { "epoch": 4.701851700791256, "grad_norm": 1.167865514755249, "learning_rate": 3.187608370091934e-05, "loss": 0.3792, "num_input_tokens_seen": 55050992, "step": 57640 }, { "epoch": 4.702259564401664, "grad_norm": 0.8680973649024963, "learning_rate": 3.1872661624777984e-05, "loss": 0.3409, "num_input_tokens_seen": 55055536, "step": 57645 }, { "epoch": 4.702667428012073, "grad_norm": 1.7228182554244995, "learning_rate": 3.1869239409330694e-05, "loss": 0.3381, "num_input_tokens_seen": 55061040, "step": 57650 }, { "epoch": 4.703075291622481, "grad_norm": 2.4004018306732178, "learning_rate": 3.186581705464684e-05, "loss": 0.3431, "num_input_tokens_seen": 55065600, "step": 57655 }, { "epoch": 4.70348315523289, "grad_norm": 1.6359771490097046, "learning_rate": 3.1862394560795786e-05, "loss": 0.2591, "num_input_tokens_seen": 55070640, "step": 57660 }, { "epoch": 4.703891018843299, "grad_norm": 1.6691408157348633, "learning_rate": 3.185897192784692e-05, "loss": 0.3398, "num_input_tokens_seen": 55075360, "step": 57665 }, { "epoch": 4.704298882453708, "grad_norm": 0.767576277256012, "learning_rate": 3.18555491558696e-05, "loss": 0.3365, "num_input_tokens_seen": 55080512, "step": 57670 }, { "epoch": 4.704706746064116, "grad_norm": 2.342421531677246, "learning_rate": 3.185212624493322e-05, "loss": 0.2923, "num_input_tokens_seen": 55084704, "step": 57675 }, { "epoch": 4.705114609674525, "grad_norm": 1.433719515800476, "learning_rate": 3.184870319510715e-05, "loss": 0.3147, "num_input_tokens_seen": 55089520, "step": 57680 }, { "epoch": 4.705522473284933, "grad_norm": 0.5933435559272766, "learning_rate": 3.184528000646078e-05, "loss": 0.3164, "num_input_tokens_seen": 55094480, "step": 57685 }, { "epoch": 4.705930336895342, "grad_norm": 1.0217680931091309, "learning_rate": 3.1841856679063485e-05, "loss": 0.3354, "num_input_tokens_seen": 55100032, "step": 57690 }, { "epoch": 4.706338200505751, "grad_norm": 1.3015588521957397, "learning_rate": 3.1838433212984666e-05, "loss": 0.4873, "num_input_tokens_seen": 55104032, "step": 57695 }, { "epoch": 4.706746064116159, "grad_norm": 0.8605812191963196, "learning_rate": 3.183500960829371e-05, "loss": 0.3673, "num_input_tokens_seen": 55109248, "step": 57700 }, { "epoch": 4.707153927726568, "grad_norm": 1.1972486972808838, "learning_rate": 3.183158586506001e-05, "loss": 0.3893, "num_input_tokens_seen": 55114240, "step": 57705 }, { "epoch": 4.7075617913369765, "grad_norm": 0.8340182304382324, "learning_rate": 3.182816198335298e-05, "loss": 0.3939, "num_input_tokens_seen": 55119088, "step": 57710 }, { "epoch": 4.7079696549473855, "grad_norm": 1.6854496002197266, "learning_rate": 3.182473796324199e-05, "loss": 0.3396, "num_input_tokens_seen": 55123712, "step": 57715 }, { "epoch": 4.7083775185577945, "grad_norm": 0.35851025581359863, "learning_rate": 3.182131380479648e-05, "loss": 0.3428, "num_input_tokens_seen": 55128576, "step": 57720 }, { "epoch": 4.708785382168203, "grad_norm": 1.0374237298965454, "learning_rate": 3.1817889508085824e-05, "loss": 0.3173, "num_input_tokens_seen": 55132976, "step": 57725 }, { "epoch": 4.709193245778612, "grad_norm": 0.9448410868644714, "learning_rate": 3.181446507317945e-05, "loss": 0.3479, "num_input_tokens_seen": 55138448, "step": 57730 }, { "epoch": 4.709601109389021, "grad_norm": 0.8472577929496765, "learning_rate": 3.1811040500146765e-05, "loss": 0.3034, "num_input_tokens_seen": 55144144, "step": 57735 }, { "epoch": 4.710008972999429, "grad_norm": 0.5798988938331604, "learning_rate": 3.1807615789057185e-05, "loss": 0.3667, "num_input_tokens_seen": 55148992, "step": 57740 }, { "epoch": 4.710416836609838, "grad_norm": 0.7191986441612244, "learning_rate": 3.1804190939980124e-05, "loss": 0.2794, "num_input_tokens_seen": 55153808, "step": 57745 }, { "epoch": 4.710824700220247, "grad_norm": 2.2115283012390137, "learning_rate": 3.1800765952985e-05, "loss": 0.363, "num_input_tokens_seen": 55159040, "step": 57750 }, { "epoch": 4.711232563830655, "grad_norm": 0.8795392513275146, "learning_rate": 3.179734082814125e-05, "loss": 0.3876, "num_input_tokens_seen": 55164240, "step": 57755 }, { "epoch": 4.711640427441064, "grad_norm": 0.6340319514274597, "learning_rate": 3.1793915565518276e-05, "loss": 0.3409, "num_input_tokens_seen": 55168528, "step": 57760 }, { "epoch": 4.712048291051472, "grad_norm": 1.4088263511657715, "learning_rate": 3.179049016518553e-05, "loss": 0.363, "num_input_tokens_seen": 55172544, "step": 57765 }, { "epoch": 4.712456154661881, "grad_norm": 0.8148741722106934, "learning_rate": 3.1787064627212435e-05, "loss": 0.3222, "num_input_tokens_seen": 55176624, "step": 57770 }, { "epoch": 4.71286401827229, "grad_norm": 0.8592115044593811, "learning_rate": 3.178363895166842e-05, "loss": 0.3525, "num_input_tokens_seen": 55181040, "step": 57775 }, { "epoch": 4.713271881882698, "grad_norm": 1.887759804725647, "learning_rate": 3.178021313862292e-05, "loss": 0.2849, "num_input_tokens_seen": 55186176, "step": 57780 }, { "epoch": 4.713679745493107, "grad_norm": 1.1647247076034546, "learning_rate": 3.177678718814539e-05, "loss": 0.331, "num_input_tokens_seen": 55190112, "step": 57785 }, { "epoch": 4.714087609103515, "grad_norm": 0.5784027576446533, "learning_rate": 3.1773361100305265e-05, "loss": 0.4229, "num_input_tokens_seen": 55195152, "step": 57790 }, { "epoch": 4.714495472713924, "grad_norm": 1.6398544311523438, "learning_rate": 3.176993487517198e-05, "loss": 0.3263, "num_input_tokens_seen": 55199296, "step": 57795 }, { "epoch": 4.714903336324333, "grad_norm": 0.2508352994918823, "learning_rate": 3.176650851281499e-05, "loss": 0.322, "num_input_tokens_seen": 55203632, "step": 57800 }, { "epoch": 4.7153111999347415, "grad_norm": 1.4025518894195557, "learning_rate": 3.1763082013303755e-05, "loss": 0.3798, "num_input_tokens_seen": 55209120, "step": 57805 }, { "epoch": 4.7157190635451505, "grad_norm": 2.1449739933013916, "learning_rate": 3.175965537670772e-05, "loss": 0.4079, "num_input_tokens_seen": 55213728, "step": 57810 }, { "epoch": 4.7161269271555595, "grad_norm": 1.2712619304656982, "learning_rate": 3.175622860309635e-05, "loss": 0.3085, "num_input_tokens_seen": 55218288, "step": 57815 }, { "epoch": 4.716534790765968, "grad_norm": 1.8186302185058594, "learning_rate": 3.1752801692539086e-05, "loss": 0.4602, "num_input_tokens_seen": 55223456, "step": 57820 }, { "epoch": 4.716942654376377, "grad_norm": 1.0719172954559326, "learning_rate": 3.174937464510541e-05, "loss": 0.3298, "num_input_tokens_seen": 55228992, "step": 57825 }, { "epoch": 4.717350517986786, "grad_norm": 1.0729657411575317, "learning_rate": 3.174594746086477e-05, "loss": 0.3447, "num_input_tokens_seen": 55234272, "step": 57830 }, { "epoch": 4.717758381597194, "grad_norm": 1.67677640914917, "learning_rate": 3.174252013988665e-05, "loss": 0.32, "num_input_tokens_seen": 55238112, "step": 57835 }, { "epoch": 4.718166245207603, "grad_norm": 1.3179043531417847, "learning_rate": 3.17390926822405e-05, "loss": 0.3102, "num_input_tokens_seen": 55242656, "step": 57840 }, { "epoch": 4.718574108818011, "grad_norm": 2.0668046474456787, "learning_rate": 3.173566508799581e-05, "loss": 0.3616, "num_input_tokens_seen": 55247232, "step": 57845 }, { "epoch": 4.71898197242842, "grad_norm": 1.1103211641311646, "learning_rate": 3.173223735722205e-05, "loss": 0.3686, "num_input_tokens_seen": 55252176, "step": 57850 }, { "epoch": 4.719389836038829, "grad_norm": 0.9185727834701538, "learning_rate": 3.172880948998871e-05, "loss": 0.3498, "num_input_tokens_seen": 55257056, "step": 57855 }, { "epoch": 4.719797699649237, "grad_norm": 0.7436705827713013, "learning_rate": 3.172538148636525e-05, "loss": 0.3149, "num_input_tokens_seen": 55260544, "step": 57860 }, { "epoch": 4.720205563259646, "grad_norm": 1.62086021900177, "learning_rate": 3.172195334642117e-05, "loss": 0.4056, "num_input_tokens_seen": 55264800, "step": 57865 }, { "epoch": 4.720613426870055, "grad_norm": 0.48020559549331665, "learning_rate": 3.171852507022595e-05, "loss": 0.3797, "num_input_tokens_seen": 55269616, "step": 57870 }, { "epoch": 4.721021290480463, "grad_norm": 1.4152042865753174, "learning_rate": 3.171509665784908e-05, "loss": 0.312, "num_input_tokens_seen": 55274512, "step": 57875 }, { "epoch": 4.721429154090872, "grad_norm": 1.5744916200637817, "learning_rate": 3.171166810936005e-05, "loss": 0.3648, "num_input_tokens_seen": 55279184, "step": 57880 }, { "epoch": 4.721837017701281, "grad_norm": 1.9561412334442139, "learning_rate": 3.170823942482837e-05, "loss": 0.3198, "num_input_tokens_seen": 55283088, "step": 57885 }, { "epoch": 4.722244881311689, "grad_norm": 0.962140679359436, "learning_rate": 3.170481060432352e-05, "loss": 0.3233, "num_input_tokens_seen": 55287680, "step": 57890 }, { "epoch": 4.722652744922098, "grad_norm": 1.1664507389068604, "learning_rate": 3.170138164791501e-05, "loss": 0.3543, "num_input_tokens_seen": 55292560, "step": 57895 }, { "epoch": 4.723060608532506, "grad_norm": 2.153081178665161, "learning_rate": 3.169795255567235e-05, "loss": 0.3356, "num_input_tokens_seen": 55298288, "step": 57900 }, { "epoch": 4.723468472142915, "grad_norm": 0.968012273311615, "learning_rate": 3.169452332766503e-05, "loss": 0.3526, "num_input_tokens_seen": 55303584, "step": 57905 }, { "epoch": 4.723876335753324, "grad_norm": 0.8920627236366272, "learning_rate": 3.1691093963962566e-05, "loss": 0.3787, "num_input_tokens_seen": 55308400, "step": 57910 }, { "epoch": 4.724284199363733, "grad_norm": 1.8944696187973022, "learning_rate": 3.168766446463446e-05, "loss": 0.3406, "num_input_tokens_seen": 55313184, "step": 57915 }, { "epoch": 4.724692062974142, "grad_norm": 0.7803494334220886, "learning_rate": 3.168423482975026e-05, "loss": 0.3255, "num_input_tokens_seen": 55318288, "step": 57920 }, { "epoch": 4.72509992658455, "grad_norm": 0.563213586807251, "learning_rate": 3.168080505937944e-05, "loss": 0.3961, "num_input_tokens_seen": 55322704, "step": 57925 }, { "epoch": 4.725507790194959, "grad_norm": 1.6777615547180176, "learning_rate": 3.167737515359155e-05, "loss": 0.4093, "num_input_tokens_seen": 55328272, "step": 57930 }, { "epoch": 4.725915653805368, "grad_norm": 1.4943047761917114, "learning_rate": 3.1673945112456104e-05, "loss": 0.2852, "num_input_tokens_seen": 55333232, "step": 57935 }, { "epoch": 4.726323517415776, "grad_norm": 1.1172761917114258, "learning_rate": 3.1670514936042625e-05, "loss": 0.2906, "num_input_tokens_seen": 55338032, "step": 57940 }, { "epoch": 4.726731381026185, "grad_norm": 1.0766733884811401, "learning_rate": 3.166708462442064e-05, "loss": 0.3608, "num_input_tokens_seen": 55342928, "step": 57945 }, { "epoch": 4.727139244636594, "grad_norm": 0.7211112380027771, "learning_rate": 3.1663654177659686e-05, "loss": 0.372, "num_input_tokens_seen": 55347056, "step": 57950 }, { "epoch": 4.727547108247002, "grad_norm": 1.095024585723877, "learning_rate": 3.166022359582929e-05, "loss": 0.4155, "num_input_tokens_seen": 55351792, "step": 57955 }, { "epoch": 4.727954971857411, "grad_norm": 0.42385032773017883, "learning_rate": 3.1656792878999e-05, "loss": 0.325, "num_input_tokens_seen": 55356624, "step": 57960 }, { "epoch": 4.72836283546782, "grad_norm": 1.2992486953735352, "learning_rate": 3.1653362027238344e-05, "loss": 0.3646, "num_input_tokens_seen": 55361856, "step": 57965 }, { "epoch": 4.728770699078228, "grad_norm": 1.5243359804153442, "learning_rate": 3.164993104061685e-05, "loss": 0.3565, "num_input_tokens_seen": 55366320, "step": 57970 }, { "epoch": 4.729178562688637, "grad_norm": 0.7808519005775452, "learning_rate": 3.1646499919204105e-05, "loss": 0.3304, "num_input_tokens_seen": 55371552, "step": 57975 }, { "epoch": 4.729586426299045, "grad_norm": 0.9508414268493652, "learning_rate": 3.164306866306962e-05, "loss": 0.3419, "num_input_tokens_seen": 55376608, "step": 57980 }, { "epoch": 4.729994289909454, "grad_norm": 1.4005686044692993, "learning_rate": 3.163963727228295e-05, "loss": 0.3605, "num_input_tokens_seen": 55381024, "step": 57985 }, { "epoch": 4.730402153519863, "grad_norm": 1.2053945064544678, "learning_rate": 3.163620574691366e-05, "loss": 0.3665, "num_input_tokens_seen": 55385968, "step": 57990 }, { "epoch": 4.730810017130271, "grad_norm": 0.8735960721969604, "learning_rate": 3.1632774087031296e-05, "loss": 0.3246, "num_input_tokens_seen": 55391008, "step": 57995 }, { "epoch": 4.73121788074068, "grad_norm": 1.7075695991516113, "learning_rate": 3.162934229270542e-05, "loss": 0.3265, "num_input_tokens_seen": 55396736, "step": 58000 }, { "epoch": 4.7316257443510885, "grad_norm": 1.2422531843185425, "learning_rate": 3.16259103640056e-05, "loss": 0.3812, "num_input_tokens_seen": 55401536, "step": 58005 }, { "epoch": 4.7320336079614975, "grad_norm": 0.9597416520118713, "learning_rate": 3.162247830100138e-05, "loss": 0.2485, "num_input_tokens_seen": 55406288, "step": 58010 }, { "epoch": 4.7324414715719065, "grad_norm": 0.535798966884613, "learning_rate": 3.161904610376235e-05, "loss": 0.3709, "num_input_tokens_seen": 55411488, "step": 58015 }, { "epoch": 4.732849335182315, "grad_norm": 1.4428770542144775, "learning_rate": 3.161561377235807e-05, "loss": 0.3414, "num_input_tokens_seen": 55415392, "step": 58020 }, { "epoch": 4.733257198792724, "grad_norm": 0.5461612939834595, "learning_rate": 3.16121813068581e-05, "loss": 0.4194, "num_input_tokens_seen": 55419072, "step": 58025 }, { "epoch": 4.733665062403133, "grad_norm": 0.7448759078979492, "learning_rate": 3.160874870733203e-05, "loss": 0.37, "num_input_tokens_seen": 55424528, "step": 58030 }, { "epoch": 4.734072926013541, "grad_norm": 0.8998239636421204, "learning_rate": 3.1605315973849434e-05, "loss": 0.3318, "num_input_tokens_seen": 55429136, "step": 58035 }, { "epoch": 4.73448078962395, "grad_norm": 1.0296416282653809, "learning_rate": 3.160188310647988e-05, "loss": 0.3294, "num_input_tokens_seen": 55433616, "step": 58040 }, { "epoch": 4.734888653234359, "grad_norm": 1.6343293190002441, "learning_rate": 3.1598450105292974e-05, "loss": 0.3183, "num_input_tokens_seen": 55438480, "step": 58045 }, { "epoch": 4.735296516844767, "grad_norm": 0.7071617245674133, "learning_rate": 3.159501697035828e-05, "loss": 0.34, "num_input_tokens_seen": 55442960, "step": 58050 }, { "epoch": 4.735704380455176, "grad_norm": 0.7194316983222961, "learning_rate": 3.1591583701745395e-05, "loss": 0.3535, "num_input_tokens_seen": 55447536, "step": 58055 }, { "epoch": 4.736112244065584, "grad_norm": 0.9897907972335815, "learning_rate": 3.158815029952391e-05, "loss": 0.3542, "num_input_tokens_seen": 55451008, "step": 58060 }, { "epoch": 4.736520107675993, "grad_norm": 0.8360373377799988, "learning_rate": 3.158471676376342e-05, "loss": 0.3428, "num_input_tokens_seen": 55455568, "step": 58065 }, { "epoch": 4.736927971286402, "grad_norm": 1.0469731092453003, "learning_rate": 3.1581283094533514e-05, "loss": 0.3436, "num_input_tokens_seen": 55460624, "step": 58070 }, { "epoch": 4.73733583489681, "grad_norm": 1.1732217073440552, "learning_rate": 3.1577849291903805e-05, "loss": 0.3294, "num_input_tokens_seen": 55465600, "step": 58075 }, { "epoch": 4.737743698507219, "grad_norm": 1.2324422597885132, "learning_rate": 3.157441535594387e-05, "loss": 0.3394, "num_input_tokens_seen": 55470576, "step": 58080 }, { "epoch": 4.738151562117628, "grad_norm": 0.7382604479789734, "learning_rate": 3.1570981286723344e-05, "loss": 0.3288, "num_input_tokens_seen": 55475568, "step": 58085 }, { "epoch": 4.738559425728036, "grad_norm": 0.8858738541603088, "learning_rate": 3.1567547084311824e-05, "loss": 0.3437, "num_input_tokens_seen": 55479520, "step": 58090 }, { "epoch": 4.738967289338445, "grad_norm": 0.9743711352348328, "learning_rate": 3.1564112748778905e-05, "loss": 0.3529, "num_input_tokens_seen": 55483488, "step": 58095 }, { "epoch": 4.739375152948854, "grad_norm": 1.3648756742477417, "learning_rate": 3.1560678280194214e-05, "loss": 0.3527, "num_input_tokens_seen": 55488016, "step": 58100 }, { "epoch": 4.7397830165592625, "grad_norm": 1.3084172010421753, "learning_rate": 3.1557243678627355e-05, "loss": 0.3136, "num_input_tokens_seen": 55492352, "step": 58105 }, { "epoch": 4.7401908801696715, "grad_norm": 0.8244088292121887, "learning_rate": 3.155380894414796e-05, "loss": 0.3851, "num_input_tokens_seen": 55498240, "step": 58110 }, { "epoch": 4.74059874378008, "grad_norm": 1.066679835319519, "learning_rate": 3.155037407682564e-05, "loss": 0.3475, "num_input_tokens_seen": 55503312, "step": 58115 }, { "epoch": 4.741006607390489, "grad_norm": 0.8520274758338928, "learning_rate": 3.154693907673002e-05, "loss": 0.2903, "num_input_tokens_seen": 55508176, "step": 58120 }, { "epoch": 4.741414471000898, "grad_norm": 1.4332149028778076, "learning_rate": 3.1543503943930733e-05, "loss": 0.3561, "num_input_tokens_seen": 55512432, "step": 58125 }, { "epoch": 4.741822334611306, "grad_norm": 1.8020148277282715, "learning_rate": 3.154006867849739e-05, "loss": 0.2795, "num_input_tokens_seen": 55517264, "step": 58130 }, { "epoch": 4.742230198221715, "grad_norm": 0.9122903347015381, "learning_rate": 3.1536633280499645e-05, "loss": 0.4443, "num_input_tokens_seen": 55521744, "step": 58135 }, { "epoch": 4.742638061832123, "grad_norm": 1.280516266822815, "learning_rate": 3.1533197750007116e-05, "loss": 0.3965, "num_input_tokens_seen": 55525856, "step": 58140 }, { "epoch": 4.743045925442532, "grad_norm": 1.0070849657058716, "learning_rate": 3.152976208708944e-05, "loss": 0.2909, "num_input_tokens_seen": 55530064, "step": 58145 }, { "epoch": 4.743453789052941, "grad_norm": 0.6954509019851685, "learning_rate": 3.152632629181627e-05, "loss": 0.2857, "num_input_tokens_seen": 55535184, "step": 58150 }, { "epoch": 4.743861652663349, "grad_norm": 0.8972771167755127, "learning_rate": 3.152289036425724e-05, "loss": 0.3063, "num_input_tokens_seen": 55539488, "step": 58155 }, { "epoch": 4.744269516273758, "grad_norm": 1.419899344444275, "learning_rate": 3.151945430448199e-05, "loss": 0.4011, "num_input_tokens_seen": 55544400, "step": 58160 }, { "epoch": 4.744677379884167, "grad_norm": 0.8993918895721436, "learning_rate": 3.1516018112560166e-05, "loss": 0.3553, "num_input_tokens_seen": 55549184, "step": 58165 }, { "epoch": 4.745085243494575, "grad_norm": 1.4222160577774048, "learning_rate": 3.151258178856144e-05, "loss": 0.3557, "num_input_tokens_seen": 55555008, "step": 58170 }, { "epoch": 4.745493107104984, "grad_norm": 1.2153472900390625, "learning_rate": 3.150914533255543e-05, "loss": 0.3582, "num_input_tokens_seen": 55559600, "step": 58175 }, { "epoch": 4.745900970715393, "grad_norm": 1.3913705348968506, "learning_rate": 3.150570874461182e-05, "loss": 0.2937, "num_input_tokens_seen": 55564672, "step": 58180 }, { "epoch": 4.746308834325801, "grad_norm": 1.3619886636734009, "learning_rate": 3.150227202480026e-05, "loss": 0.3727, "num_input_tokens_seen": 55568480, "step": 58185 }, { "epoch": 4.74671669793621, "grad_norm": 1.0681984424591064, "learning_rate": 3.14988351731904e-05, "loss": 0.3509, "num_input_tokens_seen": 55573472, "step": 58190 }, { "epoch": 4.747124561546618, "grad_norm": 1.0403817892074585, "learning_rate": 3.1495398189851916e-05, "loss": 0.3904, "num_input_tokens_seen": 55577488, "step": 58195 }, { "epoch": 4.7475324251570274, "grad_norm": 1.2561087608337402, "learning_rate": 3.149196107485447e-05, "loss": 0.3174, "num_input_tokens_seen": 55582240, "step": 58200 }, { "epoch": 4.7479402887674365, "grad_norm": 1.243373155593872, "learning_rate": 3.148852382826774e-05, "loss": 0.3521, "num_input_tokens_seen": 55587696, "step": 58205 }, { "epoch": 4.748348152377845, "grad_norm": 0.9267206192016602, "learning_rate": 3.1485086450161375e-05, "loss": 0.3192, "num_input_tokens_seen": 55592224, "step": 58210 }, { "epoch": 4.748756015988254, "grad_norm": 1.1529079675674438, "learning_rate": 3.148164894060507e-05, "loss": 0.3718, "num_input_tokens_seen": 55597360, "step": 58215 }, { "epoch": 4.749163879598662, "grad_norm": 1.0168876647949219, "learning_rate": 3.14782112996685e-05, "loss": 0.3366, "num_input_tokens_seen": 55601712, "step": 58220 }, { "epoch": 4.749571743209071, "grad_norm": 1.4962024688720703, "learning_rate": 3.147477352742133e-05, "loss": 0.3235, "num_input_tokens_seen": 55606592, "step": 58225 }, { "epoch": 4.74997960681948, "grad_norm": 1.3558754920959473, "learning_rate": 3.1471335623933265e-05, "loss": 0.3298, "num_input_tokens_seen": 55611424, "step": 58230 }, { "epoch": 4.750387470429889, "grad_norm": 1.7058907747268677, "learning_rate": 3.146789758927397e-05, "loss": 0.2941, "num_input_tokens_seen": 55616784, "step": 58235 }, { "epoch": 4.750795334040297, "grad_norm": 0.7360165119171143, "learning_rate": 3.1464459423513144e-05, "loss": 0.3502, "num_input_tokens_seen": 55621600, "step": 58240 }, { "epoch": 4.751203197650706, "grad_norm": 1.1113263368606567, "learning_rate": 3.146102112672047e-05, "loss": 0.3082, "num_input_tokens_seen": 55626480, "step": 58245 }, { "epoch": 4.751611061261114, "grad_norm": 1.0919227600097656, "learning_rate": 3.145758269896564e-05, "loss": 0.2815, "num_input_tokens_seen": 55631184, "step": 58250 }, { "epoch": 4.752018924871523, "grad_norm": 1.8221772909164429, "learning_rate": 3.145414414031835e-05, "loss": 0.349, "num_input_tokens_seen": 55635776, "step": 58255 }, { "epoch": 4.752426788481932, "grad_norm": 1.7091307640075684, "learning_rate": 3.14507054508483e-05, "loss": 0.3804, "num_input_tokens_seen": 55640512, "step": 58260 }, { "epoch": 4.75283465209234, "grad_norm": 3.057131290435791, "learning_rate": 3.14472666306252e-05, "loss": 0.3933, "num_input_tokens_seen": 55645088, "step": 58265 }, { "epoch": 4.753242515702749, "grad_norm": 2.4775657653808594, "learning_rate": 3.144382767971873e-05, "loss": 0.332, "num_input_tokens_seen": 55649920, "step": 58270 }, { "epoch": 4.753650379313157, "grad_norm": 1.988329529762268, "learning_rate": 3.144038859819863e-05, "loss": 0.3747, "num_input_tokens_seen": 55654624, "step": 58275 }, { "epoch": 4.754058242923566, "grad_norm": 1.6044678688049316, "learning_rate": 3.1436949386134576e-05, "loss": 0.3382, "num_input_tokens_seen": 55659520, "step": 58280 }, { "epoch": 4.754466106533975, "grad_norm": 1.4853229522705078, "learning_rate": 3.14335100435963e-05, "loss": 0.3208, "num_input_tokens_seen": 55665248, "step": 58285 }, { "epoch": 4.754873970144383, "grad_norm": 1.8495960235595703, "learning_rate": 3.143007057065352e-05, "loss": 0.3426, "num_input_tokens_seen": 55670880, "step": 58290 }, { "epoch": 4.755281833754792, "grad_norm": 2.1738576889038086, "learning_rate": 3.142663096737592e-05, "loss": 0.3552, "num_input_tokens_seen": 55676144, "step": 58295 }, { "epoch": 4.755689697365201, "grad_norm": 2.277174234390259, "learning_rate": 3.1423191233833247e-05, "loss": 0.3002, "num_input_tokens_seen": 55681504, "step": 58300 }, { "epoch": 4.7560975609756095, "grad_norm": 1.9344929456710815, "learning_rate": 3.141975137009523e-05, "loss": 0.3274, "num_input_tokens_seen": 55685952, "step": 58305 }, { "epoch": 4.7565054245860185, "grad_norm": 1.640160083770752, "learning_rate": 3.1416311376231574e-05, "loss": 0.2579, "num_input_tokens_seen": 55690400, "step": 58310 }, { "epoch": 4.756913288196428, "grad_norm": 2.2379767894744873, "learning_rate": 3.1412871252312005e-05, "loss": 0.4245, "num_input_tokens_seen": 55694912, "step": 58315 }, { "epoch": 4.757321151806836, "grad_norm": 1.0273022651672363, "learning_rate": 3.140943099840627e-05, "loss": 0.2656, "num_input_tokens_seen": 55699904, "step": 58320 }, { "epoch": 4.757729015417245, "grad_norm": 2.53517746925354, "learning_rate": 3.1405990614584084e-05, "loss": 0.3761, "num_input_tokens_seen": 55703600, "step": 58325 }, { "epoch": 4.758136879027653, "grad_norm": 1.6883413791656494, "learning_rate": 3.140255010091519e-05, "loss": 0.3255, "num_input_tokens_seen": 55708192, "step": 58330 }, { "epoch": 4.758544742638062, "grad_norm": 1.860629677772522, "learning_rate": 3.1399109457469336e-05, "loss": 0.387, "num_input_tokens_seen": 55713232, "step": 58335 }, { "epoch": 4.758952606248471, "grad_norm": 0.9582507610321045, "learning_rate": 3.1395668684316245e-05, "loss": 0.3418, "num_input_tokens_seen": 55717936, "step": 58340 }, { "epoch": 4.759360469858879, "grad_norm": 2.203183650970459, "learning_rate": 3.139222778152568e-05, "loss": 0.3382, "num_input_tokens_seen": 55723232, "step": 58345 }, { "epoch": 4.759768333469288, "grad_norm": 2.14768123626709, "learning_rate": 3.138878674916735e-05, "loss": 0.3973, "num_input_tokens_seen": 55728064, "step": 58350 }, { "epoch": 4.760176197079696, "grad_norm": 2.0347232818603516, "learning_rate": 3.138534558731105e-05, "loss": 0.3529, "num_input_tokens_seen": 55732336, "step": 58355 }, { "epoch": 4.760584060690105, "grad_norm": 1.9413586854934692, "learning_rate": 3.138190429602649e-05, "loss": 0.306, "num_input_tokens_seen": 55737504, "step": 58360 }, { "epoch": 4.760991924300514, "grad_norm": 1.17967689037323, "learning_rate": 3.137846287538346e-05, "loss": 0.3461, "num_input_tokens_seen": 55742336, "step": 58365 }, { "epoch": 4.761399787910922, "grad_norm": 2.4495742321014404, "learning_rate": 3.137502132545169e-05, "loss": 0.3039, "num_input_tokens_seen": 55746624, "step": 58370 }, { "epoch": 4.761807651521331, "grad_norm": 2.3398637771606445, "learning_rate": 3.1371579646300944e-05, "loss": 0.2869, "num_input_tokens_seen": 55751024, "step": 58375 }, { "epoch": 4.76221551513174, "grad_norm": 0.8467229604721069, "learning_rate": 3.136813783800099e-05, "loss": 0.2019, "num_input_tokens_seen": 55755648, "step": 58380 }, { "epoch": 4.762623378742148, "grad_norm": 1.2651420831680298, "learning_rate": 3.1364695900621586e-05, "loss": 0.472, "num_input_tokens_seen": 55760480, "step": 58385 }, { "epoch": 4.763031242352557, "grad_norm": 0.7684861421585083, "learning_rate": 3.13612538342325e-05, "loss": 0.3235, "num_input_tokens_seen": 55764800, "step": 58390 }, { "epoch": 4.763439105962966, "grad_norm": 2.249016046524048, "learning_rate": 3.13578116389035e-05, "loss": 0.2978, "num_input_tokens_seen": 55769680, "step": 58395 }, { "epoch": 4.7638469695733745, "grad_norm": 1.890016794204712, "learning_rate": 3.1354369314704366e-05, "loss": 0.2515, "num_input_tokens_seen": 55774528, "step": 58400 }, { "epoch": 4.7642548331837835, "grad_norm": 3.6817917823791504, "learning_rate": 3.1350926861704855e-05, "loss": 0.3553, "num_input_tokens_seen": 55779424, "step": 58405 }, { "epoch": 4.764662696794192, "grad_norm": 1.659185528755188, "learning_rate": 3.134748427997477e-05, "loss": 0.4603, "num_input_tokens_seen": 55783904, "step": 58410 }, { "epoch": 4.765070560404601, "grad_norm": 1.1423906087875366, "learning_rate": 3.134404156958386e-05, "loss": 0.4313, "num_input_tokens_seen": 55788384, "step": 58415 }, { "epoch": 4.76547842401501, "grad_norm": 1.2791922092437744, "learning_rate": 3.134059873060193e-05, "loss": 0.3695, "num_input_tokens_seen": 55792640, "step": 58420 }, { "epoch": 4.765886287625418, "grad_norm": 2.7624704837799072, "learning_rate": 3.133715576309876e-05, "loss": 0.4016, "num_input_tokens_seen": 55796928, "step": 58425 }, { "epoch": 4.766294151235827, "grad_norm": 5.223348617553711, "learning_rate": 3.1333712667144136e-05, "loss": 0.3821, "num_input_tokens_seen": 55802208, "step": 58430 }, { "epoch": 4.766702014846236, "grad_norm": 0.9661152362823486, "learning_rate": 3.133026944280785e-05, "loss": 0.3318, "num_input_tokens_seen": 55807344, "step": 58435 }, { "epoch": 4.767109878456644, "grad_norm": 1.5117518901824951, "learning_rate": 3.132682609015969e-05, "loss": 0.3592, "num_input_tokens_seen": 55811584, "step": 58440 }, { "epoch": 4.767517742067053, "grad_norm": 3.1841185092926025, "learning_rate": 3.1323382609269454e-05, "loss": 0.348, "num_input_tokens_seen": 55816176, "step": 58445 }, { "epoch": 4.767925605677462, "grad_norm": 1.0012985467910767, "learning_rate": 3.131993900020693e-05, "loss": 0.392, "num_input_tokens_seen": 55820448, "step": 58450 }, { "epoch": 4.76833346928787, "grad_norm": 1.1360337734222412, "learning_rate": 3.131649526304194e-05, "loss": 0.3079, "num_input_tokens_seen": 55825552, "step": 58455 }, { "epoch": 4.768741332898279, "grad_norm": 3.0662996768951416, "learning_rate": 3.131305139784427e-05, "loss": 0.3612, "num_input_tokens_seen": 55830624, "step": 58460 }, { "epoch": 4.769149196508687, "grad_norm": 1.865427017211914, "learning_rate": 3.1309607404683735e-05, "loss": 0.2993, "num_input_tokens_seen": 55836032, "step": 58465 }, { "epoch": 4.769557060119096, "grad_norm": 1.096786618232727, "learning_rate": 3.130616328363013e-05, "loss": 0.3589, "num_input_tokens_seen": 55840928, "step": 58470 }, { "epoch": 4.769964923729505, "grad_norm": 1.6049864292144775, "learning_rate": 3.130271903475328e-05, "loss": 0.3715, "num_input_tokens_seen": 55845184, "step": 58475 }, { "epoch": 4.770372787339913, "grad_norm": 1.871968388557434, "learning_rate": 3.129927465812299e-05, "loss": 0.3505, "num_input_tokens_seen": 55849424, "step": 58480 }, { "epoch": 4.770780650950322, "grad_norm": 1.2844815254211426, "learning_rate": 3.129583015380909e-05, "loss": 0.2986, "num_input_tokens_seen": 55854144, "step": 58485 }, { "epoch": 4.7711885145607305, "grad_norm": 3.2220468521118164, "learning_rate": 3.129238552188138e-05, "loss": 0.3866, "num_input_tokens_seen": 55858208, "step": 58490 }, { "epoch": 4.7715963781711395, "grad_norm": 2.5382726192474365, "learning_rate": 3.128894076240969e-05, "loss": 0.3434, "num_input_tokens_seen": 55862816, "step": 58495 }, { "epoch": 4.7720042417815485, "grad_norm": 2.052267074584961, "learning_rate": 3.128549587546384e-05, "loss": 0.3098, "num_input_tokens_seen": 55867760, "step": 58500 }, { "epoch": 4.772412105391957, "grad_norm": 1.7174558639526367, "learning_rate": 3.1282050861113664e-05, "loss": 0.3706, "num_input_tokens_seen": 55872960, "step": 58505 }, { "epoch": 4.772819969002366, "grad_norm": 2.8978042602539062, "learning_rate": 3.127860571942899e-05, "loss": 0.3289, "num_input_tokens_seen": 55877120, "step": 58510 }, { "epoch": 4.773227832612775, "grad_norm": 1.1705236434936523, "learning_rate": 3.127516045047963e-05, "loss": 0.3314, "num_input_tokens_seen": 55881504, "step": 58515 }, { "epoch": 4.773635696223183, "grad_norm": 0.8545922040939331, "learning_rate": 3.127171505433545e-05, "loss": 0.3236, "num_input_tokens_seen": 55886160, "step": 58520 }, { "epoch": 4.774043559833592, "grad_norm": 1.7717164754867554, "learning_rate": 3.126826953106627e-05, "loss": 0.4254, "num_input_tokens_seen": 55890496, "step": 58525 }, { "epoch": 4.774451423444001, "grad_norm": 1.3100134134292603, "learning_rate": 3.126482388074193e-05, "loss": 0.4274, "num_input_tokens_seen": 55894832, "step": 58530 }, { "epoch": 4.774859287054409, "grad_norm": 1.2768350839614868, "learning_rate": 3.126137810343226e-05, "loss": 0.3793, "num_input_tokens_seen": 55901024, "step": 58535 }, { "epoch": 4.775267150664818, "grad_norm": 1.4510612487792969, "learning_rate": 3.125793219920713e-05, "loss": 0.3112, "num_input_tokens_seen": 55905664, "step": 58540 }, { "epoch": 4.775675014275226, "grad_norm": 1.5679666996002197, "learning_rate": 3.125448616813637e-05, "loss": 0.3255, "num_input_tokens_seen": 55910768, "step": 58545 }, { "epoch": 4.776082877885635, "grad_norm": 1.7711960077285767, "learning_rate": 3.125104001028983e-05, "loss": 0.2971, "num_input_tokens_seen": 55915744, "step": 58550 }, { "epoch": 4.776490741496044, "grad_norm": 1.718988060951233, "learning_rate": 3.1247593725737365e-05, "loss": 0.3274, "num_input_tokens_seen": 55921136, "step": 58555 }, { "epoch": 4.776898605106452, "grad_norm": 3.2577505111694336, "learning_rate": 3.124414731454883e-05, "loss": 0.3821, "num_input_tokens_seen": 55925792, "step": 58560 }, { "epoch": 4.777306468716861, "grad_norm": 2.4058728218078613, "learning_rate": 3.124070077679408e-05, "loss": 0.2992, "num_input_tokens_seen": 55930528, "step": 58565 }, { "epoch": 4.777714332327269, "grad_norm": 1.9585862159729004, "learning_rate": 3.123725411254299e-05, "loss": 0.3277, "num_input_tokens_seen": 55935824, "step": 58570 }, { "epoch": 4.778122195937678, "grad_norm": 2.7677648067474365, "learning_rate": 3.1233807321865396e-05, "loss": 0.2993, "num_input_tokens_seen": 55939824, "step": 58575 }, { "epoch": 4.778530059548087, "grad_norm": 1.922257661819458, "learning_rate": 3.123036040483118e-05, "loss": 0.3657, "num_input_tokens_seen": 55944432, "step": 58580 }, { "epoch": 4.778937923158495, "grad_norm": 2.9033961296081543, "learning_rate": 3.122691336151021e-05, "loss": 0.3307, "num_input_tokens_seen": 55949152, "step": 58585 }, { "epoch": 4.779345786768904, "grad_norm": 0.7366693615913391, "learning_rate": 3.122346619197236e-05, "loss": 0.3326, "num_input_tokens_seen": 55953568, "step": 58590 }, { "epoch": 4.779753650379313, "grad_norm": 1.5424317121505737, "learning_rate": 3.122001889628747e-05, "loss": 0.3097, "num_input_tokens_seen": 55958736, "step": 58595 }, { "epoch": 4.7801615139897216, "grad_norm": 0.8108511567115784, "learning_rate": 3.121657147452546e-05, "loss": 0.2474, "num_input_tokens_seen": 55962240, "step": 58600 }, { "epoch": 4.780569377600131, "grad_norm": 3.3999435901641846, "learning_rate": 3.121312392675618e-05, "loss": 0.2851, "num_input_tokens_seen": 55967328, "step": 58605 }, { "epoch": 4.78097724121054, "grad_norm": 2.3748226165771484, "learning_rate": 3.120967625304952e-05, "loss": 0.222, "num_input_tokens_seen": 55973008, "step": 58610 }, { "epoch": 4.781385104820948, "grad_norm": 1.3513896465301514, "learning_rate": 3.120622845347536e-05, "loss": 0.3645, "num_input_tokens_seen": 55977632, "step": 58615 }, { "epoch": 4.781792968431357, "grad_norm": 0.8273859024047852, "learning_rate": 3.1202780528103586e-05, "loss": 0.4091, "num_input_tokens_seen": 55982432, "step": 58620 }, { "epoch": 4.782200832041765, "grad_norm": 1.0440219640731812, "learning_rate": 3.119933247700408e-05, "loss": 0.4023, "num_input_tokens_seen": 55987424, "step": 58625 }, { "epoch": 4.782608695652174, "grad_norm": 4.164237976074219, "learning_rate": 3.119588430024676e-05, "loss": 0.2436, "num_input_tokens_seen": 55992160, "step": 58630 }, { "epoch": 4.783016559262583, "grad_norm": 1.1818019151687622, "learning_rate": 3.1192435997901474e-05, "loss": 0.4171, "num_input_tokens_seen": 55996640, "step": 58635 }, { "epoch": 4.783424422872991, "grad_norm": 2.8002536296844482, "learning_rate": 3.118898757003816e-05, "loss": 0.2916, "num_input_tokens_seen": 56000832, "step": 58640 }, { "epoch": 4.7838322864834, "grad_norm": 2.5900614261627197, "learning_rate": 3.1185539016726686e-05, "loss": 0.4291, "num_input_tokens_seen": 56004912, "step": 58645 }, { "epoch": 4.784240150093809, "grad_norm": 2.7209036350250244, "learning_rate": 3.118209033803697e-05, "loss": 0.3703, "num_input_tokens_seen": 56009856, "step": 58650 }, { "epoch": 4.784648013704217, "grad_norm": 1.1583017110824585, "learning_rate": 3.117864153403892e-05, "loss": 0.3297, "num_input_tokens_seen": 56014832, "step": 58655 }, { "epoch": 4.785055877314626, "grad_norm": 3.05743408203125, "learning_rate": 3.117519260480241e-05, "loss": 0.4232, "num_input_tokens_seen": 56020448, "step": 58660 }, { "epoch": 4.785463740925035, "grad_norm": 4.530893325805664, "learning_rate": 3.1171743550397384e-05, "loss": 0.3296, "num_input_tokens_seen": 56024624, "step": 58665 }, { "epoch": 4.785871604535443, "grad_norm": 1.3632538318634033, "learning_rate": 3.116829437089373e-05, "loss": 0.3343, "num_input_tokens_seen": 56029616, "step": 58670 }, { "epoch": 4.786279468145852, "grad_norm": 0.9131392240524292, "learning_rate": 3.116484506636138e-05, "loss": 0.3122, "num_input_tokens_seen": 56034800, "step": 58675 }, { "epoch": 4.78668733175626, "grad_norm": 1.070810317993164, "learning_rate": 3.1161395636870225e-05, "loss": 0.2876, "num_input_tokens_seen": 56040032, "step": 58680 }, { "epoch": 4.787095195366669, "grad_norm": 3.8694097995758057, "learning_rate": 3.1157946082490216e-05, "loss": 0.3636, "num_input_tokens_seen": 56045664, "step": 58685 }, { "epoch": 4.787503058977078, "grad_norm": 1.9190700054168701, "learning_rate": 3.115449640329125e-05, "loss": 0.3724, "num_input_tokens_seen": 56051248, "step": 58690 }, { "epoch": 4.7879109225874865, "grad_norm": 1.0514363050460815, "learning_rate": 3.1151046599343254e-05, "loss": 0.2981, "num_input_tokens_seen": 56056816, "step": 58695 }, { "epoch": 4.7883187861978955, "grad_norm": 1.321929931640625, "learning_rate": 3.1147596670716164e-05, "loss": 0.3712, "num_input_tokens_seen": 56061312, "step": 58700 }, { "epoch": 4.788726649808304, "grad_norm": 3.585597515106201, "learning_rate": 3.114414661747989e-05, "loss": 0.303, "num_input_tokens_seen": 56066736, "step": 58705 }, { "epoch": 4.789134513418713, "grad_norm": 5.846653461456299, "learning_rate": 3.1140696439704385e-05, "loss": 0.3098, "num_input_tokens_seen": 56071456, "step": 58710 }, { "epoch": 4.789542377029122, "grad_norm": 4.29238748550415, "learning_rate": 3.113724613745957e-05, "loss": 0.3622, "num_input_tokens_seen": 56076224, "step": 58715 }, { "epoch": 4.78995024063953, "grad_norm": 1.9974757432937622, "learning_rate": 3.113379571081539e-05, "loss": 0.3987, "num_input_tokens_seen": 56081056, "step": 58720 }, { "epoch": 4.790358104249939, "grad_norm": 2.748443365097046, "learning_rate": 3.113034515984177e-05, "loss": 0.2833, "num_input_tokens_seen": 56085632, "step": 58725 }, { "epoch": 4.790765967860348, "grad_norm": 1.1145099401474, "learning_rate": 3.1126894484608656e-05, "loss": 0.3338, "num_input_tokens_seen": 56090400, "step": 58730 }, { "epoch": 4.791173831470756, "grad_norm": 3.1684958934783936, "learning_rate": 3.1123443685185997e-05, "loss": 0.3635, "num_input_tokens_seen": 56095968, "step": 58735 }, { "epoch": 4.791581695081165, "grad_norm": 1.0789997577667236, "learning_rate": 3.111999276164374e-05, "loss": 0.4231, "num_input_tokens_seen": 56100512, "step": 58740 }, { "epoch": 4.791989558691574, "grad_norm": 2.6784420013427734, "learning_rate": 3.111654171405183e-05, "loss": 0.3027, "num_input_tokens_seen": 56105312, "step": 58745 }, { "epoch": 4.792397422301982, "grad_norm": 3.0056941509246826, "learning_rate": 3.11130905424802e-05, "loss": 0.3906, "num_input_tokens_seen": 56109600, "step": 58750 }, { "epoch": 4.792805285912391, "grad_norm": 1.1223533153533936, "learning_rate": 3.1109639246998844e-05, "loss": 0.3138, "num_input_tokens_seen": 56114432, "step": 58755 }, { "epoch": 4.793213149522799, "grad_norm": 0.7900122404098511, "learning_rate": 3.110618782767769e-05, "loss": 0.3847, "num_input_tokens_seen": 56119152, "step": 58760 }, { "epoch": 4.793621013133208, "grad_norm": 1.6544525623321533, "learning_rate": 3.110273628458671e-05, "loss": 0.4158, "num_input_tokens_seen": 56124416, "step": 58765 }, { "epoch": 4.794028876743617, "grad_norm": 0.9306432604789734, "learning_rate": 3.1099284617795845e-05, "loss": 0.2855, "num_input_tokens_seen": 56129888, "step": 58770 }, { "epoch": 4.794436740354025, "grad_norm": 0.7932244539260864, "learning_rate": 3.109583282737508e-05, "loss": 0.3496, "num_input_tokens_seen": 56134416, "step": 58775 }, { "epoch": 4.794844603964434, "grad_norm": 1.378125548362732, "learning_rate": 3.109238091339438e-05, "loss": 0.3199, "num_input_tokens_seen": 56139696, "step": 58780 }, { "epoch": 4.795252467574843, "grad_norm": 0.8306365609169006, "learning_rate": 3.1088928875923706e-05, "loss": 0.3271, "num_input_tokens_seen": 56145520, "step": 58785 }, { "epoch": 4.7956603311852515, "grad_norm": 0.8452956676483154, "learning_rate": 3.108547671503303e-05, "loss": 0.3796, "num_input_tokens_seen": 56150000, "step": 58790 }, { "epoch": 4.7960681947956605, "grad_norm": 2.3276212215423584, "learning_rate": 3.1082024430792314e-05, "loss": 0.3121, "num_input_tokens_seen": 56154096, "step": 58795 }, { "epoch": 4.7964760584060695, "grad_norm": 1.2813640832901, "learning_rate": 3.107857202327156e-05, "loss": 0.312, "num_input_tokens_seen": 56158304, "step": 58800 }, { "epoch": 4.796883922016478, "grad_norm": 0.8965333700180054, "learning_rate": 3.107511949254073e-05, "loss": 0.2677, "num_input_tokens_seen": 56162880, "step": 58805 }, { "epoch": 4.797291785626887, "grad_norm": 0.6375470757484436, "learning_rate": 3.107166683866981e-05, "loss": 0.3266, "num_input_tokens_seen": 56167456, "step": 58810 }, { "epoch": 4.797699649237295, "grad_norm": 0.9694734811782837, "learning_rate": 3.1068214061728787e-05, "loss": 0.2718, "num_input_tokens_seen": 56172080, "step": 58815 }, { "epoch": 4.798107512847704, "grad_norm": 0.8383160829544067, "learning_rate": 3.106476116178764e-05, "loss": 0.3461, "num_input_tokens_seen": 56176144, "step": 58820 }, { "epoch": 4.798515376458113, "grad_norm": 1.2279045581817627, "learning_rate": 3.106130813891635e-05, "loss": 0.4606, "num_input_tokens_seen": 56180784, "step": 58825 }, { "epoch": 4.798923240068521, "grad_norm": 0.6365127563476562, "learning_rate": 3.1057854993184946e-05, "loss": 0.2771, "num_input_tokens_seen": 56185280, "step": 58830 }, { "epoch": 4.79933110367893, "grad_norm": 0.47183361649513245, "learning_rate": 3.105440172466337e-05, "loss": 0.2066, "num_input_tokens_seen": 56190448, "step": 58835 }, { "epoch": 4.799738967289338, "grad_norm": 1.4081577062606812, "learning_rate": 3.105094833342166e-05, "loss": 0.3621, "num_input_tokens_seen": 56195216, "step": 58840 }, { "epoch": 4.800146830899747, "grad_norm": 0.7330175638198853, "learning_rate": 3.1047494819529795e-05, "loss": 0.3288, "num_input_tokens_seen": 56200176, "step": 58845 }, { "epoch": 4.800554694510156, "grad_norm": 0.9330236911773682, "learning_rate": 3.1044041183057774e-05, "loss": 0.3897, "num_input_tokens_seen": 56204656, "step": 58850 }, { "epoch": 4.800962558120564, "grad_norm": 0.7484092712402344, "learning_rate": 3.104058742407561e-05, "loss": 0.3885, "num_input_tokens_seen": 56209552, "step": 58855 }, { "epoch": 4.801370421730973, "grad_norm": 0.7459593415260315, "learning_rate": 3.103713354265331e-05, "loss": 0.3158, "num_input_tokens_seen": 56213776, "step": 58860 }, { "epoch": 4.801778285341382, "grad_norm": 3.7929370403289795, "learning_rate": 3.103367953886087e-05, "loss": 0.3205, "num_input_tokens_seen": 56217648, "step": 58865 }, { "epoch": 4.80218614895179, "grad_norm": 1.136256456375122, "learning_rate": 3.103022541276831e-05, "loss": 0.3046, "num_input_tokens_seen": 56222176, "step": 58870 }, { "epoch": 4.802594012562199, "grad_norm": 1.6465699672698975, "learning_rate": 3.102677116444565e-05, "loss": 0.3885, "num_input_tokens_seen": 56227088, "step": 58875 }, { "epoch": 4.803001876172608, "grad_norm": 2.1821227073669434, "learning_rate": 3.10233167939629e-05, "loss": 0.3748, "num_input_tokens_seen": 56231936, "step": 58880 }, { "epoch": 4.803409739783016, "grad_norm": 1.1384996175765991, "learning_rate": 3.101986230139007e-05, "loss": 0.3635, "num_input_tokens_seen": 56236080, "step": 58885 }, { "epoch": 4.8038176033934255, "grad_norm": 0.9909389615058899, "learning_rate": 3.101640768679719e-05, "loss": 0.2975, "num_input_tokens_seen": 56240800, "step": 58890 }, { "epoch": 4.804225467003834, "grad_norm": 2.8628740310668945, "learning_rate": 3.101295295025429e-05, "loss": 0.2605, "num_input_tokens_seen": 56245856, "step": 58895 }, { "epoch": 4.804633330614243, "grad_norm": 2.4483423233032227, "learning_rate": 3.1009498091831385e-05, "loss": 0.3283, "num_input_tokens_seen": 56251024, "step": 58900 }, { "epoch": 4.805041194224652, "grad_norm": 2.934812545776367, "learning_rate": 3.100604311159851e-05, "loss": 0.3779, "num_input_tokens_seen": 56256256, "step": 58905 }, { "epoch": 4.80544905783506, "grad_norm": 0.453911155462265, "learning_rate": 3.100258800962569e-05, "loss": 0.2494, "num_input_tokens_seen": 56261344, "step": 58910 }, { "epoch": 4.805856921445469, "grad_norm": 2.108771562576294, "learning_rate": 3.0999132785982967e-05, "loss": 0.4003, "num_input_tokens_seen": 56265680, "step": 58915 }, { "epoch": 4.806264785055877, "grad_norm": 0.6610960364341736, "learning_rate": 3.099567744074036e-05, "loss": 0.2561, "num_input_tokens_seen": 56270128, "step": 58920 }, { "epoch": 4.806672648666286, "grad_norm": 1.8039604425430298, "learning_rate": 3.0992221973967924e-05, "loss": 0.3866, "num_input_tokens_seen": 56275312, "step": 58925 }, { "epoch": 4.807080512276695, "grad_norm": 0.6508393883705139, "learning_rate": 3.09887663857357e-05, "loss": 0.1819, "num_input_tokens_seen": 56280608, "step": 58930 }, { "epoch": 4.807488375887103, "grad_norm": 0.569796085357666, "learning_rate": 3.098531067611372e-05, "loss": 0.3519, "num_input_tokens_seen": 56286096, "step": 58935 }, { "epoch": 4.807896239497512, "grad_norm": 0.6043797135353088, "learning_rate": 3.098185484517204e-05, "loss": 0.2769, "num_input_tokens_seen": 56291664, "step": 58940 }, { "epoch": 4.808304103107921, "grad_norm": 0.8954386711120605, "learning_rate": 3.0978398892980706e-05, "loss": 0.3352, "num_input_tokens_seen": 56296256, "step": 58945 }, { "epoch": 4.808711966718329, "grad_norm": 1.3703550100326538, "learning_rate": 3.097494281960976e-05, "loss": 0.344, "num_input_tokens_seen": 56300768, "step": 58950 }, { "epoch": 4.809119830328738, "grad_norm": 1.0517371892929077, "learning_rate": 3.097148662512927e-05, "loss": 0.377, "num_input_tokens_seen": 56305440, "step": 58955 }, { "epoch": 4.809527693939147, "grad_norm": 1.650774598121643, "learning_rate": 3.096803030960927e-05, "loss": 0.3335, "num_input_tokens_seen": 56310672, "step": 58960 }, { "epoch": 4.809935557549555, "grad_norm": 1.6584680080413818, "learning_rate": 3.096457387311985e-05, "loss": 0.3628, "num_input_tokens_seen": 56316096, "step": 58965 }, { "epoch": 4.810343421159964, "grad_norm": 1.096480369567871, "learning_rate": 3.0961117315731034e-05, "loss": 0.3322, "num_input_tokens_seen": 56321168, "step": 58970 }, { "epoch": 4.810751284770372, "grad_norm": 0.9894477725028992, "learning_rate": 3.095766063751292e-05, "loss": 0.3659, "num_input_tokens_seen": 56325024, "step": 58975 }, { "epoch": 4.811159148380781, "grad_norm": 1.9316558837890625, "learning_rate": 3.0954203838535544e-05, "loss": 0.3516, "num_input_tokens_seen": 56330016, "step": 58980 }, { "epoch": 4.81156701199119, "grad_norm": 1.1797046661376953, "learning_rate": 3.095074691886899e-05, "loss": 0.3584, "num_input_tokens_seen": 56334752, "step": 58985 }, { "epoch": 4.8119748756015985, "grad_norm": 2.1157350540161133, "learning_rate": 3.094728987858333e-05, "loss": 0.3472, "num_input_tokens_seen": 56340016, "step": 58990 }, { "epoch": 4.8123827392120075, "grad_norm": 1.79200279712677, "learning_rate": 3.094383271774863e-05, "loss": 0.3612, "num_input_tokens_seen": 56344528, "step": 58995 }, { "epoch": 4.8127906028224166, "grad_norm": 0.60414719581604, "learning_rate": 3.0940375436434964e-05, "loss": 0.3793, "num_input_tokens_seen": 56348960, "step": 59000 }, { "epoch": 4.813198466432825, "grad_norm": 1.0828086137771606, "learning_rate": 3.09369180347124e-05, "loss": 0.3535, "num_input_tokens_seen": 56354096, "step": 59005 }, { "epoch": 4.813606330043234, "grad_norm": 0.9207062721252441, "learning_rate": 3.093346051265105e-05, "loss": 0.3251, "num_input_tokens_seen": 56358528, "step": 59010 }, { "epoch": 4.814014193653643, "grad_norm": 0.7795013785362244, "learning_rate": 3.093000287032096e-05, "loss": 0.3569, "num_input_tokens_seen": 56363360, "step": 59015 }, { "epoch": 4.814422057264051, "grad_norm": 0.6933233141899109, "learning_rate": 3.0926545107792247e-05, "loss": 0.3587, "num_input_tokens_seen": 56368544, "step": 59020 }, { "epoch": 4.81482992087446, "grad_norm": 0.8178785443305969, "learning_rate": 3.0923087225134974e-05, "loss": 0.3405, "num_input_tokens_seen": 56373488, "step": 59025 }, { "epoch": 4.815237784484868, "grad_norm": 2.5665884017944336, "learning_rate": 3.091962922241924e-05, "loss": 0.3396, "num_input_tokens_seen": 56377968, "step": 59030 }, { "epoch": 4.815645648095277, "grad_norm": 0.6551275253295898, "learning_rate": 3.091617109971513e-05, "loss": 0.3392, "num_input_tokens_seen": 56382240, "step": 59035 }, { "epoch": 4.816053511705686, "grad_norm": 0.7984985709190369, "learning_rate": 3.0912712857092764e-05, "loss": 0.318, "num_input_tokens_seen": 56387920, "step": 59040 }, { "epoch": 4.816461375316094, "grad_norm": 1.9960368871688843, "learning_rate": 3.0909254494622206e-05, "loss": 0.3728, "num_input_tokens_seen": 56391920, "step": 59045 }, { "epoch": 4.816869238926503, "grad_norm": 0.5139066576957703, "learning_rate": 3.090579601237358e-05, "loss": 0.3594, "num_input_tokens_seen": 56396416, "step": 59050 }, { "epoch": 4.817277102536911, "grad_norm": 0.8604274392127991, "learning_rate": 3.090233741041697e-05, "loss": 0.3452, "num_input_tokens_seen": 56401152, "step": 59055 }, { "epoch": 4.81768496614732, "grad_norm": 0.9742377996444702, "learning_rate": 3.0898878688822497e-05, "loss": 0.3313, "num_input_tokens_seen": 56406624, "step": 59060 }, { "epoch": 4.818092829757729, "grad_norm": 0.5329753756523132, "learning_rate": 3.0895419847660254e-05, "loss": 0.3491, "num_input_tokens_seen": 56411552, "step": 59065 }, { "epoch": 4.818500693368137, "grad_norm": 1.357187271118164, "learning_rate": 3.089196088700036e-05, "loss": 0.3188, "num_input_tokens_seen": 56416992, "step": 59070 }, { "epoch": 4.818908556978546, "grad_norm": 1.528242588043213, "learning_rate": 3.088850180691292e-05, "loss": 0.3379, "num_input_tokens_seen": 56421296, "step": 59075 }, { "epoch": 4.819316420588955, "grad_norm": 1.3542190790176392, "learning_rate": 3.088504260746804e-05, "loss": 0.3508, "num_input_tokens_seen": 56426688, "step": 59080 }, { "epoch": 4.8197242841993635, "grad_norm": 0.6111747622489929, "learning_rate": 3.0881583288735865e-05, "loss": 0.3409, "num_input_tokens_seen": 56430016, "step": 59085 }, { "epoch": 4.8201321478097725, "grad_norm": 1.665010929107666, "learning_rate": 3.0878123850786484e-05, "loss": 0.3364, "num_input_tokens_seen": 56435072, "step": 59090 }, { "epoch": 4.8205400114201815, "grad_norm": 0.2987368106842041, "learning_rate": 3.087466429369004e-05, "loss": 0.3338, "num_input_tokens_seen": 56439456, "step": 59095 }, { "epoch": 4.82094787503059, "grad_norm": 0.9802566170692444, "learning_rate": 3.087120461751664e-05, "loss": 0.3441, "num_input_tokens_seen": 56444160, "step": 59100 }, { "epoch": 4.821355738640999, "grad_norm": 1.122026801109314, "learning_rate": 3.086774482233642e-05, "loss": 0.2876, "num_input_tokens_seen": 56448464, "step": 59105 }, { "epoch": 4.821763602251407, "grad_norm": 0.6965798139572144, "learning_rate": 3.08642849082195e-05, "loss": 0.2732, "num_input_tokens_seen": 56453472, "step": 59110 }, { "epoch": 4.822171465861816, "grad_norm": 0.850677490234375, "learning_rate": 3.086082487523602e-05, "loss": 0.3333, "num_input_tokens_seen": 56458144, "step": 59115 }, { "epoch": 4.822579329472225, "grad_norm": 1.1629749536514282, "learning_rate": 3.0857364723456114e-05, "loss": 0.3792, "num_input_tokens_seen": 56463472, "step": 59120 }, { "epoch": 4.822987193082633, "grad_norm": 0.7642342448234558, "learning_rate": 3.085390445294991e-05, "loss": 0.3597, "num_input_tokens_seen": 56467696, "step": 59125 }, { "epoch": 4.823395056693042, "grad_norm": 1.2836040258407593, "learning_rate": 3.0850444063787557e-05, "loss": 0.382, "num_input_tokens_seen": 56471744, "step": 59130 }, { "epoch": 4.82380292030345, "grad_norm": 1.3066426515579224, "learning_rate": 3.084698355603918e-05, "loss": 0.3578, "num_input_tokens_seen": 56476128, "step": 59135 }, { "epoch": 4.824210783913859, "grad_norm": 2.3076541423797607, "learning_rate": 3.084352292977494e-05, "loss": 0.3893, "num_input_tokens_seen": 56480704, "step": 59140 }, { "epoch": 4.824618647524268, "grad_norm": 0.747957170009613, "learning_rate": 3.084006218506497e-05, "loss": 0.3289, "num_input_tokens_seen": 56485888, "step": 59145 }, { "epoch": 4.825026511134676, "grad_norm": 1.5517281293869019, "learning_rate": 3.0836601321979414e-05, "loss": 0.3542, "num_input_tokens_seen": 56490816, "step": 59150 }, { "epoch": 4.825434374745085, "grad_norm": 1.1010149717330933, "learning_rate": 3.0833140340588435e-05, "loss": 0.4085, "num_input_tokens_seen": 56495712, "step": 59155 }, { "epoch": 4.825842238355494, "grad_norm": 0.745872974395752, "learning_rate": 3.0829679240962174e-05, "loss": 0.3837, "num_input_tokens_seen": 56500128, "step": 59160 }, { "epoch": 4.826250101965902, "grad_norm": 1.039894461631775, "learning_rate": 3.08262180231708e-05, "loss": 0.3085, "num_input_tokens_seen": 56504496, "step": 59165 }, { "epoch": 4.826657965576311, "grad_norm": 1.491465449333191, "learning_rate": 3.0822756687284457e-05, "loss": 0.3606, "num_input_tokens_seen": 56509072, "step": 59170 }, { "epoch": 4.82706582918672, "grad_norm": 1.8693324327468872, "learning_rate": 3.0819295233373316e-05, "loss": 0.3768, "num_input_tokens_seen": 56514352, "step": 59175 }, { "epoch": 4.8274736927971285, "grad_norm": 1.3354098796844482, "learning_rate": 3.0815833661507525e-05, "loss": 0.3398, "num_input_tokens_seen": 56519344, "step": 59180 }, { "epoch": 4.8278815564075375, "grad_norm": 0.868251383304596, "learning_rate": 3.0812371971757265e-05, "loss": 0.3349, "num_input_tokens_seen": 56523424, "step": 59185 }, { "epoch": 4.828289420017946, "grad_norm": 0.7163839936256409, "learning_rate": 3.080891016419269e-05, "loss": 0.2712, "num_input_tokens_seen": 56528400, "step": 59190 }, { "epoch": 4.828697283628355, "grad_norm": 0.43430525064468384, "learning_rate": 3.0805448238883985e-05, "loss": 0.3287, "num_input_tokens_seen": 56533808, "step": 59195 }, { "epoch": 4.829105147238764, "grad_norm": 0.6030938625335693, "learning_rate": 3.08019861959013e-05, "loss": 0.3973, "num_input_tokens_seen": 56538704, "step": 59200 }, { "epoch": 4.829513010849172, "grad_norm": 0.941712498664856, "learning_rate": 3.079852403531482e-05, "loss": 0.3638, "num_input_tokens_seen": 56542864, "step": 59205 }, { "epoch": 4.829920874459581, "grad_norm": 0.8232848048210144, "learning_rate": 3.079506175719473e-05, "loss": 0.4171, "num_input_tokens_seen": 56546944, "step": 59210 }, { "epoch": 4.83032873806999, "grad_norm": 0.5423765778541565, "learning_rate": 3.079159936161118e-05, "loss": 0.3617, "num_input_tokens_seen": 56552816, "step": 59215 }, { "epoch": 4.830736601680398, "grad_norm": 0.5843826532363892, "learning_rate": 3.07881368486344e-05, "loss": 0.4022, "num_input_tokens_seen": 56557344, "step": 59220 }, { "epoch": 4.831144465290807, "grad_norm": 0.3222082257270813, "learning_rate": 3.0784674218334525e-05, "loss": 0.3201, "num_input_tokens_seen": 56561344, "step": 59225 }, { "epoch": 4.831552328901216, "grad_norm": 0.8195662498474121, "learning_rate": 3.0781211470781765e-05, "loss": 0.3489, "num_input_tokens_seen": 56566112, "step": 59230 }, { "epoch": 4.831960192511624, "grad_norm": 0.9787318110466003, "learning_rate": 3.077774860604631e-05, "loss": 0.3464, "num_input_tokens_seen": 56570960, "step": 59235 }, { "epoch": 4.832368056122033, "grad_norm": 1.0282245874404907, "learning_rate": 3.077428562419835e-05, "loss": 0.3397, "num_input_tokens_seen": 56576720, "step": 59240 }, { "epoch": 4.832775919732441, "grad_norm": 0.868716835975647, "learning_rate": 3.077082252530807e-05, "loss": 0.3457, "num_input_tokens_seen": 56581872, "step": 59245 }, { "epoch": 4.83318378334285, "grad_norm": 0.6464992165565491, "learning_rate": 3.0767359309445666e-05, "loss": 0.3998, "num_input_tokens_seen": 56586352, "step": 59250 }, { "epoch": 4.833591646953259, "grad_norm": 0.8032761812210083, "learning_rate": 3.076389597668135e-05, "loss": 0.3313, "num_input_tokens_seen": 56591232, "step": 59255 }, { "epoch": 4.833999510563667, "grad_norm": 0.3749648630619049, "learning_rate": 3.07604325270853e-05, "loss": 0.3567, "num_input_tokens_seen": 56595808, "step": 59260 }, { "epoch": 4.834407374174076, "grad_norm": 0.7673578858375549, "learning_rate": 3.075696896072773e-05, "loss": 0.3346, "num_input_tokens_seen": 56600960, "step": 59265 }, { "epoch": 4.834815237784484, "grad_norm": 0.8532865643501282, "learning_rate": 3.075350527767885e-05, "loss": 0.3534, "num_input_tokens_seen": 56605264, "step": 59270 }, { "epoch": 4.835223101394893, "grad_norm": 1.1281040906906128, "learning_rate": 3.075004147800887e-05, "loss": 0.3395, "num_input_tokens_seen": 56609632, "step": 59275 }, { "epoch": 4.835630965005302, "grad_norm": 0.5294163227081299, "learning_rate": 3.0746577561787973e-05, "loss": 0.3427, "num_input_tokens_seen": 56614336, "step": 59280 }, { "epoch": 4.8360388286157105, "grad_norm": 1.1490075588226318, "learning_rate": 3.074311352908641e-05, "loss": 0.3756, "num_input_tokens_seen": 56619936, "step": 59285 }, { "epoch": 4.83644669222612, "grad_norm": 0.6965187191963196, "learning_rate": 3.0739649379974366e-05, "loss": 0.329, "num_input_tokens_seen": 56624752, "step": 59290 }, { "epoch": 4.836854555836529, "grad_norm": 1.363871693611145, "learning_rate": 3.073618511452208e-05, "loss": 0.3256, "num_input_tokens_seen": 56630416, "step": 59295 }, { "epoch": 4.837262419446937, "grad_norm": 1.2098747491836548, "learning_rate": 3.0732720732799746e-05, "loss": 0.3577, "num_input_tokens_seen": 56635120, "step": 59300 }, { "epoch": 4.837670283057346, "grad_norm": 1.0120512247085571, "learning_rate": 3.07292562348776e-05, "loss": 0.3413, "num_input_tokens_seen": 56640464, "step": 59305 }, { "epoch": 4.838078146667755, "grad_norm": 1.043701171875, "learning_rate": 3.0725791620825865e-05, "loss": 0.3372, "num_input_tokens_seen": 56643824, "step": 59310 }, { "epoch": 4.838486010278163, "grad_norm": 1.030015468597412, "learning_rate": 3.072232689071477e-05, "loss": 0.3363, "num_input_tokens_seen": 56648592, "step": 59315 }, { "epoch": 4.838893873888572, "grad_norm": 0.8204217553138733, "learning_rate": 3.0718862044614544e-05, "loss": 0.316, "num_input_tokens_seen": 56653712, "step": 59320 }, { "epoch": 4.83930173749898, "grad_norm": 0.5774322748184204, "learning_rate": 3.07153970825954e-05, "loss": 0.3563, "num_input_tokens_seen": 56659008, "step": 59325 }, { "epoch": 4.839709601109389, "grad_norm": 1.5571867227554321, "learning_rate": 3.07119320047276e-05, "loss": 0.3498, "num_input_tokens_seen": 56664176, "step": 59330 }, { "epoch": 4.840117464719798, "grad_norm": 0.5408161878585815, "learning_rate": 3.0708466811081365e-05, "loss": 0.3437, "num_input_tokens_seen": 56668720, "step": 59335 }, { "epoch": 4.840525328330206, "grad_norm": 0.9667578339576721, "learning_rate": 3.070500150172693e-05, "loss": 0.3283, "num_input_tokens_seen": 56674096, "step": 59340 }, { "epoch": 4.840933191940615, "grad_norm": 1.205683708190918, "learning_rate": 3.070153607673455e-05, "loss": 0.3718, "num_input_tokens_seen": 56678480, "step": 59345 }, { "epoch": 4.841341055551024, "grad_norm": 1.9299160242080688, "learning_rate": 3.069807053617444e-05, "loss": 0.367, "num_input_tokens_seen": 56684208, "step": 59350 }, { "epoch": 4.841748919161432, "grad_norm": 1.7996807098388672, "learning_rate": 3.069460488011687e-05, "loss": 0.3429, "num_input_tokens_seen": 56689920, "step": 59355 }, { "epoch": 4.842156782771841, "grad_norm": 0.9659497737884521, "learning_rate": 3.0691139108632065e-05, "loss": 0.3215, "num_input_tokens_seen": 56694224, "step": 59360 }, { "epoch": 4.84256464638225, "grad_norm": 0.8126118183135986, "learning_rate": 3.0687673221790306e-05, "loss": 0.3562, "num_input_tokens_seen": 56699952, "step": 59365 }, { "epoch": 4.842972509992658, "grad_norm": 0.8679882287979126, "learning_rate": 3.068420721966181e-05, "loss": 0.3437, "num_input_tokens_seen": 56704640, "step": 59370 }, { "epoch": 4.843380373603067, "grad_norm": 1.1645456552505493, "learning_rate": 3.068074110231687e-05, "loss": 0.358, "num_input_tokens_seen": 56709904, "step": 59375 }, { "epoch": 4.8437882372134755, "grad_norm": 2.190169095993042, "learning_rate": 3.06772748698257e-05, "loss": 0.364, "num_input_tokens_seen": 56715200, "step": 59380 }, { "epoch": 4.8441961008238845, "grad_norm": 1.5069447755813599, "learning_rate": 3.06738085222586e-05, "loss": 0.3358, "num_input_tokens_seen": 56720768, "step": 59385 }, { "epoch": 4.8446039644342935, "grad_norm": 1.7956573963165283, "learning_rate": 3.06703420596858e-05, "loss": 0.3536, "num_input_tokens_seen": 56725792, "step": 59390 }, { "epoch": 4.845011828044702, "grad_norm": 2.8954570293426514, "learning_rate": 3.066687548217758e-05, "loss": 0.3482, "num_input_tokens_seen": 56730048, "step": 59395 }, { "epoch": 4.845419691655111, "grad_norm": 0.8390276432037354, "learning_rate": 3.066340878980421e-05, "loss": 0.396, "num_input_tokens_seen": 56735392, "step": 59400 }, { "epoch": 4.845827555265519, "grad_norm": 1.0026113986968994, "learning_rate": 3.065994198263594e-05, "loss": 0.3334, "num_input_tokens_seen": 56740352, "step": 59405 }, { "epoch": 4.846235418875928, "grad_norm": 1.3229988813400269, "learning_rate": 3.065647506074306e-05, "loss": 0.3538, "num_input_tokens_seen": 56744848, "step": 59410 }, { "epoch": 4.846643282486337, "grad_norm": 1.036577820777893, "learning_rate": 3.065300802419582e-05, "loss": 0.3859, "num_input_tokens_seen": 56749312, "step": 59415 }, { "epoch": 4.847051146096745, "grad_norm": 0.7168089151382446, "learning_rate": 3.064954087306453e-05, "loss": 0.415, "num_input_tokens_seen": 56754480, "step": 59420 }, { "epoch": 4.847459009707154, "grad_norm": 1.0746828317642212, "learning_rate": 3.0646073607419426e-05, "loss": 0.3562, "num_input_tokens_seen": 56759072, "step": 59425 }, { "epoch": 4.847866873317563, "grad_norm": 0.5178879499435425, "learning_rate": 3.0642606227330816e-05, "loss": 0.3397, "num_input_tokens_seen": 56763424, "step": 59430 }, { "epoch": 4.848274736927971, "grad_norm": 1.4093598127365112, "learning_rate": 3.063913873286898e-05, "loss": 0.3426, "num_input_tokens_seen": 56768208, "step": 59435 }, { "epoch": 4.84868260053838, "grad_norm": 1.5718530416488647, "learning_rate": 3.0635671124104194e-05, "loss": 0.3527, "num_input_tokens_seen": 56773136, "step": 59440 }, { "epoch": 4.849090464148789, "grad_norm": 0.8101872205734253, "learning_rate": 3.063220340110675e-05, "loss": 0.3173, "num_input_tokens_seen": 56777488, "step": 59445 }, { "epoch": 4.849498327759197, "grad_norm": 2.0468385219573975, "learning_rate": 3.0628735563946936e-05, "loss": 0.3358, "num_input_tokens_seen": 56781792, "step": 59450 }, { "epoch": 4.849906191369606, "grad_norm": 1.8172228336334229, "learning_rate": 3.0625267612695044e-05, "loss": 0.3265, "num_input_tokens_seen": 56787584, "step": 59455 }, { "epoch": 4.850314054980014, "grad_norm": 2.303978443145752, "learning_rate": 3.0621799547421364e-05, "loss": 0.2882, "num_input_tokens_seen": 56792320, "step": 59460 }, { "epoch": 4.850721918590423, "grad_norm": 2.428986072540283, "learning_rate": 3.0618331368196206e-05, "loss": 0.3555, "num_input_tokens_seen": 56796256, "step": 59465 }, { "epoch": 4.851129782200832, "grad_norm": 1.747355580329895, "learning_rate": 3.061486307508985e-05, "loss": 0.3373, "num_input_tokens_seen": 56800768, "step": 59470 }, { "epoch": 4.8515376458112405, "grad_norm": 0.47378629446029663, "learning_rate": 3.0611394668172616e-05, "loss": 0.3046, "num_input_tokens_seen": 56805200, "step": 59475 }, { "epoch": 4.8519455094216495, "grad_norm": 3.3966493606567383, "learning_rate": 3.060792614751478e-05, "loss": 0.4184, "num_input_tokens_seen": 56809920, "step": 59480 }, { "epoch": 4.852353373032058, "grad_norm": 2.515481948852539, "learning_rate": 3.060445751318668e-05, "loss": 0.3329, "num_input_tokens_seen": 56815008, "step": 59485 }, { "epoch": 4.852761236642467, "grad_norm": 2.395796298980713, "learning_rate": 3.06009887652586e-05, "loss": 0.3062, "num_input_tokens_seen": 56819456, "step": 59490 }, { "epoch": 4.853169100252876, "grad_norm": 2.1419718265533447, "learning_rate": 3.059751990380085e-05, "loss": 0.4149, "num_input_tokens_seen": 56824928, "step": 59495 }, { "epoch": 4.853576963863284, "grad_norm": 2.3303020000457764, "learning_rate": 3.059405092888377e-05, "loss": 0.321, "num_input_tokens_seen": 56830080, "step": 59500 }, { "epoch": 4.853984827473693, "grad_norm": 2.1161084175109863, "learning_rate": 3.0590581840577634e-05, "loss": 0.3696, "num_input_tokens_seen": 56834768, "step": 59505 }, { "epoch": 4.854392691084102, "grad_norm": 1.5867114067077637, "learning_rate": 3.058711263895279e-05, "loss": 0.3797, "num_input_tokens_seen": 56839760, "step": 59510 }, { "epoch": 4.85480055469451, "grad_norm": 1.488893985748291, "learning_rate": 3.058364332407955e-05, "loss": 0.308, "num_input_tokens_seen": 56845040, "step": 59515 }, { "epoch": 4.855208418304919, "grad_norm": 1.3598999977111816, "learning_rate": 3.058017389602822e-05, "loss": 0.3239, "num_input_tokens_seen": 56849600, "step": 59520 }, { "epoch": 4.855616281915328, "grad_norm": 1.4556405544281006, "learning_rate": 3.057670435486915e-05, "loss": 0.3664, "num_input_tokens_seen": 56854576, "step": 59525 }, { "epoch": 4.856024145525736, "grad_norm": 3.2900919914245605, "learning_rate": 3.057323470067264e-05, "loss": 0.3594, "num_input_tokens_seen": 56858560, "step": 59530 }, { "epoch": 4.856432009136145, "grad_norm": 1.8844231367111206, "learning_rate": 3.056976493350904e-05, "loss": 0.2564, "num_input_tokens_seen": 56862976, "step": 59535 }, { "epoch": 4.856839872746553, "grad_norm": 2.4706499576568604, "learning_rate": 3.056629505344867e-05, "loss": 0.3316, "num_input_tokens_seen": 56867808, "step": 59540 }, { "epoch": 4.857247736356962, "grad_norm": 1.1927050352096558, "learning_rate": 3.0562825060561866e-05, "loss": 0.305, "num_input_tokens_seen": 56871680, "step": 59545 }, { "epoch": 4.857655599967371, "grad_norm": 2.8998208045959473, "learning_rate": 3.055935495491896e-05, "loss": 0.3804, "num_input_tokens_seen": 56876208, "step": 59550 }, { "epoch": 4.858063463577779, "grad_norm": 2.753602981567383, "learning_rate": 3.055588473659029e-05, "loss": 0.4139, "num_input_tokens_seen": 56880816, "step": 59555 }, { "epoch": 4.858471327188188, "grad_norm": 2.2868590354919434, "learning_rate": 3.05524144056462e-05, "loss": 0.3316, "num_input_tokens_seen": 56885200, "step": 59560 }, { "epoch": 4.858879190798597, "grad_norm": 3.3980774879455566, "learning_rate": 3.054894396215703e-05, "loss": 0.3583, "num_input_tokens_seen": 56890128, "step": 59565 }, { "epoch": 4.859287054409005, "grad_norm": 1.8660075664520264, "learning_rate": 3.054547340619311e-05, "loss": 0.3055, "num_input_tokens_seen": 56894928, "step": 59570 }, { "epoch": 4.8596949180194144, "grad_norm": 1.9928405284881592, "learning_rate": 3.054200273782482e-05, "loss": 0.3428, "num_input_tokens_seen": 56900176, "step": 59575 }, { "epoch": 4.8601027816298235, "grad_norm": 0.8567392826080322, "learning_rate": 3.053853195712248e-05, "loss": 0.2682, "num_input_tokens_seen": 56904352, "step": 59580 }, { "epoch": 4.860510645240232, "grad_norm": 4.129620552062988, "learning_rate": 3.053506106415645e-05, "loss": 0.3622, "num_input_tokens_seen": 56909040, "step": 59585 }, { "epoch": 4.860918508850641, "grad_norm": 2.0158231258392334, "learning_rate": 3.053159005899708e-05, "loss": 0.3455, "num_input_tokens_seen": 56913440, "step": 59590 }, { "epoch": 4.861326372461049, "grad_norm": 1.57508385181427, "learning_rate": 3.052811894171474e-05, "loss": 0.3251, "num_input_tokens_seen": 56918912, "step": 59595 }, { "epoch": 4.861734236071458, "grad_norm": 5.526340484619141, "learning_rate": 3.052464771237978e-05, "loss": 0.3852, "num_input_tokens_seen": 56922880, "step": 59600 }, { "epoch": 4.862142099681867, "grad_norm": 2.4690957069396973, "learning_rate": 3.052117637106255e-05, "loss": 0.2791, "num_input_tokens_seen": 56928656, "step": 59605 }, { "epoch": 4.862549963292275, "grad_norm": 0.9168344140052795, "learning_rate": 3.051770491783343e-05, "loss": 0.3197, "num_input_tokens_seen": 56933424, "step": 59610 }, { "epoch": 4.862957826902684, "grad_norm": 1.166183352470398, "learning_rate": 3.0514233352762768e-05, "loss": 0.3961, "num_input_tokens_seen": 56937712, "step": 59615 }, { "epoch": 4.863365690513092, "grad_norm": 2.314774513244629, "learning_rate": 3.051076167592094e-05, "loss": 0.3498, "num_input_tokens_seen": 56942400, "step": 59620 }, { "epoch": 4.863773554123501, "grad_norm": 0.9089598655700684, "learning_rate": 3.050728988737831e-05, "loss": 0.3909, "num_input_tokens_seen": 56947808, "step": 59625 }, { "epoch": 4.86418141773391, "grad_norm": 1.458411693572998, "learning_rate": 3.0503817987205263e-05, "loss": 0.3445, "num_input_tokens_seen": 56952176, "step": 59630 }, { "epoch": 4.864589281344318, "grad_norm": 4.4404296875, "learning_rate": 3.0500345975472154e-05, "loss": 0.3983, "num_input_tokens_seen": 56957568, "step": 59635 }, { "epoch": 4.864997144954727, "grad_norm": 3.229661464691162, "learning_rate": 3.049687385224938e-05, "loss": 0.3843, "num_input_tokens_seen": 56961872, "step": 59640 }, { "epoch": 4.865405008565136, "grad_norm": 1.9180428981781006, "learning_rate": 3.0493401617607302e-05, "loss": 0.3342, "num_input_tokens_seen": 56967008, "step": 59645 }, { "epoch": 4.865812872175544, "grad_norm": 3.376102924346924, "learning_rate": 3.0489929271616314e-05, "loss": 0.332, "num_input_tokens_seen": 56972016, "step": 59650 }, { "epoch": 4.866220735785953, "grad_norm": 3.3169209957122803, "learning_rate": 3.0486456814346786e-05, "loss": 0.3561, "num_input_tokens_seen": 56977616, "step": 59655 }, { "epoch": 4.866628599396362, "grad_norm": 2.1988847255706787, "learning_rate": 3.0482984245869113e-05, "loss": 0.3461, "num_input_tokens_seen": 56982064, "step": 59660 }, { "epoch": 4.86703646300677, "grad_norm": 2.4018473625183105, "learning_rate": 3.0479511566253677e-05, "loss": 0.3502, "num_input_tokens_seen": 56987136, "step": 59665 }, { "epoch": 4.867444326617179, "grad_norm": 2.4042749404907227, "learning_rate": 3.0476038775570876e-05, "loss": 0.4271, "num_input_tokens_seen": 56992192, "step": 59670 }, { "epoch": 4.8678521902275875, "grad_norm": 0.6182563900947571, "learning_rate": 3.0472565873891086e-05, "loss": 0.2887, "num_input_tokens_seen": 56996480, "step": 59675 }, { "epoch": 4.8682600538379965, "grad_norm": 0.7299507856369019, "learning_rate": 3.046909286128472e-05, "loss": 0.3141, "num_input_tokens_seen": 57001680, "step": 59680 }, { "epoch": 4.8686679174484055, "grad_norm": 0.7844198346138, "learning_rate": 3.0465619737822164e-05, "loss": 0.3579, "num_input_tokens_seen": 57006960, "step": 59685 }, { "epoch": 4.869075781058814, "grad_norm": 0.5771644115447998, "learning_rate": 3.0462146503573813e-05, "loss": 0.4234, "num_input_tokens_seen": 57010800, "step": 59690 }, { "epoch": 4.869483644669223, "grad_norm": 1.0598363876342773, "learning_rate": 3.045867315861008e-05, "loss": 0.3878, "num_input_tokens_seen": 57015168, "step": 59695 }, { "epoch": 4.869891508279631, "grad_norm": 1.2635607719421387, "learning_rate": 3.0455199703001357e-05, "loss": 0.3679, "num_input_tokens_seen": 57019808, "step": 59700 }, { "epoch": 4.87029937189004, "grad_norm": 1.5241904258728027, "learning_rate": 3.0451726136818054e-05, "loss": 0.3098, "num_input_tokens_seen": 57024976, "step": 59705 }, { "epoch": 4.870707235500449, "grad_norm": 1.0632061958312988, "learning_rate": 3.0448252460130584e-05, "loss": 0.3564, "num_input_tokens_seen": 57030080, "step": 59710 }, { "epoch": 4.871115099110858, "grad_norm": 0.9130203723907471, "learning_rate": 3.0444778673009344e-05, "loss": 0.3275, "num_input_tokens_seen": 57034832, "step": 59715 }, { "epoch": 4.871522962721266, "grad_norm": 1.1373002529144287, "learning_rate": 3.044130477552476e-05, "loss": 0.3315, "num_input_tokens_seen": 57039952, "step": 59720 }, { "epoch": 4.871930826331675, "grad_norm": 0.9732263684272766, "learning_rate": 3.0437830767747232e-05, "loss": 0.3192, "num_input_tokens_seen": 57044704, "step": 59725 }, { "epoch": 4.872338689942083, "grad_norm": 0.6351660490036011, "learning_rate": 3.0434356649747198e-05, "loss": 0.3271, "num_input_tokens_seen": 57049520, "step": 59730 }, { "epoch": 4.872746553552492, "grad_norm": 0.931296706199646, "learning_rate": 3.043088242159505e-05, "loss": 0.3362, "num_input_tokens_seen": 57053840, "step": 59735 }, { "epoch": 4.873154417162901, "grad_norm": 1.310989499092102, "learning_rate": 3.042740808336123e-05, "loss": 0.4044, "num_input_tokens_seen": 57057552, "step": 59740 }, { "epoch": 4.873562280773309, "grad_norm": 1.2107608318328857, "learning_rate": 3.0423933635116158e-05, "loss": 0.2921, "num_input_tokens_seen": 57062816, "step": 59745 }, { "epoch": 4.873970144383718, "grad_norm": 1.0143377780914307, "learning_rate": 3.042045907693026e-05, "loss": 0.3504, "num_input_tokens_seen": 57067792, "step": 59750 }, { "epoch": 4.874378007994126, "grad_norm": 0.7317653894424438, "learning_rate": 3.041698440887395e-05, "loss": 0.3205, "num_input_tokens_seen": 57072400, "step": 59755 }, { "epoch": 4.874785871604535, "grad_norm": 0.8660644888877869, "learning_rate": 3.0413509631017666e-05, "loss": 0.256, "num_input_tokens_seen": 57077552, "step": 59760 }, { "epoch": 4.875193735214944, "grad_norm": 2.4812872409820557, "learning_rate": 3.0410034743431848e-05, "loss": 0.3695, "num_input_tokens_seen": 57082000, "step": 59765 }, { "epoch": 4.8756015988253525, "grad_norm": 1.7927401065826416, "learning_rate": 3.0406559746186925e-05, "loss": 0.4217, "num_input_tokens_seen": 57086016, "step": 59770 }, { "epoch": 4.8760094624357615, "grad_norm": 0.5895251035690308, "learning_rate": 3.040308463935333e-05, "loss": 0.3355, "num_input_tokens_seen": 57090528, "step": 59775 }, { "epoch": 4.8764173260461705, "grad_norm": 1.7454414367675781, "learning_rate": 3.039960942300151e-05, "loss": 0.3752, "num_input_tokens_seen": 57094640, "step": 59780 }, { "epoch": 4.876825189656579, "grad_norm": 1.3019813299179077, "learning_rate": 3.0396134097201894e-05, "loss": 0.3882, "num_input_tokens_seen": 57098864, "step": 59785 }, { "epoch": 4.877233053266988, "grad_norm": 1.213850736618042, "learning_rate": 3.0392658662024932e-05, "loss": 0.3749, "num_input_tokens_seen": 57103584, "step": 59790 }, { "epoch": 4.877640916877397, "grad_norm": 0.8842619061470032, "learning_rate": 3.0389183117541082e-05, "loss": 0.3539, "num_input_tokens_seen": 57108272, "step": 59795 }, { "epoch": 4.878048780487805, "grad_norm": 1.0612481832504272, "learning_rate": 3.0385707463820773e-05, "loss": 0.3491, "num_input_tokens_seen": 57112336, "step": 59800 }, { "epoch": 4.878456644098214, "grad_norm": 0.7887476682662964, "learning_rate": 3.038223170093446e-05, "loss": 0.3168, "num_input_tokens_seen": 57116976, "step": 59805 }, { "epoch": 4.878864507708622, "grad_norm": 1.1890801191329956, "learning_rate": 3.03787558289526e-05, "loss": 0.3728, "num_input_tokens_seen": 57121664, "step": 59810 }, { "epoch": 4.879272371319031, "grad_norm": 0.9119429588317871, "learning_rate": 3.037527984794565e-05, "loss": 0.3387, "num_input_tokens_seen": 57125840, "step": 59815 }, { "epoch": 4.87968023492944, "grad_norm": 1.4633152484893799, "learning_rate": 3.0371803757984056e-05, "loss": 0.3385, "num_input_tokens_seen": 57131072, "step": 59820 }, { "epoch": 4.880088098539848, "grad_norm": 0.9597924947738647, "learning_rate": 3.0368327559138282e-05, "loss": 0.3487, "num_input_tokens_seen": 57135824, "step": 59825 }, { "epoch": 4.880495962150257, "grad_norm": 2.1852591037750244, "learning_rate": 3.0364851251478794e-05, "loss": 0.3602, "num_input_tokens_seen": 57141152, "step": 59830 }, { "epoch": 4.880903825760665, "grad_norm": 1.0630637407302856, "learning_rate": 3.036137483507604e-05, "loss": 0.293, "num_input_tokens_seen": 57146000, "step": 59835 }, { "epoch": 4.881311689371074, "grad_norm": 1.1248732805252075, "learning_rate": 3.035789831000051e-05, "loss": 0.2825, "num_input_tokens_seen": 57150304, "step": 59840 }, { "epoch": 4.881719552981483, "grad_norm": 1.5220614671707153, "learning_rate": 3.0354421676322648e-05, "loss": 0.3318, "num_input_tokens_seen": 57154448, "step": 59845 }, { "epoch": 4.882127416591891, "grad_norm": 0.9872775077819824, "learning_rate": 3.0350944934112933e-05, "loss": 0.4224, "num_input_tokens_seen": 57158960, "step": 59850 }, { "epoch": 4.8825352802023, "grad_norm": 0.7935311198234558, "learning_rate": 3.034746808344185e-05, "loss": 0.3413, "num_input_tokens_seen": 57164032, "step": 59855 }, { "epoch": 4.882943143812709, "grad_norm": 1.69014310836792, "learning_rate": 3.034399112437985e-05, "loss": 0.3118, "num_input_tokens_seen": 57168624, "step": 59860 }, { "epoch": 4.8833510074231175, "grad_norm": 0.7094828486442566, "learning_rate": 3.0340514056997427e-05, "loss": 0.3896, "num_input_tokens_seen": 57173280, "step": 59865 }, { "epoch": 4.8837588710335265, "grad_norm": 2.3141002655029297, "learning_rate": 3.0337036881365045e-05, "loss": 0.3323, "num_input_tokens_seen": 57177392, "step": 59870 }, { "epoch": 4.8841667346439355, "grad_norm": 0.9165979623794556, "learning_rate": 3.03335595975532e-05, "loss": 0.3663, "num_input_tokens_seen": 57182032, "step": 59875 }, { "epoch": 4.884574598254344, "grad_norm": 2.229776620864868, "learning_rate": 3.033008220563237e-05, "loss": 0.3229, "num_input_tokens_seen": 57187296, "step": 59880 }, { "epoch": 4.884982461864753, "grad_norm": 1.6727709770202637, "learning_rate": 3.0326604705673038e-05, "loss": 0.3481, "num_input_tokens_seen": 57192352, "step": 59885 }, { "epoch": 4.885390325475161, "grad_norm": 0.9412485957145691, "learning_rate": 3.032312709774569e-05, "loss": 0.3723, "num_input_tokens_seen": 57197584, "step": 59890 }, { "epoch": 4.88579818908557, "grad_norm": 1.1007442474365234, "learning_rate": 3.0319649381920818e-05, "loss": 0.2969, "num_input_tokens_seen": 57201920, "step": 59895 }, { "epoch": 4.886206052695979, "grad_norm": 0.8130103945732117, "learning_rate": 3.0316171558268913e-05, "loss": 0.3427, "num_input_tokens_seen": 57206384, "step": 59900 }, { "epoch": 4.886613916306387, "grad_norm": 1.391955852508545, "learning_rate": 3.0312693626860468e-05, "loss": 0.3194, "num_input_tokens_seen": 57210672, "step": 59905 }, { "epoch": 4.887021779916796, "grad_norm": 2.0338876247406006, "learning_rate": 3.0309215587765983e-05, "loss": 0.3114, "num_input_tokens_seen": 57215664, "step": 59910 }, { "epoch": 4.887429643527205, "grad_norm": 0.6462189555168152, "learning_rate": 3.0305737441055952e-05, "loss": 0.3063, "num_input_tokens_seen": 57220608, "step": 59915 }, { "epoch": 4.887837507137613, "grad_norm": 0.669327437877655, "learning_rate": 3.030225918680088e-05, "loss": 0.382, "num_input_tokens_seen": 57225248, "step": 59920 }, { "epoch": 4.888245370748022, "grad_norm": 1.1505167484283447, "learning_rate": 3.0298780825071267e-05, "loss": 0.3693, "num_input_tokens_seen": 57229568, "step": 59925 }, { "epoch": 4.888653234358431, "grad_norm": 1.0134730339050293, "learning_rate": 3.0295302355937617e-05, "loss": 0.2977, "num_input_tokens_seen": 57234032, "step": 59930 }, { "epoch": 4.889061097968839, "grad_norm": 1.864202857017517, "learning_rate": 3.0291823779470435e-05, "loss": 0.2336, "num_input_tokens_seen": 57238480, "step": 59935 }, { "epoch": 4.889468961579248, "grad_norm": 0.9130851030349731, "learning_rate": 3.0288345095740238e-05, "loss": 0.3105, "num_input_tokens_seen": 57242656, "step": 59940 }, { "epoch": 4.889876825189656, "grad_norm": 1.2716691493988037, "learning_rate": 3.0284866304817538e-05, "loss": 0.3555, "num_input_tokens_seen": 57247264, "step": 59945 }, { "epoch": 4.890284688800065, "grad_norm": 3.365987539291382, "learning_rate": 3.0281387406772837e-05, "loss": 0.3928, "num_input_tokens_seen": 57252704, "step": 59950 }, { "epoch": 4.890692552410474, "grad_norm": 4.32464075088501, "learning_rate": 3.0277908401676663e-05, "loss": 0.3893, "num_input_tokens_seen": 57257312, "step": 59955 }, { "epoch": 4.891100416020882, "grad_norm": 1.5086921453475952, "learning_rate": 3.027442928959952e-05, "loss": 0.3193, "num_input_tokens_seen": 57262592, "step": 59960 }, { "epoch": 4.891508279631291, "grad_norm": 1.6880470514297485, "learning_rate": 3.027095007061194e-05, "loss": 0.3279, "num_input_tokens_seen": 57267728, "step": 59965 }, { "epoch": 4.8919161432416995, "grad_norm": 2.1106204986572266, "learning_rate": 3.026747074478444e-05, "loss": 0.2982, "num_input_tokens_seen": 57272336, "step": 59970 }, { "epoch": 4.8923240068521086, "grad_norm": 5.256466865539551, "learning_rate": 3.0263991312187558e-05, "loss": 0.3734, "num_input_tokens_seen": 57277808, "step": 59975 }, { "epoch": 4.892731870462518, "grad_norm": 4.03952169418335, "learning_rate": 3.026051177289179e-05, "loss": 0.3438, "num_input_tokens_seen": 57282592, "step": 59980 }, { "epoch": 4.893139734072926, "grad_norm": 3.116431713104248, "learning_rate": 3.02570321269677e-05, "loss": 0.346, "num_input_tokens_seen": 57286832, "step": 59985 }, { "epoch": 4.893547597683335, "grad_norm": 6.333736419677734, "learning_rate": 3.025355237448579e-05, "loss": 0.379, "num_input_tokens_seen": 57291456, "step": 59990 }, { "epoch": 4.893955461293744, "grad_norm": 2.7681021690368652, "learning_rate": 3.0250072515516615e-05, "loss": 0.3512, "num_input_tokens_seen": 57295824, "step": 59995 }, { "epoch": 4.894363324904152, "grad_norm": 2.5240225791931152, "learning_rate": 3.02465925501307e-05, "loss": 0.3616, "num_input_tokens_seen": 57299936, "step": 60000 }, { "epoch": 4.894771188514561, "grad_norm": 2.105648994445801, "learning_rate": 3.024311247839858e-05, "loss": 0.3301, "num_input_tokens_seen": 57303808, "step": 60005 }, { "epoch": 4.89517905212497, "grad_norm": 2.617496967315674, "learning_rate": 3.0239632300390797e-05, "loss": 0.3935, "num_input_tokens_seen": 57308288, "step": 60010 }, { "epoch": 4.895586915735378, "grad_norm": 2.580693483352661, "learning_rate": 3.0236152016177898e-05, "loss": 0.2839, "num_input_tokens_seen": 57313904, "step": 60015 }, { "epoch": 4.895994779345787, "grad_norm": 2.9107420444488525, "learning_rate": 3.0232671625830415e-05, "loss": 0.3267, "num_input_tokens_seen": 57318592, "step": 60020 }, { "epoch": 4.896402642956195, "grad_norm": 2.58357834815979, "learning_rate": 3.022919112941891e-05, "loss": 0.3231, "num_input_tokens_seen": 57324336, "step": 60025 }, { "epoch": 4.896810506566604, "grad_norm": 0.9273069500923157, "learning_rate": 3.0225710527013923e-05, "loss": 0.4486, "num_input_tokens_seen": 57329760, "step": 60030 }, { "epoch": 4.897218370177013, "grad_norm": 1.3458038568496704, "learning_rate": 3.0222229818686e-05, "loss": 0.3439, "num_input_tokens_seen": 57335392, "step": 60035 }, { "epoch": 4.897626233787421, "grad_norm": 1.5117862224578857, "learning_rate": 3.0218749004505697e-05, "loss": 0.3326, "num_input_tokens_seen": 57340688, "step": 60040 }, { "epoch": 4.89803409739783, "grad_norm": 2.162858247756958, "learning_rate": 3.021526808454357e-05, "loss": 0.3383, "num_input_tokens_seen": 57346208, "step": 60045 }, { "epoch": 4.898441961008238, "grad_norm": 1.3477498292922974, "learning_rate": 3.0211787058870178e-05, "loss": 0.3322, "num_input_tokens_seen": 57350576, "step": 60050 }, { "epoch": 4.898849824618647, "grad_norm": 2.7422749996185303, "learning_rate": 3.020830592755608e-05, "loss": 0.3646, "num_input_tokens_seen": 57356352, "step": 60055 }, { "epoch": 4.899257688229056, "grad_norm": 1.3865931034088135, "learning_rate": 3.0204824690671822e-05, "loss": 0.332, "num_input_tokens_seen": 57361248, "step": 60060 }, { "epoch": 4.8996655518394645, "grad_norm": 4.939668655395508, "learning_rate": 3.020134334828799e-05, "loss": 0.3166, "num_input_tokens_seen": 57366032, "step": 60065 }, { "epoch": 4.9000734154498735, "grad_norm": 1.0620768070220947, "learning_rate": 3.0197861900475133e-05, "loss": 0.2456, "num_input_tokens_seen": 57371232, "step": 60070 }, { "epoch": 4.9004812790602825, "grad_norm": 6.726021766662598, "learning_rate": 3.0194380347303826e-05, "loss": 0.3208, "num_input_tokens_seen": 57376496, "step": 60075 }, { "epoch": 4.900889142670691, "grad_norm": 6.714399814605713, "learning_rate": 3.0190898688844632e-05, "loss": 0.2692, "num_input_tokens_seen": 57380560, "step": 60080 }, { "epoch": 4.9012970062811, "grad_norm": 1.8890671730041504, "learning_rate": 3.0187416925168134e-05, "loss": 0.3608, "num_input_tokens_seen": 57386144, "step": 60085 }, { "epoch": 4.901704869891509, "grad_norm": 2.4457430839538574, "learning_rate": 3.0183935056344903e-05, "loss": 0.3616, "num_input_tokens_seen": 57390928, "step": 60090 }, { "epoch": 4.902112733501917, "grad_norm": 1.1912763118743896, "learning_rate": 3.0180453082445503e-05, "loss": 0.3188, "num_input_tokens_seen": 57395776, "step": 60095 }, { "epoch": 4.902520597112326, "grad_norm": 0.6570920348167419, "learning_rate": 3.0176971003540523e-05, "loss": 0.2131, "num_input_tokens_seen": 57400224, "step": 60100 }, { "epoch": 4.902928460722734, "grad_norm": 1.8793267011642456, "learning_rate": 3.017348881970054e-05, "loss": 0.2989, "num_input_tokens_seen": 57404560, "step": 60105 }, { "epoch": 4.903336324333143, "grad_norm": 2.6031787395477295, "learning_rate": 3.0170006530996136e-05, "loss": 0.4057, "num_input_tokens_seen": 57410544, "step": 60110 }, { "epoch": 4.903744187943552, "grad_norm": 0.4754827618598938, "learning_rate": 3.01665241374979e-05, "loss": 0.2614, "num_input_tokens_seen": 57413728, "step": 60115 }, { "epoch": 4.90415205155396, "grad_norm": 0.6165111064910889, "learning_rate": 3.0163041639276414e-05, "loss": 0.3061, "num_input_tokens_seen": 57418672, "step": 60120 }, { "epoch": 4.904559915164369, "grad_norm": 2.4229416847229004, "learning_rate": 3.0159559036402264e-05, "loss": 0.4011, "num_input_tokens_seen": 57423360, "step": 60125 }, { "epoch": 4.904967778774778, "grad_norm": 2.0265817642211914, "learning_rate": 3.0156076328946047e-05, "loss": 0.3989, "num_input_tokens_seen": 57428432, "step": 60130 }, { "epoch": 4.905375642385186, "grad_norm": 1.525267243385315, "learning_rate": 3.015259351697835e-05, "loss": 0.3432, "num_input_tokens_seen": 57433328, "step": 60135 }, { "epoch": 4.905783505995595, "grad_norm": 0.9091727137565613, "learning_rate": 3.0149110600569775e-05, "loss": 0.3336, "num_input_tokens_seen": 57438192, "step": 60140 }, { "epoch": 4.906191369606004, "grad_norm": 1.047644853591919, "learning_rate": 3.0145627579790914e-05, "loss": 0.331, "num_input_tokens_seen": 57443424, "step": 60145 }, { "epoch": 4.906599233216412, "grad_norm": 2.6672916412353516, "learning_rate": 3.0142144454712373e-05, "loss": 0.4497, "num_input_tokens_seen": 57447792, "step": 60150 }, { "epoch": 4.907007096826821, "grad_norm": 2.343919038772583, "learning_rate": 3.013866122540475e-05, "loss": 0.4014, "num_input_tokens_seen": 57451728, "step": 60155 }, { "epoch": 4.9074149604372295, "grad_norm": 3.74249005317688, "learning_rate": 3.013517789193865e-05, "loss": 0.3059, "num_input_tokens_seen": 57456800, "step": 60160 }, { "epoch": 4.9078228240476385, "grad_norm": 1.6303913593292236, "learning_rate": 3.0131694454384674e-05, "loss": 0.2637, "num_input_tokens_seen": 57462144, "step": 60165 }, { "epoch": 4.9082306876580475, "grad_norm": 1.6737526655197144, "learning_rate": 3.0128210912813426e-05, "loss": 0.4156, "num_input_tokens_seen": 57467200, "step": 60170 }, { "epoch": 4.908638551268456, "grad_norm": 2.044433832168579, "learning_rate": 3.012472726729552e-05, "loss": 0.3546, "num_input_tokens_seen": 57471776, "step": 60175 }, { "epoch": 4.909046414878865, "grad_norm": 3.73897647857666, "learning_rate": 3.0121243517901575e-05, "loss": 0.3005, "num_input_tokens_seen": 57476688, "step": 60180 }, { "epoch": 4.909454278489273, "grad_norm": 2.817432403564453, "learning_rate": 3.0117759664702206e-05, "loss": 0.3776, "num_input_tokens_seen": 57481120, "step": 60185 }, { "epoch": 4.909862142099682, "grad_norm": 3.4665756225585938, "learning_rate": 3.0114275707768007e-05, "loss": 0.348, "num_input_tokens_seen": 57485792, "step": 60190 }, { "epoch": 4.910270005710091, "grad_norm": 3.405850410461426, "learning_rate": 3.011079164716963e-05, "loss": 0.3153, "num_input_tokens_seen": 57490784, "step": 60195 }, { "epoch": 4.910677869320499, "grad_norm": 3.0394833087921143, "learning_rate": 3.0107307482977666e-05, "loss": 0.3453, "num_input_tokens_seen": 57495968, "step": 60200 }, { "epoch": 4.911085732930908, "grad_norm": 3.571377754211426, "learning_rate": 3.0103823215262755e-05, "loss": 0.3897, "num_input_tokens_seen": 57501696, "step": 60205 }, { "epoch": 4.911493596541317, "grad_norm": 2.3303632736206055, "learning_rate": 3.0100338844095523e-05, "loss": 0.3809, "num_input_tokens_seen": 57506928, "step": 60210 }, { "epoch": 4.911901460151725, "grad_norm": 1.3830405473709106, "learning_rate": 3.0096854369546574e-05, "loss": 0.3301, "num_input_tokens_seen": 57512064, "step": 60215 }, { "epoch": 4.912309323762134, "grad_norm": 3.381094217300415, "learning_rate": 3.0093369791686565e-05, "loss": 0.2905, "num_input_tokens_seen": 57516768, "step": 60220 }, { "epoch": 4.912717187372543, "grad_norm": 1.4927325248718262, "learning_rate": 3.0089885110586115e-05, "loss": 0.3703, "num_input_tokens_seen": 57522384, "step": 60225 }, { "epoch": 4.913125050982951, "grad_norm": 1.3599534034729004, "learning_rate": 3.008640032631585e-05, "loss": 0.2813, "num_input_tokens_seen": 57526944, "step": 60230 }, { "epoch": 4.91353291459336, "grad_norm": 2.130415678024292, "learning_rate": 3.0082915438946413e-05, "loss": 0.2489, "num_input_tokens_seen": 57532112, "step": 60235 }, { "epoch": 4.913940778203768, "grad_norm": 3.461796998977661, "learning_rate": 3.007943044854844e-05, "loss": 0.2872, "num_input_tokens_seen": 57536976, "step": 60240 }, { "epoch": 4.914348641814177, "grad_norm": 3.7291409969329834, "learning_rate": 3.007594535519257e-05, "loss": 0.3334, "num_input_tokens_seen": 57541520, "step": 60245 }, { "epoch": 4.914756505424586, "grad_norm": 3.586872100830078, "learning_rate": 3.0072460158949444e-05, "loss": 0.4127, "num_input_tokens_seen": 57547072, "step": 60250 }, { "epoch": 4.915164369034994, "grad_norm": 6.787745475769043, "learning_rate": 3.0068974859889713e-05, "loss": 0.3165, "num_input_tokens_seen": 57552320, "step": 60255 }, { "epoch": 4.915572232645403, "grad_norm": 5.271803379058838, "learning_rate": 3.0065489458084005e-05, "loss": 0.4113, "num_input_tokens_seen": 57557536, "step": 60260 }, { "epoch": 4.915980096255812, "grad_norm": 3.9739763736724854, "learning_rate": 3.0062003953602986e-05, "loss": 0.4336, "num_input_tokens_seen": 57562768, "step": 60265 }, { "epoch": 4.916387959866221, "grad_norm": 1.4650176763534546, "learning_rate": 3.0058518346517295e-05, "loss": 0.3369, "num_input_tokens_seen": 57567296, "step": 60270 }, { "epoch": 4.91679582347663, "grad_norm": 4.413017749786377, "learning_rate": 3.0055032636897585e-05, "loss": 0.3282, "num_input_tokens_seen": 57571536, "step": 60275 }, { "epoch": 4.917203687087039, "grad_norm": 4.127519607543945, "learning_rate": 3.0051546824814515e-05, "loss": 0.4525, "num_input_tokens_seen": 57576144, "step": 60280 }, { "epoch": 4.917611550697447, "grad_norm": 2.516026496887207, "learning_rate": 3.0048060910338738e-05, "loss": 0.3928, "num_input_tokens_seen": 57580800, "step": 60285 }, { "epoch": 4.918019414307856, "grad_norm": 5.61155891418457, "learning_rate": 3.004457489354091e-05, "loss": 0.3312, "num_input_tokens_seen": 57586192, "step": 60290 }, { "epoch": 4.918427277918264, "grad_norm": 1.9311572313308716, "learning_rate": 3.0041088774491694e-05, "loss": 0.32, "num_input_tokens_seen": 57590976, "step": 60295 }, { "epoch": 4.918835141528673, "grad_norm": 5.633139610290527, "learning_rate": 3.0037602553261745e-05, "loss": 0.3966, "num_input_tokens_seen": 57596368, "step": 60300 }, { "epoch": 4.919243005139082, "grad_norm": 2.8994927406311035, "learning_rate": 3.003411622992174e-05, "loss": 0.3371, "num_input_tokens_seen": 57601024, "step": 60305 }, { "epoch": 4.91965086874949, "grad_norm": 3.521589994430542, "learning_rate": 3.003062980454233e-05, "loss": 0.3431, "num_input_tokens_seen": 57605840, "step": 60310 }, { "epoch": 4.920058732359899, "grad_norm": 2.5819091796875, "learning_rate": 3.0027143277194192e-05, "loss": 0.3362, "num_input_tokens_seen": 57610704, "step": 60315 }, { "epoch": 4.920466595970307, "grad_norm": 4.383622646331787, "learning_rate": 3.0023656647948005e-05, "loss": 0.28, "num_input_tokens_seen": 57615792, "step": 60320 }, { "epoch": 4.920874459580716, "grad_norm": 3.028803586959839, "learning_rate": 3.0020169916874425e-05, "loss": 0.3022, "num_input_tokens_seen": 57619936, "step": 60325 }, { "epoch": 4.921282323191125, "grad_norm": 4.883628845214844, "learning_rate": 3.0016683084044134e-05, "loss": 0.384, "num_input_tokens_seen": 57623776, "step": 60330 }, { "epoch": 4.921690186801533, "grad_norm": 0.5739901065826416, "learning_rate": 3.0013196149527812e-05, "loss": 0.3373, "num_input_tokens_seen": 57628352, "step": 60335 }, { "epoch": 4.922098050411942, "grad_norm": 2.9231295585632324, "learning_rate": 3.0009709113396132e-05, "loss": 0.3942, "num_input_tokens_seen": 57633696, "step": 60340 }, { "epoch": 4.922505914022351, "grad_norm": 3.7084405422210693, "learning_rate": 3.000622197571977e-05, "loss": 0.4112, "num_input_tokens_seen": 57638336, "step": 60345 }, { "epoch": 4.922913777632759, "grad_norm": 3.3666422367095947, "learning_rate": 3.0002734736569432e-05, "loss": 0.4211, "num_input_tokens_seen": 57642688, "step": 60350 }, { "epoch": 4.923321641243168, "grad_norm": 4.572760581970215, "learning_rate": 2.9999247396015778e-05, "loss": 0.3385, "num_input_tokens_seen": 57647312, "step": 60355 }, { "epoch": 4.923729504853577, "grad_norm": 4.079047679901123, "learning_rate": 2.999575995412951e-05, "loss": 0.2888, "num_input_tokens_seen": 57652032, "step": 60360 }, { "epoch": 4.9241373684639855, "grad_norm": 1.29647696018219, "learning_rate": 2.99922724109813e-05, "loss": 0.29, "num_input_tokens_seen": 57656320, "step": 60365 }, { "epoch": 4.9245452320743945, "grad_norm": 0.798821210861206, "learning_rate": 2.9988784766641853e-05, "loss": 0.2777, "num_input_tokens_seen": 57661200, "step": 60370 }, { "epoch": 4.924953095684803, "grad_norm": 2.2770752906799316, "learning_rate": 2.9985297021181862e-05, "loss": 0.3437, "num_input_tokens_seen": 57666064, "step": 60375 }, { "epoch": 4.925360959295212, "grad_norm": 1.890251636505127, "learning_rate": 2.9981809174672017e-05, "loss": 0.4224, "num_input_tokens_seen": 57671136, "step": 60380 }, { "epoch": 4.925768822905621, "grad_norm": 2.979377508163452, "learning_rate": 2.9978321227183017e-05, "loss": 0.313, "num_input_tokens_seen": 57675456, "step": 60385 }, { "epoch": 4.926176686516029, "grad_norm": 2.5833582878112793, "learning_rate": 2.9974833178785555e-05, "loss": 0.4187, "num_input_tokens_seen": 57679648, "step": 60390 }, { "epoch": 4.926584550126438, "grad_norm": 3.794768810272217, "learning_rate": 2.9971345029550346e-05, "loss": 0.4337, "num_input_tokens_seen": 57684576, "step": 60395 }, { "epoch": 4.926992413736846, "grad_norm": 7.426989555358887, "learning_rate": 2.9967856779548082e-05, "loss": 0.3147, "num_input_tokens_seen": 57688960, "step": 60400 }, { "epoch": 4.927400277347255, "grad_norm": 2.6192877292633057, "learning_rate": 2.9964368428849478e-05, "loss": 0.3822, "num_input_tokens_seen": 57693584, "step": 60405 }, { "epoch": 4.927808140957664, "grad_norm": 1.471469521522522, "learning_rate": 2.996087997752523e-05, "loss": 0.3444, "num_input_tokens_seen": 57698288, "step": 60410 }, { "epoch": 4.928216004568072, "grad_norm": 1.3994675874710083, "learning_rate": 2.9957391425646054e-05, "loss": 0.3561, "num_input_tokens_seen": 57703536, "step": 60415 }, { "epoch": 4.928623868178481, "grad_norm": 2.4589059352874756, "learning_rate": 2.995390277328266e-05, "loss": 0.3495, "num_input_tokens_seen": 57708288, "step": 60420 }, { "epoch": 4.92903173178889, "grad_norm": 2.1533429622650146, "learning_rate": 2.995041402050576e-05, "loss": 0.3576, "num_input_tokens_seen": 57713344, "step": 60425 }, { "epoch": 4.929439595399298, "grad_norm": 2.1095566749572754, "learning_rate": 2.9946925167386075e-05, "loss": 0.3616, "num_input_tokens_seen": 57717888, "step": 60430 }, { "epoch": 4.929847459009707, "grad_norm": 1.5644359588623047, "learning_rate": 2.994343621399432e-05, "loss": 0.3408, "num_input_tokens_seen": 57723072, "step": 60435 }, { "epoch": 4.930255322620116, "grad_norm": 1.4940201044082642, "learning_rate": 2.9939947160401216e-05, "loss": 0.3612, "num_input_tokens_seen": 57727776, "step": 60440 }, { "epoch": 4.930663186230524, "grad_norm": 1.4083397388458252, "learning_rate": 2.993645800667747e-05, "loss": 0.4482, "num_input_tokens_seen": 57732720, "step": 60445 }, { "epoch": 4.931071049840933, "grad_norm": 1.7333343029022217, "learning_rate": 2.993296875289383e-05, "loss": 0.2914, "num_input_tokens_seen": 57738160, "step": 60450 }, { "epoch": 4.9314789134513415, "grad_norm": 1.297519564628601, "learning_rate": 2.9929479399121008e-05, "loss": 0.2756, "num_input_tokens_seen": 57742288, "step": 60455 }, { "epoch": 4.9318867770617505, "grad_norm": 1.6491142511367798, "learning_rate": 2.9925989945429723e-05, "loss": 0.2997, "num_input_tokens_seen": 57746976, "step": 60460 }, { "epoch": 4.9322946406721595, "grad_norm": 0.6317053437232971, "learning_rate": 2.9922500391890723e-05, "loss": 0.3458, "num_input_tokens_seen": 57752064, "step": 60465 }, { "epoch": 4.932702504282568, "grad_norm": 1.1769709587097168, "learning_rate": 2.9919010738574725e-05, "loss": 0.2958, "num_input_tokens_seen": 57756768, "step": 60470 }, { "epoch": 4.933110367892977, "grad_norm": 0.7891080379486084, "learning_rate": 2.991552098555248e-05, "loss": 0.2441, "num_input_tokens_seen": 57761696, "step": 60475 }, { "epoch": 4.933518231503386, "grad_norm": 1.2617353200912476, "learning_rate": 2.9912031132894708e-05, "loss": 0.361, "num_input_tokens_seen": 57766032, "step": 60480 }, { "epoch": 4.933926095113794, "grad_norm": 1.311802625656128, "learning_rate": 2.9908541180672157e-05, "loss": 0.3792, "num_input_tokens_seen": 57769936, "step": 60485 }, { "epoch": 4.934333958724203, "grad_norm": 2.061293840408325, "learning_rate": 2.9905051128955554e-05, "loss": 0.4676, "num_input_tokens_seen": 57774320, "step": 60490 }, { "epoch": 4.934741822334612, "grad_norm": 3.092750072479248, "learning_rate": 2.990156097781565e-05, "loss": 0.2928, "num_input_tokens_seen": 57778608, "step": 60495 }, { "epoch": 4.93514968594502, "grad_norm": 0.7176551818847656, "learning_rate": 2.9898070727323196e-05, "loss": 0.2843, "num_input_tokens_seen": 57783456, "step": 60500 }, { "epoch": 4.935557549555429, "grad_norm": 1.1126840114593506, "learning_rate": 2.9894580377548926e-05, "loss": 0.4036, "num_input_tokens_seen": 57788432, "step": 60505 }, { "epoch": 4.935965413165837, "grad_norm": 3.359213352203369, "learning_rate": 2.9891089928563588e-05, "loss": 0.3928, "num_input_tokens_seen": 57793760, "step": 60510 }, { "epoch": 4.936373276776246, "grad_norm": 3.5560240745544434, "learning_rate": 2.9887599380437935e-05, "loss": 0.3307, "num_input_tokens_seen": 57798528, "step": 60515 }, { "epoch": 4.936781140386655, "grad_norm": 3.935739278793335, "learning_rate": 2.988410873324272e-05, "loss": 0.3486, "num_input_tokens_seen": 57804000, "step": 60520 }, { "epoch": 4.937189003997063, "grad_norm": 2.9515740871429443, "learning_rate": 2.98806179870487e-05, "loss": 0.3052, "num_input_tokens_seen": 57808720, "step": 60525 }, { "epoch": 4.937596867607472, "grad_norm": 2.525836944580078, "learning_rate": 2.9877127141926624e-05, "loss": 0.2481, "num_input_tokens_seen": 57813280, "step": 60530 }, { "epoch": 4.93800473121788, "grad_norm": 2.603100061416626, "learning_rate": 2.9873636197947253e-05, "loss": 0.2716, "num_input_tokens_seen": 57818688, "step": 60535 }, { "epoch": 4.938412594828289, "grad_norm": 3.4128565788269043, "learning_rate": 2.987014515518135e-05, "loss": 0.3471, "num_input_tokens_seen": 57823056, "step": 60540 }, { "epoch": 4.938820458438698, "grad_norm": 4.783059120178223, "learning_rate": 2.9866654013699668e-05, "loss": 0.4099, "num_input_tokens_seen": 57827664, "step": 60545 }, { "epoch": 4.9392283220491064, "grad_norm": 3.1642372608184814, "learning_rate": 2.986316277357299e-05, "loss": 0.4683, "num_input_tokens_seen": 57832080, "step": 60550 }, { "epoch": 4.9396361856595155, "grad_norm": 3.355630397796631, "learning_rate": 2.985967143487206e-05, "loss": 0.2755, "num_input_tokens_seen": 57837328, "step": 60555 }, { "epoch": 4.9400440492699245, "grad_norm": 1.3691942691802979, "learning_rate": 2.9856179997667653e-05, "loss": 0.2863, "num_input_tokens_seen": 57842048, "step": 60560 }, { "epoch": 4.940451912880333, "grad_norm": 2.032266139984131, "learning_rate": 2.9852688462030544e-05, "loss": 0.4287, "num_input_tokens_seen": 57847520, "step": 60565 }, { "epoch": 4.940859776490742, "grad_norm": 3.2677550315856934, "learning_rate": 2.9849196828031506e-05, "loss": 0.275, "num_input_tokens_seen": 57852768, "step": 60570 }, { "epoch": 4.941267640101151, "grad_norm": 2.9463629722595215, "learning_rate": 2.9845705095741305e-05, "loss": 0.3373, "num_input_tokens_seen": 57857808, "step": 60575 }, { "epoch": 4.941675503711559, "grad_norm": 2.626237392425537, "learning_rate": 2.9842213265230718e-05, "loss": 0.4122, "num_input_tokens_seen": 57862864, "step": 60580 }, { "epoch": 4.942083367321968, "grad_norm": 3.054682970046997, "learning_rate": 2.983872133657053e-05, "loss": 0.3925, "num_input_tokens_seen": 57868384, "step": 60585 }, { "epoch": 4.942491230932376, "grad_norm": 3.383347988128662, "learning_rate": 2.9835229309831514e-05, "loss": 0.3374, "num_input_tokens_seen": 57872384, "step": 60590 }, { "epoch": 4.942899094542785, "grad_norm": 4.545836448669434, "learning_rate": 2.9831737185084457e-05, "loss": 0.3231, "num_input_tokens_seen": 57877616, "step": 60595 }, { "epoch": 4.943306958153194, "grad_norm": 3.5204179286956787, "learning_rate": 2.9828244962400133e-05, "loss": 0.2939, "num_input_tokens_seen": 57882640, "step": 60600 }, { "epoch": 4.943714821763602, "grad_norm": 3.69169545173645, "learning_rate": 2.9824752641849348e-05, "loss": 0.2927, "num_input_tokens_seen": 57887456, "step": 60605 }, { "epoch": 4.944122685374011, "grad_norm": 2.3471224308013916, "learning_rate": 2.9821260223502866e-05, "loss": 0.3668, "num_input_tokens_seen": 57892032, "step": 60610 }, { "epoch": 4.944530548984419, "grad_norm": 1.3541851043701172, "learning_rate": 2.9817767707431493e-05, "loss": 0.3777, "num_input_tokens_seen": 57897216, "step": 60615 }, { "epoch": 4.944938412594828, "grad_norm": 1.5679043531417847, "learning_rate": 2.9814275093706013e-05, "loss": 0.2979, "num_input_tokens_seen": 57901920, "step": 60620 }, { "epoch": 4.945346276205237, "grad_norm": 2.613204002380371, "learning_rate": 2.9810782382397222e-05, "loss": 0.3587, "num_input_tokens_seen": 57906240, "step": 60625 }, { "epoch": 4.945754139815645, "grad_norm": 1.659445881843567, "learning_rate": 2.980728957357592e-05, "loss": 0.3941, "num_input_tokens_seen": 57910976, "step": 60630 }, { "epoch": 4.946162003426054, "grad_norm": 2.2756621837615967, "learning_rate": 2.9803796667312905e-05, "loss": 0.3977, "num_input_tokens_seen": 57915888, "step": 60635 }, { "epoch": 4.946569867036463, "grad_norm": 2.9670324325561523, "learning_rate": 2.980030366367897e-05, "loss": 0.3212, "num_input_tokens_seen": 57920304, "step": 60640 }, { "epoch": 4.946977730646871, "grad_norm": 0.7426567077636719, "learning_rate": 2.9796810562744915e-05, "loss": 0.3441, "num_input_tokens_seen": 57923920, "step": 60645 }, { "epoch": 4.94738559425728, "grad_norm": 1.437272071838379, "learning_rate": 2.9793317364581552e-05, "loss": 0.3673, "num_input_tokens_seen": 57928944, "step": 60650 }, { "epoch": 4.947793457867689, "grad_norm": 2.2700815200805664, "learning_rate": 2.9789824069259675e-05, "loss": 0.3388, "num_input_tokens_seen": 57933728, "step": 60655 }, { "epoch": 4.9482013214780975, "grad_norm": 3.1065032482147217, "learning_rate": 2.9786330676850105e-05, "loss": 0.3724, "num_input_tokens_seen": 57938624, "step": 60660 }, { "epoch": 4.948609185088507, "grad_norm": 0.9954392313957214, "learning_rate": 2.978283718742365e-05, "loss": 0.3017, "num_input_tokens_seen": 57943664, "step": 60665 }, { "epoch": 4.949017048698915, "grad_norm": 2.7578091621398926, "learning_rate": 2.9779343601051112e-05, "loss": 0.3485, "num_input_tokens_seen": 57948480, "step": 60670 }, { "epoch": 4.949424912309324, "grad_norm": 1.555796504020691, "learning_rate": 2.977584991780331e-05, "loss": 0.3509, "num_input_tokens_seen": 57952240, "step": 60675 }, { "epoch": 4.949832775919733, "grad_norm": 2.2053749561309814, "learning_rate": 2.9772356137751063e-05, "loss": 0.2923, "num_input_tokens_seen": 57956832, "step": 60680 }, { "epoch": 4.950240639530141, "grad_norm": 0.9515072107315063, "learning_rate": 2.9768862260965184e-05, "loss": 0.3163, "num_input_tokens_seen": 57961456, "step": 60685 }, { "epoch": 4.95064850314055, "grad_norm": 2.2591803073883057, "learning_rate": 2.976536828751649e-05, "loss": 0.2671, "num_input_tokens_seen": 57966464, "step": 60690 }, { "epoch": 4.951056366750959, "grad_norm": 1.9980778694152832, "learning_rate": 2.976187421747581e-05, "loss": 0.315, "num_input_tokens_seen": 57970688, "step": 60695 }, { "epoch": 4.951464230361367, "grad_norm": 4.620424270629883, "learning_rate": 2.9758380050913963e-05, "loss": 0.3473, "num_input_tokens_seen": 57975408, "step": 60700 }, { "epoch": 4.951872093971776, "grad_norm": 2.942140579223633, "learning_rate": 2.9754885787901775e-05, "loss": 0.372, "num_input_tokens_seen": 57978992, "step": 60705 }, { "epoch": 4.952279957582185, "grad_norm": 3.845142126083374, "learning_rate": 2.9751391428510078e-05, "loss": 0.3662, "num_input_tokens_seen": 57984064, "step": 60710 }, { "epoch": 4.952687821192593, "grad_norm": 1.8802614212036133, "learning_rate": 2.9747896972809685e-05, "loss": 0.3687, "num_input_tokens_seen": 57988304, "step": 60715 }, { "epoch": 4.953095684803002, "grad_norm": 2.079929828643799, "learning_rate": 2.9744402420871442e-05, "loss": 0.3409, "num_input_tokens_seen": 57993296, "step": 60720 }, { "epoch": 4.95350354841341, "grad_norm": 2.6440346240997314, "learning_rate": 2.9740907772766173e-05, "loss": 0.3673, "num_input_tokens_seen": 57997568, "step": 60725 }, { "epoch": 4.953911412023819, "grad_norm": 1.7229962348937988, "learning_rate": 2.9737413028564726e-05, "loss": 0.2485, "num_input_tokens_seen": 58001968, "step": 60730 }, { "epoch": 4.954319275634228, "grad_norm": 1.797897458076477, "learning_rate": 2.9733918188337924e-05, "loss": 0.355, "num_input_tokens_seen": 58007040, "step": 60735 }, { "epoch": 4.954727139244636, "grad_norm": 2.377431869506836, "learning_rate": 2.9730423252156615e-05, "loss": 0.3794, "num_input_tokens_seen": 58012416, "step": 60740 }, { "epoch": 4.955135002855045, "grad_norm": 2.8812365531921387, "learning_rate": 2.9726928220091633e-05, "loss": 0.3203, "num_input_tokens_seen": 58017072, "step": 60745 }, { "epoch": 4.9555428664654535, "grad_norm": 2.838121175765991, "learning_rate": 2.9723433092213825e-05, "loss": 0.3534, "num_input_tokens_seen": 58022128, "step": 60750 }, { "epoch": 4.9559507300758625, "grad_norm": 1.7786602973937988, "learning_rate": 2.971993786859404e-05, "loss": 0.41, "num_input_tokens_seen": 58026688, "step": 60755 }, { "epoch": 4.9563585936862715, "grad_norm": 3.5581419467926025, "learning_rate": 2.971644254930312e-05, "loss": 0.3739, "num_input_tokens_seen": 58031936, "step": 60760 }, { "epoch": 4.95676645729668, "grad_norm": 1.6415987014770508, "learning_rate": 2.971294713441191e-05, "loss": 0.432, "num_input_tokens_seen": 58037200, "step": 60765 }, { "epoch": 4.957174320907089, "grad_norm": 1.3930391073226929, "learning_rate": 2.970945162399127e-05, "loss": 0.3371, "num_input_tokens_seen": 58042448, "step": 60770 }, { "epoch": 4.957582184517498, "grad_norm": 2.298485517501831, "learning_rate": 2.9705956018112037e-05, "loss": 0.3707, "num_input_tokens_seen": 58047872, "step": 60775 }, { "epoch": 4.957990048127906, "grad_norm": 2.1369645595550537, "learning_rate": 2.970246031684508e-05, "loss": 0.4045, "num_input_tokens_seen": 58052624, "step": 60780 }, { "epoch": 4.958397911738315, "grad_norm": 1.6286600828170776, "learning_rate": 2.9698964520261256e-05, "loss": 0.3252, "num_input_tokens_seen": 58057088, "step": 60785 }, { "epoch": 4.958805775348724, "grad_norm": 2.26617431640625, "learning_rate": 2.9695468628431412e-05, "loss": 0.2698, "num_input_tokens_seen": 58061696, "step": 60790 }, { "epoch": 4.959213638959132, "grad_norm": 1.3773295879364014, "learning_rate": 2.969197264142642e-05, "loss": 0.3193, "num_input_tokens_seen": 58066368, "step": 60795 }, { "epoch": 4.959621502569541, "grad_norm": 2.2128279209136963, "learning_rate": 2.9688476559317126e-05, "loss": 0.3418, "num_input_tokens_seen": 58071328, "step": 60800 }, { "epoch": 4.960029366179949, "grad_norm": 2.8947553634643555, "learning_rate": 2.968498038217441e-05, "loss": 0.274, "num_input_tokens_seen": 58076448, "step": 60805 }, { "epoch": 4.960437229790358, "grad_norm": 1.9351797103881836, "learning_rate": 2.9681484110069136e-05, "loss": 0.421, "num_input_tokens_seen": 58081456, "step": 60810 }, { "epoch": 4.960845093400767, "grad_norm": 1.8937287330627441, "learning_rate": 2.9677987743072163e-05, "loss": 0.3912, "num_input_tokens_seen": 58086496, "step": 60815 }, { "epoch": 4.961252957011175, "grad_norm": 0.8208326101303101, "learning_rate": 2.967449128125437e-05, "loss": 0.3992, "num_input_tokens_seen": 58091360, "step": 60820 }, { "epoch": 4.961660820621584, "grad_norm": 1.543447494506836, "learning_rate": 2.967099472468662e-05, "loss": 0.2993, "num_input_tokens_seen": 58096224, "step": 60825 }, { "epoch": 4.962068684231993, "grad_norm": 1.2333194017410278, "learning_rate": 2.966749807343979e-05, "loss": 0.3286, "num_input_tokens_seen": 58101008, "step": 60830 }, { "epoch": 4.962476547842401, "grad_norm": 1.993006706237793, "learning_rate": 2.9664001327584763e-05, "loss": 0.3803, "num_input_tokens_seen": 58105056, "step": 60835 }, { "epoch": 4.96288441145281, "grad_norm": 1.1711443662643433, "learning_rate": 2.9660504487192415e-05, "loss": 0.3346, "num_input_tokens_seen": 58110176, "step": 60840 }, { "epoch": 4.963292275063219, "grad_norm": 2.0382444858551025, "learning_rate": 2.9657007552333617e-05, "loss": 0.3095, "num_input_tokens_seen": 58114816, "step": 60845 }, { "epoch": 4.9637001386736275, "grad_norm": 1.6046768426895142, "learning_rate": 2.9653510523079254e-05, "loss": 0.3334, "num_input_tokens_seen": 58119104, "step": 60850 }, { "epoch": 4.9641080022840365, "grad_norm": 1.8389188051223755, "learning_rate": 2.965001339950021e-05, "loss": 0.3774, "num_input_tokens_seen": 58124032, "step": 60855 }, { "epoch": 4.964515865894445, "grad_norm": 1.030269742012024, "learning_rate": 2.9646516181667373e-05, "loss": 0.3901, "num_input_tokens_seen": 58128944, "step": 60860 }, { "epoch": 4.964923729504854, "grad_norm": 1.5833746194839478, "learning_rate": 2.9643018869651624e-05, "loss": 0.2763, "num_input_tokens_seen": 58133536, "step": 60865 }, { "epoch": 4.965331593115263, "grad_norm": 3.0829596519470215, "learning_rate": 2.9639521463523856e-05, "loss": 0.2944, "num_input_tokens_seen": 58138560, "step": 60870 }, { "epoch": 4.965739456725671, "grad_norm": 1.0282024145126343, "learning_rate": 2.9636023963354957e-05, "loss": 0.3363, "num_input_tokens_seen": 58143184, "step": 60875 }, { "epoch": 4.96614732033608, "grad_norm": 1.1507492065429688, "learning_rate": 2.9632526369215823e-05, "loss": 0.3083, "num_input_tokens_seen": 58146800, "step": 60880 }, { "epoch": 4.966555183946488, "grad_norm": 0.6661674976348877, "learning_rate": 2.9629028681177357e-05, "loss": 0.249, "num_input_tokens_seen": 58151776, "step": 60885 }, { "epoch": 4.966963047556897, "grad_norm": 1.1113675832748413, "learning_rate": 2.9625530899310433e-05, "loss": 0.331, "num_input_tokens_seen": 58157280, "step": 60890 }, { "epoch": 4.967370911167306, "grad_norm": 1.091368556022644, "learning_rate": 2.9622033023685974e-05, "loss": 0.2448, "num_input_tokens_seen": 58162976, "step": 60895 }, { "epoch": 4.967778774777714, "grad_norm": 0.7231336236000061, "learning_rate": 2.9618535054374863e-05, "loss": 0.3382, "num_input_tokens_seen": 58167440, "step": 60900 }, { "epoch": 4.968186638388123, "grad_norm": 2.04706072807312, "learning_rate": 2.9615036991448015e-05, "loss": 0.2542, "num_input_tokens_seen": 58172544, "step": 60905 }, { "epoch": 4.968594501998532, "grad_norm": 2.785604953765869, "learning_rate": 2.9611538834976328e-05, "loss": 0.4279, "num_input_tokens_seen": 58176736, "step": 60910 }, { "epoch": 4.96900236560894, "grad_norm": 2.0924575328826904, "learning_rate": 2.9608040585030706e-05, "loss": 0.3605, "num_input_tokens_seen": 58182096, "step": 60915 }, { "epoch": 4.969410229219349, "grad_norm": 1.7314116954803467, "learning_rate": 2.960454224168206e-05, "loss": 0.4818, "num_input_tokens_seen": 58186768, "step": 60920 }, { "epoch": 4.969818092829758, "grad_norm": 1.0768389701843262, "learning_rate": 2.9601043805001293e-05, "loss": 0.3599, "num_input_tokens_seen": 58192160, "step": 60925 }, { "epoch": 4.970225956440166, "grad_norm": 2.7692270278930664, "learning_rate": 2.959754527505933e-05, "loss": 0.3963, "num_input_tokens_seen": 58196752, "step": 60930 }, { "epoch": 4.970633820050575, "grad_norm": 3.2531239986419678, "learning_rate": 2.959404665192707e-05, "loss": 0.3312, "num_input_tokens_seen": 58201376, "step": 60935 }, { "epoch": 4.971041683660983, "grad_norm": 2.51759672164917, "learning_rate": 2.959054793567545e-05, "loss": 0.4103, "num_input_tokens_seen": 58206704, "step": 60940 }, { "epoch": 4.971449547271392, "grad_norm": 3.8069722652435303, "learning_rate": 2.958704912637536e-05, "loss": 0.427, "num_input_tokens_seen": 58211536, "step": 60945 }, { "epoch": 4.971857410881801, "grad_norm": 3.4082131385803223, "learning_rate": 2.9583550224097744e-05, "loss": 0.3378, "num_input_tokens_seen": 58216464, "step": 60950 }, { "epoch": 4.97226527449221, "grad_norm": 1.5773025751113892, "learning_rate": 2.95800512289135e-05, "loss": 0.3091, "num_input_tokens_seen": 58221712, "step": 60955 }, { "epoch": 4.972673138102619, "grad_norm": 3.558892011642456, "learning_rate": 2.9576552140893576e-05, "loss": 0.3002, "num_input_tokens_seen": 58226320, "step": 60960 }, { "epoch": 4.973081001713027, "grad_norm": 1.0976433753967285, "learning_rate": 2.9573052960108883e-05, "loss": 0.3431, "num_input_tokens_seen": 58231696, "step": 60965 }, { "epoch": 4.973488865323436, "grad_norm": 2.334763765335083, "learning_rate": 2.9569553686630346e-05, "loss": 0.3741, "num_input_tokens_seen": 58236336, "step": 60970 }, { "epoch": 4.973896728933845, "grad_norm": 2.1458816528320312, "learning_rate": 2.95660543205289e-05, "loss": 0.3447, "num_input_tokens_seen": 58240096, "step": 60975 }, { "epoch": 4.974304592544253, "grad_norm": 0.7357451915740967, "learning_rate": 2.9562554861875473e-05, "loss": 0.2944, "num_input_tokens_seen": 58244576, "step": 60980 }, { "epoch": 4.974712456154662, "grad_norm": 2.8488476276397705, "learning_rate": 2.9559055310741003e-05, "loss": 0.3239, "num_input_tokens_seen": 58249008, "step": 60985 }, { "epoch": 4.975120319765071, "grad_norm": 2.9080960750579834, "learning_rate": 2.955555566719641e-05, "loss": 0.3202, "num_input_tokens_seen": 58253712, "step": 60990 }, { "epoch": 4.975528183375479, "grad_norm": 3.6722323894500732, "learning_rate": 2.9552055931312646e-05, "loss": 0.3173, "num_input_tokens_seen": 58258240, "step": 60995 }, { "epoch": 4.975936046985888, "grad_norm": 4.751850605010986, "learning_rate": 2.9548556103160634e-05, "loss": 0.4239, "num_input_tokens_seen": 58262080, "step": 61000 }, { "epoch": 4.976343910596297, "grad_norm": 3.1246302127838135, "learning_rate": 2.9545056182811332e-05, "loss": 0.2784, "num_input_tokens_seen": 58266768, "step": 61005 }, { "epoch": 4.976751774206705, "grad_norm": 3.0712335109710693, "learning_rate": 2.954155617033567e-05, "loss": 0.3215, "num_input_tokens_seen": 58271024, "step": 61010 }, { "epoch": 4.977159637817114, "grad_norm": 1.4003353118896484, "learning_rate": 2.9538056065804588e-05, "loss": 0.2849, "num_input_tokens_seen": 58276096, "step": 61015 }, { "epoch": 4.977567501427522, "grad_norm": 1.6169064044952393, "learning_rate": 2.9534555869289043e-05, "loss": 0.467, "num_input_tokens_seen": 58280736, "step": 61020 }, { "epoch": 4.977975365037931, "grad_norm": 2.7124385833740234, "learning_rate": 2.953105558085997e-05, "loss": 0.3268, "num_input_tokens_seen": 58285152, "step": 61025 }, { "epoch": 4.97838322864834, "grad_norm": 2.776111602783203, "learning_rate": 2.952755520058834e-05, "loss": 0.4159, "num_input_tokens_seen": 58290560, "step": 61030 }, { "epoch": 4.978791092258748, "grad_norm": 2.0901947021484375, "learning_rate": 2.952405472854508e-05, "loss": 0.4075, "num_input_tokens_seen": 58294832, "step": 61035 }, { "epoch": 4.979198955869157, "grad_norm": 2.2666561603546143, "learning_rate": 2.9520554164801157e-05, "loss": 0.259, "num_input_tokens_seen": 58299952, "step": 61040 }, { "epoch": 4.979606819479566, "grad_norm": 2.0387752056121826, "learning_rate": 2.9517053509427523e-05, "loss": 0.3753, "num_input_tokens_seen": 58303664, "step": 61045 }, { "epoch": 4.9800146830899745, "grad_norm": 0.4703189432621002, "learning_rate": 2.9513552762495132e-05, "loss": 0.3623, "num_input_tokens_seen": 58308448, "step": 61050 }, { "epoch": 4.9804225467003835, "grad_norm": 0.6391854882240295, "learning_rate": 2.9510051924074937e-05, "loss": 0.2812, "num_input_tokens_seen": 58313232, "step": 61055 }, { "epoch": 4.9808304103107925, "grad_norm": 2.1636013984680176, "learning_rate": 2.950655099423792e-05, "loss": 0.3114, "num_input_tokens_seen": 58318512, "step": 61060 }, { "epoch": 4.981238273921201, "grad_norm": 3.0830821990966797, "learning_rate": 2.950304997305502e-05, "loss": 0.3912, "num_input_tokens_seen": 58323072, "step": 61065 }, { "epoch": 4.98164613753161, "grad_norm": 1.904602289199829, "learning_rate": 2.949954886059721e-05, "loss": 0.3855, "num_input_tokens_seen": 58327632, "step": 61070 }, { "epoch": 4.982054001142018, "grad_norm": 2.4363479614257812, "learning_rate": 2.949604765693546e-05, "loss": 0.2771, "num_input_tokens_seen": 58333008, "step": 61075 }, { "epoch": 4.982461864752427, "grad_norm": 2.126953601837158, "learning_rate": 2.9492546362140732e-05, "loss": 0.3507, "num_input_tokens_seen": 58338896, "step": 61080 }, { "epoch": 4.982869728362836, "grad_norm": 1.3981530666351318, "learning_rate": 2.9489044976284004e-05, "loss": 0.3154, "num_input_tokens_seen": 58343456, "step": 61085 }, { "epoch": 4.983277591973244, "grad_norm": 2.326482057571411, "learning_rate": 2.948554349943623e-05, "loss": 0.358, "num_input_tokens_seen": 58347776, "step": 61090 }, { "epoch": 4.983685455583653, "grad_norm": 1.8724623918533325, "learning_rate": 2.9482041931668407e-05, "loss": 0.3694, "num_input_tokens_seen": 58352288, "step": 61095 }, { "epoch": 4.984093319194061, "grad_norm": 1.006118893623352, "learning_rate": 2.947854027305149e-05, "loss": 0.2998, "num_input_tokens_seen": 58357744, "step": 61100 }, { "epoch": 4.98450118280447, "grad_norm": 2.715923547744751, "learning_rate": 2.947503852365647e-05, "loss": 0.3276, "num_input_tokens_seen": 58362256, "step": 61105 }, { "epoch": 4.984909046414879, "grad_norm": 3.4788124561309814, "learning_rate": 2.9471536683554324e-05, "loss": 0.3857, "num_input_tokens_seen": 58366656, "step": 61110 }, { "epoch": 4.985316910025287, "grad_norm": 1.402197003364563, "learning_rate": 2.9468034752816025e-05, "loss": 0.3453, "num_input_tokens_seen": 58371472, "step": 61115 }, { "epoch": 4.985724773635696, "grad_norm": 2.0636789798736572, "learning_rate": 2.946453273151256e-05, "loss": 0.3615, "num_input_tokens_seen": 58375584, "step": 61120 }, { "epoch": 4.986132637246105, "grad_norm": 1.1033742427825928, "learning_rate": 2.946103061971492e-05, "loss": 0.3711, "num_input_tokens_seen": 58380832, "step": 61125 }, { "epoch": 4.986540500856513, "grad_norm": 1.429918885231018, "learning_rate": 2.945752841749408e-05, "loss": 0.3257, "num_input_tokens_seen": 58385760, "step": 61130 }, { "epoch": 4.986948364466922, "grad_norm": 2.2621805667877197, "learning_rate": 2.9454026124921026e-05, "loss": 0.323, "num_input_tokens_seen": 58391568, "step": 61135 }, { "epoch": 4.987356228077331, "grad_norm": 1.4017887115478516, "learning_rate": 2.9450523742066766e-05, "loss": 0.2977, "num_input_tokens_seen": 58396816, "step": 61140 }, { "epoch": 4.9877640916877395, "grad_norm": 1.2227896451950073, "learning_rate": 2.9447021269002274e-05, "loss": 0.3124, "num_input_tokens_seen": 58401424, "step": 61145 }, { "epoch": 4.9881719552981485, "grad_norm": 0.8899888396263123, "learning_rate": 2.944351870579855e-05, "loss": 0.3296, "num_input_tokens_seen": 58406352, "step": 61150 }, { "epoch": 4.988579818908557, "grad_norm": 1.7828865051269531, "learning_rate": 2.9440016052526593e-05, "loss": 0.3725, "num_input_tokens_seen": 58411376, "step": 61155 }, { "epoch": 4.988987682518966, "grad_norm": 2.599224328994751, "learning_rate": 2.9436513309257395e-05, "loss": 0.323, "num_input_tokens_seen": 58415536, "step": 61160 }, { "epoch": 4.989395546129375, "grad_norm": 0.8847014307975769, "learning_rate": 2.943301047606196e-05, "loss": 0.366, "num_input_tokens_seen": 58420384, "step": 61165 }, { "epoch": 4.989803409739783, "grad_norm": 1.1879510879516602, "learning_rate": 2.9429507553011287e-05, "loss": 0.3771, "num_input_tokens_seen": 58425808, "step": 61170 }, { "epoch": 4.990211273350192, "grad_norm": 1.8732492923736572, "learning_rate": 2.9426004540176377e-05, "loss": 0.2708, "num_input_tokens_seen": 58430864, "step": 61175 }, { "epoch": 4.9906191369606, "grad_norm": 0.56231290102005, "learning_rate": 2.942250143762823e-05, "loss": 0.3394, "num_input_tokens_seen": 58436016, "step": 61180 }, { "epoch": 4.991027000571009, "grad_norm": 1.3985694646835327, "learning_rate": 2.9418998245437868e-05, "loss": 0.3592, "num_input_tokens_seen": 58440544, "step": 61185 }, { "epoch": 4.991434864181418, "grad_norm": 1.4683866500854492, "learning_rate": 2.941549496367629e-05, "loss": 0.3145, "num_input_tokens_seen": 58445808, "step": 61190 }, { "epoch": 4.991842727791826, "grad_norm": 4.0327324867248535, "learning_rate": 2.9411991592414502e-05, "loss": 0.2921, "num_input_tokens_seen": 58450752, "step": 61195 }, { "epoch": 4.992250591402235, "grad_norm": 1.713051199913025, "learning_rate": 2.9408488131723512e-05, "loss": 0.4144, "num_input_tokens_seen": 58455552, "step": 61200 }, { "epoch": 4.992658455012644, "grad_norm": 2.377596855163574, "learning_rate": 2.940498458167435e-05, "loss": 0.287, "num_input_tokens_seen": 58459856, "step": 61205 }, { "epoch": 4.993066318623052, "grad_norm": 1.0523396730422974, "learning_rate": 2.9401480942338016e-05, "loss": 0.3099, "num_input_tokens_seen": 58464320, "step": 61210 }, { "epoch": 4.993474182233461, "grad_norm": 2.4815914630889893, "learning_rate": 2.9397977213785538e-05, "loss": 0.3384, "num_input_tokens_seen": 58469024, "step": 61215 }, { "epoch": 4.99388204584387, "grad_norm": 3.0593101978302, "learning_rate": 2.9394473396087928e-05, "loss": 0.3086, "num_input_tokens_seen": 58473936, "step": 61220 }, { "epoch": 4.994289909454278, "grad_norm": 3.9635682106018066, "learning_rate": 2.9390969489316204e-05, "loss": 0.3573, "num_input_tokens_seen": 58478544, "step": 61225 }, { "epoch": 4.994697773064687, "grad_norm": 3.460817337036133, "learning_rate": 2.9387465493541406e-05, "loss": 0.2712, "num_input_tokens_seen": 58483520, "step": 61230 }, { "epoch": 4.995105636675095, "grad_norm": 1.5974425077438354, "learning_rate": 2.938396140883454e-05, "loss": 0.3363, "num_input_tokens_seen": 58488224, "step": 61235 }, { "epoch": 4.9955135002855044, "grad_norm": 1.880927562713623, "learning_rate": 2.938045723526664e-05, "loss": 0.4091, "num_input_tokens_seen": 58492000, "step": 61240 }, { "epoch": 4.9959213638959135, "grad_norm": 3.338113784790039, "learning_rate": 2.9376952972908734e-05, "loss": 0.4086, "num_input_tokens_seen": 58497152, "step": 61245 }, { "epoch": 4.996329227506322, "grad_norm": 1.3793805837631226, "learning_rate": 2.937344862183185e-05, "loss": 0.4298, "num_input_tokens_seen": 58502352, "step": 61250 }, { "epoch": 4.996737091116731, "grad_norm": 1.800553798675537, "learning_rate": 2.9369944182107024e-05, "loss": 0.4005, "num_input_tokens_seen": 58507280, "step": 61255 }, { "epoch": 4.99714495472714, "grad_norm": 1.1824333667755127, "learning_rate": 2.9366439653805288e-05, "loss": 0.3634, "num_input_tokens_seen": 58511712, "step": 61260 }, { "epoch": 4.997552818337548, "grad_norm": 0.4305136501789093, "learning_rate": 2.9362935036997673e-05, "loss": 0.3273, "num_input_tokens_seen": 58516528, "step": 61265 }, { "epoch": 4.997960681947957, "grad_norm": 3.7901394367218018, "learning_rate": 2.9359430331755217e-05, "loss": 0.3339, "num_input_tokens_seen": 58521136, "step": 61270 }, { "epoch": 4.998368545558366, "grad_norm": 0.8068770170211792, "learning_rate": 2.935592553814896e-05, "loss": 0.3384, "num_input_tokens_seen": 58525072, "step": 61275 }, { "epoch": 4.998776409168774, "grad_norm": 3.1970057487487793, "learning_rate": 2.935242065624994e-05, "loss": 0.2946, "num_input_tokens_seen": 58530432, "step": 61280 }, { "epoch": 4.999184272779183, "grad_norm": 1.2961288690567017, "learning_rate": 2.9348915686129207e-05, "loss": 0.3685, "num_input_tokens_seen": 58535392, "step": 61285 }, { "epoch": 4.999592136389591, "grad_norm": 2.4141247272491455, "learning_rate": 2.93454106278578e-05, "loss": 0.4875, "num_input_tokens_seen": 58540432, "step": 61290 }, { "epoch": 5.0, "grad_norm": 0.96281498670578, "learning_rate": 2.934190548150677e-05, "loss": 0.2588, "num_input_tokens_seen": 58545456, "step": 61295 }, { "epoch": 5.000407863610409, "grad_norm": 3.7156951427459717, "learning_rate": 2.933840024714716e-05, "loss": 0.4232, "num_input_tokens_seen": 58550736, "step": 61300 }, { "epoch": 5.000407863610409, "eval_loss": 0.3393753170967102, "eval_runtime": 570.8355, "eval_samples_per_second": 4.774, "eval_steps_per_second": 2.388, "num_input_tokens_seen": 58550736, "step": 61300 }, { "epoch": 5.000815727220817, "grad_norm": 1.7012147903442383, "learning_rate": 2.9334894924850016e-05, "loss": 0.3754, "num_input_tokens_seen": 58555408, "step": 61305 }, { "epoch": 5.001223590831226, "grad_norm": 1.6695102453231812, "learning_rate": 2.9331389514686392e-05, "loss": 0.3137, "num_input_tokens_seen": 58560256, "step": 61310 }, { "epoch": 5.001631454441635, "grad_norm": 0.9442352056503296, "learning_rate": 2.9327884016727353e-05, "loss": 0.3524, "num_input_tokens_seen": 58564704, "step": 61315 }, { "epoch": 5.002039318052043, "grad_norm": 1.9766756296157837, "learning_rate": 2.9324378431043943e-05, "loss": 0.3947, "num_input_tokens_seen": 58569328, "step": 61320 }, { "epoch": 5.002447181662452, "grad_norm": 6.048576831817627, "learning_rate": 2.9320872757707213e-05, "loss": 0.2999, "num_input_tokens_seen": 58574592, "step": 61325 }, { "epoch": 5.00285504527286, "grad_norm": 4.415679931640625, "learning_rate": 2.9317366996788232e-05, "loss": 0.3222, "num_input_tokens_seen": 58579504, "step": 61330 }, { "epoch": 5.003262908883269, "grad_norm": 3.527527093887329, "learning_rate": 2.9313861148358052e-05, "loss": 0.3164, "num_input_tokens_seen": 58584864, "step": 61335 }, { "epoch": 5.003670772493678, "grad_norm": 2.137282609939575, "learning_rate": 2.9310355212487746e-05, "loss": 0.292, "num_input_tokens_seen": 58589584, "step": 61340 }, { "epoch": 5.0040786361040865, "grad_norm": 1.5085158348083496, "learning_rate": 2.9306849189248363e-05, "loss": 0.3531, "num_input_tokens_seen": 58595024, "step": 61345 }, { "epoch": 5.0044864997144956, "grad_norm": 3.6130309104919434, "learning_rate": 2.9303343078710986e-05, "loss": 0.3278, "num_input_tokens_seen": 58599872, "step": 61350 }, { "epoch": 5.004894363324905, "grad_norm": 2.0446133613586426, "learning_rate": 2.929983688094666e-05, "loss": 0.366, "num_input_tokens_seen": 58604336, "step": 61355 }, { "epoch": 5.005302226935313, "grad_norm": 1.9209961891174316, "learning_rate": 2.9296330596026485e-05, "loss": 0.298, "num_input_tokens_seen": 58609184, "step": 61360 }, { "epoch": 5.005710090545722, "grad_norm": 4.5919060707092285, "learning_rate": 2.929282422402151e-05, "loss": 0.3559, "num_input_tokens_seen": 58614064, "step": 61365 }, { "epoch": 5.00611795415613, "grad_norm": 2.458059787750244, "learning_rate": 2.9289317765002805e-05, "loss": 0.3862, "num_input_tokens_seen": 58618800, "step": 61370 }, { "epoch": 5.006525817766539, "grad_norm": 4.998291492462158, "learning_rate": 2.9285811219041453e-05, "loss": 0.416, "num_input_tokens_seen": 58623232, "step": 61375 }, { "epoch": 5.006933681376948, "grad_norm": 2.1577413082122803, "learning_rate": 2.928230458620853e-05, "loss": 0.344, "num_input_tokens_seen": 58628752, "step": 61380 }, { "epoch": 5.007341544987356, "grad_norm": 2.324202537536621, "learning_rate": 2.9278797866575116e-05, "loss": 0.3657, "num_input_tokens_seen": 58634224, "step": 61385 }, { "epoch": 5.007749408597765, "grad_norm": 3.6940407752990723, "learning_rate": 2.9275291060212285e-05, "loss": 0.4494, "num_input_tokens_seen": 58638864, "step": 61390 }, { "epoch": 5.008157272208174, "grad_norm": 1.965098261833191, "learning_rate": 2.9271784167191123e-05, "loss": 0.3534, "num_input_tokens_seen": 58643424, "step": 61395 }, { "epoch": 5.008565135818582, "grad_norm": 1.1612555980682373, "learning_rate": 2.926827718758271e-05, "loss": 0.3139, "num_input_tokens_seen": 58648640, "step": 61400 }, { "epoch": 5.008972999428991, "grad_norm": 4.26917028427124, "learning_rate": 2.926477012145813e-05, "loss": 0.2993, "num_input_tokens_seen": 58653344, "step": 61405 }, { "epoch": 5.009380863039399, "grad_norm": 2.1249725818634033, "learning_rate": 2.926126296888847e-05, "loss": 0.2698, "num_input_tokens_seen": 58658048, "step": 61410 }, { "epoch": 5.009788726649808, "grad_norm": 2.0500473976135254, "learning_rate": 2.9257755729944825e-05, "loss": 0.3522, "num_input_tokens_seen": 58662336, "step": 61415 }, { "epoch": 5.010196590260217, "grad_norm": 0.7376721501350403, "learning_rate": 2.925424840469828e-05, "loss": 0.3756, "num_input_tokens_seen": 58667136, "step": 61420 }, { "epoch": 5.010604453870625, "grad_norm": 0.6794233322143555, "learning_rate": 2.9250740993219928e-05, "loss": 0.3091, "num_input_tokens_seen": 58671072, "step": 61425 }, { "epoch": 5.011012317481034, "grad_norm": 2.9976258277893066, "learning_rate": 2.924723349558086e-05, "loss": 0.3478, "num_input_tokens_seen": 58675344, "step": 61430 }, { "epoch": 5.011420181091443, "grad_norm": 0.7755669951438904, "learning_rate": 2.924372591185217e-05, "loss": 0.2792, "num_input_tokens_seen": 58679568, "step": 61435 }, { "epoch": 5.0118280447018515, "grad_norm": 1.0034550428390503, "learning_rate": 2.9240218242104965e-05, "loss": 0.3765, "num_input_tokens_seen": 58684080, "step": 61440 }, { "epoch": 5.0122359083122605, "grad_norm": 3.11515212059021, "learning_rate": 2.9236710486410335e-05, "loss": 0.3936, "num_input_tokens_seen": 58689600, "step": 61445 }, { "epoch": 5.012643771922669, "grad_norm": 0.6410999298095703, "learning_rate": 2.9233202644839385e-05, "loss": 0.3095, "num_input_tokens_seen": 58694016, "step": 61450 }, { "epoch": 5.013051635533078, "grad_norm": 1.2318679094314575, "learning_rate": 2.9229694717463218e-05, "loss": 0.2569, "num_input_tokens_seen": 58698992, "step": 61455 }, { "epoch": 5.013459499143487, "grad_norm": 1.0424904823303223, "learning_rate": 2.9226186704352944e-05, "loss": 0.3562, "num_input_tokens_seen": 58703264, "step": 61460 }, { "epoch": 5.013867362753895, "grad_norm": 0.9387893676757812, "learning_rate": 2.9222678605579647e-05, "loss": 0.2767, "num_input_tokens_seen": 58708768, "step": 61465 }, { "epoch": 5.014275226364304, "grad_norm": 1.5362848043441772, "learning_rate": 2.9219170421214453e-05, "loss": 0.3567, "num_input_tokens_seen": 58713792, "step": 61470 }, { "epoch": 5.014683089974713, "grad_norm": 1.1095257997512817, "learning_rate": 2.9215662151328472e-05, "loss": 0.3615, "num_input_tokens_seen": 58718368, "step": 61475 }, { "epoch": 5.015090953585121, "grad_norm": 0.8706168532371521, "learning_rate": 2.92121537959928e-05, "loss": 0.4439, "num_input_tokens_seen": 58722480, "step": 61480 }, { "epoch": 5.01549881719553, "grad_norm": 3.476534128189087, "learning_rate": 2.9208645355278574e-05, "loss": 0.3461, "num_input_tokens_seen": 58727376, "step": 61485 }, { "epoch": 5.015906680805939, "grad_norm": 3.1859545707702637, "learning_rate": 2.9205136829256885e-05, "loss": 0.2593, "num_input_tokens_seen": 58732640, "step": 61490 }, { "epoch": 5.016314544416347, "grad_norm": 6.08010721206665, "learning_rate": 2.9201628217998866e-05, "loss": 0.3669, "num_input_tokens_seen": 58737344, "step": 61495 }, { "epoch": 5.016722408026756, "grad_norm": 3.2563114166259766, "learning_rate": 2.9198119521575624e-05, "loss": 0.3109, "num_input_tokens_seen": 58741616, "step": 61500 }, { "epoch": 5.017130271637164, "grad_norm": 2.0810203552246094, "learning_rate": 2.919461074005828e-05, "loss": 0.2549, "num_input_tokens_seen": 58747312, "step": 61505 }, { "epoch": 5.017538135247573, "grad_norm": 1.4057527780532837, "learning_rate": 2.9191101873517962e-05, "loss": 0.4264, "num_input_tokens_seen": 58752144, "step": 61510 }, { "epoch": 5.017945998857982, "grad_norm": 2.9475064277648926, "learning_rate": 2.91875929220258e-05, "loss": 0.3397, "num_input_tokens_seen": 58757392, "step": 61515 }, { "epoch": 5.01835386246839, "grad_norm": 1.0033080577850342, "learning_rate": 2.9184083885652897e-05, "loss": 0.3075, "num_input_tokens_seen": 58763248, "step": 61520 }, { "epoch": 5.018761726078799, "grad_norm": 3.0018768310546875, "learning_rate": 2.918057476447039e-05, "loss": 0.2434, "num_input_tokens_seen": 58768336, "step": 61525 }, { "epoch": 5.019169589689208, "grad_norm": 6.132620811462402, "learning_rate": 2.9177065558549416e-05, "loss": 0.3049, "num_input_tokens_seen": 58773072, "step": 61530 }, { "epoch": 5.0195774532996165, "grad_norm": 1.726772427558899, "learning_rate": 2.91735562679611e-05, "loss": 0.386, "num_input_tokens_seen": 58777136, "step": 61535 }, { "epoch": 5.0199853169100255, "grad_norm": 5.115744590759277, "learning_rate": 2.917004689277657e-05, "loss": 0.274, "num_input_tokens_seen": 58782608, "step": 61540 }, { "epoch": 5.020393180520434, "grad_norm": 1.790155291557312, "learning_rate": 2.9166537433066954e-05, "loss": 0.3192, "num_input_tokens_seen": 58787136, "step": 61545 }, { "epoch": 5.020801044130843, "grad_norm": 9.916252136230469, "learning_rate": 2.9163027888903398e-05, "loss": 0.3488, "num_input_tokens_seen": 58791968, "step": 61550 }, { "epoch": 5.021208907741252, "grad_norm": 8.128252029418945, "learning_rate": 2.9159518260357037e-05, "loss": 0.3771, "num_input_tokens_seen": 58796000, "step": 61555 }, { "epoch": 5.02161677135166, "grad_norm": 2.3748888969421387, "learning_rate": 2.9156008547499016e-05, "loss": 0.4024, "num_input_tokens_seen": 58800656, "step": 61560 }, { "epoch": 5.022024634962069, "grad_norm": 0.4701078534126282, "learning_rate": 2.9152498750400458e-05, "loss": 0.3887, "num_input_tokens_seen": 58805776, "step": 61565 }, { "epoch": 5.022432498572478, "grad_norm": 2.206115245819092, "learning_rate": 2.9148988869132517e-05, "loss": 0.4738, "num_input_tokens_seen": 58810896, "step": 61570 }, { "epoch": 5.022840362182886, "grad_norm": 7.897062301635742, "learning_rate": 2.9145478903766334e-05, "loss": 0.3792, "num_input_tokens_seen": 58815232, "step": 61575 }, { "epoch": 5.023248225793295, "grad_norm": 0.8611997961997986, "learning_rate": 2.9141968854373053e-05, "loss": 0.2854, "num_input_tokens_seen": 58820672, "step": 61580 }, { "epoch": 5.023656089403703, "grad_norm": 2.743978261947632, "learning_rate": 2.9138458721023827e-05, "loss": 0.3552, "num_input_tokens_seen": 58826464, "step": 61585 }, { "epoch": 5.024063953014112, "grad_norm": 2.8063557147979736, "learning_rate": 2.91349485037898e-05, "loss": 0.3971, "num_input_tokens_seen": 58830352, "step": 61590 }, { "epoch": 5.024471816624521, "grad_norm": 3.168644905090332, "learning_rate": 2.9131438202742124e-05, "loss": 0.3618, "num_input_tokens_seen": 58834272, "step": 61595 }, { "epoch": 5.024879680234929, "grad_norm": 2.5614612102508545, "learning_rate": 2.9127927817951955e-05, "loss": 0.3497, "num_input_tokens_seen": 58839360, "step": 61600 }, { "epoch": 5.025287543845338, "grad_norm": 1.7594038248062134, "learning_rate": 2.912441734949044e-05, "loss": 0.3634, "num_input_tokens_seen": 58844224, "step": 61605 }, { "epoch": 5.025695407455747, "grad_norm": 3.9583892822265625, "learning_rate": 2.9120906797428737e-05, "loss": 0.3806, "num_input_tokens_seen": 58848400, "step": 61610 }, { "epoch": 5.026103271066155, "grad_norm": 2.802672863006592, "learning_rate": 2.9117396161838007e-05, "loss": 0.3528, "num_input_tokens_seen": 58852848, "step": 61615 }, { "epoch": 5.026511134676564, "grad_norm": 2.2882838249206543, "learning_rate": 2.9113885442789405e-05, "loss": 0.3187, "num_input_tokens_seen": 58857744, "step": 61620 }, { "epoch": 5.026918998286972, "grad_norm": 3.1452033519744873, "learning_rate": 2.9110374640354086e-05, "loss": 0.4401, "num_input_tokens_seen": 58863008, "step": 61625 }, { "epoch": 5.027326861897381, "grad_norm": 2.837629795074463, "learning_rate": 2.910686375460323e-05, "loss": 0.256, "num_input_tokens_seen": 58867808, "step": 61630 }, { "epoch": 5.02773472550779, "grad_norm": 1.2122939825057983, "learning_rate": 2.9103352785607983e-05, "loss": 0.3367, "num_input_tokens_seen": 58871216, "step": 61635 }, { "epoch": 5.028142589118199, "grad_norm": 3.050947904586792, "learning_rate": 2.9099841733439525e-05, "loss": 0.2948, "num_input_tokens_seen": 58875408, "step": 61640 }, { "epoch": 5.028550452728608, "grad_norm": 3.1982390880584717, "learning_rate": 2.9096330598169013e-05, "loss": 0.3417, "num_input_tokens_seen": 58880256, "step": 61645 }, { "epoch": 5.028958316339017, "grad_norm": 2.1462903022766113, "learning_rate": 2.9092819379867625e-05, "loss": 0.3804, "num_input_tokens_seen": 58884656, "step": 61650 }, { "epoch": 5.029366179949425, "grad_norm": 1.3206989765167236, "learning_rate": 2.908930807860652e-05, "loss": 0.3812, "num_input_tokens_seen": 58889712, "step": 61655 }, { "epoch": 5.029774043559834, "grad_norm": 3.7076544761657715, "learning_rate": 2.9085796694456887e-05, "loss": 0.3533, "num_input_tokens_seen": 58893296, "step": 61660 }, { "epoch": 5.030181907170242, "grad_norm": 1.7909996509552002, "learning_rate": 2.9082285227489892e-05, "loss": 0.3916, "num_input_tokens_seen": 58897360, "step": 61665 }, { "epoch": 5.030589770780651, "grad_norm": 4.039254665374756, "learning_rate": 2.9078773677776706e-05, "loss": 0.3364, "num_input_tokens_seen": 58902320, "step": 61670 }, { "epoch": 5.03099763439106, "grad_norm": 2.728776693344116, "learning_rate": 2.9075262045388506e-05, "loss": 0.3666, "num_input_tokens_seen": 58907120, "step": 61675 }, { "epoch": 5.031405498001468, "grad_norm": 1.9229589700698853, "learning_rate": 2.907175033039648e-05, "loss": 0.3166, "num_input_tokens_seen": 58912336, "step": 61680 }, { "epoch": 5.031813361611877, "grad_norm": 1.187582015991211, "learning_rate": 2.9068238532871806e-05, "loss": 0.3004, "num_input_tokens_seen": 58917328, "step": 61685 }, { "epoch": 5.032221225222286, "grad_norm": 1.1796859502792358, "learning_rate": 2.906472665288566e-05, "loss": 0.3037, "num_input_tokens_seen": 58921152, "step": 61690 }, { "epoch": 5.032629088832694, "grad_norm": 0.7289637327194214, "learning_rate": 2.906121469050924e-05, "loss": 0.3946, "num_input_tokens_seen": 58926416, "step": 61695 }, { "epoch": 5.033036952443103, "grad_norm": 0.6902880072593689, "learning_rate": 2.9057702645813718e-05, "loss": 0.4361, "num_input_tokens_seen": 58930912, "step": 61700 }, { "epoch": 5.033444816053512, "grad_norm": 0.5614417195320129, "learning_rate": 2.905419051887029e-05, "loss": 0.3331, "num_input_tokens_seen": 58935584, "step": 61705 }, { "epoch": 5.03385267966392, "grad_norm": 2.084550142288208, "learning_rate": 2.905067830975014e-05, "loss": 0.3373, "num_input_tokens_seen": 58939984, "step": 61710 }, { "epoch": 5.034260543274329, "grad_norm": 3.3156886100769043, "learning_rate": 2.9047166018524464e-05, "loss": 0.3137, "num_input_tokens_seen": 58945056, "step": 61715 }, { "epoch": 5.034668406884737, "grad_norm": 0.7094321846961975, "learning_rate": 2.9043653645264457e-05, "loss": 0.3124, "num_input_tokens_seen": 58950160, "step": 61720 }, { "epoch": 5.035076270495146, "grad_norm": 1.005232334136963, "learning_rate": 2.9040141190041304e-05, "loss": 0.421, "num_input_tokens_seen": 58955264, "step": 61725 }, { "epoch": 5.035484134105555, "grad_norm": 0.7395258545875549, "learning_rate": 2.9036628652926205e-05, "loss": 0.3273, "num_input_tokens_seen": 58960160, "step": 61730 }, { "epoch": 5.0358919977159635, "grad_norm": 2.844914436340332, "learning_rate": 2.9033116033990364e-05, "loss": 0.34, "num_input_tokens_seen": 58964672, "step": 61735 }, { "epoch": 5.0362998613263725, "grad_norm": 2.842327356338501, "learning_rate": 2.9029603333304973e-05, "loss": 0.3858, "num_input_tokens_seen": 58969568, "step": 61740 }, { "epoch": 5.0367077249367815, "grad_norm": 0.757477879524231, "learning_rate": 2.9026090550941233e-05, "loss": 0.3338, "num_input_tokens_seen": 58975104, "step": 61745 }, { "epoch": 5.03711558854719, "grad_norm": 2.4925360679626465, "learning_rate": 2.9022577686970347e-05, "loss": 0.3787, "num_input_tokens_seen": 58980048, "step": 61750 }, { "epoch": 5.037523452157599, "grad_norm": 4.4979071617126465, "learning_rate": 2.901906474146352e-05, "loss": 0.3681, "num_input_tokens_seen": 58984112, "step": 61755 }, { "epoch": 5.037931315768007, "grad_norm": 4.293688774108887, "learning_rate": 2.9015551714491962e-05, "loss": 0.3393, "num_input_tokens_seen": 58988688, "step": 61760 }, { "epoch": 5.038339179378416, "grad_norm": 1.2044769525527954, "learning_rate": 2.9012038606126874e-05, "loss": 0.2905, "num_input_tokens_seen": 58994464, "step": 61765 }, { "epoch": 5.038747042988825, "grad_norm": 1.126387596130371, "learning_rate": 2.900852541643947e-05, "loss": 0.3223, "num_input_tokens_seen": 58999584, "step": 61770 }, { "epoch": 5.039154906599233, "grad_norm": 2.789912700653076, "learning_rate": 2.900501214550096e-05, "loss": 0.3426, "num_input_tokens_seen": 59003632, "step": 61775 }, { "epoch": 5.039562770209642, "grad_norm": 0.8796001076698303, "learning_rate": 2.900149879338255e-05, "loss": 0.3472, "num_input_tokens_seen": 59008624, "step": 61780 }, { "epoch": 5.039970633820051, "grad_norm": 1.5827053785324097, "learning_rate": 2.8997985360155466e-05, "loss": 0.3769, "num_input_tokens_seen": 59013008, "step": 61785 }, { "epoch": 5.040378497430459, "grad_norm": 0.7992948293685913, "learning_rate": 2.8994471845890912e-05, "loss": 0.3921, "num_input_tokens_seen": 59016880, "step": 61790 }, { "epoch": 5.040786361040868, "grad_norm": 3.247884750366211, "learning_rate": 2.8990958250660115e-05, "loss": 0.35, "num_input_tokens_seen": 59021824, "step": 61795 }, { "epoch": 5.041194224651276, "grad_norm": 2.4845387935638428, "learning_rate": 2.8987444574534296e-05, "loss": 0.3512, "num_input_tokens_seen": 59026256, "step": 61800 }, { "epoch": 5.041602088261685, "grad_norm": 0.7390751838684082, "learning_rate": 2.898393081758466e-05, "loss": 0.3241, "num_input_tokens_seen": 59031168, "step": 61805 }, { "epoch": 5.042009951872094, "grad_norm": 0.904089629650116, "learning_rate": 2.898041697988244e-05, "loss": 0.3095, "num_input_tokens_seen": 59035856, "step": 61810 }, { "epoch": 5.042417815482502, "grad_norm": 0.6608301997184753, "learning_rate": 2.8976903061498866e-05, "loss": 0.3867, "num_input_tokens_seen": 59040608, "step": 61815 }, { "epoch": 5.042825679092911, "grad_norm": 1.5987218618392944, "learning_rate": 2.8973389062505152e-05, "loss": 0.2413, "num_input_tokens_seen": 59044496, "step": 61820 }, { "epoch": 5.04323354270332, "grad_norm": 2.659085273742676, "learning_rate": 2.8969874982972527e-05, "loss": 0.3574, "num_input_tokens_seen": 59050080, "step": 61825 }, { "epoch": 5.0436414063137285, "grad_norm": 0.7370951771736145, "learning_rate": 2.8966360822972233e-05, "loss": 0.4185, "num_input_tokens_seen": 59055280, "step": 61830 }, { "epoch": 5.0440492699241375, "grad_norm": 1.2448772192001343, "learning_rate": 2.896284658257548e-05, "loss": 0.3708, "num_input_tokens_seen": 59060160, "step": 61835 }, { "epoch": 5.044457133534546, "grad_norm": 0.6390364766120911, "learning_rate": 2.8959332261853517e-05, "loss": 0.3669, "num_input_tokens_seen": 59065424, "step": 61840 }, { "epoch": 5.044864997144955, "grad_norm": 5.186641693115234, "learning_rate": 2.8955817860877567e-05, "loss": 0.3275, "num_input_tokens_seen": 59070064, "step": 61845 }, { "epoch": 5.045272860755364, "grad_norm": 0.37803781032562256, "learning_rate": 2.8952303379718875e-05, "loss": 0.3152, "num_input_tokens_seen": 59075296, "step": 61850 }, { "epoch": 5.045680724365772, "grad_norm": 1.1173536777496338, "learning_rate": 2.894878881844867e-05, "loss": 0.2942, "num_input_tokens_seen": 59079408, "step": 61855 }, { "epoch": 5.046088587976181, "grad_norm": 0.7244669198989868, "learning_rate": 2.8945274177138192e-05, "loss": 0.3262, "num_input_tokens_seen": 59084512, "step": 61860 }, { "epoch": 5.04649645158659, "grad_norm": 1.3449240922927856, "learning_rate": 2.8941759455858684e-05, "loss": 0.3559, "num_input_tokens_seen": 59089440, "step": 61865 }, { "epoch": 5.046904315196998, "grad_norm": 0.9848493337631226, "learning_rate": 2.8938244654681395e-05, "loss": 0.4111, "num_input_tokens_seen": 59093456, "step": 61870 }, { "epoch": 5.047312178807407, "grad_norm": 0.7634024024009705, "learning_rate": 2.8934729773677548e-05, "loss": 0.3674, "num_input_tokens_seen": 59098864, "step": 61875 }, { "epoch": 5.047720042417816, "grad_norm": 4.148778438568115, "learning_rate": 2.8931214812918407e-05, "loss": 0.323, "num_input_tokens_seen": 59104544, "step": 61880 }, { "epoch": 5.048127906028224, "grad_norm": 1.297579288482666, "learning_rate": 2.8927699772475215e-05, "loss": 0.3345, "num_input_tokens_seen": 59109824, "step": 61885 }, { "epoch": 5.048535769638633, "grad_norm": 1.6147449016571045, "learning_rate": 2.8924184652419206e-05, "loss": 0.3691, "num_input_tokens_seen": 59115008, "step": 61890 }, { "epoch": 5.048943633249041, "grad_norm": 1.773158311843872, "learning_rate": 2.8920669452821653e-05, "loss": 0.3962, "num_input_tokens_seen": 59119328, "step": 61895 }, { "epoch": 5.04935149685945, "grad_norm": 2.917515516281128, "learning_rate": 2.8917154173753792e-05, "loss": 0.348, "num_input_tokens_seen": 59124352, "step": 61900 }, { "epoch": 5.049759360469859, "grad_norm": 1.877976894378662, "learning_rate": 2.8913638815286882e-05, "loss": 0.3432, "num_input_tokens_seen": 59128576, "step": 61905 }, { "epoch": 5.050167224080267, "grad_norm": 1.943186640739441, "learning_rate": 2.8910123377492172e-05, "loss": 0.2922, "num_input_tokens_seen": 59133056, "step": 61910 }, { "epoch": 5.050575087690676, "grad_norm": 2.159937858581543, "learning_rate": 2.8906607860440928e-05, "loss": 0.375, "num_input_tokens_seen": 59137824, "step": 61915 }, { "epoch": 5.050982951301085, "grad_norm": 1.2203319072723389, "learning_rate": 2.8903092264204402e-05, "loss": 0.3144, "num_input_tokens_seen": 59143120, "step": 61920 }, { "epoch": 5.051390814911493, "grad_norm": 3.392721652984619, "learning_rate": 2.8899576588853844e-05, "loss": 0.353, "num_input_tokens_seen": 59149008, "step": 61925 }, { "epoch": 5.0517986785219025, "grad_norm": 2.343195676803589, "learning_rate": 2.8896060834460535e-05, "loss": 0.3314, "num_input_tokens_seen": 59153712, "step": 61930 }, { "epoch": 5.052206542132311, "grad_norm": 3.5037949085235596, "learning_rate": 2.8892545001095727e-05, "loss": 0.3398, "num_input_tokens_seen": 59158544, "step": 61935 }, { "epoch": 5.05261440574272, "grad_norm": 2.8892300128936768, "learning_rate": 2.8889029088830686e-05, "loss": 0.3147, "num_input_tokens_seen": 59164064, "step": 61940 }, { "epoch": 5.053022269353129, "grad_norm": 0.47470077872276306, "learning_rate": 2.8885513097736684e-05, "loss": 0.3313, "num_input_tokens_seen": 59169024, "step": 61945 }, { "epoch": 5.053430132963537, "grad_norm": 1.2061091661453247, "learning_rate": 2.8881997027884976e-05, "loss": 0.4019, "num_input_tokens_seen": 59173328, "step": 61950 }, { "epoch": 5.053837996573946, "grad_norm": 1.602811574935913, "learning_rate": 2.8878480879346837e-05, "loss": 0.3274, "num_input_tokens_seen": 59178736, "step": 61955 }, { "epoch": 5.054245860184355, "grad_norm": 1.544775366783142, "learning_rate": 2.8874964652193542e-05, "loss": 0.2433, "num_input_tokens_seen": 59183424, "step": 61960 }, { "epoch": 5.054653723794763, "grad_norm": 3.187211036682129, "learning_rate": 2.8871448346496355e-05, "loss": 0.32, "num_input_tokens_seen": 59188112, "step": 61965 }, { "epoch": 5.055061587405172, "grad_norm": 3.536503314971924, "learning_rate": 2.8867931962326562e-05, "loss": 0.3991, "num_input_tokens_seen": 59193008, "step": 61970 }, { "epoch": 5.05546945101558, "grad_norm": 2.9204413890838623, "learning_rate": 2.8864415499755428e-05, "loss": 0.2736, "num_input_tokens_seen": 59197536, "step": 61975 }, { "epoch": 5.055877314625989, "grad_norm": 6.676260471343994, "learning_rate": 2.8860898958854233e-05, "loss": 0.3511, "num_input_tokens_seen": 59202416, "step": 61980 }, { "epoch": 5.056285178236398, "grad_norm": 2.381669282913208, "learning_rate": 2.8857382339694256e-05, "loss": 0.3911, "num_input_tokens_seen": 59207792, "step": 61985 }, { "epoch": 5.056693041846806, "grad_norm": 2.2392208576202393, "learning_rate": 2.8853865642346777e-05, "loss": 0.407, "num_input_tokens_seen": 59212208, "step": 61990 }, { "epoch": 5.057100905457215, "grad_norm": 0.9655089974403381, "learning_rate": 2.885034886688308e-05, "loss": 0.431, "num_input_tokens_seen": 59216912, "step": 61995 }, { "epoch": 5.057508769067624, "grad_norm": 2.9720897674560547, "learning_rate": 2.8846832013374448e-05, "loss": 0.3348, "num_input_tokens_seen": 59221488, "step": 62000 }, { "epoch": 5.057916632678032, "grad_norm": 1.3963900804519653, "learning_rate": 2.884331508189217e-05, "loss": 0.3401, "num_input_tokens_seen": 59225696, "step": 62005 }, { "epoch": 5.058324496288441, "grad_norm": 3.2667324542999268, "learning_rate": 2.883979807250753e-05, "loss": 0.363, "num_input_tokens_seen": 59230576, "step": 62010 }, { "epoch": 5.058732359898849, "grad_norm": 3.1223225593566895, "learning_rate": 2.8836280985291807e-05, "loss": 0.2934, "num_input_tokens_seen": 59235744, "step": 62015 }, { "epoch": 5.059140223509258, "grad_norm": 2.0205540657043457, "learning_rate": 2.8832763820316307e-05, "loss": 0.3466, "num_input_tokens_seen": 59240704, "step": 62020 }, { "epoch": 5.059548087119667, "grad_norm": 1.135500192642212, "learning_rate": 2.8829246577652303e-05, "loss": 0.3087, "num_input_tokens_seen": 59245712, "step": 62025 }, { "epoch": 5.0599559507300755, "grad_norm": 1.9421638250350952, "learning_rate": 2.882572925737111e-05, "loss": 0.2534, "num_input_tokens_seen": 59250656, "step": 62030 }, { "epoch": 5.0603638143404845, "grad_norm": 0.5526593327522278, "learning_rate": 2.8822211859543997e-05, "loss": 0.1736, "num_input_tokens_seen": 59255408, "step": 62035 }, { "epoch": 5.0607716779508936, "grad_norm": 1.4964158535003662, "learning_rate": 2.8818694384242283e-05, "loss": 0.2866, "num_input_tokens_seen": 59258752, "step": 62040 }, { "epoch": 5.061179541561302, "grad_norm": 2.1295785903930664, "learning_rate": 2.8815176831537248e-05, "loss": 0.4353, "num_input_tokens_seen": 59263760, "step": 62045 }, { "epoch": 5.061587405171711, "grad_norm": 1.8946921825408936, "learning_rate": 2.881165920150021e-05, "loss": 0.4975, "num_input_tokens_seen": 59268672, "step": 62050 }, { "epoch": 5.06199526878212, "grad_norm": 2.2034120559692383, "learning_rate": 2.8808141494202452e-05, "loss": 0.4134, "num_input_tokens_seen": 59272784, "step": 62055 }, { "epoch": 5.062403132392528, "grad_norm": 2.7092974185943604, "learning_rate": 2.8804623709715288e-05, "loss": 0.2624, "num_input_tokens_seen": 59278112, "step": 62060 }, { "epoch": 5.062810996002937, "grad_norm": 3.7756006717681885, "learning_rate": 2.8801105848110012e-05, "loss": 0.3685, "num_input_tokens_seen": 59283280, "step": 62065 }, { "epoch": 5.063218859613345, "grad_norm": 4.0223002433776855, "learning_rate": 2.8797587909457945e-05, "loss": 0.3513, "num_input_tokens_seen": 59286528, "step": 62070 }, { "epoch": 5.063626723223754, "grad_norm": 0.7630754113197327, "learning_rate": 2.8794069893830384e-05, "loss": 0.3814, "num_input_tokens_seen": 59291312, "step": 62075 }, { "epoch": 5.064034586834163, "grad_norm": 1.0011773109436035, "learning_rate": 2.8790551801298638e-05, "loss": 0.3131, "num_input_tokens_seen": 59296384, "step": 62080 }, { "epoch": 5.064442450444571, "grad_norm": 1.4620532989501953, "learning_rate": 2.8787033631934013e-05, "loss": 0.3114, "num_input_tokens_seen": 59300880, "step": 62085 }, { "epoch": 5.06485031405498, "grad_norm": 1.8324068784713745, "learning_rate": 2.8783515385807825e-05, "loss": 0.3846, "num_input_tokens_seen": 59305376, "step": 62090 }, { "epoch": 5.065258177665389, "grad_norm": 0.6928085088729858, "learning_rate": 2.877999706299139e-05, "loss": 0.2437, "num_input_tokens_seen": 59310368, "step": 62095 }, { "epoch": 5.065666041275797, "grad_norm": 2.957822799682617, "learning_rate": 2.8776478663556017e-05, "loss": 0.2752, "num_input_tokens_seen": 59315504, "step": 62100 }, { "epoch": 5.066073904886206, "grad_norm": 6.3102006912231445, "learning_rate": 2.8772960187573035e-05, "loss": 0.3071, "num_input_tokens_seen": 59320192, "step": 62105 }, { "epoch": 5.066481768496614, "grad_norm": 1.134216070175171, "learning_rate": 2.876944163511375e-05, "loss": 0.2085, "num_input_tokens_seen": 59324512, "step": 62110 }, { "epoch": 5.066889632107023, "grad_norm": 0.8784603476524353, "learning_rate": 2.8765923006249478e-05, "loss": 0.4003, "num_input_tokens_seen": 59329520, "step": 62115 }, { "epoch": 5.067297495717432, "grad_norm": 1.8479059934616089, "learning_rate": 2.8762404301051554e-05, "loss": 0.4454, "num_input_tokens_seen": 59334384, "step": 62120 }, { "epoch": 5.0677053593278405, "grad_norm": 0.7905091047286987, "learning_rate": 2.875888551959129e-05, "loss": 0.3552, "num_input_tokens_seen": 59339344, "step": 62125 }, { "epoch": 5.0681132229382495, "grad_norm": 0.6249197721481323, "learning_rate": 2.875536666194002e-05, "loss": 0.3636, "num_input_tokens_seen": 59344176, "step": 62130 }, { "epoch": 5.0685210865486585, "grad_norm": 0.9337932467460632, "learning_rate": 2.875184772816905e-05, "loss": 0.3039, "num_input_tokens_seen": 59348912, "step": 62135 }, { "epoch": 5.068928950159067, "grad_norm": 2.311391830444336, "learning_rate": 2.8748328718349737e-05, "loss": 0.3884, "num_input_tokens_seen": 59353760, "step": 62140 }, { "epoch": 5.069336813769476, "grad_norm": 1.6902912855148315, "learning_rate": 2.8744809632553388e-05, "loss": 0.2984, "num_input_tokens_seen": 59358928, "step": 62145 }, { "epoch": 5.069744677379884, "grad_norm": 4.558055400848389, "learning_rate": 2.8741290470851336e-05, "loss": 0.2878, "num_input_tokens_seen": 59363712, "step": 62150 }, { "epoch": 5.070152540990293, "grad_norm": 3.316322088241577, "learning_rate": 2.8737771233314916e-05, "loss": 0.2599, "num_input_tokens_seen": 59368624, "step": 62155 }, { "epoch": 5.070560404600702, "grad_norm": 1.1183092594146729, "learning_rate": 2.8734251920015464e-05, "loss": 0.4017, "num_input_tokens_seen": 59373296, "step": 62160 }, { "epoch": 5.07096826821111, "grad_norm": 1.8779785633087158, "learning_rate": 2.8730732531024308e-05, "loss": 0.3543, "num_input_tokens_seen": 59377840, "step": 62165 }, { "epoch": 5.071376131821519, "grad_norm": 2.839439868927002, "learning_rate": 2.8727213066412794e-05, "loss": 0.4162, "num_input_tokens_seen": 59382544, "step": 62170 }, { "epoch": 5.071783995431928, "grad_norm": 0.9660068154335022, "learning_rate": 2.8723693526252254e-05, "loss": 0.329, "num_input_tokens_seen": 59387280, "step": 62175 }, { "epoch": 5.072191859042336, "grad_norm": 3.301236867904663, "learning_rate": 2.8720173910614025e-05, "loss": 0.362, "num_input_tokens_seen": 59392256, "step": 62180 }, { "epoch": 5.072599722652745, "grad_norm": 0.609183669090271, "learning_rate": 2.8716654219569456e-05, "loss": 0.3064, "num_input_tokens_seen": 59397136, "step": 62185 }, { "epoch": 5.073007586263153, "grad_norm": 2.4936327934265137, "learning_rate": 2.871313445318988e-05, "loss": 0.2572, "num_input_tokens_seen": 59402096, "step": 62190 }, { "epoch": 5.073415449873562, "grad_norm": 1.4502644538879395, "learning_rate": 2.8709614611546646e-05, "loss": 0.3507, "num_input_tokens_seen": 59407456, "step": 62195 }, { "epoch": 5.073823313483971, "grad_norm": 1.4044486284255981, "learning_rate": 2.8706094694711107e-05, "loss": 0.4102, "num_input_tokens_seen": 59412400, "step": 62200 }, { "epoch": 5.074231177094379, "grad_norm": 3.410597562789917, "learning_rate": 2.8702574702754597e-05, "loss": 0.2665, "num_input_tokens_seen": 59417632, "step": 62205 }, { "epoch": 5.074639040704788, "grad_norm": 4.43959903717041, "learning_rate": 2.8699054635748478e-05, "loss": 0.3587, "num_input_tokens_seen": 59422352, "step": 62210 }, { "epoch": 5.075046904315197, "grad_norm": 2.6326236724853516, "learning_rate": 2.8695534493764094e-05, "loss": 0.3471, "num_input_tokens_seen": 59427328, "step": 62215 }, { "epoch": 5.0754547679256055, "grad_norm": 1.3001649379730225, "learning_rate": 2.8692014276872787e-05, "loss": 0.3329, "num_input_tokens_seen": 59431552, "step": 62220 }, { "epoch": 5.0758626315360145, "grad_norm": 0.6264640092849731, "learning_rate": 2.8688493985145927e-05, "loss": 0.2884, "num_input_tokens_seen": 59436544, "step": 62225 }, { "epoch": 5.076270495146423, "grad_norm": 1.0764046907424927, "learning_rate": 2.868497361865486e-05, "loss": 0.3436, "num_input_tokens_seen": 59441120, "step": 62230 }, { "epoch": 5.076678358756832, "grad_norm": 0.6469579339027405, "learning_rate": 2.868145317747094e-05, "loss": 0.3551, "num_input_tokens_seen": 59445648, "step": 62235 }, { "epoch": 5.077086222367241, "grad_norm": 2.3617005348205566, "learning_rate": 2.867793266166553e-05, "loss": 0.336, "num_input_tokens_seen": 59450288, "step": 62240 }, { "epoch": 5.077494085977649, "grad_norm": 3.2309699058532715, "learning_rate": 2.8674412071309987e-05, "loss": 0.3413, "num_input_tokens_seen": 59454784, "step": 62245 }, { "epoch": 5.077901949588058, "grad_norm": 2.1712799072265625, "learning_rate": 2.8670891406475674e-05, "loss": 0.3387, "num_input_tokens_seen": 59459024, "step": 62250 }, { "epoch": 5.078309813198467, "grad_norm": 7.315144062042236, "learning_rate": 2.8667370667233952e-05, "loss": 0.4369, "num_input_tokens_seen": 59463424, "step": 62255 }, { "epoch": 5.078717676808875, "grad_norm": 1.2191470861434937, "learning_rate": 2.866384985365618e-05, "loss": 0.4073, "num_input_tokens_seen": 59467904, "step": 62260 }, { "epoch": 5.079125540419284, "grad_norm": 9.89559555053711, "learning_rate": 2.8660328965813733e-05, "loss": 0.3307, "num_input_tokens_seen": 59472736, "step": 62265 }, { "epoch": 5.079533404029693, "grad_norm": 2.8698387145996094, "learning_rate": 2.8656808003777975e-05, "loss": 0.2835, "num_input_tokens_seen": 59477296, "step": 62270 }, { "epoch": 5.079941267640101, "grad_norm": 1.3255261182785034, "learning_rate": 2.8653286967620275e-05, "loss": 0.3541, "num_input_tokens_seen": 59481616, "step": 62275 }, { "epoch": 5.08034913125051, "grad_norm": 3.029057025909424, "learning_rate": 2.8649765857411993e-05, "loss": 0.3645, "num_input_tokens_seen": 59486704, "step": 62280 }, { "epoch": 5.080756994860918, "grad_norm": 2.6739420890808105, "learning_rate": 2.8646244673224515e-05, "loss": 0.414, "num_input_tokens_seen": 59492176, "step": 62285 }, { "epoch": 5.081164858471327, "grad_norm": 1.246601939201355, "learning_rate": 2.8642723415129203e-05, "loss": 0.3562, "num_input_tokens_seen": 59496944, "step": 62290 }, { "epoch": 5.081572722081736, "grad_norm": 1.1811391115188599, "learning_rate": 2.8639202083197437e-05, "loss": 0.2966, "num_input_tokens_seen": 59501264, "step": 62295 }, { "epoch": 5.081980585692144, "grad_norm": 3.80546498298645, "learning_rate": 2.8635680677500587e-05, "loss": 0.3303, "num_input_tokens_seen": 59505920, "step": 62300 }, { "epoch": 5.082388449302553, "grad_norm": 1.4314982891082764, "learning_rate": 2.8632159198110044e-05, "loss": 0.3254, "num_input_tokens_seen": 59510704, "step": 62305 }, { "epoch": 5.082796312912962, "grad_norm": 1.3428806066513062, "learning_rate": 2.8628637645097162e-05, "loss": 0.373, "num_input_tokens_seen": 59515600, "step": 62310 }, { "epoch": 5.08320417652337, "grad_norm": 7.870498180389404, "learning_rate": 2.862511601853335e-05, "loss": 0.3989, "num_input_tokens_seen": 59520160, "step": 62315 }, { "epoch": 5.083612040133779, "grad_norm": 0.6997183561325073, "learning_rate": 2.8621594318489976e-05, "loss": 0.3538, "num_input_tokens_seen": 59523952, "step": 62320 }, { "epoch": 5.0840199037441876, "grad_norm": 1.0200504064559937, "learning_rate": 2.8618072545038417e-05, "loss": 0.4077, "num_input_tokens_seen": 59528512, "step": 62325 }, { "epoch": 5.084427767354597, "grad_norm": 1.61964750289917, "learning_rate": 2.861455069825007e-05, "loss": 0.4179, "num_input_tokens_seen": 59532992, "step": 62330 }, { "epoch": 5.084835630965006, "grad_norm": 2.049570083618164, "learning_rate": 2.861102877819631e-05, "loss": 0.3253, "num_input_tokens_seen": 59537712, "step": 62335 }, { "epoch": 5.085243494575414, "grad_norm": 2.002518653869629, "learning_rate": 2.860750678494854e-05, "loss": 0.3545, "num_input_tokens_seen": 59543360, "step": 62340 }, { "epoch": 5.085651358185823, "grad_norm": 0.9538434743881226, "learning_rate": 2.860398471857813e-05, "loss": 0.3485, "num_input_tokens_seen": 59548656, "step": 62345 }, { "epoch": 5.086059221796232, "grad_norm": 2.164076566696167, "learning_rate": 2.860046257915649e-05, "loss": 0.3674, "num_input_tokens_seen": 59553392, "step": 62350 }, { "epoch": 5.08646708540664, "grad_norm": 1.765303611755371, "learning_rate": 2.8596940366755004e-05, "loss": 0.3737, "num_input_tokens_seen": 59558544, "step": 62355 }, { "epoch": 5.086874949017049, "grad_norm": 0.2687585651874542, "learning_rate": 2.8593418081445063e-05, "loss": 0.3242, "num_input_tokens_seen": 59562480, "step": 62360 }, { "epoch": 5.087282812627457, "grad_norm": 0.4591938257217407, "learning_rate": 2.858989572329806e-05, "loss": 0.3366, "num_input_tokens_seen": 59567248, "step": 62365 }, { "epoch": 5.087690676237866, "grad_norm": 0.6792935729026794, "learning_rate": 2.8586373292385403e-05, "loss": 0.3716, "num_input_tokens_seen": 59571952, "step": 62370 }, { "epoch": 5.088098539848275, "grad_norm": 1.711483359336853, "learning_rate": 2.858285078877848e-05, "loss": 0.3303, "num_input_tokens_seen": 59577232, "step": 62375 }, { "epoch": 5.088506403458683, "grad_norm": 0.811604380607605, "learning_rate": 2.857932821254869e-05, "loss": 0.3708, "num_input_tokens_seen": 59582160, "step": 62380 }, { "epoch": 5.088914267069092, "grad_norm": 1.871502161026001, "learning_rate": 2.8575805563767442e-05, "loss": 0.3309, "num_input_tokens_seen": 59586976, "step": 62385 }, { "epoch": 5.089322130679501, "grad_norm": 1.4729245901107788, "learning_rate": 2.857228284250613e-05, "loss": 0.3723, "num_input_tokens_seen": 59592016, "step": 62390 }, { "epoch": 5.089729994289909, "grad_norm": 2.2207415103912354, "learning_rate": 2.8568760048836173e-05, "loss": 0.3415, "num_input_tokens_seen": 59597296, "step": 62395 }, { "epoch": 5.090137857900318, "grad_norm": 1.963150978088379, "learning_rate": 2.8565237182828958e-05, "loss": 0.3418, "num_input_tokens_seen": 59602528, "step": 62400 }, { "epoch": 5.090545721510727, "grad_norm": 1.167304515838623, "learning_rate": 2.8561714244555904e-05, "loss": 0.3414, "num_input_tokens_seen": 59607584, "step": 62405 }, { "epoch": 5.090953585121135, "grad_norm": 2.2798099517822266, "learning_rate": 2.855819123408841e-05, "loss": 0.3376, "num_input_tokens_seen": 59611856, "step": 62410 }, { "epoch": 5.091361448731544, "grad_norm": 1.1616817712783813, "learning_rate": 2.85546681514979e-05, "loss": 0.3335, "num_input_tokens_seen": 59616800, "step": 62415 }, { "epoch": 5.0917693123419525, "grad_norm": 1.235804796218872, "learning_rate": 2.855114499685577e-05, "loss": 0.3225, "num_input_tokens_seen": 59621104, "step": 62420 }, { "epoch": 5.0921771759523615, "grad_norm": 0.5532292723655701, "learning_rate": 2.854762177023345e-05, "loss": 0.3077, "num_input_tokens_seen": 59626928, "step": 62425 }, { "epoch": 5.0925850395627705, "grad_norm": 1.206352710723877, "learning_rate": 2.854409847170234e-05, "loss": 0.2956, "num_input_tokens_seen": 59631536, "step": 62430 }, { "epoch": 5.092992903173179, "grad_norm": 1.6737136840820312, "learning_rate": 2.8540575101333856e-05, "loss": 0.404, "num_input_tokens_seen": 59636992, "step": 62435 }, { "epoch": 5.093400766783588, "grad_norm": 0.46214061975479126, "learning_rate": 2.853705165919942e-05, "loss": 0.3319, "num_input_tokens_seen": 59642176, "step": 62440 }, { "epoch": 5.093808630393997, "grad_norm": 2.062474489212036, "learning_rate": 2.8533528145370448e-05, "loss": 0.3543, "num_input_tokens_seen": 59646976, "step": 62445 }, { "epoch": 5.094216494004405, "grad_norm": 0.3940490484237671, "learning_rate": 2.8530004559918367e-05, "loss": 0.3096, "num_input_tokens_seen": 59651936, "step": 62450 }, { "epoch": 5.094624357614814, "grad_norm": 1.3452701568603516, "learning_rate": 2.8526480902914588e-05, "loss": 0.3459, "num_input_tokens_seen": 59657008, "step": 62455 }, { "epoch": 5.095032221225222, "grad_norm": 1.0966438055038452, "learning_rate": 2.8522957174430542e-05, "loss": 0.3159, "num_input_tokens_seen": 59661296, "step": 62460 }, { "epoch": 5.095440084835631, "grad_norm": 1.0900245904922485, "learning_rate": 2.8519433374537648e-05, "loss": 0.426, "num_input_tokens_seen": 59666464, "step": 62465 }, { "epoch": 5.09584794844604, "grad_norm": 1.4850236177444458, "learning_rate": 2.8515909503307344e-05, "loss": 0.3374, "num_input_tokens_seen": 59671568, "step": 62470 }, { "epoch": 5.096255812056448, "grad_norm": 1.9095810651779175, "learning_rate": 2.851238556081104e-05, "loss": 0.3814, "num_input_tokens_seen": 59675664, "step": 62475 }, { "epoch": 5.096663675666857, "grad_norm": 2.347015142440796, "learning_rate": 2.8508861547120174e-05, "loss": 0.3471, "num_input_tokens_seen": 59680384, "step": 62480 }, { "epoch": 5.097071539277266, "grad_norm": 8.116655349731445, "learning_rate": 2.8505337462306182e-05, "loss": 0.3951, "num_input_tokens_seen": 59685344, "step": 62485 }, { "epoch": 5.097479402887674, "grad_norm": 0.7593744397163391, "learning_rate": 2.8501813306440482e-05, "loss": 0.3549, "num_input_tokens_seen": 59689584, "step": 62490 }, { "epoch": 5.097887266498083, "grad_norm": 1.506258249282837, "learning_rate": 2.849828907959452e-05, "loss": 0.3436, "num_input_tokens_seen": 59694656, "step": 62495 }, { "epoch": 5.098295130108491, "grad_norm": 2.4123189449310303, "learning_rate": 2.8494764781839723e-05, "loss": 0.3053, "num_input_tokens_seen": 59699616, "step": 62500 }, { "epoch": 5.0987029937189, "grad_norm": 0.9069919586181641, "learning_rate": 2.8491240413247526e-05, "loss": 0.3367, "num_input_tokens_seen": 59704976, "step": 62505 }, { "epoch": 5.099110857329309, "grad_norm": 1.9100302457809448, "learning_rate": 2.8487715973889368e-05, "loss": 0.3286, "num_input_tokens_seen": 59709664, "step": 62510 }, { "epoch": 5.0995187209397175, "grad_norm": 3.409003257751465, "learning_rate": 2.8484191463836695e-05, "loss": 0.3322, "num_input_tokens_seen": 59715072, "step": 62515 }, { "epoch": 5.0999265845501265, "grad_norm": 0.7451040744781494, "learning_rate": 2.8480666883160932e-05, "loss": 0.3718, "num_input_tokens_seen": 59720032, "step": 62520 }, { "epoch": 5.1003344481605355, "grad_norm": 1.8502596616744995, "learning_rate": 2.847714223193354e-05, "loss": 0.2992, "num_input_tokens_seen": 59724240, "step": 62525 }, { "epoch": 5.100742311770944, "grad_norm": 0.7441559433937073, "learning_rate": 2.8473617510225953e-05, "loss": 0.335, "num_input_tokens_seen": 59728592, "step": 62530 }, { "epoch": 5.101150175381353, "grad_norm": 2.0991947650909424, "learning_rate": 2.8470092718109608e-05, "loss": 0.3367, "num_input_tokens_seen": 59733008, "step": 62535 }, { "epoch": 5.101558038991761, "grad_norm": 5.665555953979492, "learning_rate": 2.846656785565597e-05, "loss": 0.3189, "num_input_tokens_seen": 59737456, "step": 62540 }, { "epoch": 5.10196590260217, "grad_norm": 1.2951831817626953, "learning_rate": 2.8463042922936457e-05, "loss": 0.2791, "num_input_tokens_seen": 59742080, "step": 62545 }, { "epoch": 5.102373766212579, "grad_norm": 1.8546051979064941, "learning_rate": 2.845951792002255e-05, "loss": 0.3874, "num_input_tokens_seen": 59746720, "step": 62550 }, { "epoch": 5.102781629822987, "grad_norm": 0.9545558094978333, "learning_rate": 2.845599284698568e-05, "loss": 0.2755, "num_input_tokens_seen": 59751360, "step": 62555 }, { "epoch": 5.103189493433396, "grad_norm": 6.858096599578857, "learning_rate": 2.8452467703897307e-05, "loss": 0.3084, "num_input_tokens_seen": 59756256, "step": 62560 }, { "epoch": 5.103597357043805, "grad_norm": 1.4241530895233154, "learning_rate": 2.8448942490828874e-05, "loss": 0.3933, "num_input_tokens_seen": 59761024, "step": 62565 }, { "epoch": 5.104005220654213, "grad_norm": 4.018624782562256, "learning_rate": 2.8445417207851843e-05, "loss": 0.341, "num_input_tokens_seen": 59765520, "step": 62570 }, { "epoch": 5.104413084264622, "grad_norm": 0.3822086453437805, "learning_rate": 2.844189185503767e-05, "loss": 0.3569, "num_input_tokens_seen": 59769280, "step": 62575 }, { "epoch": 5.10482094787503, "grad_norm": 2.350165605545044, "learning_rate": 2.8438366432457813e-05, "loss": 0.3341, "num_input_tokens_seen": 59774432, "step": 62580 }, { "epoch": 5.105228811485439, "grad_norm": 3.6083099842071533, "learning_rate": 2.8434840940183732e-05, "loss": 0.3364, "num_input_tokens_seen": 59778864, "step": 62585 }, { "epoch": 5.105636675095848, "grad_norm": 1.1491901874542236, "learning_rate": 2.8431315378286884e-05, "loss": 0.3407, "num_input_tokens_seen": 59783408, "step": 62590 }, { "epoch": 5.106044538706256, "grad_norm": 1.1026383638381958, "learning_rate": 2.842778974683873e-05, "loss": 0.3469, "num_input_tokens_seen": 59788144, "step": 62595 }, { "epoch": 5.106452402316665, "grad_norm": 1.232582449913025, "learning_rate": 2.842426404591073e-05, "loss": 0.2995, "num_input_tokens_seen": 59793088, "step": 62600 }, { "epoch": 5.106860265927074, "grad_norm": 2.599256992340088, "learning_rate": 2.8420738275574354e-05, "loss": 0.3253, "num_input_tokens_seen": 59798080, "step": 62605 }, { "epoch": 5.107268129537482, "grad_norm": 0.5603761672973633, "learning_rate": 2.841721243590107e-05, "loss": 0.3369, "num_input_tokens_seen": 59802752, "step": 62610 }, { "epoch": 5.1076759931478914, "grad_norm": 0.677066445350647, "learning_rate": 2.841368652696234e-05, "loss": 0.3009, "num_input_tokens_seen": 59807744, "step": 62615 }, { "epoch": 5.1080838567583005, "grad_norm": 1.7785248756408691, "learning_rate": 2.841016054882964e-05, "loss": 0.3633, "num_input_tokens_seen": 59812464, "step": 62620 }, { "epoch": 5.108491720368709, "grad_norm": 0.46000707149505615, "learning_rate": 2.8406634501574432e-05, "loss": 0.2731, "num_input_tokens_seen": 59817152, "step": 62625 }, { "epoch": 5.108899583979118, "grad_norm": 2.9678657054901123, "learning_rate": 2.840310838526819e-05, "loss": 0.3056, "num_input_tokens_seen": 59822480, "step": 62630 }, { "epoch": 5.109307447589526, "grad_norm": 1.8881261348724365, "learning_rate": 2.8399582199982387e-05, "loss": 0.4052, "num_input_tokens_seen": 59827568, "step": 62635 }, { "epoch": 5.109715311199935, "grad_norm": 0.6573877334594727, "learning_rate": 2.8396055945788496e-05, "loss": 0.3221, "num_input_tokens_seen": 59832096, "step": 62640 }, { "epoch": 5.110123174810344, "grad_norm": 2.4034695625305176, "learning_rate": 2.8392529622757992e-05, "loss": 0.2847, "num_input_tokens_seen": 59838032, "step": 62645 }, { "epoch": 5.110531038420752, "grad_norm": 3.403402805328369, "learning_rate": 2.8389003230962358e-05, "loss": 0.3409, "num_input_tokens_seen": 59842640, "step": 62650 }, { "epoch": 5.110938902031161, "grad_norm": 1.0652261972427368, "learning_rate": 2.8385476770473063e-05, "loss": 0.3976, "num_input_tokens_seen": 59846864, "step": 62655 }, { "epoch": 5.11134676564157, "grad_norm": 2.60957407951355, "learning_rate": 2.8381950241361598e-05, "loss": 0.3437, "num_input_tokens_seen": 59851744, "step": 62660 }, { "epoch": 5.111754629251978, "grad_norm": 8.616374015808105, "learning_rate": 2.8378423643699438e-05, "loss": 0.3293, "num_input_tokens_seen": 59856704, "step": 62665 }, { "epoch": 5.112162492862387, "grad_norm": 0.8334367275238037, "learning_rate": 2.837489697755807e-05, "loss": 0.2964, "num_input_tokens_seen": 59861280, "step": 62670 }, { "epoch": 5.112570356472795, "grad_norm": 4.193596839904785, "learning_rate": 2.837137024300897e-05, "loss": 0.3329, "num_input_tokens_seen": 59865840, "step": 62675 }, { "epoch": 5.112978220083204, "grad_norm": 0.9978124499320984, "learning_rate": 2.8367843440123626e-05, "loss": 0.3349, "num_input_tokens_seen": 59870752, "step": 62680 }, { "epoch": 5.113386083693613, "grad_norm": 6.9880170822143555, "learning_rate": 2.836431656897353e-05, "loss": 0.3323, "num_input_tokens_seen": 59875424, "step": 62685 }, { "epoch": 5.113793947304021, "grad_norm": 2.076430082321167, "learning_rate": 2.836078962963016e-05, "loss": 0.4005, "num_input_tokens_seen": 59880736, "step": 62690 }, { "epoch": 5.11420181091443, "grad_norm": 1.0071102380752563, "learning_rate": 2.8357262622165022e-05, "loss": 0.3399, "num_input_tokens_seen": 59885680, "step": 62695 }, { "epoch": 5.114609674524839, "grad_norm": 1.9286597967147827, "learning_rate": 2.8353735546649597e-05, "loss": 0.3735, "num_input_tokens_seen": 59890928, "step": 62700 }, { "epoch": 5.115017538135247, "grad_norm": 3.984192371368408, "learning_rate": 2.835020840315537e-05, "loss": 0.3982, "num_input_tokens_seen": 59895808, "step": 62705 }, { "epoch": 5.115425401745656, "grad_norm": 1.112377405166626, "learning_rate": 2.834668119175384e-05, "loss": 0.3158, "num_input_tokens_seen": 59901216, "step": 62710 }, { "epoch": 5.1158332653560645, "grad_norm": 4.262672424316406, "learning_rate": 2.8343153912516517e-05, "loss": 0.4168, "num_input_tokens_seen": 59906176, "step": 62715 }, { "epoch": 5.1162411289664735, "grad_norm": 3.264893054962158, "learning_rate": 2.833962656551487e-05, "loss": 0.3315, "num_input_tokens_seen": 59911136, "step": 62720 }, { "epoch": 5.1166489925768825, "grad_norm": 4.132962703704834, "learning_rate": 2.8336099150820422e-05, "loss": 0.349, "num_input_tokens_seen": 59915968, "step": 62725 }, { "epoch": 5.117056856187291, "grad_norm": 3.1335699558258057, "learning_rate": 2.8332571668504664e-05, "loss": 0.3986, "num_input_tokens_seen": 59921248, "step": 62730 }, { "epoch": 5.1174647197977, "grad_norm": 6.966286659240723, "learning_rate": 2.8329044118639087e-05, "loss": 0.4612, "num_input_tokens_seen": 59924896, "step": 62735 }, { "epoch": 5.117872583408109, "grad_norm": 1.8212605714797974, "learning_rate": 2.8325516501295202e-05, "loss": 0.2779, "num_input_tokens_seen": 59929360, "step": 62740 }, { "epoch": 5.118280447018517, "grad_norm": 0.6436547040939331, "learning_rate": 2.832198881654451e-05, "loss": 0.2677, "num_input_tokens_seen": 59933408, "step": 62745 }, { "epoch": 5.118688310628926, "grad_norm": 2.127866744995117, "learning_rate": 2.831846106445852e-05, "loss": 0.345, "num_input_tokens_seen": 59938096, "step": 62750 }, { "epoch": 5.119096174239334, "grad_norm": 4.9898600578308105, "learning_rate": 2.831493324510873e-05, "loss": 0.3246, "num_input_tokens_seen": 59942960, "step": 62755 }, { "epoch": 5.119504037849743, "grad_norm": 0.7478012442588806, "learning_rate": 2.8311405358566656e-05, "loss": 0.2086, "num_input_tokens_seen": 59947792, "step": 62760 }, { "epoch": 5.119911901460152, "grad_norm": 5.863081932067871, "learning_rate": 2.83078774049038e-05, "loss": 0.3963, "num_input_tokens_seen": 59952384, "step": 62765 }, { "epoch": 5.12031976507056, "grad_norm": 0.4770318865776062, "learning_rate": 2.8304349384191675e-05, "loss": 0.2726, "num_input_tokens_seen": 59956992, "step": 62770 }, { "epoch": 5.120727628680969, "grad_norm": 3.065725564956665, "learning_rate": 2.8300821296501795e-05, "loss": 0.2365, "num_input_tokens_seen": 59962032, "step": 62775 }, { "epoch": 5.121135492291378, "grad_norm": 8.73963451385498, "learning_rate": 2.829729314190566e-05, "loss": 0.3375, "num_input_tokens_seen": 59966832, "step": 62780 }, { "epoch": 5.121543355901786, "grad_norm": 3.5271689891815186, "learning_rate": 2.829376492047481e-05, "loss": 0.1939, "num_input_tokens_seen": 59971616, "step": 62785 }, { "epoch": 5.121951219512195, "grad_norm": 8.380963325500488, "learning_rate": 2.829023663228073e-05, "loss": 0.5208, "num_input_tokens_seen": 59975856, "step": 62790 }, { "epoch": 5.122359083122603, "grad_norm": 2.51454496383667, "learning_rate": 2.828670827739496e-05, "loss": 0.479, "num_input_tokens_seen": 59980816, "step": 62795 }, { "epoch": 5.122766946733012, "grad_norm": 1.2506117820739746, "learning_rate": 2.8283179855889002e-05, "loss": 0.3065, "num_input_tokens_seen": 59985296, "step": 62800 }, { "epoch": 5.123174810343421, "grad_norm": 9.50806999206543, "learning_rate": 2.8279651367834393e-05, "loss": 0.4656, "num_input_tokens_seen": 59989616, "step": 62805 }, { "epoch": 5.1235826739538295, "grad_norm": 7.69466495513916, "learning_rate": 2.8276122813302637e-05, "loss": 0.3514, "num_input_tokens_seen": 59994576, "step": 62810 }, { "epoch": 5.1239905375642385, "grad_norm": 9.622153282165527, "learning_rate": 2.8272594192365266e-05, "loss": 0.4084, "num_input_tokens_seen": 59999760, "step": 62815 }, { "epoch": 5.1243984011746475, "grad_norm": 2.980318307876587, "learning_rate": 2.8269065505093794e-05, "loss": 0.33, "num_input_tokens_seen": 60004624, "step": 62820 }, { "epoch": 5.124806264785056, "grad_norm": 6.648176193237305, "learning_rate": 2.826553675155977e-05, "loss": 0.4045, "num_input_tokens_seen": 60009904, "step": 62825 }, { "epoch": 5.125214128395465, "grad_norm": 5.75263786315918, "learning_rate": 2.8262007931834695e-05, "loss": 0.3846, "num_input_tokens_seen": 60014592, "step": 62830 }, { "epoch": 5.125621992005874, "grad_norm": 2.545058012008667, "learning_rate": 2.8258479045990104e-05, "loss": 0.4164, "num_input_tokens_seen": 60020064, "step": 62835 }, { "epoch": 5.126029855616282, "grad_norm": 3.186237096786499, "learning_rate": 2.8254950094097533e-05, "loss": 0.2963, "num_input_tokens_seen": 60025296, "step": 62840 }, { "epoch": 5.126437719226691, "grad_norm": 4.393021106719971, "learning_rate": 2.8251421076228496e-05, "loss": 0.4216, "num_input_tokens_seen": 60029552, "step": 62845 }, { "epoch": 5.126845582837099, "grad_norm": 3.156254291534424, "learning_rate": 2.8247891992454545e-05, "loss": 0.3267, "num_input_tokens_seen": 60033008, "step": 62850 }, { "epoch": 5.127253446447508, "grad_norm": 12.099163055419922, "learning_rate": 2.8244362842847195e-05, "loss": 0.3788, "num_input_tokens_seen": 60037472, "step": 62855 }, { "epoch": 5.127661310057917, "grad_norm": 2.264442205429077, "learning_rate": 2.8240833627478e-05, "loss": 0.3023, "num_input_tokens_seen": 60042064, "step": 62860 }, { "epoch": 5.128069173668325, "grad_norm": 2.4030637741088867, "learning_rate": 2.8237304346418473e-05, "loss": 0.282, "num_input_tokens_seen": 60046752, "step": 62865 }, { "epoch": 5.128477037278734, "grad_norm": 14.443207740783691, "learning_rate": 2.823377499974017e-05, "loss": 0.4569, "num_input_tokens_seen": 60050816, "step": 62870 }, { "epoch": 5.128884900889143, "grad_norm": 2.540529251098633, "learning_rate": 2.823024558751462e-05, "loss": 0.3427, "num_input_tokens_seen": 60055344, "step": 62875 }, { "epoch": 5.129292764499551, "grad_norm": 0.4950350821018219, "learning_rate": 2.8226716109813362e-05, "loss": 0.3142, "num_input_tokens_seen": 60059984, "step": 62880 }, { "epoch": 5.12970062810996, "grad_norm": 2.6341471672058105, "learning_rate": 2.822318656670794e-05, "loss": 0.3405, "num_input_tokens_seen": 60064864, "step": 62885 }, { "epoch": 5.130108491720368, "grad_norm": 6.248004913330078, "learning_rate": 2.8219656958269898e-05, "loss": 0.4368, "num_input_tokens_seen": 60069072, "step": 62890 }, { "epoch": 5.130516355330777, "grad_norm": 10.41618537902832, "learning_rate": 2.821612728457078e-05, "loss": 0.2879, "num_input_tokens_seen": 60073744, "step": 62895 }, { "epoch": 5.130924218941186, "grad_norm": 5.360042572021484, "learning_rate": 2.8212597545682128e-05, "loss": 0.4673, "num_input_tokens_seen": 60077648, "step": 62900 }, { "epoch": 5.1313320825515945, "grad_norm": 1.782402753829956, "learning_rate": 2.8209067741675486e-05, "loss": 0.3568, "num_input_tokens_seen": 60081888, "step": 62905 }, { "epoch": 5.1317399461620035, "grad_norm": 0.8486356139183044, "learning_rate": 2.8205537872622406e-05, "loss": 0.5132, "num_input_tokens_seen": 60086048, "step": 62910 }, { "epoch": 5.1321478097724125, "grad_norm": 2.4593513011932373, "learning_rate": 2.8202007938594432e-05, "loss": 0.4302, "num_input_tokens_seen": 60089216, "step": 62915 }, { "epoch": 5.132555673382821, "grad_norm": 4.629002571105957, "learning_rate": 2.819847793966312e-05, "loss": 0.4339, "num_input_tokens_seen": 60094464, "step": 62920 }, { "epoch": 5.13296353699323, "grad_norm": 9.561020851135254, "learning_rate": 2.8194947875900025e-05, "loss": 0.6415, "num_input_tokens_seen": 60099328, "step": 62925 }, { "epoch": 5.133371400603638, "grad_norm": 3.102423667907715, "learning_rate": 2.8191417747376688e-05, "loss": 0.2895, "num_input_tokens_seen": 60104192, "step": 62930 }, { "epoch": 5.133779264214047, "grad_norm": 4.29079008102417, "learning_rate": 2.8187887554164664e-05, "loss": 0.5266, "num_input_tokens_seen": 60108432, "step": 62935 }, { "epoch": 5.134187127824456, "grad_norm": 5.555455684661865, "learning_rate": 2.8184357296335523e-05, "loss": 0.3056, "num_input_tokens_seen": 60113856, "step": 62940 }, { "epoch": 5.134594991434864, "grad_norm": 5.13003396987915, "learning_rate": 2.8180826973960805e-05, "loss": 0.3341, "num_input_tokens_seen": 60118896, "step": 62945 }, { "epoch": 5.135002855045273, "grad_norm": 2.993776798248291, "learning_rate": 2.8177296587112083e-05, "loss": 0.2987, "num_input_tokens_seen": 60123984, "step": 62950 }, { "epoch": 5.135410718655682, "grad_norm": 3.817890167236328, "learning_rate": 2.8173766135860908e-05, "loss": 0.3024, "num_input_tokens_seen": 60129648, "step": 62955 }, { "epoch": 5.13581858226609, "grad_norm": 32.28988265991211, "learning_rate": 2.8170235620278838e-05, "loss": 0.5206, "num_input_tokens_seen": 60134400, "step": 62960 }, { "epoch": 5.136226445876499, "grad_norm": 1.66667902469635, "learning_rate": 2.8166705040437447e-05, "loss": 0.2169, "num_input_tokens_seen": 60139120, "step": 62965 }, { "epoch": 5.136634309486908, "grad_norm": 3.5349862575531006, "learning_rate": 2.8163174396408286e-05, "loss": 0.4525, "num_input_tokens_seen": 60142720, "step": 62970 }, { "epoch": 5.137042173097316, "grad_norm": 14.785161018371582, "learning_rate": 2.815964368826292e-05, "loss": 0.4774, "num_input_tokens_seen": 60147424, "step": 62975 }, { "epoch": 5.137450036707725, "grad_norm": 4.134587287902832, "learning_rate": 2.8156112916072925e-05, "loss": 0.3805, "num_input_tokens_seen": 60152672, "step": 62980 }, { "epoch": 5.137857900318133, "grad_norm": 2.7411460876464844, "learning_rate": 2.815258207990986e-05, "loss": 0.4855, "num_input_tokens_seen": 60156720, "step": 62985 }, { "epoch": 5.138265763928542, "grad_norm": 3.2010152339935303, "learning_rate": 2.8149051179845292e-05, "loss": 0.5093, "num_input_tokens_seen": 60162320, "step": 62990 }, { "epoch": 5.138673627538951, "grad_norm": 6.888751983642578, "learning_rate": 2.8145520215950804e-05, "loss": 0.422, "num_input_tokens_seen": 60166192, "step": 62995 }, { "epoch": 5.139081491149359, "grad_norm": 6.40943717956543, "learning_rate": 2.8141989188297947e-05, "loss": 0.2539, "num_input_tokens_seen": 60170976, "step": 63000 }, { "epoch": 5.139489354759768, "grad_norm": 1.1234359741210938, "learning_rate": 2.8138458096958313e-05, "loss": 0.2534, "num_input_tokens_seen": 60174880, "step": 63005 }, { "epoch": 5.139897218370177, "grad_norm": 2.5265166759490967, "learning_rate": 2.813492694200346e-05, "loss": 0.5596, "num_input_tokens_seen": 60180208, "step": 63010 }, { "epoch": 5.1403050819805856, "grad_norm": 0.6501914858818054, "learning_rate": 2.813139572350498e-05, "loss": 0.4281, "num_input_tokens_seen": 60184416, "step": 63015 }, { "epoch": 5.140712945590995, "grad_norm": 27.726945877075195, "learning_rate": 2.8127864441534423e-05, "loss": 0.4906, "num_input_tokens_seen": 60189024, "step": 63020 }, { "epoch": 5.141120809201403, "grad_norm": 1.4243777990341187, "learning_rate": 2.8124333096163398e-05, "loss": 0.4102, "num_input_tokens_seen": 60194176, "step": 63025 }, { "epoch": 5.141528672811812, "grad_norm": 2.7006869316101074, "learning_rate": 2.8120801687463466e-05, "loss": 0.3367, "num_input_tokens_seen": 60199312, "step": 63030 }, { "epoch": 5.141936536422221, "grad_norm": 1.182701826095581, "learning_rate": 2.8117270215506202e-05, "loss": 0.3565, "num_input_tokens_seen": 60204480, "step": 63035 }, { "epoch": 5.142344400032629, "grad_norm": 3.042243719100952, "learning_rate": 2.8113738680363205e-05, "loss": 0.3126, "num_input_tokens_seen": 60209536, "step": 63040 }, { "epoch": 5.142752263643038, "grad_norm": 10.74599552154541, "learning_rate": 2.8110207082106044e-05, "loss": 0.382, "num_input_tokens_seen": 60213792, "step": 63045 }, { "epoch": 5.143160127253447, "grad_norm": 5.707936763763428, "learning_rate": 2.8106675420806312e-05, "loss": 0.3534, "num_input_tokens_seen": 60218720, "step": 63050 }, { "epoch": 5.143567990863855, "grad_norm": 2.550056219100952, "learning_rate": 2.810314369653558e-05, "loss": 0.298, "num_input_tokens_seen": 60222576, "step": 63055 }, { "epoch": 5.143975854474264, "grad_norm": 3.8203556537628174, "learning_rate": 2.809961190936545e-05, "loss": 0.3044, "num_input_tokens_seen": 60227552, "step": 63060 }, { "epoch": 5.144383718084672, "grad_norm": 3.313136100769043, "learning_rate": 2.8096080059367498e-05, "loss": 0.3681, "num_input_tokens_seen": 60232560, "step": 63065 }, { "epoch": 5.144791581695081, "grad_norm": 1.357529878616333, "learning_rate": 2.8092548146613323e-05, "loss": 0.3013, "num_input_tokens_seen": 60236912, "step": 63070 }, { "epoch": 5.14519944530549, "grad_norm": 4.215899467468262, "learning_rate": 2.8089016171174515e-05, "loss": 0.338, "num_input_tokens_seen": 60241712, "step": 63075 }, { "epoch": 5.145607308915898, "grad_norm": 2.6824951171875, "learning_rate": 2.8085484133122653e-05, "loss": 0.3506, "num_input_tokens_seen": 60246512, "step": 63080 }, { "epoch": 5.146015172526307, "grad_norm": 3.589226722717285, "learning_rate": 2.8081952032529347e-05, "loss": 0.4001, "num_input_tokens_seen": 60251968, "step": 63085 }, { "epoch": 5.146423036136716, "grad_norm": 3.91137957572937, "learning_rate": 2.8078419869466172e-05, "loss": 0.325, "num_input_tokens_seen": 60256768, "step": 63090 }, { "epoch": 5.146830899747124, "grad_norm": 5.1849493980407715, "learning_rate": 2.8074887644004743e-05, "loss": 0.3608, "num_input_tokens_seen": 60261936, "step": 63095 }, { "epoch": 5.147238763357533, "grad_norm": 1.7290940284729004, "learning_rate": 2.8071355356216644e-05, "loss": 0.3184, "num_input_tokens_seen": 60266816, "step": 63100 }, { "epoch": 5.1476466269679415, "grad_norm": 2.5759942531585693, "learning_rate": 2.8067823006173483e-05, "loss": 0.3954, "num_input_tokens_seen": 60271952, "step": 63105 }, { "epoch": 5.1480544905783505, "grad_norm": 0.6380490660667419, "learning_rate": 2.8064290593946852e-05, "loss": 0.2715, "num_input_tokens_seen": 60276640, "step": 63110 }, { "epoch": 5.1484623541887595, "grad_norm": 2.8242440223693848, "learning_rate": 2.8060758119608356e-05, "loss": 0.2457, "num_input_tokens_seen": 60280992, "step": 63115 }, { "epoch": 5.148870217799168, "grad_norm": 2.992570638656616, "learning_rate": 2.805722558322958e-05, "loss": 0.2646, "num_input_tokens_seen": 60285872, "step": 63120 }, { "epoch": 5.149278081409577, "grad_norm": 3.498326063156128, "learning_rate": 2.8053692984882152e-05, "loss": 0.3959, "num_input_tokens_seen": 60290192, "step": 63125 }, { "epoch": 5.149685945019986, "grad_norm": 2.0307440757751465, "learning_rate": 2.8050160324637664e-05, "loss": 0.4118, "num_input_tokens_seen": 60295504, "step": 63130 }, { "epoch": 5.150093808630394, "grad_norm": 4.299921035766602, "learning_rate": 2.8046627602567716e-05, "loss": 0.366, "num_input_tokens_seen": 60299264, "step": 63135 }, { "epoch": 5.150501672240803, "grad_norm": 1.8402411937713623, "learning_rate": 2.8043094818743927e-05, "loss": 0.3968, "num_input_tokens_seen": 60304496, "step": 63140 }, { "epoch": 5.150909535851211, "grad_norm": 1.9793390035629272, "learning_rate": 2.8039561973237893e-05, "loss": 0.3955, "num_input_tokens_seen": 60309712, "step": 63145 }, { "epoch": 5.15131739946162, "grad_norm": 1.6472221612930298, "learning_rate": 2.8036029066121235e-05, "loss": 0.3213, "num_input_tokens_seen": 60313632, "step": 63150 }, { "epoch": 5.151725263072029, "grad_norm": 2.592992067337036, "learning_rate": 2.8032496097465556e-05, "loss": 0.3094, "num_input_tokens_seen": 60317632, "step": 63155 }, { "epoch": 5.152133126682437, "grad_norm": 2.6462299823760986, "learning_rate": 2.8028963067342468e-05, "loss": 0.338, "num_input_tokens_seen": 60322752, "step": 63160 }, { "epoch": 5.152540990292846, "grad_norm": 1.5685393810272217, "learning_rate": 2.8025429975823586e-05, "loss": 0.2602, "num_input_tokens_seen": 60327360, "step": 63165 }, { "epoch": 5.152948853903255, "grad_norm": 1.3397669792175293, "learning_rate": 2.8021896822980524e-05, "loss": 0.3201, "num_input_tokens_seen": 60332224, "step": 63170 }, { "epoch": 5.153356717513663, "grad_norm": 0.6343717575073242, "learning_rate": 2.8018363608884902e-05, "loss": 0.3184, "num_input_tokens_seen": 60336624, "step": 63175 }, { "epoch": 5.153764581124072, "grad_norm": 3.2758820056915283, "learning_rate": 2.8014830333608334e-05, "loss": 0.3209, "num_input_tokens_seen": 60341680, "step": 63180 }, { "epoch": 5.154172444734481, "grad_norm": 1.9350996017456055, "learning_rate": 2.801129699722243e-05, "loss": 0.3065, "num_input_tokens_seen": 60346256, "step": 63185 }, { "epoch": 5.154580308344889, "grad_norm": 0.7721691727638245, "learning_rate": 2.8007763599798814e-05, "loss": 0.2848, "num_input_tokens_seen": 60350400, "step": 63190 }, { "epoch": 5.154988171955298, "grad_norm": 1.7124876976013184, "learning_rate": 2.8004230141409116e-05, "loss": 0.2747, "num_input_tokens_seen": 60354928, "step": 63195 }, { "epoch": 5.1553960355657065, "grad_norm": 3.379601001739502, "learning_rate": 2.8000696622124937e-05, "loss": 0.3213, "num_input_tokens_seen": 60359184, "step": 63200 }, { "epoch": 5.1558038991761155, "grad_norm": 3.883269786834717, "learning_rate": 2.7997163042017926e-05, "loss": 0.3636, "num_input_tokens_seen": 60364816, "step": 63205 }, { "epoch": 5.1562117627865245, "grad_norm": 5.361817836761475, "learning_rate": 2.799362940115968e-05, "loss": 0.2857, "num_input_tokens_seen": 60368992, "step": 63210 }, { "epoch": 5.156619626396933, "grad_norm": 0.9164261817932129, "learning_rate": 2.7990095699621853e-05, "loss": 0.2808, "num_input_tokens_seen": 60373232, "step": 63215 }, { "epoch": 5.157027490007342, "grad_norm": 3.5439157485961914, "learning_rate": 2.7986561937476046e-05, "loss": 0.3133, "num_input_tokens_seen": 60376880, "step": 63220 }, { "epoch": 5.157435353617751, "grad_norm": 2.741161823272705, "learning_rate": 2.7983028114793903e-05, "loss": 0.3355, "num_input_tokens_seen": 60381680, "step": 63225 }, { "epoch": 5.157843217228159, "grad_norm": 4.964249134063721, "learning_rate": 2.7979494231647047e-05, "loss": 0.2696, "num_input_tokens_seen": 60386432, "step": 63230 }, { "epoch": 5.158251080838568, "grad_norm": 2.4783921241760254, "learning_rate": 2.7975960288107105e-05, "loss": 0.3158, "num_input_tokens_seen": 60390592, "step": 63235 }, { "epoch": 5.158658944448976, "grad_norm": 4.019497871398926, "learning_rate": 2.797242628424572e-05, "loss": 0.3628, "num_input_tokens_seen": 60395488, "step": 63240 }, { "epoch": 5.159066808059385, "grad_norm": 1.213468074798584, "learning_rate": 2.7968892220134512e-05, "loss": 0.3707, "num_input_tokens_seen": 60400000, "step": 63245 }, { "epoch": 5.159474671669794, "grad_norm": 1.4210543632507324, "learning_rate": 2.7965358095845122e-05, "loss": 0.258, "num_input_tokens_seen": 60404720, "step": 63250 }, { "epoch": 5.159882535280202, "grad_norm": 7.517911434173584, "learning_rate": 2.796182391144918e-05, "loss": 0.3938, "num_input_tokens_seen": 60409312, "step": 63255 }, { "epoch": 5.160290398890611, "grad_norm": 1.1117535829544067, "learning_rate": 2.7958289667018327e-05, "loss": 0.3753, "num_input_tokens_seen": 60414208, "step": 63260 }, { "epoch": 5.16069826250102, "grad_norm": 6.522140026092529, "learning_rate": 2.7954755362624202e-05, "loss": 0.4168, "num_input_tokens_seen": 60419344, "step": 63265 }, { "epoch": 5.161106126111428, "grad_norm": 0.6565452814102173, "learning_rate": 2.7951220998338438e-05, "loss": 0.3145, "num_input_tokens_seen": 60424064, "step": 63270 }, { "epoch": 5.161513989721837, "grad_norm": 0.35113760828971863, "learning_rate": 2.794768657423268e-05, "loss": 0.3408, "num_input_tokens_seen": 60429056, "step": 63275 }, { "epoch": 5.161921853332245, "grad_norm": 73.9974136352539, "learning_rate": 2.7944152090378573e-05, "loss": 0.4557, "num_input_tokens_seen": 60434080, "step": 63280 }, { "epoch": 5.162329716942654, "grad_norm": 2.5031332969665527, "learning_rate": 2.794061754684775e-05, "loss": 0.389, "num_input_tokens_seen": 60439216, "step": 63285 }, { "epoch": 5.162737580553063, "grad_norm": 6.732127666473389, "learning_rate": 2.793708294371185e-05, "loss": 0.5072, "num_input_tokens_seen": 60443952, "step": 63290 }, { "epoch": 5.163145444163471, "grad_norm": 3.3896732330322266, "learning_rate": 2.7933548281042544e-05, "loss": 0.378, "num_input_tokens_seen": 60449584, "step": 63295 }, { "epoch": 5.16355330777388, "grad_norm": 5.109633922576904, "learning_rate": 2.7930013558911445e-05, "loss": 0.372, "num_input_tokens_seen": 60454432, "step": 63300 }, { "epoch": 5.1639611713842895, "grad_norm": 4.623714923858643, "learning_rate": 2.7926478777390224e-05, "loss": 0.3506, "num_input_tokens_seen": 60458944, "step": 63305 }, { "epoch": 5.164369034994698, "grad_norm": 4.484335422515869, "learning_rate": 2.792294393655052e-05, "loss": 0.3158, "num_input_tokens_seen": 60463152, "step": 63310 }, { "epoch": 5.164776898605107, "grad_norm": 2.859053373336792, "learning_rate": 2.791940903646399e-05, "loss": 0.3471, "num_input_tokens_seen": 60467376, "step": 63315 }, { "epoch": 5.165184762215515, "grad_norm": 4.403772830963135, "learning_rate": 2.791587407720227e-05, "loss": 0.4835, "num_input_tokens_seen": 60471984, "step": 63320 }, { "epoch": 5.165592625825924, "grad_norm": 1.684280514717102, "learning_rate": 2.7912339058837027e-05, "loss": 0.2675, "num_input_tokens_seen": 60477056, "step": 63325 }, { "epoch": 5.166000489436333, "grad_norm": 3.1104068756103516, "learning_rate": 2.7908803981439913e-05, "loss": 0.2939, "num_input_tokens_seen": 60481968, "step": 63330 }, { "epoch": 5.166408353046741, "grad_norm": 2.5780181884765625, "learning_rate": 2.790526884508257e-05, "loss": 0.373, "num_input_tokens_seen": 60487296, "step": 63335 }, { "epoch": 5.16681621665715, "grad_norm": 4.867471218109131, "learning_rate": 2.7901733649836668e-05, "loss": 0.3569, "num_input_tokens_seen": 60490688, "step": 63340 }, { "epoch": 5.167224080267559, "grad_norm": 5.203207492828369, "learning_rate": 2.789819839577385e-05, "loss": 0.3199, "num_input_tokens_seen": 60495728, "step": 63345 }, { "epoch": 5.167631943877967, "grad_norm": 4.436010837554932, "learning_rate": 2.7894663082965787e-05, "loss": 0.3314, "num_input_tokens_seen": 60499696, "step": 63350 }, { "epoch": 5.168039807488376, "grad_norm": 6.184319496154785, "learning_rate": 2.7891127711484133e-05, "loss": 0.4096, "num_input_tokens_seen": 60503888, "step": 63355 }, { "epoch": 5.168447671098784, "grad_norm": 1.6922165155410767, "learning_rate": 2.7887592281400556e-05, "loss": 0.3478, "num_input_tokens_seen": 60508800, "step": 63360 }, { "epoch": 5.168855534709193, "grad_norm": 3.271174430847168, "learning_rate": 2.7884056792786693e-05, "loss": 0.3812, "num_input_tokens_seen": 60513936, "step": 63365 }, { "epoch": 5.169263398319602, "grad_norm": 1.4904221296310425, "learning_rate": 2.7880521245714242e-05, "loss": 0.3911, "num_input_tokens_seen": 60519152, "step": 63370 }, { "epoch": 5.16967126193001, "grad_norm": 1.2181440591812134, "learning_rate": 2.7876985640254844e-05, "loss": 0.2898, "num_input_tokens_seen": 60524688, "step": 63375 }, { "epoch": 5.170079125540419, "grad_norm": 4.024445056915283, "learning_rate": 2.7873449976480165e-05, "loss": 0.3293, "num_input_tokens_seen": 60528032, "step": 63380 }, { "epoch": 5.170486989150828, "grad_norm": 4.978294849395752, "learning_rate": 2.786991425446188e-05, "loss": 0.3458, "num_input_tokens_seen": 60532864, "step": 63385 }, { "epoch": 5.170894852761236, "grad_norm": 1.6199601888656616, "learning_rate": 2.7866378474271655e-05, "loss": 0.3351, "num_input_tokens_seen": 60537824, "step": 63390 }, { "epoch": 5.171302716371645, "grad_norm": 3.0627129077911377, "learning_rate": 2.7862842635981158e-05, "loss": 0.3644, "num_input_tokens_seen": 60541776, "step": 63395 }, { "epoch": 5.171710579982054, "grad_norm": 0.993030309677124, "learning_rate": 2.7859306739662044e-05, "loss": 0.3122, "num_input_tokens_seen": 60547328, "step": 63400 }, { "epoch": 5.1721184435924625, "grad_norm": 3.1869876384735107, "learning_rate": 2.785577078538601e-05, "loss": 0.3186, "num_input_tokens_seen": 60552016, "step": 63405 }, { "epoch": 5.1725263072028715, "grad_norm": 3.298290967941284, "learning_rate": 2.7852234773224706e-05, "loss": 0.3424, "num_input_tokens_seen": 60556768, "step": 63410 }, { "epoch": 5.17293417081328, "grad_norm": 3.261324644088745, "learning_rate": 2.784869870324982e-05, "loss": 0.4433, "num_input_tokens_seen": 60561584, "step": 63415 }, { "epoch": 5.173342034423689, "grad_norm": 3.526170492172241, "learning_rate": 2.7845162575533024e-05, "loss": 0.3372, "num_input_tokens_seen": 60567056, "step": 63420 }, { "epoch": 5.173749898034098, "grad_norm": 4.602758884429932, "learning_rate": 2.784162639014599e-05, "loss": 0.3856, "num_input_tokens_seen": 60571504, "step": 63425 }, { "epoch": 5.174157761644506, "grad_norm": 2.9314255714416504, "learning_rate": 2.78380901471604e-05, "loss": 0.3552, "num_input_tokens_seen": 60576368, "step": 63430 }, { "epoch": 5.174565625254915, "grad_norm": 1.27556312084198, "learning_rate": 2.783455384664792e-05, "loss": 0.4716, "num_input_tokens_seen": 60580608, "step": 63435 }, { "epoch": 5.174973488865324, "grad_norm": 2.0050253868103027, "learning_rate": 2.7831017488680244e-05, "loss": 0.3378, "num_input_tokens_seen": 60584208, "step": 63440 }, { "epoch": 5.175381352475732, "grad_norm": 0.9627823233604431, "learning_rate": 2.7827481073329044e-05, "loss": 0.3241, "num_input_tokens_seen": 60589072, "step": 63445 }, { "epoch": 5.175789216086141, "grad_norm": 2.4863393306732178, "learning_rate": 2.7823944600666008e-05, "loss": 0.2942, "num_input_tokens_seen": 60594320, "step": 63450 }, { "epoch": 5.176197079696549, "grad_norm": 0.9223365187644958, "learning_rate": 2.782040807076281e-05, "loss": 0.389, "num_input_tokens_seen": 60598752, "step": 63455 }, { "epoch": 5.176604943306958, "grad_norm": 1.0574215650558472, "learning_rate": 2.781687148369115e-05, "loss": 0.4024, "num_input_tokens_seen": 60603392, "step": 63460 }, { "epoch": 5.177012806917367, "grad_norm": 5.149667739868164, "learning_rate": 2.7813334839522685e-05, "loss": 0.3515, "num_input_tokens_seen": 60608304, "step": 63465 }, { "epoch": 5.177420670527775, "grad_norm": 4.383205413818359, "learning_rate": 2.780979813832913e-05, "loss": 0.3985, "num_input_tokens_seen": 60612928, "step": 63470 }, { "epoch": 5.177828534138184, "grad_norm": 3.3084192276000977, "learning_rate": 2.780626138018215e-05, "loss": 0.3041, "num_input_tokens_seen": 60618160, "step": 63475 }, { "epoch": 5.178236397748593, "grad_norm": 1.3768072128295898, "learning_rate": 2.780272456515346e-05, "loss": 0.3617, "num_input_tokens_seen": 60622112, "step": 63480 }, { "epoch": 5.178644261359001, "grad_norm": 2.90610408782959, "learning_rate": 2.7799187693314727e-05, "loss": 0.4184, "num_input_tokens_seen": 60626560, "step": 63485 }, { "epoch": 5.17905212496941, "grad_norm": 1.0398603677749634, "learning_rate": 2.779565076473764e-05, "loss": 0.3359, "num_input_tokens_seen": 60631248, "step": 63490 }, { "epoch": 5.1794599885798185, "grad_norm": 0.8710589408874512, "learning_rate": 2.779211377949391e-05, "loss": 0.4064, "num_input_tokens_seen": 60635120, "step": 63495 }, { "epoch": 5.1798678521902275, "grad_norm": 4.010515213012695, "learning_rate": 2.7788576737655215e-05, "loss": 0.3159, "num_input_tokens_seen": 60639456, "step": 63500 }, { "epoch": 5.1802757158006365, "grad_norm": 2.1780951023101807, "learning_rate": 2.7785039639293257e-05, "loss": 0.3581, "num_input_tokens_seen": 60642896, "step": 63505 }, { "epoch": 5.180683579411045, "grad_norm": 2.646090269088745, "learning_rate": 2.7781502484479726e-05, "loss": 0.3771, "num_input_tokens_seen": 60647456, "step": 63510 }, { "epoch": 5.181091443021454, "grad_norm": 1.3359096050262451, "learning_rate": 2.7777965273286325e-05, "loss": 0.3291, "num_input_tokens_seen": 60652048, "step": 63515 }, { "epoch": 5.181499306631863, "grad_norm": 0.482844740152359, "learning_rate": 2.7774428005784753e-05, "loss": 0.3082, "num_input_tokens_seen": 60656464, "step": 63520 }, { "epoch": 5.181907170242271, "grad_norm": 1.6040165424346924, "learning_rate": 2.7770890682046702e-05, "loss": 0.4108, "num_input_tokens_seen": 60660992, "step": 63525 }, { "epoch": 5.18231503385268, "grad_norm": 1.6481468677520752, "learning_rate": 2.776735330214387e-05, "loss": 0.3309, "num_input_tokens_seen": 60664992, "step": 63530 }, { "epoch": 5.182722897463089, "grad_norm": 1.311539649963379, "learning_rate": 2.7763815866147965e-05, "loss": 0.3049, "num_input_tokens_seen": 60669648, "step": 63535 }, { "epoch": 5.183130761073497, "grad_norm": 3.1963605880737305, "learning_rate": 2.7760278374130688e-05, "loss": 0.3258, "num_input_tokens_seen": 60673984, "step": 63540 }, { "epoch": 5.183538624683906, "grad_norm": 2.230292558670044, "learning_rate": 2.7756740826163737e-05, "loss": 0.3493, "num_input_tokens_seen": 60678992, "step": 63545 }, { "epoch": 5.183946488294314, "grad_norm": 2.1978299617767334, "learning_rate": 2.7753203222318825e-05, "loss": 0.3073, "num_input_tokens_seen": 60683424, "step": 63550 }, { "epoch": 5.184354351904723, "grad_norm": 1.0568779706954956, "learning_rate": 2.774966556266765e-05, "loss": 0.3034, "num_input_tokens_seen": 60688208, "step": 63555 }, { "epoch": 5.184762215515132, "grad_norm": 3.00005841255188, "learning_rate": 2.7746127847281923e-05, "loss": 0.3115, "num_input_tokens_seen": 60693536, "step": 63560 }, { "epoch": 5.18517007912554, "grad_norm": 3.126417875289917, "learning_rate": 2.774259007623335e-05, "loss": 0.331, "num_input_tokens_seen": 60697968, "step": 63565 }, { "epoch": 5.185577942735949, "grad_norm": 2.4895949363708496, "learning_rate": 2.7739052249593646e-05, "loss": 0.3742, "num_input_tokens_seen": 60702848, "step": 63570 }, { "epoch": 5.185985806346358, "grad_norm": 1.958740234375, "learning_rate": 2.7735514367434508e-05, "loss": 0.2872, "num_input_tokens_seen": 60708304, "step": 63575 }, { "epoch": 5.186393669956766, "grad_norm": 0.9322640895843506, "learning_rate": 2.7731976429827668e-05, "loss": 0.3987, "num_input_tokens_seen": 60713792, "step": 63580 }, { "epoch": 5.186801533567175, "grad_norm": 0.8897093534469604, "learning_rate": 2.7728438436844823e-05, "loss": 0.2006, "num_input_tokens_seen": 60719488, "step": 63585 }, { "epoch": 5.1872093971775834, "grad_norm": 1.2754591703414917, "learning_rate": 2.772490038855769e-05, "loss": 0.3406, "num_input_tokens_seen": 60724880, "step": 63590 }, { "epoch": 5.1876172607879925, "grad_norm": 2.2004005908966064, "learning_rate": 2.7721362285037982e-05, "loss": 0.3012, "num_input_tokens_seen": 60729584, "step": 63595 }, { "epoch": 5.1880251243984015, "grad_norm": 1.9801667928695679, "learning_rate": 2.7717824126357417e-05, "loss": 0.4226, "num_input_tokens_seen": 60734112, "step": 63600 }, { "epoch": 5.18843298800881, "grad_norm": 6.384108543395996, "learning_rate": 2.7714285912587713e-05, "loss": 0.2704, "num_input_tokens_seen": 60739424, "step": 63605 }, { "epoch": 5.188840851619219, "grad_norm": 4.029847621917725, "learning_rate": 2.7710747643800582e-05, "loss": 0.2615, "num_input_tokens_seen": 60744240, "step": 63610 }, { "epoch": 5.189248715229628, "grad_norm": 3.3409311771392822, "learning_rate": 2.7707209320067756e-05, "loss": 0.3225, "num_input_tokens_seen": 60748512, "step": 63615 }, { "epoch": 5.189656578840036, "grad_norm": 0.6575065851211548, "learning_rate": 2.770367094146094e-05, "loss": 0.2821, "num_input_tokens_seen": 60753184, "step": 63620 }, { "epoch": 5.190064442450445, "grad_norm": 1.4409301280975342, "learning_rate": 2.770013250805187e-05, "loss": 0.3382, "num_input_tokens_seen": 60757392, "step": 63625 }, { "epoch": 5.190472306060853, "grad_norm": 4.435014724731445, "learning_rate": 2.769659401991226e-05, "loss": 0.3101, "num_input_tokens_seen": 60762288, "step": 63630 }, { "epoch": 5.190880169671262, "grad_norm": 3.2517952919006348, "learning_rate": 2.7693055477113826e-05, "loss": 0.3607, "num_input_tokens_seen": 60766896, "step": 63635 }, { "epoch": 5.191288033281671, "grad_norm": 3.3058605194091797, "learning_rate": 2.7689516879728318e-05, "loss": 0.3925, "num_input_tokens_seen": 60771584, "step": 63640 }, { "epoch": 5.191695896892079, "grad_norm": 2.692941188812256, "learning_rate": 2.768597822782743e-05, "loss": 0.3728, "num_input_tokens_seen": 60775952, "step": 63645 }, { "epoch": 5.192103760502488, "grad_norm": 2.094304323196411, "learning_rate": 2.7682439521482916e-05, "loss": 0.2495, "num_input_tokens_seen": 60780384, "step": 63650 }, { "epoch": 5.192511624112897, "grad_norm": 4.880365371704102, "learning_rate": 2.767890076076649e-05, "loss": 0.3893, "num_input_tokens_seen": 60785680, "step": 63655 }, { "epoch": 5.192919487723305, "grad_norm": 0.700549840927124, "learning_rate": 2.7675361945749883e-05, "loss": 0.3048, "num_input_tokens_seen": 60790656, "step": 63660 }, { "epoch": 5.193327351333714, "grad_norm": 2.3972809314727783, "learning_rate": 2.767182307650482e-05, "loss": 0.4203, "num_input_tokens_seen": 60795264, "step": 63665 }, { "epoch": 5.193735214944122, "grad_norm": 0.64823979139328, "learning_rate": 2.7668284153103045e-05, "loss": 0.2634, "num_input_tokens_seen": 60800336, "step": 63670 }, { "epoch": 5.194143078554531, "grad_norm": 0.6718860268592834, "learning_rate": 2.766474517561628e-05, "loss": 0.4212, "num_input_tokens_seen": 60804736, "step": 63675 }, { "epoch": 5.19455094216494, "grad_norm": 2.479780673980713, "learning_rate": 2.766120614411627e-05, "loss": 0.2928, "num_input_tokens_seen": 60810256, "step": 63680 }, { "epoch": 5.194958805775348, "grad_norm": 2.060123920440674, "learning_rate": 2.7657667058674742e-05, "loss": 0.3379, "num_input_tokens_seen": 60814624, "step": 63685 }, { "epoch": 5.195366669385757, "grad_norm": 1.7154347896575928, "learning_rate": 2.7654127919363422e-05, "loss": 0.3476, "num_input_tokens_seen": 60819296, "step": 63690 }, { "epoch": 5.195774532996166, "grad_norm": 0.5592583417892456, "learning_rate": 2.765058872625407e-05, "loss": 0.291, "num_input_tokens_seen": 60822832, "step": 63695 }, { "epoch": 5.1961823966065745, "grad_norm": 0.7709327340126038, "learning_rate": 2.7647049479418397e-05, "loss": 0.2592, "num_input_tokens_seen": 60828656, "step": 63700 }, { "epoch": 5.196590260216984, "grad_norm": 5.971943378448486, "learning_rate": 2.7643510178928163e-05, "loss": 0.3162, "num_input_tokens_seen": 60833008, "step": 63705 }, { "epoch": 5.196998123827392, "grad_norm": 3.319561719894409, "learning_rate": 2.7639970824855098e-05, "loss": 0.3886, "num_input_tokens_seen": 60837520, "step": 63710 }, { "epoch": 5.197405987437801, "grad_norm": 1.1505732536315918, "learning_rate": 2.7636431417270947e-05, "loss": 0.3549, "num_input_tokens_seen": 60842000, "step": 63715 }, { "epoch": 5.19781385104821, "grad_norm": 2.6732945442199707, "learning_rate": 2.7632891956247457e-05, "loss": 0.3342, "num_input_tokens_seen": 60846000, "step": 63720 }, { "epoch": 5.198221714658618, "grad_norm": 3.1588425636291504, "learning_rate": 2.762935244185636e-05, "loss": 0.4364, "num_input_tokens_seen": 60850784, "step": 63725 }, { "epoch": 5.198629578269027, "grad_norm": 4.318993091583252, "learning_rate": 2.762581287416941e-05, "loss": 0.4104, "num_input_tokens_seen": 60855216, "step": 63730 }, { "epoch": 5.199037441879436, "grad_norm": 1.7031670808792114, "learning_rate": 2.7622273253258346e-05, "loss": 0.3585, "num_input_tokens_seen": 60860640, "step": 63735 }, { "epoch": 5.199445305489844, "grad_norm": 2.4780452251434326, "learning_rate": 2.7618733579194923e-05, "loss": 0.297, "num_input_tokens_seen": 60865584, "step": 63740 }, { "epoch": 5.199853169100253, "grad_norm": 4.302734375, "learning_rate": 2.7615193852050876e-05, "loss": 0.3502, "num_input_tokens_seen": 60869680, "step": 63745 }, { "epoch": 5.200261032710662, "grad_norm": 1.5161638259887695, "learning_rate": 2.7611654071897962e-05, "loss": 0.3667, "num_input_tokens_seen": 60875376, "step": 63750 }, { "epoch": 5.20066889632107, "grad_norm": 2.2554221153259277, "learning_rate": 2.760811423880793e-05, "loss": 0.3218, "num_input_tokens_seen": 60879808, "step": 63755 }, { "epoch": 5.201076759931479, "grad_norm": 1.9500770568847656, "learning_rate": 2.7604574352852535e-05, "loss": 0.3971, "num_input_tokens_seen": 60884672, "step": 63760 }, { "epoch": 5.201484623541887, "grad_norm": 3.185849666595459, "learning_rate": 2.7601034414103517e-05, "loss": 0.333, "num_input_tokens_seen": 60888352, "step": 63765 }, { "epoch": 5.201892487152296, "grad_norm": 0.9471885561943054, "learning_rate": 2.759749442263264e-05, "loss": 0.3644, "num_input_tokens_seen": 60892992, "step": 63770 }, { "epoch": 5.202300350762705, "grad_norm": 1.7293695211410522, "learning_rate": 2.7593954378511656e-05, "loss": 0.3255, "num_input_tokens_seen": 60897968, "step": 63775 }, { "epoch": 5.202708214373113, "grad_norm": 2.4991464614868164, "learning_rate": 2.759041428181232e-05, "loss": 0.2798, "num_input_tokens_seen": 60902400, "step": 63780 }, { "epoch": 5.203116077983522, "grad_norm": 3.2802865505218506, "learning_rate": 2.758687413260639e-05, "loss": 0.4131, "num_input_tokens_seen": 60906800, "step": 63785 }, { "epoch": 5.203523941593931, "grad_norm": 1.625633955001831, "learning_rate": 2.758333393096561e-05, "loss": 0.4295, "num_input_tokens_seen": 60911440, "step": 63790 }, { "epoch": 5.2039318052043395, "grad_norm": 1.593650460243225, "learning_rate": 2.7579793676961756e-05, "loss": 0.4326, "num_input_tokens_seen": 60915232, "step": 63795 }, { "epoch": 5.2043396688147485, "grad_norm": 1.5277175903320312, "learning_rate": 2.757625337066658e-05, "loss": 0.3302, "num_input_tokens_seen": 60920272, "step": 63800 }, { "epoch": 5.204747532425157, "grad_norm": 1.5718470811843872, "learning_rate": 2.7572713012151853e-05, "loss": 0.3495, "num_input_tokens_seen": 60924928, "step": 63805 }, { "epoch": 5.205155396035566, "grad_norm": 1.8639521598815918, "learning_rate": 2.7569172601489314e-05, "loss": 0.3323, "num_input_tokens_seen": 60930256, "step": 63810 }, { "epoch": 5.205563259645975, "grad_norm": 5.1269121170043945, "learning_rate": 2.7565632138750743e-05, "loss": 0.3849, "num_input_tokens_seen": 60935872, "step": 63815 }, { "epoch": 5.205971123256383, "grad_norm": 2.034088134765625, "learning_rate": 2.7562091624007892e-05, "loss": 0.3533, "num_input_tokens_seen": 60940608, "step": 63820 }, { "epoch": 5.206378986866792, "grad_norm": 3.5551769733428955, "learning_rate": 2.7558551057332533e-05, "loss": 0.398, "num_input_tokens_seen": 60945824, "step": 63825 }, { "epoch": 5.206786850477201, "grad_norm": 2.024786949157715, "learning_rate": 2.7555010438796437e-05, "loss": 0.3598, "num_input_tokens_seen": 60950096, "step": 63830 }, { "epoch": 5.207194714087609, "grad_norm": 1.2311210632324219, "learning_rate": 2.755146976847136e-05, "loss": 0.3475, "num_input_tokens_seen": 60954752, "step": 63835 }, { "epoch": 5.207602577698018, "grad_norm": 1.9888851642608643, "learning_rate": 2.754792904642908e-05, "loss": 0.3432, "num_input_tokens_seen": 60959072, "step": 63840 }, { "epoch": 5.208010441308426, "grad_norm": 1.1753616333007812, "learning_rate": 2.7544388272741357e-05, "loss": 0.3386, "num_input_tokens_seen": 60963904, "step": 63845 }, { "epoch": 5.208418304918835, "grad_norm": 1.4039300680160522, "learning_rate": 2.754084744747997e-05, "loss": 0.3218, "num_input_tokens_seen": 60969632, "step": 63850 }, { "epoch": 5.208826168529244, "grad_norm": 0.7351853251457214, "learning_rate": 2.7537306570716676e-05, "loss": 0.3264, "num_input_tokens_seen": 60975280, "step": 63855 }, { "epoch": 5.209234032139652, "grad_norm": 4.672470569610596, "learning_rate": 2.7533765642523264e-05, "loss": 0.3808, "num_input_tokens_seen": 60979856, "step": 63860 }, { "epoch": 5.209641895750061, "grad_norm": 1.3742845058441162, "learning_rate": 2.75302246629715e-05, "loss": 0.2946, "num_input_tokens_seen": 60984560, "step": 63865 }, { "epoch": 5.21004975936047, "grad_norm": 0.7446562051773071, "learning_rate": 2.7526683632133155e-05, "loss": 0.3328, "num_input_tokens_seen": 60989808, "step": 63870 }, { "epoch": 5.210457622970878, "grad_norm": 0.5203513503074646, "learning_rate": 2.752314255008e-05, "loss": 0.3791, "num_input_tokens_seen": 60994400, "step": 63875 }, { "epoch": 5.210865486581287, "grad_norm": 0.9664270877838135, "learning_rate": 2.751960141688383e-05, "loss": 0.3641, "num_input_tokens_seen": 60998976, "step": 63880 }, { "epoch": 5.211273350191696, "grad_norm": 0.711134135723114, "learning_rate": 2.7516060232616402e-05, "loss": 0.3389, "num_input_tokens_seen": 61004160, "step": 63885 }, { "epoch": 5.2116812138021045, "grad_norm": 0.9118515849113464, "learning_rate": 2.75125189973495e-05, "loss": 0.3654, "num_input_tokens_seen": 61008624, "step": 63890 }, { "epoch": 5.2120890774125135, "grad_norm": 1.6941790580749512, "learning_rate": 2.750897771115491e-05, "loss": 0.4215, "num_input_tokens_seen": 61013760, "step": 63895 }, { "epoch": 5.212496941022922, "grad_norm": 2.104569673538208, "learning_rate": 2.7505436374104405e-05, "loss": 0.3085, "num_input_tokens_seen": 61018784, "step": 63900 }, { "epoch": 5.212904804633331, "grad_norm": 2.099684238433838, "learning_rate": 2.750189498626977e-05, "loss": 0.38, "num_input_tokens_seen": 61023888, "step": 63905 }, { "epoch": 5.21331266824374, "grad_norm": 1.1612070798873901, "learning_rate": 2.749835354772279e-05, "loss": 0.3423, "num_input_tokens_seen": 61027680, "step": 63910 }, { "epoch": 5.213720531854148, "grad_norm": 0.9875631332397461, "learning_rate": 2.749481205853524e-05, "loss": 0.3272, "num_input_tokens_seen": 61032064, "step": 63915 }, { "epoch": 5.214128395464557, "grad_norm": 0.9869650602340698, "learning_rate": 2.7491270518778912e-05, "loss": 0.3382, "num_input_tokens_seen": 61036864, "step": 63920 }, { "epoch": 5.214536259074965, "grad_norm": 5.140778064727783, "learning_rate": 2.7487728928525592e-05, "loss": 0.326, "num_input_tokens_seen": 61041728, "step": 63925 }, { "epoch": 5.214944122685374, "grad_norm": 0.975005567073822, "learning_rate": 2.7484187287847062e-05, "loss": 0.3392, "num_input_tokens_seen": 61046288, "step": 63930 }, { "epoch": 5.215351986295783, "grad_norm": 1.380165696144104, "learning_rate": 2.7480645596815114e-05, "loss": 0.3477, "num_input_tokens_seen": 61051648, "step": 63935 }, { "epoch": 5.215759849906191, "grad_norm": 2.483889579772949, "learning_rate": 2.7477103855501528e-05, "loss": 0.3723, "num_input_tokens_seen": 61056528, "step": 63940 }, { "epoch": 5.2161677135166, "grad_norm": 0.8747181296348572, "learning_rate": 2.7473562063978103e-05, "loss": 0.3353, "num_input_tokens_seen": 61060944, "step": 63945 }, { "epoch": 5.216575577127009, "grad_norm": 1.9444935321807861, "learning_rate": 2.747002022231663e-05, "loss": 0.3099, "num_input_tokens_seen": 61066352, "step": 63950 }, { "epoch": 5.216983440737417, "grad_norm": 2.3731143474578857, "learning_rate": 2.7466478330588884e-05, "loss": 0.3034, "num_input_tokens_seen": 61071120, "step": 63955 }, { "epoch": 5.217391304347826, "grad_norm": 0.7388631105422974, "learning_rate": 2.7462936388866683e-05, "loss": 0.3449, "num_input_tokens_seen": 61075840, "step": 63960 }, { "epoch": 5.217799167958235, "grad_norm": 0.8918030858039856, "learning_rate": 2.74593943972218e-05, "loss": 0.3677, "num_input_tokens_seen": 61079280, "step": 63965 }, { "epoch": 5.218207031568643, "grad_norm": 4.340728759765625, "learning_rate": 2.7455852355726043e-05, "loss": 0.403, "num_input_tokens_seen": 61084000, "step": 63970 }, { "epoch": 5.218614895179052, "grad_norm": 2.684330940246582, "learning_rate": 2.7452310264451193e-05, "loss": 0.2742, "num_input_tokens_seen": 61088240, "step": 63975 }, { "epoch": 5.21902275878946, "grad_norm": 4.695930004119873, "learning_rate": 2.7448768123469065e-05, "loss": 0.3559, "num_input_tokens_seen": 61093280, "step": 63980 }, { "epoch": 5.219430622399869, "grad_norm": 1.3262087106704712, "learning_rate": 2.7445225932851447e-05, "loss": 0.3355, "num_input_tokens_seen": 61097456, "step": 63985 }, { "epoch": 5.2198384860102784, "grad_norm": 1.9656060934066772, "learning_rate": 2.7441683692670133e-05, "loss": 0.361, "num_input_tokens_seen": 61102880, "step": 63990 }, { "epoch": 5.220246349620687, "grad_norm": 1.3997251987457275, "learning_rate": 2.7438141402996937e-05, "loss": 0.4155, "num_input_tokens_seen": 61107744, "step": 63995 }, { "epoch": 5.220654213231096, "grad_norm": 2.854567766189575, "learning_rate": 2.743459906390365e-05, "loss": 0.3904, "num_input_tokens_seen": 61112352, "step": 64000 }, { "epoch": 5.221062076841505, "grad_norm": 1.6759910583496094, "learning_rate": 2.743105667546207e-05, "loss": 0.2775, "num_input_tokens_seen": 61117024, "step": 64005 }, { "epoch": 5.221469940451913, "grad_norm": 2.000000238418579, "learning_rate": 2.7427514237744e-05, "loss": 0.3549, "num_input_tokens_seen": 61122144, "step": 64010 }, { "epoch": 5.221877804062322, "grad_norm": 0.9160823225975037, "learning_rate": 2.7423971750821254e-05, "loss": 0.2889, "num_input_tokens_seen": 61127760, "step": 64015 }, { "epoch": 5.22228566767273, "grad_norm": 2.300999164581299, "learning_rate": 2.742042921476563e-05, "loss": 0.2957, "num_input_tokens_seen": 61132864, "step": 64020 }, { "epoch": 5.222693531283139, "grad_norm": 2.5579841136932373, "learning_rate": 2.7416886629648935e-05, "loss": 0.3363, "num_input_tokens_seen": 61137024, "step": 64025 }, { "epoch": 5.223101394893548, "grad_norm": 0.9956862926483154, "learning_rate": 2.7413343995542968e-05, "loss": 0.3526, "num_input_tokens_seen": 61142000, "step": 64030 }, { "epoch": 5.223509258503956, "grad_norm": 2.628431797027588, "learning_rate": 2.740980131251955e-05, "loss": 0.3247, "num_input_tokens_seen": 61147568, "step": 64035 }, { "epoch": 5.223917122114365, "grad_norm": 2.9951512813568115, "learning_rate": 2.7406258580650485e-05, "loss": 0.4193, "num_input_tokens_seen": 61152096, "step": 64040 }, { "epoch": 5.224324985724774, "grad_norm": 2.5486345291137695, "learning_rate": 2.7402715800007578e-05, "loss": 0.3485, "num_input_tokens_seen": 61157632, "step": 64045 }, { "epoch": 5.224732849335182, "grad_norm": 1.6227920055389404, "learning_rate": 2.7399172970662644e-05, "loss": 0.3422, "num_input_tokens_seen": 61162240, "step": 64050 }, { "epoch": 5.225140712945591, "grad_norm": 3.3955471515655518, "learning_rate": 2.739563009268749e-05, "loss": 0.3434, "num_input_tokens_seen": 61167024, "step": 64055 }, { "epoch": 5.225548576555999, "grad_norm": 1.8511186838150024, "learning_rate": 2.7392087166153935e-05, "loss": 0.3336, "num_input_tokens_seen": 61171488, "step": 64060 }, { "epoch": 5.225956440166408, "grad_norm": 0.8545489311218262, "learning_rate": 2.7388544191133792e-05, "loss": 0.3622, "num_input_tokens_seen": 61176144, "step": 64065 }, { "epoch": 5.226364303776817, "grad_norm": 2.9576215744018555, "learning_rate": 2.7385001167698875e-05, "loss": 0.36, "num_input_tokens_seen": 61181760, "step": 64070 }, { "epoch": 5.226772167387225, "grad_norm": 5.258298397064209, "learning_rate": 2.7381458095920986e-05, "loss": 0.3618, "num_input_tokens_seen": 61187024, "step": 64075 }, { "epoch": 5.227180030997634, "grad_norm": 2.222630500793457, "learning_rate": 2.7377914975871967e-05, "loss": 0.3359, "num_input_tokens_seen": 61191472, "step": 64080 }, { "epoch": 5.227587894608043, "grad_norm": 1.3954519033432007, "learning_rate": 2.7374371807623615e-05, "loss": 0.2959, "num_input_tokens_seen": 61195888, "step": 64085 }, { "epoch": 5.2279957582184515, "grad_norm": 1.7768903970718384, "learning_rate": 2.7370828591247756e-05, "loss": 0.3357, "num_input_tokens_seen": 61200784, "step": 64090 }, { "epoch": 5.2284036218288605, "grad_norm": 4.434187889099121, "learning_rate": 2.7367285326816204e-05, "loss": 0.3156, "num_input_tokens_seen": 61205712, "step": 64095 }, { "epoch": 5.2288114854392695, "grad_norm": 1.9386364221572876, "learning_rate": 2.736374201440079e-05, "loss": 0.2672, "num_input_tokens_seen": 61210096, "step": 64100 }, { "epoch": 5.229219349049678, "grad_norm": 2.072727680206299, "learning_rate": 2.7360198654073333e-05, "loss": 0.3402, "num_input_tokens_seen": 61215088, "step": 64105 }, { "epoch": 5.229627212660087, "grad_norm": 1.5912268161773682, "learning_rate": 2.7356655245905644e-05, "loss": 0.3926, "num_input_tokens_seen": 61219728, "step": 64110 }, { "epoch": 5.230035076270495, "grad_norm": 2.1219987869262695, "learning_rate": 2.735311178996956e-05, "loss": 0.3197, "num_input_tokens_seen": 61225072, "step": 64115 }, { "epoch": 5.230442939880904, "grad_norm": 1.9500881433486938, "learning_rate": 2.734956828633689e-05, "loss": 0.2502, "num_input_tokens_seen": 61230336, "step": 64120 }, { "epoch": 5.230850803491313, "grad_norm": 0.7451227307319641, "learning_rate": 2.7346024735079486e-05, "loss": 0.2588, "num_input_tokens_seen": 61235744, "step": 64125 }, { "epoch": 5.231258667101721, "grad_norm": 1.8924754858016968, "learning_rate": 2.734248113626915e-05, "loss": 0.3581, "num_input_tokens_seen": 61240864, "step": 64130 }, { "epoch": 5.23166653071213, "grad_norm": 2.3877856731414795, "learning_rate": 2.7338937489977712e-05, "loss": 0.1917, "num_input_tokens_seen": 61246320, "step": 64135 }, { "epoch": 5.232074394322539, "grad_norm": 5.120668888092041, "learning_rate": 2.733539379627701e-05, "loss": 0.2319, "num_input_tokens_seen": 61250704, "step": 64140 }, { "epoch": 5.232482257932947, "grad_norm": 6.851952075958252, "learning_rate": 2.7331850055238872e-05, "loss": 0.3701, "num_input_tokens_seen": 61256016, "step": 64145 }, { "epoch": 5.232890121543356, "grad_norm": 0.7591164112091064, "learning_rate": 2.7328306266935122e-05, "loss": 0.3839, "num_input_tokens_seen": 61261520, "step": 64150 }, { "epoch": 5.233297985153764, "grad_norm": 3.4630160331726074, "learning_rate": 2.7324762431437583e-05, "loss": 0.2603, "num_input_tokens_seen": 61265808, "step": 64155 }, { "epoch": 5.233705848764173, "grad_norm": 2.5425753593444824, "learning_rate": 2.7321218548818106e-05, "loss": 0.3785, "num_input_tokens_seen": 61270224, "step": 64160 }, { "epoch": 5.234113712374582, "grad_norm": 3.3430352210998535, "learning_rate": 2.7317674619148514e-05, "loss": 0.4351, "num_input_tokens_seen": 61276128, "step": 64165 }, { "epoch": 5.23452157598499, "grad_norm": 1.8960626125335693, "learning_rate": 2.7314130642500646e-05, "loss": 0.3156, "num_input_tokens_seen": 61280368, "step": 64170 }, { "epoch": 5.234929439595399, "grad_norm": 5.729445934295654, "learning_rate": 2.731058661894633e-05, "loss": 0.3972, "num_input_tokens_seen": 61285184, "step": 64175 }, { "epoch": 5.235337303205808, "grad_norm": 4.94917106628418, "learning_rate": 2.7307042548557405e-05, "loss": 0.3549, "num_input_tokens_seen": 61289808, "step": 64180 }, { "epoch": 5.2357451668162165, "grad_norm": 4.027055263519287, "learning_rate": 2.7303498431405706e-05, "loss": 0.2761, "num_input_tokens_seen": 61293984, "step": 64185 }, { "epoch": 5.2361530304266255, "grad_norm": 4.788071632385254, "learning_rate": 2.729995426756307e-05, "loss": 0.4745, "num_input_tokens_seen": 61298992, "step": 64190 }, { "epoch": 5.236560894037034, "grad_norm": 5.3549394607543945, "learning_rate": 2.7296410057101346e-05, "loss": 0.3969, "num_input_tokens_seen": 61303920, "step": 64195 }, { "epoch": 5.236968757647443, "grad_norm": 3.097850799560547, "learning_rate": 2.729286580009236e-05, "loss": 0.2546, "num_input_tokens_seen": 61309232, "step": 64200 }, { "epoch": 5.237376621257852, "grad_norm": 3.5144801139831543, "learning_rate": 2.728932149660796e-05, "loss": 0.2522, "num_input_tokens_seen": 61314816, "step": 64205 }, { "epoch": 5.23778448486826, "grad_norm": 9.892308235168457, "learning_rate": 2.7285777146719994e-05, "loss": 0.3386, "num_input_tokens_seen": 61319808, "step": 64210 }, { "epoch": 5.238192348478669, "grad_norm": 2.435756206512451, "learning_rate": 2.728223275050029e-05, "loss": 0.4387, "num_input_tokens_seen": 61324272, "step": 64215 }, { "epoch": 5.238600212089078, "grad_norm": 6.232942581176758, "learning_rate": 2.7278688308020696e-05, "loss": 0.3616, "num_input_tokens_seen": 61327920, "step": 64220 }, { "epoch": 5.239008075699486, "grad_norm": 8.687285423278809, "learning_rate": 2.727514381935306e-05, "loss": 0.3809, "num_input_tokens_seen": 61333040, "step": 64225 }, { "epoch": 5.239415939309895, "grad_norm": 9.330517768859863, "learning_rate": 2.727159928456922e-05, "loss": 0.4001, "num_input_tokens_seen": 61338128, "step": 64230 }, { "epoch": 5.239823802920303, "grad_norm": 0.5029776096343994, "learning_rate": 2.7268054703741036e-05, "loss": 0.3571, "num_input_tokens_seen": 61343184, "step": 64235 }, { "epoch": 5.240231666530712, "grad_norm": 3.346072196960449, "learning_rate": 2.7264510076940346e-05, "loss": 0.3766, "num_input_tokens_seen": 61347808, "step": 64240 }, { "epoch": 5.240639530141121, "grad_norm": 4.582188606262207, "learning_rate": 2.7260965404238996e-05, "loss": 0.4087, "num_input_tokens_seen": 61352656, "step": 64245 }, { "epoch": 5.241047393751529, "grad_norm": 2.5037896633148193, "learning_rate": 2.725742068570884e-05, "loss": 0.3124, "num_input_tokens_seen": 61357424, "step": 64250 }, { "epoch": 5.241455257361938, "grad_norm": 3.4468538761138916, "learning_rate": 2.7253875921421724e-05, "loss": 0.3732, "num_input_tokens_seen": 61362656, "step": 64255 }, { "epoch": 5.241863120972347, "grad_norm": 3.1584150791168213, "learning_rate": 2.7250331111449502e-05, "loss": 0.3626, "num_input_tokens_seen": 61367632, "step": 64260 }, { "epoch": 5.242270984582755, "grad_norm": 2.014876127243042, "learning_rate": 2.724678625586402e-05, "loss": 0.3627, "num_input_tokens_seen": 61372080, "step": 64265 }, { "epoch": 5.242678848193164, "grad_norm": 2.54337739944458, "learning_rate": 2.7243241354737143e-05, "loss": 0.3747, "num_input_tokens_seen": 61378096, "step": 64270 }, { "epoch": 5.243086711803572, "grad_norm": 4.93571662902832, "learning_rate": 2.723969640814072e-05, "loss": 0.3771, "num_input_tokens_seen": 61382176, "step": 64275 }, { "epoch": 5.2434945754139815, "grad_norm": 2.750157356262207, "learning_rate": 2.7236151416146594e-05, "loss": 0.3728, "num_input_tokens_seen": 61386880, "step": 64280 }, { "epoch": 5.2439024390243905, "grad_norm": 2.924638271331787, "learning_rate": 2.7232606378826636e-05, "loss": 0.3298, "num_input_tokens_seen": 61391584, "step": 64285 }, { "epoch": 5.244310302634799, "grad_norm": 0.7724999189376831, "learning_rate": 2.722906129625269e-05, "loss": 0.3063, "num_input_tokens_seen": 61396352, "step": 64290 }, { "epoch": 5.244718166245208, "grad_norm": 1.6879392862319946, "learning_rate": 2.7225516168496618e-05, "loss": 0.3651, "num_input_tokens_seen": 61400864, "step": 64295 }, { "epoch": 5.245126029855617, "grad_norm": 1.2173618078231812, "learning_rate": 2.7221970995630274e-05, "loss": 0.3625, "num_input_tokens_seen": 61404912, "step": 64300 }, { "epoch": 5.245533893466025, "grad_norm": 1.522632122039795, "learning_rate": 2.7218425777725532e-05, "loss": 0.317, "num_input_tokens_seen": 61410112, "step": 64305 }, { "epoch": 5.245941757076434, "grad_norm": 1.407866358757019, "learning_rate": 2.7214880514854236e-05, "loss": 0.3451, "num_input_tokens_seen": 61414976, "step": 64310 }, { "epoch": 5.246349620686843, "grad_norm": 1.5131027698516846, "learning_rate": 2.7211335207088257e-05, "loss": 0.3186, "num_input_tokens_seen": 61419360, "step": 64315 }, { "epoch": 5.246757484297251, "grad_norm": 0.8933187127113342, "learning_rate": 2.720778985449945e-05, "loss": 0.327, "num_input_tokens_seen": 61424224, "step": 64320 }, { "epoch": 5.24716534790766, "grad_norm": 1.1900100708007812, "learning_rate": 2.7204244457159684e-05, "loss": 0.2883, "num_input_tokens_seen": 61429728, "step": 64325 }, { "epoch": 5.247573211518068, "grad_norm": 2.9689204692840576, "learning_rate": 2.7200699015140818e-05, "loss": 0.353, "num_input_tokens_seen": 61434576, "step": 64330 }, { "epoch": 5.247981075128477, "grad_norm": 2.8762755393981934, "learning_rate": 2.7197153528514718e-05, "loss": 0.3978, "num_input_tokens_seen": 61438896, "step": 64335 }, { "epoch": 5.248388938738886, "grad_norm": 0.8365936875343323, "learning_rate": 2.7193607997353254e-05, "loss": 0.3397, "num_input_tokens_seen": 61442976, "step": 64340 }, { "epoch": 5.248796802349294, "grad_norm": 1.8673592805862427, "learning_rate": 2.7190062421728284e-05, "loss": 0.3418, "num_input_tokens_seen": 61447328, "step": 64345 }, { "epoch": 5.249204665959703, "grad_norm": 0.6911375522613525, "learning_rate": 2.7186516801711682e-05, "loss": 0.3311, "num_input_tokens_seen": 61451728, "step": 64350 }, { "epoch": 5.249612529570112, "grad_norm": 6.848876476287842, "learning_rate": 2.7182971137375308e-05, "loss": 0.3557, "num_input_tokens_seen": 61456240, "step": 64355 }, { "epoch": 5.25002039318052, "grad_norm": 1.0224192142486572, "learning_rate": 2.717942542879104e-05, "loss": 0.3466, "num_input_tokens_seen": 61461264, "step": 64360 }, { "epoch": 5.250428256790929, "grad_norm": 2.6119208335876465, "learning_rate": 2.7175879676030742e-05, "loss": 0.277, "num_input_tokens_seen": 61466832, "step": 64365 }, { "epoch": 5.250836120401337, "grad_norm": 1.3774536848068237, "learning_rate": 2.7172333879166288e-05, "loss": 0.4039, "num_input_tokens_seen": 61471840, "step": 64370 }, { "epoch": 5.251243984011746, "grad_norm": 3.4806342124938965, "learning_rate": 2.716878803826955e-05, "loss": 0.4173, "num_input_tokens_seen": 61476640, "step": 64375 }, { "epoch": 5.251651847622155, "grad_norm": 1.0558433532714844, "learning_rate": 2.7165242153412402e-05, "loss": 0.4274, "num_input_tokens_seen": 61481824, "step": 64380 }, { "epoch": 5.2520597112325635, "grad_norm": 2.949540853500366, "learning_rate": 2.716169622466671e-05, "loss": 0.3126, "num_input_tokens_seen": 61486000, "step": 64385 }, { "epoch": 5.2524675748429726, "grad_norm": 3.9936411380767822, "learning_rate": 2.7158150252104358e-05, "loss": 0.3921, "num_input_tokens_seen": 61490544, "step": 64390 }, { "epoch": 5.252875438453382, "grad_norm": 1.9849315881729126, "learning_rate": 2.7154604235797222e-05, "loss": 0.3387, "num_input_tokens_seen": 61495136, "step": 64395 }, { "epoch": 5.25328330206379, "grad_norm": 0.8759344220161438, "learning_rate": 2.715105817581717e-05, "loss": 0.3126, "num_input_tokens_seen": 61499584, "step": 64400 }, { "epoch": 5.253691165674199, "grad_norm": 1.8562040328979492, "learning_rate": 2.714751207223608e-05, "loss": 0.2669, "num_input_tokens_seen": 61503776, "step": 64405 }, { "epoch": 5.254099029284607, "grad_norm": 1.1045913696289062, "learning_rate": 2.714396592512584e-05, "loss": 0.3558, "num_input_tokens_seen": 61507616, "step": 64410 }, { "epoch": 5.254506892895016, "grad_norm": 2.210820436477661, "learning_rate": 2.714041973455832e-05, "loss": 0.3963, "num_input_tokens_seen": 61511984, "step": 64415 }, { "epoch": 5.254914756505425, "grad_norm": 1.142930507659912, "learning_rate": 2.7136873500605393e-05, "loss": 0.3609, "num_input_tokens_seen": 61516784, "step": 64420 }, { "epoch": 5.255322620115833, "grad_norm": 0.660218358039856, "learning_rate": 2.7133327223338956e-05, "loss": 0.2828, "num_input_tokens_seen": 61521040, "step": 64425 }, { "epoch": 5.255730483726242, "grad_norm": 2.544177770614624, "learning_rate": 2.7129780902830874e-05, "loss": 0.5246, "num_input_tokens_seen": 61526192, "step": 64430 }, { "epoch": 5.256138347336651, "grad_norm": 1.5401705503463745, "learning_rate": 2.712623453915305e-05, "loss": 0.2861, "num_input_tokens_seen": 61530944, "step": 64435 }, { "epoch": 5.256546210947059, "grad_norm": 1.4418143033981323, "learning_rate": 2.712268813237735e-05, "loss": 0.4148, "num_input_tokens_seen": 61536448, "step": 64440 }, { "epoch": 5.256954074557468, "grad_norm": 2.374018907546997, "learning_rate": 2.711914168257566e-05, "loss": 0.3142, "num_input_tokens_seen": 61540848, "step": 64445 }, { "epoch": 5.257361938167877, "grad_norm": 0.8483805656433105, "learning_rate": 2.7115595189819875e-05, "loss": 0.3547, "num_input_tokens_seen": 61546128, "step": 64450 }, { "epoch": 5.257769801778285, "grad_norm": 0.9648324847221375, "learning_rate": 2.711204865418187e-05, "loss": 0.319, "num_input_tokens_seen": 61551056, "step": 64455 }, { "epoch": 5.258177665388694, "grad_norm": 1.0081968307495117, "learning_rate": 2.710850207573354e-05, "loss": 0.3196, "num_input_tokens_seen": 61556528, "step": 64460 }, { "epoch": 5.258585528999102, "grad_norm": 1.1566648483276367, "learning_rate": 2.710495545454676e-05, "loss": 0.3141, "num_input_tokens_seen": 61560976, "step": 64465 }, { "epoch": 5.258993392609511, "grad_norm": 1.7974547147750854, "learning_rate": 2.7101408790693438e-05, "loss": 0.2852, "num_input_tokens_seen": 61565344, "step": 64470 }, { "epoch": 5.25940125621992, "grad_norm": 3.063192844390869, "learning_rate": 2.7097862084245453e-05, "loss": 0.356, "num_input_tokens_seen": 61570752, "step": 64475 }, { "epoch": 5.2598091198303285, "grad_norm": 2.2319586277008057, "learning_rate": 2.7094315335274685e-05, "loss": 0.3294, "num_input_tokens_seen": 61575984, "step": 64480 }, { "epoch": 5.2602169834407375, "grad_norm": 1.7795217037200928, "learning_rate": 2.709076854385304e-05, "loss": 0.2899, "num_input_tokens_seen": 61581232, "step": 64485 }, { "epoch": 5.260624847051146, "grad_norm": 0.8137083053588867, "learning_rate": 2.7087221710052413e-05, "loss": 0.3535, "num_input_tokens_seen": 61585616, "step": 64490 }, { "epoch": 5.261032710661555, "grad_norm": 4.04307222366333, "learning_rate": 2.7083674833944684e-05, "loss": 0.296, "num_input_tokens_seen": 61590192, "step": 64495 }, { "epoch": 5.261440574271964, "grad_norm": 1.0940887928009033, "learning_rate": 2.7080127915601744e-05, "loss": 0.3305, "num_input_tokens_seen": 61594240, "step": 64500 }, { "epoch": 5.261848437882372, "grad_norm": 2.811614990234375, "learning_rate": 2.7076580955095503e-05, "loss": 0.2772, "num_input_tokens_seen": 61598976, "step": 64505 }, { "epoch": 5.262256301492781, "grad_norm": 4.650230884552002, "learning_rate": 2.707303395249784e-05, "loss": 0.451, "num_input_tokens_seen": 61604672, "step": 64510 }, { "epoch": 5.26266416510319, "grad_norm": 1.133531928062439, "learning_rate": 2.706948690788067e-05, "loss": 0.4385, "num_input_tokens_seen": 61610096, "step": 64515 }, { "epoch": 5.263072028713598, "grad_norm": 6.351141452789307, "learning_rate": 2.7065939821315873e-05, "loss": 0.342, "num_input_tokens_seen": 61614816, "step": 64520 }, { "epoch": 5.263479892324007, "grad_norm": 0.9852806925773621, "learning_rate": 2.7062392692875355e-05, "loss": 0.3039, "num_input_tokens_seen": 61619184, "step": 64525 }, { "epoch": 5.263887755934416, "grad_norm": 1.0440810918807983, "learning_rate": 2.705884552263101e-05, "loss": 0.2696, "num_input_tokens_seen": 61624528, "step": 64530 }, { "epoch": 5.264295619544824, "grad_norm": 4.437736988067627, "learning_rate": 2.7055298310654747e-05, "loss": 0.3587, "num_input_tokens_seen": 61628928, "step": 64535 }, { "epoch": 5.264703483155233, "grad_norm": 3.8688340187072754, "learning_rate": 2.705175105701846e-05, "loss": 0.3398, "num_input_tokens_seen": 61634016, "step": 64540 }, { "epoch": 5.265111346765641, "grad_norm": 1.2545771598815918, "learning_rate": 2.704820376179405e-05, "loss": 0.3774, "num_input_tokens_seen": 61638112, "step": 64545 }, { "epoch": 5.26551921037605, "grad_norm": 5.861461639404297, "learning_rate": 2.7044656425053423e-05, "loss": 0.3236, "num_input_tokens_seen": 61642816, "step": 64550 }, { "epoch": 5.265927073986459, "grad_norm": 4.141334056854248, "learning_rate": 2.7041109046868473e-05, "loss": 0.3633, "num_input_tokens_seen": 61647760, "step": 64555 }, { "epoch": 5.266334937596867, "grad_norm": 2.5827653408050537, "learning_rate": 2.703756162731112e-05, "loss": 0.2873, "num_input_tokens_seen": 61651760, "step": 64560 }, { "epoch": 5.266742801207276, "grad_norm": 5.768069267272949, "learning_rate": 2.7034014166453247e-05, "loss": 0.3792, "num_input_tokens_seen": 61657120, "step": 64565 }, { "epoch": 5.267150664817685, "grad_norm": 2.8487679958343506, "learning_rate": 2.7030466664366777e-05, "loss": 0.3124, "num_input_tokens_seen": 61662416, "step": 64570 }, { "epoch": 5.2675585284280935, "grad_norm": 3.216207504272461, "learning_rate": 2.7026919121123606e-05, "loss": 0.3994, "num_input_tokens_seen": 61667408, "step": 64575 }, { "epoch": 5.2679663920385025, "grad_norm": 5.421244144439697, "learning_rate": 2.7023371536795648e-05, "loss": 0.4138, "num_input_tokens_seen": 61672752, "step": 64580 }, { "epoch": 5.268374255648911, "grad_norm": 0.7609596848487854, "learning_rate": 2.7019823911454812e-05, "loss": 0.3494, "num_input_tokens_seen": 61677920, "step": 64585 }, { "epoch": 5.26878211925932, "grad_norm": 4.385813236236572, "learning_rate": 2.7016276245173e-05, "loss": 0.3182, "num_input_tokens_seen": 61681424, "step": 64590 }, { "epoch": 5.269189982869729, "grad_norm": 4.565379619598389, "learning_rate": 2.701272853802213e-05, "loss": 0.4007, "num_input_tokens_seen": 61685408, "step": 64595 }, { "epoch": 5.269597846480137, "grad_norm": 12.320712089538574, "learning_rate": 2.7009180790074102e-05, "loss": 0.3593, "num_input_tokens_seen": 61690464, "step": 64600 }, { "epoch": 5.270005710090546, "grad_norm": 3.3261821269989014, "learning_rate": 2.7005633001400838e-05, "loss": 0.4383, "num_input_tokens_seen": 61695376, "step": 64605 }, { "epoch": 5.270413573700955, "grad_norm": 1.8885818719863892, "learning_rate": 2.7002085172074238e-05, "loss": 0.2963, "num_input_tokens_seen": 61700112, "step": 64610 }, { "epoch": 5.270821437311363, "grad_norm": 2.287919521331787, "learning_rate": 2.6998537302166228e-05, "loss": 0.2986, "num_input_tokens_seen": 61705616, "step": 64615 }, { "epoch": 5.271229300921772, "grad_norm": 3.707897901535034, "learning_rate": 2.699498939174872e-05, "loss": 0.431, "num_input_tokens_seen": 61710352, "step": 64620 }, { "epoch": 5.27163716453218, "grad_norm": 4.764702796936035, "learning_rate": 2.699144144089362e-05, "loss": 0.3117, "num_input_tokens_seen": 61714928, "step": 64625 }, { "epoch": 5.272045028142589, "grad_norm": 4.434325695037842, "learning_rate": 2.6987893449672848e-05, "loss": 0.3222, "num_input_tokens_seen": 61719808, "step": 64630 }, { "epoch": 5.272452891752998, "grad_norm": 3.9323253631591797, "learning_rate": 2.6984345418158318e-05, "loss": 0.2745, "num_input_tokens_seen": 61724304, "step": 64635 }, { "epoch": 5.272860755363406, "grad_norm": 3.962066650390625, "learning_rate": 2.6980797346421953e-05, "loss": 0.2996, "num_input_tokens_seen": 61729376, "step": 64640 }, { "epoch": 5.273268618973815, "grad_norm": 1.3452625274658203, "learning_rate": 2.6977249234535668e-05, "loss": 0.3105, "num_input_tokens_seen": 61734672, "step": 64645 }, { "epoch": 5.273676482584224, "grad_norm": 1.4925376176834106, "learning_rate": 2.697370108257138e-05, "loss": 0.3298, "num_input_tokens_seen": 61739840, "step": 64650 }, { "epoch": 5.274084346194632, "grad_norm": 6.16137170791626, "learning_rate": 2.697015289060101e-05, "loss": 0.473, "num_input_tokens_seen": 61744128, "step": 64655 }, { "epoch": 5.274492209805041, "grad_norm": 1.6570477485656738, "learning_rate": 2.6966604658696477e-05, "loss": 0.4397, "num_input_tokens_seen": 61749824, "step": 64660 }, { "epoch": 5.27490007341545, "grad_norm": 8.034547805786133, "learning_rate": 2.69630563869297e-05, "loss": 0.2806, "num_input_tokens_seen": 61753664, "step": 64665 }, { "epoch": 5.275307937025858, "grad_norm": 2.496485948562622, "learning_rate": 2.6959508075372607e-05, "loss": 0.4154, "num_input_tokens_seen": 61758672, "step": 64670 }, { "epoch": 5.275715800636267, "grad_norm": 7.488378047943115, "learning_rate": 2.6955959724097118e-05, "loss": 0.3552, "num_input_tokens_seen": 61764016, "step": 64675 }, { "epoch": 5.276123664246676, "grad_norm": 7.848288059234619, "learning_rate": 2.6952411333175158e-05, "loss": 0.3076, "num_input_tokens_seen": 61769280, "step": 64680 }, { "epoch": 5.276531527857085, "grad_norm": 5.666046142578125, "learning_rate": 2.6948862902678656e-05, "loss": 0.3898, "num_input_tokens_seen": 61773200, "step": 64685 }, { "epoch": 5.276939391467494, "grad_norm": 2.53458571434021, "learning_rate": 2.6945314432679526e-05, "loss": 0.3444, "num_input_tokens_seen": 61777200, "step": 64690 }, { "epoch": 5.277347255077902, "grad_norm": 3.8533248901367188, "learning_rate": 2.69417659232497e-05, "loss": 0.2688, "num_input_tokens_seen": 61781440, "step": 64695 }, { "epoch": 5.277755118688311, "grad_norm": 5.330580711364746, "learning_rate": 2.69382173744611e-05, "loss": 0.3616, "num_input_tokens_seen": 61787040, "step": 64700 }, { "epoch": 5.27816298229872, "grad_norm": 6.688582420349121, "learning_rate": 2.693466878638566e-05, "loss": 0.2779, "num_input_tokens_seen": 61792112, "step": 64705 }, { "epoch": 5.278570845909128, "grad_norm": 5.265024185180664, "learning_rate": 2.6931120159095303e-05, "loss": 0.343, "num_input_tokens_seen": 61797248, "step": 64710 }, { "epoch": 5.278978709519537, "grad_norm": 0.8731058835983276, "learning_rate": 2.6927571492661962e-05, "loss": 0.2564, "num_input_tokens_seen": 61802784, "step": 64715 }, { "epoch": 5.279386573129945, "grad_norm": 1.1950910091400146, "learning_rate": 2.6924022787157566e-05, "loss": 0.2951, "num_input_tokens_seen": 61807808, "step": 64720 }, { "epoch": 5.279794436740354, "grad_norm": 9.919418334960938, "learning_rate": 2.6920474042654048e-05, "loss": 0.2725, "num_input_tokens_seen": 61812720, "step": 64725 }, { "epoch": 5.280202300350763, "grad_norm": 2.1679582595825195, "learning_rate": 2.691692525922333e-05, "loss": 0.3421, "num_input_tokens_seen": 61817312, "step": 64730 }, { "epoch": 5.280610163961171, "grad_norm": 1.6731338500976562, "learning_rate": 2.6913376436937358e-05, "loss": 0.3015, "num_input_tokens_seen": 61821760, "step": 64735 }, { "epoch": 5.28101802757158, "grad_norm": 2.598727226257324, "learning_rate": 2.6909827575868063e-05, "loss": 0.3721, "num_input_tokens_seen": 61825552, "step": 64740 }, { "epoch": 5.281425891181989, "grad_norm": 3.529916286468506, "learning_rate": 2.6906278676087367e-05, "loss": 0.5081, "num_input_tokens_seen": 61830048, "step": 64745 }, { "epoch": 5.281833754792397, "grad_norm": 4.011013031005859, "learning_rate": 2.6902729737667216e-05, "loss": 0.4209, "num_input_tokens_seen": 61835152, "step": 64750 }, { "epoch": 5.282241618402806, "grad_norm": 2.8274965286254883, "learning_rate": 2.6899180760679538e-05, "loss": 0.3847, "num_input_tokens_seen": 61839808, "step": 64755 }, { "epoch": 5.282649482013214, "grad_norm": 6.45952844619751, "learning_rate": 2.6895631745196282e-05, "loss": 0.3365, "num_input_tokens_seen": 61844176, "step": 64760 }, { "epoch": 5.283057345623623, "grad_norm": 7.038033962249756, "learning_rate": 2.6892082691289362e-05, "loss": 0.2738, "num_input_tokens_seen": 61848944, "step": 64765 }, { "epoch": 5.283465209234032, "grad_norm": 6.398866653442383, "learning_rate": 2.6888533599030742e-05, "loss": 0.3574, "num_input_tokens_seen": 61853488, "step": 64770 }, { "epoch": 5.2838730728444405, "grad_norm": 2.3412094116210938, "learning_rate": 2.6884984468492335e-05, "loss": 0.2808, "num_input_tokens_seen": 61858848, "step": 64775 }, { "epoch": 5.2842809364548495, "grad_norm": 2.5945217609405518, "learning_rate": 2.688143529974611e-05, "loss": 0.3728, "num_input_tokens_seen": 61864160, "step": 64780 }, { "epoch": 5.2846888000652585, "grad_norm": 2.819356918334961, "learning_rate": 2.687788609286398e-05, "loss": 0.3342, "num_input_tokens_seen": 61868320, "step": 64785 }, { "epoch": 5.285096663675667, "grad_norm": 6.219566822052002, "learning_rate": 2.6874336847917903e-05, "loss": 0.3285, "num_input_tokens_seen": 61873280, "step": 64790 }, { "epoch": 5.285504527286076, "grad_norm": 0.6780275702476501, "learning_rate": 2.687078756497981e-05, "loss": 0.3413, "num_input_tokens_seen": 61878496, "step": 64795 }, { "epoch": 5.285912390896485, "grad_norm": 1.0099200010299683, "learning_rate": 2.6867238244121644e-05, "loss": 0.3608, "num_input_tokens_seen": 61883856, "step": 64800 }, { "epoch": 5.286320254506893, "grad_norm": 2.4010159969329834, "learning_rate": 2.686368888541536e-05, "loss": 0.2826, "num_input_tokens_seen": 61888688, "step": 64805 }, { "epoch": 5.286728118117302, "grad_norm": 4.921028137207031, "learning_rate": 2.6860139488932896e-05, "loss": 0.3504, "num_input_tokens_seen": 61893872, "step": 64810 }, { "epoch": 5.28713598172771, "grad_norm": 2.039546489715576, "learning_rate": 2.6856590054746188e-05, "loss": 0.3904, "num_input_tokens_seen": 61898688, "step": 64815 }, { "epoch": 5.287543845338119, "grad_norm": 5.976898670196533, "learning_rate": 2.6853040582927198e-05, "loss": 0.4666, "num_input_tokens_seen": 61902880, "step": 64820 }, { "epoch": 5.287951708948528, "grad_norm": 4.456238746643066, "learning_rate": 2.684949107354785e-05, "loss": 0.3864, "num_input_tokens_seen": 61907520, "step": 64825 }, { "epoch": 5.288359572558936, "grad_norm": 3.22686505317688, "learning_rate": 2.684594152668011e-05, "loss": 0.3619, "num_input_tokens_seen": 61912096, "step": 64830 }, { "epoch": 5.288767436169345, "grad_norm": 1.9721530675888062, "learning_rate": 2.684239194239593e-05, "loss": 0.2751, "num_input_tokens_seen": 61916560, "step": 64835 }, { "epoch": 5.289175299779753, "grad_norm": 2.549804210662842, "learning_rate": 2.6838842320767236e-05, "loss": 0.2594, "num_input_tokens_seen": 61921152, "step": 64840 }, { "epoch": 5.289583163390162, "grad_norm": 4.5799360275268555, "learning_rate": 2.6835292661865984e-05, "loss": 0.3347, "num_input_tokens_seen": 61925760, "step": 64845 }, { "epoch": 5.289991027000571, "grad_norm": 5.440235614776611, "learning_rate": 2.6831742965764145e-05, "loss": 0.3601, "num_input_tokens_seen": 61930720, "step": 64850 }, { "epoch": 5.290398890610979, "grad_norm": 1.6139822006225586, "learning_rate": 2.6828193232533644e-05, "loss": 0.3597, "num_input_tokens_seen": 61935440, "step": 64855 }, { "epoch": 5.290806754221388, "grad_norm": 2.8419175148010254, "learning_rate": 2.6824643462246447e-05, "loss": 0.3014, "num_input_tokens_seen": 61940272, "step": 64860 }, { "epoch": 5.291214617831797, "grad_norm": 0.6176894903182983, "learning_rate": 2.6821093654974495e-05, "loss": 0.3532, "num_input_tokens_seen": 61945264, "step": 64865 }, { "epoch": 5.2916224814422055, "grad_norm": 1.14674973487854, "learning_rate": 2.681754381078976e-05, "loss": 0.3073, "num_input_tokens_seen": 61949360, "step": 64870 }, { "epoch": 5.2920303450526145, "grad_norm": 1.6100404262542725, "learning_rate": 2.6813993929764175e-05, "loss": 0.4371, "num_input_tokens_seen": 61955232, "step": 64875 }, { "epoch": 5.2924382086630235, "grad_norm": 1.5994181632995605, "learning_rate": 2.681044401196971e-05, "loss": 0.3383, "num_input_tokens_seen": 61960928, "step": 64880 }, { "epoch": 5.292846072273432, "grad_norm": 8.70287036895752, "learning_rate": 2.6806894057478303e-05, "loss": 0.5141, "num_input_tokens_seen": 61965424, "step": 64885 }, { "epoch": 5.293253935883841, "grad_norm": 2.8716237545013428, "learning_rate": 2.6803344066361936e-05, "loss": 0.3225, "num_input_tokens_seen": 61969680, "step": 64890 }, { "epoch": 5.293661799494249, "grad_norm": 10.59132194519043, "learning_rate": 2.679979403869255e-05, "loss": 0.4391, "num_input_tokens_seen": 61974256, "step": 64895 }, { "epoch": 5.294069663104658, "grad_norm": 1.9360344409942627, "learning_rate": 2.6796243974542096e-05, "loss": 0.295, "num_input_tokens_seen": 61979216, "step": 64900 }, { "epoch": 5.294477526715067, "grad_norm": 0.6524059176445007, "learning_rate": 2.679269387398255e-05, "loss": 0.2868, "num_input_tokens_seen": 61984912, "step": 64905 }, { "epoch": 5.294885390325475, "grad_norm": 4.892933368682861, "learning_rate": 2.6789143737085846e-05, "loss": 0.3579, "num_input_tokens_seen": 61989104, "step": 64910 }, { "epoch": 5.295293253935884, "grad_norm": 2.1581366062164307, "learning_rate": 2.6785593563923973e-05, "loss": 0.372, "num_input_tokens_seen": 61993808, "step": 64915 }, { "epoch": 5.295701117546293, "grad_norm": 0.5810367465019226, "learning_rate": 2.6782043354568866e-05, "loss": 0.3517, "num_input_tokens_seen": 61998352, "step": 64920 }, { "epoch": 5.296108981156701, "grad_norm": 3.0730860233306885, "learning_rate": 2.67784931090925e-05, "loss": 0.2988, "num_input_tokens_seen": 62003056, "step": 64925 }, { "epoch": 5.29651684476711, "grad_norm": 0.718379020690918, "learning_rate": 2.6774942827566834e-05, "loss": 0.3408, "num_input_tokens_seen": 62007808, "step": 64930 }, { "epoch": 5.296924708377518, "grad_norm": 0.5613431334495544, "learning_rate": 2.6771392510063838e-05, "loss": 0.2709, "num_input_tokens_seen": 62012352, "step": 64935 }, { "epoch": 5.297332571987927, "grad_norm": 2.3205058574676514, "learning_rate": 2.676784215665546e-05, "loss": 0.3334, "num_input_tokens_seen": 62017536, "step": 64940 }, { "epoch": 5.297740435598336, "grad_norm": 1.5569223165512085, "learning_rate": 2.676429176741368e-05, "loss": 0.3506, "num_input_tokens_seen": 62021312, "step": 64945 }, { "epoch": 5.298148299208744, "grad_norm": 5.266844749450684, "learning_rate": 2.676074134241045e-05, "loss": 0.2991, "num_input_tokens_seen": 62025360, "step": 64950 }, { "epoch": 5.298556162819153, "grad_norm": 1.0354169607162476, "learning_rate": 2.675719088171774e-05, "loss": 0.4107, "num_input_tokens_seen": 62029824, "step": 64955 }, { "epoch": 5.298964026429562, "grad_norm": 3.207773447036743, "learning_rate": 2.6753640385407523e-05, "loss": 0.3161, "num_input_tokens_seen": 62034144, "step": 64960 }, { "epoch": 5.2993718900399704, "grad_norm": 1.74872887134552, "learning_rate": 2.6750089853551763e-05, "loss": 0.3653, "num_input_tokens_seen": 62038240, "step": 64965 }, { "epoch": 5.2997797536503795, "grad_norm": 3.9298112392425537, "learning_rate": 2.6746539286222423e-05, "loss": 0.3437, "num_input_tokens_seen": 62043136, "step": 64970 }, { "epoch": 5.300187617260788, "grad_norm": 5.393467903137207, "learning_rate": 2.6742988683491466e-05, "loss": 0.3581, "num_input_tokens_seen": 62048272, "step": 64975 }, { "epoch": 5.300595480871197, "grad_norm": 6.973228931427002, "learning_rate": 2.673943804543088e-05, "loss": 0.3334, "num_input_tokens_seen": 62053584, "step": 64980 }, { "epoch": 5.301003344481606, "grad_norm": 3.259751796722412, "learning_rate": 2.6735887372112616e-05, "loss": 0.3431, "num_input_tokens_seen": 62058432, "step": 64985 }, { "epoch": 5.301411208092014, "grad_norm": 4.281860828399658, "learning_rate": 2.6732336663608654e-05, "loss": 0.3091, "num_input_tokens_seen": 62063856, "step": 64990 }, { "epoch": 5.301819071702423, "grad_norm": 7.0866522789001465, "learning_rate": 2.6728785919990968e-05, "loss": 0.4156, "num_input_tokens_seen": 62069472, "step": 64995 }, { "epoch": 5.302226935312832, "grad_norm": 2.754239559173584, "learning_rate": 2.6725235141331522e-05, "loss": 0.2748, "num_input_tokens_seen": 62074800, "step": 65000 }, { "epoch": 5.30263479892324, "grad_norm": 2.6073455810546875, "learning_rate": 2.6721684327702297e-05, "loss": 0.4722, "num_input_tokens_seen": 62079600, "step": 65005 }, { "epoch": 5.303042662533649, "grad_norm": 3.3020026683807373, "learning_rate": 2.6718133479175257e-05, "loss": 0.3193, "num_input_tokens_seen": 62084368, "step": 65010 }, { "epoch": 5.303450526144058, "grad_norm": 2.4242753982543945, "learning_rate": 2.6714582595822386e-05, "loss": 0.3844, "num_input_tokens_seen": 62088576, "step": 65015 }, { "epoch": 5.303858389754466, "grad_norm": 8.599771499633789, "learning_rate": 2.671103167771565e-05, "loss": 0.4182, "num_input_tokens_seen": 62093776, "step": 65020 }, { "epoch": 5.304266253364875, "grad_norm": 5.834350109100342, "learning_rate": 2.6707480724927032e-05, "loss": 0.5044, "num_input_tokens_seen": 62099312, "step": 65025 }, { "epoch": 5.304674116975283, "grad_norm": 3.99943208694458, "learning_rate": 2.6703929737528504e-05, "loss": 0.3535, "num_input_tokens_seen": 62103200, "step": 65030 }, { "epoch": 5.305081980585692, "grad_norm": 8.81913948059082, "learning_rate": 2.6700378715592044e-05, "loss": 0.3687, "num_input_tokens_seen": 62107584, "step": 65035 }, { "epoch": 5.305489844196101, "grad_norm": 4.393631458282471, "learning_rate": 2.6696827659189627e-05, "loss": 0.4347, "num_input_tokens_seen": 62112672, "step": 65040 }, { "epoch": 5.305897707806509, "grad_norm": 1.05958092212677, "learning_rate": 2.6693276568393245e-05, "loss": 0.3923, "num_input_tokens_seen": 62117920, "step": 65045 }, { "epoch": 5.306305571416918, "grad_norm": 1.6932425498962402, "learning_rate": 2.668972544327486e-05, "loss": 0.2694, "num_input_tokens_seen": 62122400, "step": 65050 }, { "epoch": 5.306713435027326, "grad_norm": 4.544517517089844, "learning_rate": 2.668617428390645e-05, "loss": 0.4132, "num_input_tokens_seen": 62127616, "step": 65055 }, { "epoch": 5.307121298637735, "grad_norm": 2.4715585708618164, "learning_rate": 2.6682623090360015e-05, "loss": 0.3382, "num_input_tokens_seen": 62132544, "step": 65060 }, { "epoch": 5.307529162248144, "grad_norm": 4.050968170166016, "learning_rate": 2.6679071862707516e-05, "loss": 0.3702, "num_input_tokens_seen": 62137296, "step": 65065 }, { "epoch": 5.3079370258585525, "grad_norm": 7.222520351409912, "learning_rate": 2.6675520601020953e-05, "loss": 0.3195, "num_input_tokens_seen": 62142032, "step": 65070 }, { "epoch": 5.3083448894689615, "grad_norm": 2.612029790878296, "learning_rate": 2.667196930537229e-05, "loss": 0.2954, "num_input_tokens_seen": 62146784, "step": 65075 }, { "epoch": 5.308752753079371, "grad_norm": 2.076669216156006, "learning_rate": 2.6668417975833533e-05, "loss": 0.3468, "num_input_tokens_seen": 62151200, "step": 65080 }, { "epoch": 5.309160616689779, "grad_norm": 3.4913978576660156, "learning_rate": 2.6664866612476635e-05, "loss": 0.3345, "num_input_tokens_seen": 62156048, "step": 65085 }, { "epoch": 5.309568480300188, "grad_norm": 5.2558159828186035, "learning_rate": 2.666131521537361e-05, "loss": 0.3301, "num_input_tokens_seen": 62160432, "step": 65090 }, { "epoch": 5.309976343910597, "grad_norm": 5.060132026672363, "learning_rate": 2.6657763784596434e-05, "loss": 0.3524, "num_input_tokens_seen": 62165584, "step": 65095 }, { "epoch": 5.310384207521005, "grad_norm": 2.988103151321411, "learning_rate": 2.6654212320217088e-05, "loss": 0.3807, "num_input_tokens_seen": 62170048, "step": 65100 }, { "epoch": 5.310792071131414, "grad_norm": 4.585902214050293, "learning_rate": 2.665066082230756e-05, "loss": 0.4701, "num_input_tokens_seen": 62175008, "step": 65105 }, { "epoch": 5.311199934741822, "grad_norm": 2.4525368213653564, "learning_rate": 2.6647109290939838e-05, "loss": 0.3118, "num_input_tokens_seen": 62179824, "step": 65110 }, { "epoch": 5.311607798352231, "grad_norm": 5.0843987464904785, "learning_rate": 2.6643557726185912e-05, "loss": 0.3592, "num_input_tokens_seen": 62184576, "step": 65115 }, { "epoch": 5.31201566196264, "grad_norm": 1.8595713376998901, "learning_rate": 2.664000612811776e-05, "loss": 0.3188, "num_input_tokens_seen": 62189360, "step": 65120 }, { "epoch": 5.312423525573048, "grad_norm": 3.1798384189605713, "learning_rate": 2.6636454496807394e-05, "loss": 0.2976, "num_input_tokens_seen": 62194176, "step": 65125 }, { "epoch": 5.312831389183457, "grad_norm": 4.758949279785156, "learning_rate": 2.6632902832326784e-05, "loss": 0.3857, "num_input_tokens_seen": 62199488, "step": 65130 }, { "epoch": 5.313239252793866, "grad_norm": 1.6703944206237793, "learning_rate": 2.6629351134747928e-05, "loss": 0.4507, "num_input_tokens_seen": 62203968, "step": 65135 }, { "epoch": 5.313647116404274, "grad_norm": 3.3508358001708984, "learning_rate": 2.6625799404142817e-05, "loss": 0.274, "num_input_tokens_seen": 62209152, "step": 65140 }, { "epoch": 5.314054980014683, "grad_norm": 2.9730303287506104, "learning_rate": 2.662224764058344e-05, "loss": 0.3143, "num_input_tokens_seen": 62214528, "step": 65145 }, { "epoch": 5.314462843625091, "grad_norm": 3.945709466934204, "learning_rate": 2.66186958441418e-05, "loss": 0.3602, "num_input_tokens_seen": 62219760, "step": 65150 }, { "epoch": 5.3148707072355, "grad_norm": 1.7370967864990234, "learning_rate": 2.6615144014889876e-05, "loss": 0.3444, "num_input_tokens_seen": 62224336, "step": 65155 }, { "epoch": 5.315278570845909, "grad_norm": 4.526970386505127, "learning_rate": 2.661159215289967e-05, "loss": 0.3579, "num_input_tokens_seen": 62228880, "step": 65160 }, { "epoch": 5.3156864344563175, "grad_norm": 3.544926643371582, "learning_rate": 2.6608040258243184e-05, "loss": 0.4432, "num_input_tokens_seen": 62234128, "step": 65165 }, { "epoch": 5.3160942980667265, "grad_norm": 3.5298681259155273, "learning_rate": 2.66044883309924e-05, "loss": 0.3412, "num_input_tokens_seen": 62239696, "step": 65170 }, { "epoch": 5.3165021616771355, "grad_norm": 2.6106996536254883, "learning_rate": 2.6600936371219314e-05, "loss": 0.3431, "num_input_tokens_seen": 62244640, "step": 65175 }, { "epoch": 5.316910025287544, "grad_norm": 3.8788533210754395, "learning_rate": 2.6597384378995937e-05, "loss": 0.3882, "num_input_tokens_seen": 62249136, "step": 65180 }, { "epoch": 5.317317888897953, "grad_norm": 3.328939199447632, "learning_rate": 2.659383235439425e-05, "loss": 0.2446, "num_input_tokens_seen": 62254416, "step": 65185 }, { "epoch": 5.317725752508361, "grad_norm": 3.9907355308532715, "learning_rate": 2.6590280297486266e-05, "loss": 0.2656, "num_input_tokens_seen": 62259472, "step": 65190 }, { "epoch": 5.31813361611877, "grad_norm": 2.9879233837127686, "learning_rate": 2.658672820834397e-05, "loss": 0.3867, "num_input_tokens_seen": 62264592, "step": 65195 }, { "epoch": 5.318541479729179, "grad_norm": 3.519979238510132, "learning_rate": 2.658317608703937e-05, "loss": 0.3726, "num_input_tokens_seen": 62269552, "step": 65200 }, { "epoch": 5.318949343339587, "grad_norm": 3.9686379432678223, "learning_rate": 2.657962393364446e-05, "loss": 0.302, "num_input_tokens_seen": 62274816, "step": 65205 }, { "epoch": 5.319357206949996, "grad_norm": 0.8536873459815979, "learning_rate": 2.6576071748231245e-05, "loss": 0.2087, "num_input_tokens_seen": 62280128, "step": 65210 }, { "epoch": 5.319765070560405, "grad_norm": 1.6410880088806152, "learning_rate": 2.6572519530871725e-05, "loss": 0.3885, "num_input_tokens_seen": 62285440, "step": 65215 }, { "epoch": 5.320172934170813, "grad_norm": 2.923130750656128, "learning_rate": 2.65689672816379e-05, "loss": 0.431, "num_input_tokens_seen": 62290448, "step": 65220 }, { "epoch": 5.320580797781222, "grad_norm": 3.2888360023498535, "learning_rate": 2.6565415000601786e-05, "loss": 0.4346, "num_input_tokens_seen": 62295936, "step": 65225 }, { "epoch": 5.320988661391631, "grad_norm": 1.2462470531463623, "learning_rate": 2.656186268783537e-05, "loss": 0.3018, "num_input_tokens_seen": 62300480, "step": 65230 }, { "epoch": 5.321396525002039, "grad_norm": 5.700477600097656, "learning_rate": 2.6558310343410654e-05, "loss": 0.3084, "num_input_tokens_seen": 62305808, "step": 65235 }, { "epoch": 5.321804388612448, "grad_norm": 2.7208657264709473, "learning_rate": 2.6554757967399657e-05, "loss": 0.3026, "num_input_tokens_seen": 62311040, "step": 65240 }, { "epoch": 5.322212252222856, "grad_norm": 3.6791934967041016, "learning_rate": 2.6551205559874376e-05, "loss": 0.358, "num_input_tokens_seen": 62316080, "step": 65245 }, { "epoch": 5.322620115833265, "grad_norm": 1.0089540481567383, "learning_rate": 2.654765312090682e-05, "loss": 0.2476, "num_input_tokens_seen": 62320656, "step": 65250 }, { "epoch": 5.323027979443674, "grad_norm": 2.392547607421875, "learning_rate": 2.654410065056898e-05, "loss": 0.3632, "num_input_tokens_seen": 62324400, "step": 65255 }, { "epoch": 5.3234358430540825, "grad_norm": 5.3985395431518555, "learning_rate": 2.654054814893289e-05, "loss": 0.3485, "num_input_tokens_seen": 62329312, "step": 65260 }, { "epoch": 5.3238437066644915, "grad_norm": 4.944550514221191, "learning_rate": 2.6536995616070535e-05, "loss": 0.3027, "num_input_tokens_seen": 62334992, "step": 65265 }, { "epoch": 5.3242515702749005, "grad_norm": 2.967123031616211, "learning_rate": 2.6533443052053935e-05, "loss": 0.3135, "num_input_tokens_seen": 62340320, "step": 65270 }, { "epoch": 5.324659433885309, "grad_norm": 2.5952155590057373, "learning_rate": 2.65298904569551e-05, "loss": 0.2516, "num_input_tokens_seen": 62345232, "step": 65275 }, { "epoch": 5.325067297495718, "grad_norm": 4.492616653442383, "learning_rate": 2.652633783084603e-05, "loss": 0.3697, "num_input_tokens_seen": 62350240, "step": 65280 }, { "epoch": 5.325475161106126, "grad_norm": 2.4786994457244873, "learning_rate": 2.6522785173798735e-05, "loss": 0.3129, "num_input_tokens_seen": 62355056, "step": 65285 }, { "epoch": 5.325883024716535, "grad_norm": 5.732936859130859, "learning_rate": 2.6519232485885244e-05, "loss": 0.3438, "num_input_tokens_seen": 62359440, "step": 65290 }, { "epoch": 5.326290888326944, "grad_norm": 4.980266094207764, "learning_rate": 2.6515679767177554e-05, "loss": 0.3721, "num_input_tokens_seen": 62364752, "step": 65295 }, { "epoch": 5.326698751937352, "grad_norm": 2.0061442852020264, "learning_rate": 2.651212701774767e-05, "loss": 0.4762, "num_input_tokens_seen": 62369072, "step": 65300 }, { "epoch": 5.327106615547761, "grad_norm": 2.4544055461883545, "learning_rate": 2.6508574237667627e-05, "loss": 0.2937, "num_input_tokens_seen": 62373904, "step": 65305 }, { "epoch": 5.32751447915817, "grad_norm": 2.280686616897583, "learning_rate": 2.650502142700942e-05, "loss": 0.3192, "num_input_tokens_seen": 62377744, "step": 65310 }, { "epoch": 5.327922342768578, "grad_norm": 6.5505828857421875, "learning_rate": 2.650146858584507e-05, "loss": 0.3119, "num_input_tokens_seen": 62382816, "step": 65315 }, { "epoch": 5.328330206378987, "grad_norm": 2.9525227546691895, "learning_rate": 2.649791571424658e-05, "loss": 0.3466, "num_input_tokens_seen": 62387904, "step": 65320 }, { "epoch": 5.328738069989395, "grad_norm": 3.872097969055176, "learning_rate": 2.6494362812285983e-05, "loss": 0.4268, "num_input_tokens_seen": 62392896, "step": 65325 }, { "epoch": 5.329145933599804, "grad_norm": 2.9528746604919434, "learning_rate": 2.6490809880035284e-05, "loss": 0.301, "num_input_tokens_seen": 62397184, "step": 65330 }, { "epoch": 5.329553797210213, "grad_norm": 3.9346094131469727, "learning_rate": 2.6487256917566505e-05, "loss": 0.2785, "num_input_tokens_seen": 62401712, "step": 65335 }, { "epoch": 5.329961660820621, "grad_norm": 4.245405673980713, "learning_rate": 2.6483703924951654e-05, "loss": 0.3044, "num_input_tokens_seen": 62405920, "step": 65340 }, { "epoch": 5.33036952443103, "grad_norm": 2.9219210147857666, "learning_rate": 2.6480150902262762e-05, "loss": 0.312, "num_input_tokens_seen": 62410688, "step": 65345 }, { "epoch": 5.330777388041439, "grad_norm": 1.6336939334869385, "learning_rate": 2.647659784957184e-05, "loss": 0.3472, "num_input_tokens_seen": 62415488, "step": 65350 }, { "epoch": 5.331185251651847, "grad_norm": 4.683806896209717, "learning_rate": 2.64730447669509e-05, "loss": 0.3331, "num_input_tokens_seen": 62421040, "step": 65355 }, { "epoch": 5.331593115262256, "grad_norm": 6.586699485778809, "learning_rate": 2.6469491654471977e-05, "loss": 0.4584, "num_input_tokens_seen": 62426000, "step": 65360 }, { "epoch": 5.332000978872665, "grad_norm": 5.245700359344482, "learning_rate": 2.646593851220708e-05, "loss": 0.3422, "num_input_tokens_seen": 62430576, "step": 65365 }, { "epoch": 5.332408842483074, "grad_norm": 3.9204957485198975, "learning_rate": 2.646238534022823e-05, "loss": 0.3956, "num_input_tokens_seen": 62434992, "step": 65370 }, { "epoch": 5.332816706093483, "grad_norm": 7.197994709014893, "learning_rate": 2.6458832138607454e-05, "loss": 0.3562, "num_input_tokens_seen": 62439360, "step": 65375 }, { "epoch": 5.333224569703891, "grad_norm": 7.217317581176758, "learning_rate": 2.645527890741677e-05, "loss": 0.3366, "num_input_tokens_seen": 62445136, "step": 65380 }, { "epoch": 5.3336324333143, "grad_norm": 3.1684954166412354, "learning_rate": 2.6451725646728193e-05, "loss": 0.3748, "num_input_tokens_seen": 62450608, "step": 65385 }, { "epoch": 5.334040296924709, "grad_norm": 3.140350103378296, "learning_rate": 2.6448172356613765e-05, "loss": 0.3968, "num_input_tokens_seen": 62455936, "step": 65390 }, { "epoch": 5.334448160535117, "grad_norm": 1.1475248336791992, "learning_rate": 2.6444619037145494e-05, "loss": 0.3, "num_input_tokens_seen": 62461376, "step": 65395 }, { "epoch": 5.334856024145526, "grad_norm": 4.397027015686035, "learning_rate": 2.6441065688395407e-05, "loss": 0.3114, "num_input_tokens_seen": 62466592, "step": 65400 }, { "epoch": 5.335263887755934, "grad_norm": 4.22957706451416, "learning_rate": 2.643751231043553e-05, "loss": 0.278, "num_input_tokens_seen": 62471920, "step": 65405 }, { "epoch": 5.335671751366343, "grad_norm": 3.5856282711029053, "learning_rate": 2.643395890333789e-05, "loss": 0.4071, "num_input_tokens_seen": 62476976, "step": 65410 }, { "epoch": 5.336079614976752, "grad_norm": 2.419529914855957, "learning_rate": 2.6430405467174517e-05, "loss": 0.2335, "num_input_tokens_seen": 62481440, "step": 65415 }, { "epoch": 5.33648747858716, "grad_norm": 4.830348491668701, "learning_rate": 2.6426852002017427e-05, "loss": 0.4223, "num_input_tokens_seen": 62485792, "step": 65420 }, { "epoch": 5.336895342197569, "grad_norm": 0.8660995364189148, "learning_rate": 2.642329850793866e-05, "loss": 0.2968, "num_input_tokens_seen": 62491280, "step": 65425 }, { "epoch": 5.337303205807978, "grad_norm": 1.0461013317108154, "learning_rate": 2.641974498501023e-05, "loss": 0.4038, "num_input_tokens_seen": 62496320, "step": 65430 }, { "epoch": 5.337711069418386, "grad_norm": 3.517357587814331, "learning_rate": 2.6416191433304178e-05, "loss": 0.4137, "num_input_tokens_seen": 62500448, "step": 65435 }, { "epoch": 5.338118933028795, "grad_norm": 4.764317035675049, "learning_rate": 2.6412637852892524e-05, "loss": 0.2656, "num_input_tokens_seen": 62505232, "step": 65440 }, { "epoch": 5.338526796639204, "grad_norm": 4.713673114776611, "learning_rate": 2.6409084243847308e-05, "loss": 0.3576, "num_input_tokens_seen": 62509504, "step": 65445 }, { "epoch": 5.338934660249612, "grad_norm": 1.9054193496704102, "learning_rate": 2.6405530606240547e-05, "loss": 0.3085, "num_input_tokens_seen": 62514784, "step": 65450 }, { "epoch": 5.339342523860021, "grad_norm": 4.692177772521973, "learning_rate": 2.640197694014428e-05, "loss": 0.2507, "num_input_tokens_seen": 62519360, "step": 65455 }, { "epoch": 5.3397503874704295, "grad_norm": 2.635955333709717, "learning_rate": 2.6398423245630533e-05, "loss": 0.4162, "num_input_tokens_seen": 62523216, "step": 65460 }, { "epoch": 5.3401582510808385, "grad_norm": 2.252654790878296, "learning_rate": 2.6394869522771336e-05, "loss": 0.2582, "num_input_tokens_seen": 62527728, "step": 65465 }, { "epoch": 5.3405661146912475, "grad_norm": 3.8553709983825684, "learning_rate": 2.6391315771638735e-05, "loss": 0.4006, "num_input_tokens_seen": 62532448, "step": 65470 }, { "epoch": 5.340973978301656, "grad_norm": 2.8044326305389404, "learning_rate": 2.6387761992304754e-05, "loss": 0.3826, "num_input_tokens_seen": 62537600, "step": 65475 }, { "epoch": 5.341381841912065, "grad_norm": 2.2904350757598877, "learning_rate": 2.6384208184841426e-05, "loss": 0.3351, "num_input_tokens_seen": 62541920, "step": 65480 }, { "epoch": 5.341789705522474, "grad_norm": 3.3383169174194336, "learning_rate": 2.6380654349320782e-05, "loss": 0.3232, "num_input_tokens_seen": 62546576, "step": 65485 }, { "epoch": 5.342197569132882, "grad_norm": 4.064181804656982, "learning_rate": 2.6377100485814865e-05, "loss": 0.2772, "num_input_tokens_seen": 62550720, "step": 65490 }, { "epoch": 5.342605432743291, "grad_norm": 3.809302806854248, "learning_rate": 2.6373546594395708e-05, "loss": 0.451, "num_input_tokens_seen": 62555648, "step": 65495 }, { "epoch": 5.343013296353699, "grad_norm": 4.564980983734131, "learning_rate": 2.636999267513534e-05, "loss": 0.3051, "num_input_tokens_seen": 62560544, "step": 65500 }, { "epoch": 5.343421159964108, "grad_norm": 0.47036802768707275, "learning_rate": 2.6366438728105814e-05, "loss": 0.3054, "num_input_tokens_seen": 62565360, "step": 65505 }, { "epoch": 5.343829023574517, "grad_norm": 4.9784836769104, "learning_rate": 2.6362884753379148e-05, "loss": 0.3544, "num_input_tokens_seen": 62571408, "step": 65510 }, { "epoch": 5.344236887184925, "grad_norm": 1.3967335224151611, "learning_rate": 2.635933075102739e-05, "loss": 0.3089, "num_input_tokens_seen": 62575264, "step": 65515 }, { "epoch": 5.344644750795334, "grad_norm": 3.155034303665161, "learning_rate": 2.6355776721122567e-05, "loss": 0.3844, "num_input_tokens_seen": 62579792, "step": 65520 }, { "epoch": 5.345052614405743, "grad_norm": 1.184186577796936, "learning_rate": 2.635222266373673e-05, "loss": 0.3114, "num_input_tokens_seen": 62584464, "step": 65525 }, { "epoch": 5.345460478016151, "grad_norm": 1.5209659337997437, "learning_rate": 2.6348668578941917e-05, "loss": 0.288, "num_input_tokens_seen": 62589552, "step": 65530 }, { "epoch": 5.34586834162656, "grad_norm": 2.289583683013916, "learning_rate": 2.6345114466810162e-05, "loss": 0.3283, "num_input_tokens_seen": 62594336, "step": 65535 }, { "epoch": 5.346276205236968, "grad_norm": 1.2681853771209717, "learning_rate": 2.6341560327413507e-05, "loss": 0.3839, "num_input_tokens_seen": 62598848, "step": 65540 }, { "epoch": 5.346684068847377, "grad_norm": 4.7285590171813965, "learning_rate": 2.6338006160824e-05, "loss": 0.3137, "num_input_tokens_seen": 62604032, "step": 65545 }, { "epoch": 5.347091932457786, "grad_norm": 5.250614166259766, "learning_rate": 2.6334451967113677e-05, "loss": 0.2811, "num_input_tokens_seen": 62609024, "step": 65550 }, { "epoch": 5.3474997960681945, "grad_norm": 0.8519900441169739, "learning_rate": 2.633089774635457e-05, "loss": 0.3131, "num_input_tokens_seen": 62614960, "step": 65555 }, { "epoch": 5.3479076596786035, "grad_norm": 1.8364158868789673, "learning_rate": 2.632734349861874e-05, "loss": 0.2878, "num_input_tokens_seen": 62620112, "step": 65560 }, { "epoch": 5.3483155232890125, "grad_norm": 4.667142868041992, "learning_rate": 2.632378922397822e-05, "loss": 0.2838, "num_input_tokens_seen": 62625408, "step": 65565 }, { "epoch": 5.348723386899421, "grad_norm": 2.6787545680999756, "learning_rate": 2.632023492250505e-05, "loss": 0.2195, "num_input_tokens_seen": 62630160, "step": 65570 }, { "epoch": 5.34913125050983, "grad_norm": 5.00676965713501, "learning_rate": 2.6316680594271286e-05, "loss": 0.3018, "num_input_tokens_seen": 62634096, "step": 65575 }, { "epoch": 5.349539114120239, "grad_norm": 4.658806800842285, "learning_rate": 2.631312623934896e-05, "loss": 0.2644, "num_input_tokens_seen": 62639056, "step": 65580 }, { "epoch": 5.349946977730647, "grad_norm": 3.620760202407837, "learning_rate": 2.6309571857810127e-05, "loss": 0.3817, "num_input_tokens_seen": 62643520, "step": 65585 }, { "epoch": 5.350354841341056, "grad_norm": 6.298159599304199, "learning_rate": 2.6306017449726832e-05, "loss": 0.4961, "num_input_tokens_seen": 62647408, "step": 65590 }, { "epoch": 5.350762704951464, "grad_norm": 2.8948864936828613, "learning_rate": 2.630246301517111e-05, "loss": 0.3587, "num_input_tokens_seen": 62652256, "step": 65595 }, { "epoch": 5.351170568561873, "grad_norm": 1.7962292432785034, "learning_rate": 2.6298908554215025e-05, "loss": 0.3967, "num_input_tokens_seen": 62657104, "step": 65600 }, { "epoch": 5.351578432172282, "grad_norm": 1.4933522939682007, "learning_rate": 2.6295354066930612e-05, "loss": 0.2782, "num_input_tokens_seen": 62662032, "step": 65605 }, { "epoch": 5.35198629578269, "grad_norm": 2.8890042304992676, "learning_rate": 2.6291799553389913e-05, "loss": 0.3804, "num_input_tokens_seen": 62666496, "step": 65610 }, { "epoch": 5.352394159393099, "grad_norm": 5.153157711029053, "learning_rate": 2.6288245013664996e-05, "loss": 0.4627, "num_input_tokens_seen": 62671024, "step": 65615 }, { "epoch": 5.352802023003507, "grad_norm": 3.5756309032440186, "learning_rate": 2.6284690447827897e-05, "loss": 0.1583, "num_input_tokens_seen": 62675904, "step": 65620 }, { "epoch": 5.353209886613916, "grad_norm": 2.3664135932922363, "learning_rate": 2.6281135855950673e-05, "loss": 0.3606, "num_input_tokens_seen": 62681120, "step": 65625 }, { "epoch": 5.353617750224325, "grad_norm": 1.00822114944458, "learning_rate": 2.6277581238105358e-05, "loss": 0.27, "num_input_tokens_seen": 62685888, "step": 65630 }, { "epoch": 5.354025613834733, "grad_norm": 3.70334792137146, "learning_rate": 2.6274026594364026e-05, "loss": 0.3709, "num_input_tokens_seen": 62690752, "step": 65635 }, { "epoch": 5.354433477445142, "grad_norm": 0.6606283187866211, "learning_rate": 2.6270471924798713e-05, "loss": 0.3378, "num_input_tokens_seen": 62695184, "step": 65640 }, { "epoch": 5.354841341055551, "grad_norm": 4.505627632141113, "learning_rate": 2.6266917229481474e-05, "loss": 0.3001, "num_input_tokens_seen": 62700208, "step": 65645 }, { "epoch": 5.355249204665959, "grad_norm": 1.7067314386367798, "learning_rate": 2.6263362508484364e-05, "loss": 0.2865, "num_input_tokens_seen": 62704960, "step": 65650 }, { "epoch": 5.3556570682763684, "grad_norm": 7.441603660583496, "learning_rate": 2.625980776187943e-05, "loss": 0.415, "num_input_tokens_seen": 62710144, "step": 65655 }, { "epoch": 5.3560649318867775, "grad_norm": 6.82371711730957, "learning_rate": 2.6256252989738727e-05, "loss": 0.3855, "num_input_tokens_seen": 62714192, "step": 65660 }, { "epoch": 5.356472795497186, "grad_norm": 1.1392453908920288, "learning_rate": 2.6252698192134308e-05, "loss": 0.3317, "num_input_tokens_seen": 62719216, "step": 65665 }, { "epoch": 5.356880659107595, "grad_norm": 6.577495574951172, "learning_rate": 2.6249143369138236e-05, "loss": 0.4313, "num_input_tokens_seen": 62724464, "step": 65670 }, { "epoch": 5.357288522718003, "grad_norm": 5.7895355224609375, "learning_rate": 2.624558852082255e-05, "loss": 0.4184, "num_input_tokens_seen": 62729328, "step": 65675 }, { "epoch": 5.357696386328412, "grad_norm": 4.719120502471924, "learning_rate": 2.6242033647259316e-05, "loss": 0.3522, "num_input_tokens_seen": 62734464, "step": 65680 }, { "epoch": 5.358104249938821, "grad_norm": 3.438736915588379, "learning_rate": 2.623847874852059e-05, "loss": 0.3557, "num_input_tokens_seen": 62738608, "step": 65685 }, { "epoch": 5.358512113549229, "grad_norm": 4.348867416381836, "learning_rate": 2.6234923824678424e-05, "loss": 0.3536, "num_input_tokens_seen": 62743568, "step": 65690 }, { "epoch": 5.358919977159638, "grad_norm": 8.203655242919922, "learning_rate": 2.6231368875804878e-05, "loss": 0.4215, "num_input_tokens_seen": 62747952, "step": 65695 }, { "epoch": 5.359327840770047, "grad_norm": 2.2609524726867676, "learning_rate": 2.6227813901972008e-05, "loss": 0.2677, "num_input_tokens_seen": 62752176, "step": 65700 }, { "epoch": 5.359735704380455, "grad_norm": 2.8085873126983643, "learning_rate": 2.6224258903251873e-05, "loss": 0.3745, "num_input_tokens_seen": 62757664, "step": 65705 }, { "epoch": 5.360143567990864, "grad_norm": 4.569900989532471, "learning_rate": 2.622070387971653e-05, "loss": 0.27, "num_input_tokens_seen": 62763040, "step": 65710 }, { "epoch": 5.360551431601272, "grad_norm": 7.24866247177124, "learning_rate": 2.621714883143804e-05, "loss": 0.4139, "num_input_tokens_seen": 62767984, "step": 65715 }, { "epoch": 5.360959295211681, "grad_norm": 5.12598991394043, "learning_rate": 2.621359375848846e-05, "loss": 0.3467, "num_input_tokens_seen": 62773152, "step": 65720 }, { "epoch": 5.36136715882209, "grad_norm": 3.6966745853424072, "learning_rate": 2.621003866093985e-05, "loss": 0.3585, "num_input_tokens_seen": 62777520, "step": 65725 }, { "epoch": 5.361775022432498, "grad_norm": 0.6010119318962097, "learning_rate": 2.6206483538864262e-05, "loss": 0.3167, "num_input_tokens_seen": 62782592, "step": 65730 }, { "epoch": 5.362182886042907, "grad_norm": 4.3885369300842285, "learning_rate": 2.6202928392333775e-05, "loss": 0.2954, "num_input_tokens_seen": 62787536, "step": 65735 }, { "epoch": 5.362590749653316, "grad_norm": 1.5407516956329346, "learning_rate": 2.6199373221420438e-05, "loss": 0.4519, "num_input_tokens_seen": 62792336, "step": 65740 }, { "epoch": 5.362998613263724, "grad_norm": 3.6974081993103027, "learning_rate": 2.6195818026196317e-05, "loss": 0.3045, "num_input_tokens_seen": 62798384, "step": 65745 }, { "epoch": 5.363406476874133, "grad_norm": 1.285204291343689, "learning_rate": 2.6192262806733474e-05, "loss": 0.2895, "num_input_tokens_seen": 62802768, "step": 65750 }, { "epoch": 5.3638143404845415, "grad_norm": 2.209988832473755, "learning_rate": 2.6188707563103964e-05, "loss": 0.32, "num_input_tokens_seen": 62807168, "step": 65755 }, { "epoch": 5.3642222040949505, "grad_norm": 4.140518665313721, "learning_rate": 2.618515229537986e-05, "loss": 0.4335, "num_input_tokens_seen": 62811824, "step": 65760 }, { "epoch": 5.3646300677053596, "grad_norm": 5.084580898284912, "learning_rate": 2.6181597003633218e-05, "loss": 0.2749, "num_input_tokens_seen": 62817168, "step": 65765 }, { "epoch": 5.365037931315768, "grad_norm": 1.0424985885620117, "learning_rate": 2.6178041687936115e-05, "loss": 0.3442, "num_input_tokens_seen": 62822528, "step": 65770 }, { "epoch": 5.365445794926177, "grad_norm": 4.740236282348633, "learning_rate": 2.6174486348360606e-05, "loss": 0.3912, "num_input_tokens_seen": 62826656, "step": 65775 }, { "epoch": 5.365853658536586, "grad_norm": 4.177439212799072, "learning_rate": 2.6170930984978754e-05, "loss": 0.3492, "num_input_tokens_seen": 62831360, "step": 65780 }, { "epoch": 5.366261522146994, "grad_norm": 1.199803113937378, "learning_rate": 2.6167375597862632e-05, "loss": 0.3427, "num_input_tokens_seen": 62835584, "step": 65785 }, { "epoch": 5.366669385757403, "grad_norm": 1.3964900970458984, "learning_rate": 2.6163820187084302e-05, "loss": 0.2277, "num_input_tokens_seen": 62840752, "step": 65790 }, { "epoch": 5.367077249367812, "grad_norm": 4.7530927658081055, "learning_rate": 2.616026475271583e-05, "loss": 0.3343, "num_input_tokens_seen": 62845440, "step": 65795 }, { "epoch": 5.36748511297822, "grad_norm": 6.986172676086426, "learning_rate": 2.6156709294829283e-05, "loss": 0.4352, "num_input_tokens_seen": 62850784, "step": 65800 }, { "epoch": 5.367892976588629, "grad_norm": 0.9953150153160095, "learning_rate": 2.6153153813496738e-05, "loss": 0.2639, "num_input_tokens_seen": 62856352, "step": 65805 }, { "epoch": 5.368300840199037, "grad_norm": 8.176281929016113, "learning_rate": 2.6149598308790248e-05, "loss": 0.3937, "num_input_tokens_seen": 62860720, "step": 65810 }, { "epoch": 5.368708703809446, "grad_norm": 4.137895107269287, "learning_rate": 2.6146042780781894e-05, "loss": 0.3301, "num_input_tokens_seen": 62865968, "step": 65815 }, { "epoch": 5.369116567419855, "grad_norm": 1.134118914604187, "learning_rate": 2.6142487229543727e-05, "loss": 0.3925, "num_input_tokens_seen": 62870432, "step": 65820 }, { "epoch": 5.369524431030263, "grad_norm": 6.511207103729248, "learning_rate": 2.613893165514784e-05, "loss": 0.3918, "num_input_tokens_seen": 62875184, "step": 65825 }, { "epoch": 5.369932294640672, "grad_norm": 4.071086883544922, "learning_rate": 2.613537605766629e-05, "loss": 0.3701, "num_input_tokens_seen": 62879552, "step": 65830 }, { "epoch": 5.370340158251081, "grad_norm": 4.441572666168213, "learning_rate": 2.613182043717115e-05, "loss": 0.4123, "num_input_tokens_seen": 62884352, "step": 65835 }, { "epoch": 5.370748021861489, "grad_norm": 4.733458995819092, "learning_rate": 2.612826479373449e-05, "loss": 0.4464, "num_input_tokens_seen": 62889760, "step": 65840 }, { "epoch": 5.371155885471898, "grad_norm": 7.7827229499816895, "learning_rate": 2.6124709127428387e-05, "loss": 0.2624, "num_input_tokens_seen": 62893936, "step": 65845 }, { "epoch": 5.3715637490823065, "grad_norm": 2.874605178833008, "learning_rate": 2.61211534383249e-05, "loss": 0.3505, "num_input_tokens_seen": 62898496, "step": 65850 }, { "epoch": 5.3719716126927155, "grad_norm": 6.18162727355957, "learning_rate": 2.6117597726496117e-05, "loss": 0.3525, "num_input_tokens_seen": 62902912, "step": 65855 }, { "epoch": 5.3723794763031245, "grad_norm": 2.7735466957092285, "learning_rate": 2.6114041992014105e-05, "loss": 0.2709, "num_input_tokens_seen": 62907168, "step": 65860 }, { "epoch": 5.372787339913533, "grad_norm": 7.483617782592773, "learning_rate": 2.6110486234950926e-05, "loss": 0.3826, "num_input_tokens_seen": 62912208, "step": 65865 }, { "epoch": 5.373195203523942, "grad_norm": 5.768033504486084, "learning_rate": 2.610693045537867e-05, "loss": 0.3279, "num_input_tokens_seen": 62917728, "step": 65870 }, { "epoch": 5.373603067134351, "grad_norm": 5.068927764892578, "learning_rate": 2.6103374653369395e-05, "loss": 0.3577, "num_input_tokens_seen": 62922096, "step": 65875 }, { "epoch": 5.374010930744759, "grad_norm": 3.8032357692718506, "learning_rate": 2.6099818828995194e-05, "loss": 0.301, "num_input_tokens_seen": 62926928, "step": 65880 }, { "epoch": 5.374418794355168, "grad_norm": 3.364025354385376, "learning_rate": 2.6096262982328123e-05, "loss": 0.3403, "num_input_tokens_seen": 62930544, "step": 65885 }, { "epoch": 5.374826657965576, "grad_norm": 10.612336158752441, "learning_rate": 2.6092707113440274e-05, "loss": 0.3032, "num_input_tokens_seen": 62935488, "step": 65890 }, { "epoch": 5.375234521575985, "grad_norm": 4.085610389709473, "learning_rate": 2.6089151222403713e-05, "loss": 0.3123, "num_input_tokens_seen": 62940512, "step": 65895 }, { "epoch": 5.375642385186394, "grad_norm": 3.5693531036376953, "learning_rate": 2.6085595309290523e-05, "loss": 0.3527, "num_input_tokens_seen": 62945552, "step": 65900 }, { "epoch": 5.376050248796802, "grad_norm": 1.3936805725097656, "learning_rate": 2.6082039374172778e-05, "loss": 0.3542, "num_input_tokens_seen": 62950000, "step": 65905 }, { "epoch": 5.376458112407211, "grad_norm": 2.0324718952178955, "learning_rate": 2.6078483417122552e-05, "loss": 0.372, "num_input_tokens_seen": 62955584, "step": 65910 }, { "epoch": 5.37686597601762, "grad_norm": 1.204992651939392, "learning_rate": 2.6074927438211927e-05, "loss": 0.4251, "num_input_tokens_seen": 62961440, "step": 65915 }, { "epoch": 5.377273839628028, "grad_norm": 1.1671963930130005, "learning_rate": 2.607137143751298e-05, "loss": 0.3337, "num_input_tokens_seen": 62965808, "step": 65920 }, { "epoch": 5.377681703238437, "grad_norm": 2.1205711364746094, "learning_rate": 2.6067815415097785e-05, "loss": 0.4346, "num_input_tokens_seen": 62970480, "step": 65925 }, { "epoch": 5.378089566848846, "grad_norm": 3.2035815715789795, "learning_rate": 2.606425937103843e-05, "loss": 0.2973, "num_input_tokens_seen": 62975280, "step": 65930 }, { "epoch": 5.378497430459254, "grad_norm": 3.533628463745117, "learning_rate": 2.6060703305406993e-05, "loss": 0.3563, "num_input_tokens_seen": 62980432, "step": 65935 }, { "epoch": 5.378905294069663, "grad_norm": 2.181501626968384, "learning_rate": 2.605714721827554e-05, "loss": 0.3505, "num_input_tokens_seen": 62985744, "step": 65940 }, { "epoch": 5.3793131576800715, "grad_norm": 6.1937127113342285, "learning_rate": 2.605359110971617e-05, "loss": 0.3393, "num_input_tokens_seen": 62991200, "step": 65945 }, { "epoch": 5.3797210212904805, "grad_norm": 3.0843515396118164, "learning_rate": 2.6050034979800953e-05, "loss": 0.3883, "num_input_tokens_seen": 62996704, "step": 65950 }, { "epoch": 5.3801288849008895, "grad_norm": 3.8877809047698975, "learning_rate": 2.6046478828601973e-05, "loss": 0.2689, "num_input_tokens_seen": 63002400, "step": 65955 }, { "epoch": 5.380536748511298, "grad_norm": 2.582319736480713, "learning_rate": 2.6042922656191316e-05, "loss": 0.3107, "num_input_tokens_seen": 63007200, "step": 65960 }, { "epoch": 5.380944612121707, "grad_norm": 4.248570442199707, "learning_rate": 2.603936646264105e-05, "loss": 0.2821, "num_input_tokens_seen": 63012496, "step": 65965 }, { "epoch": 5.381352475732115, "grad_norm": 6.25634241104126, "learning_rate": 2.6035810248023274e-05, "loss": 0.2997, "num_input_tokens_seen": 63016672, "step": 65970 }, { "epoch": 5.381760339342524, "grad_norm": 3.483797788619995, "learning_rate": 2.603225401241006e-05, "loss": 0.3311, "num_input_tokens_seen": 63022320, "step": 65975 }, { "epoch": 5.382168202952933, "grad_norm": 2.393472671508789, "learning_rate": 2.6028697755873505e-05, "loss": 0.3138, "num_input_tokens_seen": 63027616, "step": 65980 }, { "epoch": 5.382576066563341, "grad_norm": 6.648128032684326, "learning_rate": 2.6025141478485678e-05, "loss": 0.4675, "num_input_tokens_seen": 63032432, "step": 65985 }, { "epoch": 5.38298393017375, "grad_norm": 2.4466392993927, "learning_rate": 2.6021585180318668e-05, "loss": 0.2911, "num_input_tokens_seen": 63038112, "step": 65990 }, { "epoch": 5.383391793784159, "grad_norm": 0.5386646389961243, "learning_rate": 2.6018028861444566e-05, "loss": 0.2777, "num_input_tokens_seen": 63042224, "step": 65995 }, { "epoch": 5.383799657394567, "grad_norm": 4.986451148986816, "learning_rate": 2.601447252193545e-05, "loss": 0.3957, "num_input_tokens_seen": 63046640, "step": 66000 }, { "epoch": 5.384207521004976, "grad_norm": 2.8786962032318115, "learning_rate": 2.6010916161863408e-05, "loss": 0.2284, "num_input_tokens_seen": 63051664, "step": 66005 }, { "epoch": 5.384615384615385, "grad_norm": 2.5521435737609863, "learning_rate": 2.6007359781300516e-05, "loss": 0.3289, "num_input_tokens_seen": 63056592, "step": 66010 }, { "epoch": 5.385023248225793, "grad_norm": 4.542372703552246, "learning_rate": 2.6003803380318876e-05, "loss": 0.3108, "num_input_tokens_seen": 63061104, "step": 66015 }, { "epoch": 5.385431111836202, "grad_norm": 2.13167142868042, "learning_rate": 2.6000246958990565e-05, "loss": 0.2994, "num_input_tokens_seen": 63065744, "step": 66020 }, { "epoch": 5.38583897544661, "grad_norm": 7.343207359313965, "learning_rate": 2.5996690517387678e-05, "loss": 0.3473, "num_input_tokens_seen": 63069888, "step": 66025 }, { "epoch": 5.386246839057019, "grad_norm": 0.6397114992141724, "learning_rate": 2.5993134055582292e-05, "loss": 0.2456, "num_input_tokens_seen": 63074384, "step": 66030 }, { "epoch": 5.386654702667428, "grad_norm": 2.9453859329223633, "learning_rate": 2.5989577573646507e-05, "loss": 0.2198, "num_input_tokens_seen": 63079408, "step": 66035 }, { "epoch": 5.387062566277836, "grad_norm": 6.719377040863037, "learning_rate": 2.5986021071652393e-05, "loss": 0.3565, "num_input_tokens_seen": 63083504, "step": 66040 }, { "epoch": 5.387470429888245, "grad_norm": 6.502688884735107, "learning_rate": 2.5982464549672065e-05, "loss": 0.3501, "num_input_tokens_seen": 63088736, "step": 66045 }, { "epoch": 5.387878293498654, "grad_norm": 5.548454284667969, "learning_rate": 2.597890800777759e-05, "loss": 0.3453, "num_input_tokens_seen": 63093776, "step": 66050 }, { "epoch": 5.388286157109063, "grad_norm": 2.8514058589935303, "learning_rate": 2.597535144604107e-05, "loss": 0.2397, "num_input_tokens_seen": 63098848, "step": 66055 }, { "epoch": 5.388694020719472, "grad_norm": 2.979100227355957, "learning_rate": 2.5971794864534587e-05, "loss": 0.4692, "num_input_tokens_seen": 63104096, "step": 66060 }, { "epoch": 5.38910188432988, "grad_norm": 4.291395664215088, "learning_rate": 2.596823826333024e-05, "loss": 0.4015, "num_input_tokens_seen": 63109168, "step": 66065 }, { "epoch": 5.389509747940289, "grad_norm": 4.139344215393066, "learning_rate": 2.5964681642500115e-05, "loss": 0.3359, "num_input_tokens_seen": 63114288, "step": 66070 }, { "epoch": 5.389917611550698, "grad_norm": 2.988595485687256, "learning_rate": 2.5961125002116298e-05, "loss": 0.3011, "num_input_tokens_seen": 63118560, "step": 66075 }, { "epoch": 5.390325475161106, "grad_norm": 1.2285902500152588, "learning_rate": 2.595756834225089e-05, "loss": 0.3956, "num_input_tokens_seen": 63123472, "step": 66080 }, { "epoch": 5.390733338771515, "grad_norm": 8.792069435119629, "learning_rate": 2.595401166297597e-05, "loss": 0.3369, "num_input_tokens_seen": 63129056, "step": 66085 }, { "epoch": 5.391141202381924, "grad_norm": 12.482776641845703, "learning_rate": 2.5950454964363645e-05, "loss": 0.3504, "num_input_tokens_seen": 63133616, "step": 66090 }, { "epoch": 5.391549065992332, "grad_norm": 6.692117691040039, "learning_rate": 2.5946898246486e-05, "loss": 0.2497, "num_input_tokens_seen": 63138128, "step": 66095 }, { "epoch": 5.391956929602741, "grad_norm": 1.456749439239502, "learning_rate": 2.5943341509415137e-05, "loss": 0.4564, "num_input_tokens_seen": 63143008, "step": 66100 }, { "epoch": 5.392364793213149, "grad_norm": 10.600716590881348, "learning_rate": 2.593978475322314e-05, "loss": 0.3745, "num_input_tokens_seen": 63148080, "step": 66105 }, { "epoch": 5.392772656823558, "grad_norm": 0.7668575048446655, "learning_rate": 2.5936227977982098e-05, "loss": 0.2292, "num_input_tokens_seen": 63153280, "step": 66110 }, { "epoch": 5.393180520433967, "grad_norm": 1.184862494468689, "learning_rate": 2.5932671183764117e-05, "loss": 0.3083, "num_input_tokens_seen": 63157472, "step": 66115 }, { "epoch": 5.393588384044375, "grad_norm": 1.7496849298477173, "learning_rate": 2.5929114370641283e-05, "loss": 0.3215, "num_input_tokens_seen": 63160944, "step": 66120 }, { "epoch": 5.393996247654784, "grad_norm": 7.37935209274292, "learning_rate": 2.59255575386857e-05, "loss": 0.3289, "num_input_tokens_seen": 63165456, "step": 66125 }, { "epoch": 5.394404111265193, "grad_norm": 3.1251721382141113, "learning_rate": 2.5922000687969467e-05, "loss": 0.3724, "num_input_tokens_seen": 63170864, "step": 66130 }, { "epoch": 5.394811974875601, "grad_norm": 8.136841773986816, "learning_rate": 2.5918443818564665e-05, "loss": 0.2761, "num_input_tokens_seen": 63175216, "step": 66135 }, { "epoch": 5.39521983848601, "grad_norm": 5.848969459533691, "learning_rate": 2.5914886930543392e-05, "loss": 0.3138, "num_input_tokens_seen": 63180928, "step": 66140 }, { "epoch": 5.395627702096419, "grad_norm": 8.260095596313477, "learning_rate": 2.5911330023977754e-05, "loss": 0.3578, "num_input_tokens_seen": 63185280, "step": 66145 }, { "epoch": 5.3960355657068275, "grad_norm": 3.3597567081451416, "learning_rate": 2.590777309893984e-05, "loss": 0.3978, "num_input_tokens_seen": 63189840, "step": 66150 }, { "epoch": 5.3964434293172365, "grad_norm": 8.60145092010498, "learning_rate": 2.590421615550176e-05, "loss": 0.3175, "num_input_tokens_seen": 63193776, "step": 66155 }, { "epoch": 5.396851292927645, "grad_norm": 1.2470015287399292, "learning_rate": 2.59006591937356e-05, "loss": 0.2761, "num_input_tokens_seen": 63198768, "step": 66160 }, { "epoch": 5.397259156538054, "grad_norm": 9.357428550720215, "learning_rate": 2.589710221371345e-05, "loss": 0.2429, "num_input_tokens_seen": 63203712, "step": 66165 }, { "epoch": 5.397667020148463, "grad_norm": 0.5800327062606812, "learning_rate": 2.589354521550743e-05, "loss": 0.3077, "num_input_tokens_seen": 63208176, "step": 66170 }, { "epoch": 5.398074883758871, "grad_norm": 0.5501692891120911, "learning_rate": 2.5889988199189625e-05, "loss": 0.2543, "num_input_tokens_seen": 63213712, "step": 66175 }, { "epoch": 5.39848274736928, "grad_norm": 9.676135063171387, "learning_rate": 2.5886431164832135e-05, "loss": 0.3011, "num_input_tokens_seen": 63218464, "step": 66180 }, { "epoch": 5.398890610979688, "grad_norm": 0.9729948043823242, "learning_rate": 2.5882874112507065e-05, "loss": 0.3019, "num_input_tokens_seen": 63223936, "step": 66185 }, { "epoch": 5.399298474590097, "grad_norm": 15.419934272766113, "learning_rate": 2.5879317042286512e-05, "loss": 0.2996, "num_input_tokens_seen": 63227584, "step": 66190 }, { "epoch": 5.399706338200506, "grad_norm": 5.018681526184082, "learning_rate": 2.587575995424258e-05, "loss": 0.5551, "num_input_tokens_seen": 63232560, "step": 66195 }, { "epoch": 5.400114201810914, "grad_norm": 0.42107537388801575, "learning_rate": 2.5872202848447362e-05, "loss": 0.4411, "num_input_tokens_seen": 63237136, "step": 66200 }, { "epoch": 5.400522065421323, "grad_norm": 7.516005992889404, "learning_rate": 2.5868645724972967e-05, "loss": 0.3645, "num_input_tokens_seen": 63241680, "step": 66205 }, { "epoch": 5.400929929031732, "grad_norm": 5.000680923461914, "learning_rate": 2.5865088583891484e-05, "loss": 0.4488, "num_input_tokens_seen": 63246992, "step": 66210 }, { "epoch": 5.40133779264214, "grad_norm": 0.5764078497886658, "learning_rate": 2.5861531425275027e-05, "loss": 0.3801, "num_input_tokens_seen": 63251536, "step": 66215 }, { "epoch": 5.401745656252549, "grad_norm": 6.641988754272461, "learning_rate": 2.5857974249195695e-05, "loss": 0.5109, "num_input_tokens_seen": 63255840, "step": 66220 }, { "epoch": 5.402153519862958, "grad_norm": 0.6334072351455688, "learning_rate": 2.5854417055725587e-05, "loss": 0.256, "num_input_tokens_seen": 63260816, "step": 66225 }, { "epoch": 5.402561383473366, "grad_norm": 8.603106498718262, "learning_rate": 2.585085984493681e-05, "loss": 0.2498, "num_input_tokens_seen": 63264672, "step": 66230 }, { "epoch": 5.402969247083775, "grad_norm": 11.43634033203125, "learning_rate": 2.5847302616901466e-05, "loss": 0.3777, "num_input_tokens_seen": 63269264, "step": 66235 }, { "epoch": 5.4033771106941835, "grad_norm": 7.892618179321289, "learning_rate": 2.584374537169165e-05, "loss": 0.419, "num_input_tokens_seen": 63273728, "step": 66240 }, { "epoch": 5.4037849743045925, "grad_norm": 1.989015817642212, "learning_rate": 2.5840188109379486e-05, "loss": 0.4261, "num_input_tokens_seen": 63278224, "step": 66245 }, { "epoch": 5.4041928379150015, "grad_norm": 6.576587677001953, "learning_rate": 2.5836630830037056e-05, "loss": 0.3884, "num_input_tokens_seen": 63282832, "step": 66250 }, { "epoch": 5.40460070152541, "grad_norm": 7.091525077819824, "learning_rate": 2.5833073533736485e-05, "loss": 0.3239, "num_input_tokens_seen": 63287968, "step": 66255 }, { "epoch": 5.405008565135819, "grad_norm": 0.6280041933059692, "learning_rate": 2.5829516220549864e-05, "loss": 0.4775, "num_input_tokens_seen": 63292928, "step": 66260 }, { "epoch": 5.405416428746228, "grad_norm": 2.631857395172119, "learning_rate": 2.58259588905493e-05, "loss": 0.2858, "num_input_tokens_seen": 63297264, "step": 66265 }, { "epoch": 5.405824292356636, "grad_norm": 0.5894966125488281, "learning_rate": 2.58224015438069e-05, "loss": 0.2937, "num_input_tokens_seen": 63301856, "step": 66270 }, { "epoch": 5.406232155967045, "grad_norm": 2.636906862258911, "learning_rate": 2.581884418039477e-05, "loss": 0.3601, "num_input_tokens_seen": 63306784, "step": 66275 }, { "epoch": 5.406640019577454, "grad_norm": 13.485602378845215, "learning_rate": 2.581528680038502e-05, "loss": 0.472, "num_input_tokens_seen": 63311488, "step": 66280 }, { "epoch": 5.407047883187862, "grad_norm": 3.059555768966675, "learning_rate": 2.5811729403849742e-05, "loss": 0.3076, "num_input_tokens_seen": 63316768, "step": 66285 }, { "epoch": 5.407455746798271, "grad_norm": 3.520730495452881, "learning_rate": 2.5808171990861068e-05, "loss": 0.4057, "num_input_tokens_seen": 63320400, "step": 66290 }, { "epoch": 5.407863610408679, "grad_norm": 7.289705276489258, "learning_rate": 2.5804614561491082e-05, "loss": 0.3622, "num_input_tokens_seen": 63324208, "step": 66295 }, { "epoch": 5.408271474019088, "grad_norm": 9.093647956848145, "learning_rate": 2.5801057115811906e-05, "loss": 0.2988, "num_input_tokens_seen": 63329136, "step": 66300 }, { "epoch": 5.408679337629497, "grad_norm": 0.7039731740951538, "learning_rate": 2.579749965389564e-05, "loss": 0.2413, "num_input_tokens_seen": 63333344, "step": 66305 }, { "epoch": 5.409087201239905, "grad_norm": 4.970104217529297, "learning_rate": 2.5793942175814396e-05, "loss": 0.4167, "num_input_tokens_seen": 63338080, "step": 66310 }, { "epoch": 5.409495064850314, "grad_norm": 1.809016466140747, "learning_rate": 2.5790384681640277e-05, "loss": 0.3154, "num_input_tokens_seen": 63342512, "step": 66315 }, { "epoch": 5.409902928460722, "grad_norm": 6.398077487945557, "learning_rate": 2.57868271714454e-05, "loss": 0.2526, "num_input_tokens_seen": 63346864, "step": 66320 }, { "epoch": 5.410310792071131, "grad_norm": 5.2117743492126465, "learning_rate": 2.5783269645301872e-05, "loss": 0.4566, "num_input_tokens_seen": 63351712, "step": 66325 }, { "epoch": 5.41071865568154, "grad_norm": 5.741287708282471, "learning_rate": 2.57797121032818e-05, "loss": 0.4488, "num_input_tokens_seen": 63357040, "step": 66330 }, { "epoch": 5.411126519291948, "grad_norm": 6.526323318481445, "learning_rate": 2.577615454545729e-05, "loss": 0.2761, "num_input_tokens_seen": 63361312, "step": 66335 }, { "epoch": 5.411534382902357, "grad_norm": 2.1821980476379395, "learning_rate": 2.5772596971900464e-05, "loss": 0.3082, "num_input_tokens_seen": 63365760, "step": 66340 }, { "epoch": 5.4119422465127665, "grad_norm": 4.88969087600708, "learning_rate": 2.576903938268343e-05, "loss": 0.3152, "num_input_tokens_seen": 63370032, "step": 66345 }, { "epoch": 5.412350110123175, "grad_norm": 9.166566848754883, "learning_rate": 2.576548177787828e-05, "loss": 0.3546, "num_input_tokens_seen": 63374560, "step": 66350 }, { "epoch": 5.412757973733584, "grad_norm": 1.031874656677246, "learning_rate": 2.5761924157557155e-05, "loss": 0.3573, "num_input_tokens_seen": 63379968, "step": 66355 }, { "epoch": 5.413165837343993, "grad_norm": 1.2518755197525024, "learning_rate": 2.5758366521792144e-05, "loss": 0.2347, "num_input_tokens_seen": 63385296, "step": 66360 }, { "epoch": 5.413573700954401, "grad_norm": 1.783541202545166, "learning_rate": 2.5754808870655363e-05, "loss": 0.3456, "num_input_tokens_seen": 63389248, "step": 66365 }, { "epoch": 5.41398156456481, "grad_norm": 6.361268520355225, "learning_rate": 2.575125120421893e-05, "loss": 0.4802, "num_input_tokens_seen": 63394416, "step": 66370 }, { "epoch": 5.414389428175218, "grad_norm": 2.1921396255493164, "learning_rate": 2.574769352255495e-05, "loss": 0.4069, "num_input_tokens_seen": 63399552, "step": 66375 }, { "epoch": 5.414797291785627, "grad_norm": 7.74777889251709, "learning_rate": 2.574413582573555e-05, "loss": 0.2953, "num_input_tokens_seen": 63403952, "step": 66380 }, { "epoch": 5.415205155396036, "grad_norm": 1.1250958442687988, "learning_rate": 2.574057811383282e-05, "loss": 0.4022, "num_input_tokens_seen": 63407760, "step": 66385 }, { "epoch": 5.415613019006444, "grad_norm": 2.592496633529663, "learning_rate": 2.5737020386918896e-05, "loss": 0.3672, "num_input_tokens_seen": 63412864, "step": 66390 }, { "epoch": 5.416020882616853, "grad_norm": 5.695375919342041, "learning_rate": 2.5733462645065876e-05, "loss": 0.2822, "num_input_tokens_seen": 63416752, "step": 66395 }, { "epoch": 5.416428746227262, "grad_norm": 1.3376834392547607, "learning_rate": 2.5729904888345885e-05, "loss": 0.2936, "num_input_tokens_seen": 63421920, "step": 66400 }, { "epoch": 5.41683660983767, "grad_norm": 5.1226115226745605, "learning_rate": 2.5726347116831034e-05, "loss": 0.4274, "num_input_tokens_seen": 63425984, "step": 66405 }, { "epoch": 5.417244473448079, "grad_norm": 4.913010120391846, "learning_rate": 2.5722789330593433e-05, "loss": 0.3421, "num_input_tokens_seen": 63430384, "step": 66410 }, { "epoch": 5.417652337058487, "grad_norm": 4.9166059494018555, "learning_rate": 2.57192315297052e-05, "loss": 0.3496, "num_input_tokens_seen": 63435024, "step": 66415 }, { "epoch": 5.418060200668896, "grad_norm": 0.7257733941078186, "learning_rate": 2.5715673714238443e-05, "loss": 0.2101, "num_input_tokens_seen": 63439776, "step": 66420 }, { "epoch": 5.418468064279305, "grad_norm": 2.2867743968963623, "learning_rate": 2.5712115884265292e-05, "loss": 0.2753, "num_input_tokens_seen": 63444336, "step": 66425 }, { "epoch": 5.418875927889713, "grad_norm": 4.287849426269531, "learning_rate": 2.5708558039857845e-05, "loss": 0.3838, "num_input_tokens_seen": 63448672, "step": 66430 }, { "epoch": 5.419283791500122, "grad_norm": 1.7255321741104126, "learning_rate": 2.5705000181088233e-05, "loss": 0.3715, "num_input_tokens_seen": 63452224, "step": 66435 }, { "epoch": 5.419691655110531, "grad_norm": 3.219944715499878, "learning_rate": 2.570144230802857e-05, "loss": 0.3886, "num_input_tokens_seen": 63456800, "step": 66440 }, { "epoch": 5.4200995187209395, "grad_norm": 1.7363883256912231, "learning_rate": 2.5697884420750968e-05, "loss": 0.3417, "num_input_tokens_seen": 63462016, "step": 66445 }, { "epoch": 5.4205073823313485, "grad_norm": 8.524985313415527, "learning_rate": 2.5694326519327538e-05, "loss": 0.3012, "num_input_tokens_seen": 63467120, "step": 66450 }, { "epoch": 5.420915245941757, "grad_norm": 1.033011555671692, "learning_rate": 2.569076860383042e-05, "loss": 0.4172, "num_input_tokens_seen": 63472576, "step": 66455 }, { "epoch": 5.421323109552166, "grad_norm": 3.3200111389160156, "learning_rate": 2.5687210674331702e-05, "loss": 0.3573, "num_input_tokens_seen": 63477424, "step": 66460 }, { "epoch": 5.421730973162575, "grad_norm": 5.969977378845215, "learning_rate": 2.568365273090352e-05, "loss": 0.3974, "num_input_tokens_seen": 63481600, "step": 66465 }, { "epoch": 5.422138836772983, "grad_norm": 4.973843574523926, "learning_rate": 2.568009477361799e-05, "loss": 0.4196, "num_input_tokens_seen": 63486368, "step": 66470 }, { "epoch": 5.422546700383392, "grad_norm": 7.551977157592773, "learning_rate": 2.567653680254723e-05, "loss": 0.2905, "num_input_tokens_seen": 63491184, "step": 66475 }, { "epoch": 5.422954563993801, "grad_norm": 1.577623724937439, "learning_rate": 2.5672978817763354e-05, "loss": 0.2996, "num_input_tokens_seen": 63495888, "step": 66480 }, { "epoch": 5.423362427604209, "grad_norm": 6.350038051605225, "learning_rate": 2.5669420819338473e-05, "loss": 0.439, "num_input_tokens_seen": 63500320, "step": 66485 }, { "epoch": 5.423770291214618, "grad_norm": 7.095316410064697, "learning_rate": 2.566586280734473e-05, "loss": 0.4143, "num_input_tokens_seen": 63505056, "step": 66490 }, { "epoch": 5.424178154825027, "grad_norm": 2.29290771484375, "learning_rate": 2.5662304781854224e-05, "loss": 0.3587, "num_input_tokens_seen": 63510096, "step": 66495 }, { "epoch": 5.424586018435435, "grad_norm": 3.0943186283111572, "learning_rate": 2.5658746742939088e-05, "loss": 0.3381, "num_input_tokens_seen": 63514912, "step": 66500 }, { "epoch": 5.424993882045844, "grad_norm": 1.6713402271270752, "learning_rate": 2.5655188690671433e-05, "loss": 0.2941, "num_input_tokens_seen": 63520560, "step": 66505 }, { "epoch": 5.425401745656252, "grad_norm": 3.2830801010131836, "learning_rate": 2.5651630625123375e-05, "loss": 0.2688, "num_input_tokens_seen": 63525440, "step": 66510 }, { "epoch": 5.425809609266661, "grad_norm": 5.775928020477295, "learning_rate": 2.564807254636705e-05, "loss": 0.2556, "num_input_tokens_seen": 63530464, "step": 66515 }, { "epoch": 5.42621747287707, "grad_norm": 3.9460110664367676, "learning_rate": 2.5644514454474562e-05, "loss": 0.3843, "num_input_tokens_seen": 63534912, "step": 66520 }, { "epoch": 5.426625336487478, "grad_norm": 0.7679042220115662, "learning_rate": 2.5640956349518043e-05, "loss": 0.2791, "num_input_tokens_seen": 63539616, "step": 66525 }, { "epoch": 5.427033200097887, "grad_norm": 4.9059062004089355, "learning_rate": 2.563739823156961e-05, "loss": 0.3351, "num_input_tokens_seen": 63544784, "step": 66530 }, { "epoch": 5.4274410637082955, "grad_norm": 2.348177909851074, "learning_rate": 2.563384010070139e-05, "loss": 0.4066, "num_input_tokens_seen": 63549360, "step": 66535 }, { "epoch": 5.4278489273187045, "grad_norm": 8.143789291381836, "learning_rate": 2.5630281956985497e-05, "loss": 0.3605, "num_input_tokens_seen": 63554240, "step": 66540 }, { "epoch": 5.4282567909291135, "grad_norm": 2.9636332988739014, "learning_rate": 2.5626723800494063e-05, "loss": 0.278, "num_input_tokens_seen": 63559136, "step": 66545 }, { "epoch": 5.428664654539522, "grad_norm": 3.9074199199676514, "learning_rate": 2.5623165631299195e-05, "loss": 0.3139, "num_input_tokens_seen": 63563792, "step": 66550 }, { "epoch": 5.429072518149931, "grad_norm": 4.022609233856201, "learning_rate": 2.5619607449473026e-05, "loss": 0.3194, "num_input_tokens_seen": 63568448, "step": 66555 }, { "epoch": 5.42948038176034, "grad_norm": 0.8048085570335388, "learning_rate": 2.5616049255087683e-05, "loss": 0.2943, "num_input_tokens_seen": 63572688, "step": 66560 }, { "epoch": 5.429888245370748, "grad_norm": 1.0748995542526245, "learning_rate": 2.5612491048215275e-05, "loss": 0.3279, "num_input_tokens_seen": 63577712, "step": 66565 }, { "epoch": 5.430296108981157, "grad_norm": 5.692739009857178, "learning_rate": 2.560893282892794e-05, "loss": 0.3394, "num_input_tokens_seen": 63581712, "step": 66570 }, { "epoch": 5.430703972591566, "grad_norm": 4.332658290863037, "learning_rate": 2.5605374597297795e-05, "loss": 0.2875, "num_input_tokens_seen": 63586240, "step": 66575 }, { "epoch": 5.431111836201974, "grad_norm": 2.3522939682006836, "learning_rate": 2.5601816353396962e-05, "loss": 0.1978, "num_input_tokens_seen": 63590944, "step": 66580 }, { "epoch": 5.431519699812383, "grad_norm": 2.0324695110321045, "learning_rate": 2.559825809729756e-05, "loss": 0.2243, "num_input_tokens_seen": 63595680, "step": 66585 }, { "epoch": 5.431927563422791, "grad_norm": 1.1841788291931152, "learning_rate": 2.5594699829071734e-05, "loss": 0.4973, "num_input_tokens_seen": 63600016, "step": 66590 }, { "epoch": 5.4323354270332, "grad_norm": 1.8723863363265991, "learning_rate": 2.5591141548791588e-05, "loss": 0.2722, "num_input_tokens_seen": 63604736, "step": 66595 }, { "epoch": 5.432743290643609, "grad_norm": 0.7805678248405457, "learning_rate": 2.5587583256529257e-05, "loss": 0.2794, "num_input_tokens_seen": 63609184, "step": 66600 }, { "epoch": 5.433151154254017, "grad_norm": 2.1198952198028564, "learning_rate": 2.5584024952356866e-05, "loss": 0.3379, "num_input_tokens_seen": 63613856, "step": 66605 }, { "epoch": 5.433559017864426, "grad_norm": 4.067028522491455, "learning_rate": 2.5580466636346533e-05, "loss": 0.4304, "num_input_tokens_seen": 63619200, "step": 66610 }, { "epoch": 5.433966881474835, "grad_norm": 8.157546997070312, "learning_rate": 2.557690830857039e-05, "loss": 0.3048, "num_input_tokens_seen": 63624672, "step": 66615 }, { "epoch": 5.434374745085243, "grad_norm": 4.18731689453125, "learning_rate": 2.5573349969100558e-05, "loss": 0.356, "num_input_tokens_seen": 63628800, "step": 66620 }, { "epoch": 5.434782608695652, "grad_norm": 3.533022880554199, "learning_rate": 2.5569791618009166e-05, "loss": 0.2918, "num_input_tokens_seen": 63633776, "step": 66625 }, { "epoch": 5.4351904723060604, "grad_norm": 1.7086089849472046, "learning_rate": 2.5566233255368338e-05, "loss": 0.3943, "num_input_tokens_seen": 63638224, "step": 66630 }, { "epoch": 5.4355983359164695, "grad_norm": 3.421560764312744, "learning_rate": 2.556267488125021e-05, "loss": 0.3435, "num_input_tokens_seen": 63643344, "step": 66635 }, { "epoch": 5.4360061995268785, "grad_norm": 6.08819580078125, "learning_rate": 2.5559116495726887e-05, "loss": 0.3765, "num_input_tokens_seen": 63648224, "step": 66640 }, { "epoch": 5.436414063137287, "grad_norm": 7.007836818695068, "learning_rate": 2.5555558098870525e-05, "loss": 0.3499, "num_input_tokens_seen": 63652368, "step": 66645 }, { "epoch": 5.436821926747696, "grad_norm": 2.502901554107666, "learning_rate": 2.5551999690753226e-05, "loss": 0.2562, "num_input_tokens_seen": 63656784, "step": 66650 }, { "epoch": 5.437229790358105, "grad_norm": 1.6012779474258423, "learning_rate": 2.5548441271447137e-05, "loss": 0.3116, "num_input_tokens_seen": 63662288, "step": 66655 }, { "epoch": 5.437637653968513, "grad_norm": 0.5487096309661865, "learning_rate": 2.5544882841024374e-05, "loss": 0.2594, "num_input_tokens_seen": 63667008, "step": 66660 }, { "epoch": 5.438045517578922, "grad_norm": 5.962356090545654, "learning_rate": 2.554132439955706e-05, "loss": 0.1785, "num_input_tokens_seen": 63671856, "step": 66665 }, { "epoch": 5.43845338118933, "grad_norm": 1.2533162832260132, "learning_rate": 2.5537765947117336e-05, "loss": 0.4134, "num_input_tokens_seen": 63676448, "step": 66670 }, { "epoch": 5.438861244799739, "grad_norm": 1.0794119834899902, "learning_rate": 2.5534207483777328e-05, "loss": 0.3358, "num_input_tokens_seen": 63681264, "step": 66675 }, { "epoch": 5.439269108410148, "grad_norm": 9.382620811462402, "learning_rate": 2.553064900960916e-05, "loss": 0.6318, "num_input_tokens_seen": 63685760, "step": 66680 }, { "epoch": 5.439676972020556, "grad_norm": 4.451755046844482, "learning_rate": 2.552709052468496e-05, "loss": 0.3287, "num_input_tokens_seen": 63691216, "step": 66685 }, { "epoch": 5.440084835630965, "grad_norm": 3.0404484272003174, "learning_rate": 2.5523532029076864e-05, "loss": 0.3119, "num_input_tokens_seen": 63696496, "step": 66690 }, { "epoch": 5.440492699241374, "grad_norm": 3.720553159713745, "learning_rate": 2.551997352285698e-05, "loss": 0.5022, "num_input_tokens_seen": 63701520, "step": 66695 }, { "epoch": 5.440900562851782, "grad_norm": 3.1707923412323, "learning_rate": 2.551641500609747e-05, "loss": 0.2421, "num_input_tokens_seen": 63705968, "step": 66700 }, { "epoch": 5.441308426462191, "grad_norm": 9.596963882446289, "learning_rate": 2.5512856478870443e-05, "loss": 0.4828, "num_input_tokens_seen": 63711184, "step": 66705 }, { "epoch": 5.4417162900726, "grad_norm": 2.835294485092163, "learning_rate": 2.5509297941248034e-05, "loss": 0.4137, "num_input_tokens_seen": 63715120, "step": 66710 }, { "epoch": 5.442124153683008, "grad_norm": 1.9128994941711426, "learning_rate": 2.550573939330237e-05, "loss": 0.3185, "num_input_tokens_seen": 63720496, "step": 66715 }, { "epoch": 5.442532017293417, "grad_norm": 1.1086407899856567, "learning_rate": 2.550218083510558e-05, "loss": 0.3642, "num_input_tokens_seen": 63725072, "step": 66720 }, { "epoch": 5.442939880903825, "grad_norm": 4.911913871765137, "learning_rate": 2.5498622266729805e-05, "loss": 0.2734, "num_input_tokens_seen": 63730336, "step": 66725 }, { "epoch": 5.443347744514234, "grad_norm": 1.3791011571884155, "learning_rate": 2.5495063688247163e-05, "loss": 0.3536, "num_input_tokens_seen": 63734912, "step": 66730 }, { "epoch": 5.443755608124643, "grad_norm": 10.004592895507812, "learning_rate": 2.5491505099729797e-05, "loss": 0.3633, "num_input_tokens_seen": 63740160, "step": 66735 }, { "epoch": 5.4441634717350516, "grad_norm": 0.9177548289299011, "learning_rate": 2.5487946501249822e-05, "loss": 0.4336, "num_input_tokens_seen": 63745200, "step": 66740 }, { "epoch": 5.444571335345461, "grad_norm": 3.823497772216797, "learning_rate": 2.5484387892879385e-05, "loss": 0.3005, "num_input_tokens_seen": 63750288, "step": 66745 }, { "epoch": 5.44497919895587, "grad_norm": 6.365275859832764, "learning_rate": 2.5480829274690614e-05, "loss": 0.364, "num_input_tokens_seen": 63754624, "step": 66750 }, { "epoch": 5.445387062566278, "grad_norm": 4.530864715576172, "learning_rate": 2.5477270646755634e-05, "loss": 0.3241, "num_input_tokens_seen": 63758384, "step": 66755 }, { "epoch": 5.445794926176687, "grad_norm": 6.137711524963379, "learning_rate": 2.5473712009146583e-05, "loss": 0.2615, "num_input_tokens_seen": 63763184, "step": 66760 }, { "epoch": 5.446202789787095, "grad_norm": 9.077417373657227, "learning_rate": 2.547015336193559e-05, "loss": 0.4177, "num_input_tokens_seen": 63768784, "step": 66765 }, { "epoch": 5.446610653397504, "grad_norm": 3.8313632011413574, "learning_rate": 2.5466594705194785e-05, "loss": 0.3717, "num_input_tokens_seen": 63773568, "step": 66770 }, { "epoch": 5.447018517007913, "grad_norm": 8.352352142333984, "learning_rate": 2.54630360389963e-05, "loss": 0.3261, "num_input_tokens_seen": 63777888, "step": 66775 }, { "epoch": 5.447426380618321, "grad_norm": 4.4051103591918945, "learning_rate": 2.5459477363412282e-05, "loss": 0.3489, "num_input_tokens_seen": 63782736, "step": 66780 }, { "epoch": 5.44783424422873, "grad_norm": 4.3340020179748535, "learning_rate": 2.5455918678514845e-05, "loss": 0.3279, "num_input_tokens_seen": 63787488, "step": 66785 }, { "epoch": 5.448242107839139, "grad_norm": 4.7493085861206055, "learning_rate": 2.5452359984376133e-05, "loss": 0.2605, "num_input_tokens_seen": 63791888, "step": 66790 }, { "epoch": 5.448649971449547, "grad_norm": 2.813800096511841, "learning_rate": 2.5448801281068275e-05, "loss": 0.2989, "num_input_tokens_seen": 63796128, "step": 66795 }, { "epoch": 5.449057835059956, "grad_norm": 1.2332165241241455, "learning_rate": 2.544524256866341e-05, "loss": 0.4601, "num_input_tokens_seen": 63800608, "step": 66800 }, { "epoch": 5.449465698670364, "grad_norm": 2.2128353118896484, "learning_rate": 2.544168384723366e-05, "loss": 0.3696, "num_input_tokens_seen": 63805072, "step": 66805 }, { "epoch": 5.449873562280773, "grad_norm": 7.417605400085449, "learning_rate": 2.5438125116851175e-05, "loss": 0.323, "num_input_tokens_seen": 63809888, "step": 66810 }, { "epoch": 5.450281425891182, "grad_norm": 5.738135814666748, "learning_rate": 2.5434566377588076e-05, "loss": 0.3611, "num_input_tokens_seen": 63814576, "step": 66815 }, { "epoch": 5.45068928950159, "grad_norm": 1.6388108730316162, "learning_rate": 2.5431007629516503e-05, "loss": 0.3438, "num_input_tokens_seen": 63818576, "step": 66820 }, { "epoch": 5.451097153111999, "grad_norm": 3.7699623107910156, "learning_rate": 2.5427448872708587e-05, "loss": 0.3068, "num_input_tokens_seen": 63823392, "step": 66825 }, { "epoch": 5.451505016722408, "grad_norm": 1.9967964887619019, "learning_rate": 2.542389010723646e-05, "loss": 0.237, "num_input_tokens_seen": 63828752, "step": 66830 }, { "epoch": 5.4519128803328165, "grad_norm": 0.8963451981544495, "learning_rate": 2.5420331333172264e-05, "loss": 0.392, "num_input_tokens_seen": 63833168, "step": 66835 }, { "epoch": 5.4523207439432255, "grad_norm": 2.920471668243408, "learning_rate": 2.541677255058813e-05, "loss": 0.4015, "num_input_tokens_seen": 63837696, "step": 66840 }, { "epoch": 5.4527286075536345, "grad_norm": 1.2768535614013672, "learning_rate": 2.5413213759556197e-05, "loss": 0.2802, "num_input_tokens_seen": 63842688, "step": 66845 }, { "epoch": 5.453136471164043, "grad_norm": 4.62457275390625, "learning_rate": 2.540965496014859e-05, "loss": 0.3312, "num_input_tokens_seen": 63847808, "step": 66850 }, { "epoch": 5.453544334774452, "grad_norm": 4.195269584655762, "learning_rate": 2.5406096152437453e-05, "loss": 0.4223, "num_input_tokens_seen": 63852240, "step": 66855 }, { "epoch": 5.45395219838486, "grad_norm": 2.1180830001831055, "learning_rate": 2.5402537336494925e-05, "loss": 0.3505, "num_input_tokens_seen": 63857520, "step": 66860 }, { "epoch": 5.454360061995269, "grad_norm": 6.666465759277344, "learning_rate": 2.539897851239313e-05, "loss": 0.3367, "num_input_tokens_seen": 63862560, "step": 66865 }, { "epoch": 5.454767925605678, "grad_norm": 1.4646676778793335, "learning_rate": 2.5395419680204214e-05, "loss": 0.3946, "num_input_tokens_seen": 63867984, "step": 66870 }, { "epoch": 5.455175789216086, "grad_norm": 2.646998882293701, "learning_rate": 2.5391860840000304e-05, "loss": 0.3859, "num_input_tokens_seen": 63872240, "step": 66875 }, { "epoch": 5.455583652826495, "grad_norm": 1.903769850730896, "learning_rate": 2.5388301991853546e-05, "loss": 0.3421, "num_input_tokens_seen": 63876736, "step": 66880 }, { "epoch": 5.455991516436903, "grad_norm": 1.5953365564346313, "learning_rate": 2.5384743135836077e-05, "loss": 0.3269, "num_input_tokens_seen": 63881456, "step": 66885 }, { "epoch": 5.456399380047312, "grad_norm": 3.513174295425415, "learning_rate": 2.538118427202002e-05, "loss": 0.3956, "num_input_tokens_seen": 63886128, "step": 66890 }, { "epoch": 5.456807243657721, "grad_norm": 9.195717811584473, "learning_rate": 2.5377625400477517e-05, "loss": 0.3464, "num_input_tokens_seen": 63890384, "step": 66895 }, { "epoch": 5.457215107268129, "grad_norm": 0.8953927755355835, "learning_rate": 2.5374066521280713e-05, "loss": 0.3064, "num_input_tokens_seen": 63895472, "step": 66900 }, { "epoch": 5.457622970878538, "grad_norm": 1.775068998336792, "learning_rate": 2.5370507634501738e-05, "loss": 0.346, "num_input_tokens_seen": 63899456, "step": 66905 }, { "epoch": 5.458030834488947, "grad_norm": 3.284313917160034, "learning_rate": 2.5366948740212732e-05, "loss": 0.3103, "num_input_tokens_seen": 63903280, "step": 66910 }, { "epoch": 5.458438698099355, "grad_norm": 9.76848316192627, "learning_rate": 2.5363389838485834e-05, "loss": 0.3188, "num_input_tokens_seen": 63908272, "step": 66915 }, { "epoch": 5.458846561709764, "grad_norm": 3.1299326419830322, "learning_rate": 2.535983092939317e-05, "loss": 0.3021, "num_input_tokens_seen": 63911968, "step": 66920 }, { "epoch": 5.459254425320173, "grad_norm": 9.097023010253906, "learning_rate": 2.5356272013006892e-05, "loss": 0.298, "num_input_tokens_seen": 63917312, "step": 66925 }, { "epoch": 5.4596622889305815, "grad_norm": 4.416143894195557, "learning_rate": 2.5352713089399128e-05, "loss": 0.3625, "num_input_tokens_seen": 63921312, "step": 66930 }, { "epoch": 5.4600701525409905, "grad_norm": 3.326206684112549, "learning_rate": 2.5349154158642026e-05, "loss": 0.3869, "num_input_tokens_seen": 63927392, "step": 66935 }, { "epoch": 5.460478016151399, "grad_norm": 8.038920402526855, "learning_rate": 2.534559522080771e-05, "loss": 0.3294, "num_input_tokens_seen": 63932464, "step": 66940 }, { "epoch": 5.460885879761808, "grad_norm": 2.271993398666382, "learning_rate": 2.534203627596833e-05, "loss": 0.37, "num_input_tokens_seen": 63937408, "step": 66945 }, { "epoch": 5.461293743372217, "grad_norm": 1.2543237209320068, "learning_rate": 2.5338477324196018e-05, "loss": 0.2865, "num_input_tokens_seen": 63941776, "step": 66950 }, { "epoch": 5.461701606982625, "grad_norm": 4.066614627838135, "learning_rate": 2.533491836556292e-05, "loss": 0.3507, "num_input_tokens_seen": 63947136, "step": 66955 }, { "epoch": 5.462109470593034, "grad_norm": 3.915776491165161, "learning_rate": 2.533135940014117e-05, "loss": 0.31, "num_input_tokens_seen": 63952032, "step": 66960 }, { "epoch": 5.462517334203443, "grad_norm": 2.9722416400909424, "learning_rate": 2.5327800428002897e-05, "loss": 0.3478, "num_input_tokens_seen": 63957536, "step": 66965 }, { "epoch": 5.462925197813851, "grad_norm": 8.066269874572754, "learning_rate": 2.5324241449220254e-05, "loss": 0.323, "num_input_tokens_seen": 63962704, "step": 66970 }, { "epoch": 5.46333306142426, "grad_norm": 1.2847527265548706, "learning_rate": 2.532068246386537e-05, "loss": 0.2776, "num_input_tokens_seen": 63967792, "step": 66975 }, { "epoch": 5.463740925034668, "grad_norm": 3.442166328430176, "learning_rate": 2.531712347201039e-05, "loss": 0.3531, "num_input_tokens_seen": 63972032, "step": 66980 }, { "epoch": 5.464148788645077, "grad_norm": 3.4772582054138184, "learning_rate": 2.531356447372745e-05, "loss": 0.3826, "num_input_tokens_seen": 63975840, "step": 66985 }, { "epoch": 5.464556652255486, "grad_norm": 5.091733932495117, "learning_rate": 2.53100054690887e-05, "loss": 0.3116, "num_input_tokens_seen": 63980544, "step": 66990 }, { "epoch": 5.464964515865894, "grad_norm": 2.804842710494995, "learning_rate": 2.5306446458166262e-05, "loss": 0.3829, "num_input_tokens_seen": 63985728, "step": 66995 }, { "epoch": 5.465372379476303, "grad_norm": 14.356965065002441, "learning_rate": 2.5302887441032287e-05, "loss": 0.467, "num_input_tokens_seen": 63991136, "step": 67000 }, { "epoch": 5.465780243086712, "grad_norm": 3.9663374423980713, "learning_rate": 2.529932841775891e-05, "loss": 0.3619, "num_input_tokens_seen": 63994704, "step": 67005 }, { "epoch": 5.46618810669712, "grad_norm": 7.1361541748046875, "learning_rate": 2.529576938841828e-05, "loss": 0.3208, "num_input_tokens_seen": 63999328, "step": 67010 }, { "epoch": 5.466595970307529, "grad_norm": 4.02170467376709, "learning_rate": 2.529221035308253e-05, "loss": 0.3718, "num_input_tokens_seen": 64003856, "step": 67015 }, { "epoch": 5.467003833917937, "grad_norm": 3.8326432704925537, "learning_rate": 2.5288651311823795e-05, "loss": 0.4404, "num_input_tokens_seen": 64008272, "step": 67020 }, { "epoch": 5.467411697528346, "grad_norm": 1.4063388109207153, "learning_rate": 2.5285092264714223e-05, "loss": 0.3899, "num_input_tokens_seen": 64012880, "step": 67025 }, { "epoch": 5.4678195611387554, "grad_norm": 0.7871237397193909, "learning_rate": 2.5281533211825954e-05, "loss": 0.3075, "num_input_tokens_seen": 64017952, "step": 67030 }, { "epoch": 5.468227424749164, "grad_norm": 2.752816677093506, "learning_rate": 2.527797415323112e-05, "loss": 0.2891, "num_input_tokens_seen": 64022496, "step": 67035 }, { "epoch": 5.468635288359573, "grad_norm": 1.2973726987838745, "learning_rate": 2.5274415089001873e-05, "loss": 0.3587, "num_input_tokens_seen": 64027392, "step": 67040 }, { "epoch": 5.469043151969982, "grad_norm": 3.355839729309082, "learning_rate": 2.5270856019210347e-05, "loss": 0.3516, "num_input_tokens_seen": 64032672, "step": 67045 }, { "epoch": 5.46945101558039, "grad_norm": 1.3275741338729858, "learning_rate": 2.5267296943928685e-05, "loss": 0.3331, "num_input_tokens_seen": 64037232, "step": 67050 }, { "epoch": 5.469858879190799, "grad_norm": 2.3149008750915527, "learning_rate": 2.526373786322903e-05, "loss": 0.3109, "num_input_tokens_seen": 64042544, "step": 67055 }, { "epoch": 5.470266742801208, "grad_norm": 2.201634168624878, "learning_rate": 2.526017877718352e-05, "loss": 0.3348, "num_input_tokens_seen": 64047296, "step": 67060 }, { "epoch": 5.470674606411616, "grad_norm": 2.279067277908325, "learning_rate": 2.5256619685864292e-05, "loss": 0.3812, "num_input_tokens_seen": 64052480, "step": 67065 }, { "epoch": 5.471082470022025, "grad_norm": 6.012924671173096, "learning_rate": 2.5253060589343496e-05, "loss": 0.3264, "num_input_tokens_seen": 64057536, "step": 67070 }, { "epoch": 5.471490333632433, "grad_norm": 2.5914320945739746, "learning_rate": 2.5249501487693265e-05, "loss": 0.3377, "num_input_tokens_seen": 64061664, "step": 67075 }, { "epoch": 5.471898197242842, "grad_norm": 4.179947853088379, "learning_rate": 2.524594238098575e-05, "loss": 0.3701, "num_input_tokens_seen": 64065648, "step": 67080 }, { "epoch": 5.472306060853251, "grad_norm": 3.7231650352478027, "learning_rate": 2.5242383269293086e-05, "loss": 0.3481, "num_input_tokens_seen": 64070304, "step": 67085 }, { "epoch": 5.472713924463659, "grad_norm": 4.0666375160217285, "learning_rate": 2.5238824152687418e-05, "loss": 0.2761, "num_input_tokens_seen": 64074240, "step": 67090 }, { "epoch": 5.473121788074068, "grad_norm": 1.5258784294128418, "learning_rate": 2.5235265031240885e-05, "loss": 0.4156, "num_input_tokens_seen": 64079008, "step": 67095 }, { "epoch": 5.473529651684476, "grad_norm": 4.54345703125, "learning_rate": 2.5231705905025625e-05, "loss": 0.3221, "num_input_tokens_seen": 64084160, "step": 67100 }, { "epoch": 5.473937515294885, "grad_norm": 2.5374011993408203, "learning_rate": 2.5228146774113787e-05, "loss": 0.3008, "num_input_tokens_seen": 64089376, "step": 67105 }, { "epoch": 5.474345378905294, "grad_norm": 0.6864334344863892, "learning_rate": 2.522458763857752e-05, "loss": 0.2515, "num_input_tokens_seen": 64095104, "step": 67110 }, { "epoch": 5.474753242515702, "grad_norm": 1.5734541416168213, "learning_rate": 2.5221028498488947e-05, "loss": 0.3827, "num_input_tokens_seen": 64100128, "step": 67115 }, { "epoch": 5.475161106126111, "grad_norm": 7.310810089111328, "learning_rate": 2.521746935392022e-05, "loss": 0.3797, "num_input_tokens_seen": 64105568, "step": 67120 }, { "epoch": 5.47556896973652, "grad_norm": 5.433363914489746, "learning_rate": 2.5213910204943487e-05, "loss": 0.3926, "num_input_tokens_seen": 64110256, "step": 67125 }, { "epoch": 5.4759768333469285, "grad_norm": 5.173296928405762, "learning_rate": 2.5210351051630877e-05, "loss": 0.3642, "num_input_tokens_seen": 64115376, "step": 67130 }, { "epoch": 5.4763846969573375, "grad_norm": 4.197601795196533, "learning_rate": 2.5206791894054547e-05, "loss": 0.3427, "num_input_tokens_seen": 64120384, "step": 67135 }, { "epoch": 5.4767925605677465, "grad_norm": 0.922336220741272, "learning_rate": 2.520323273228663e-05, "loss": 0.4109, "num_input_tokens_seen": 64124816, "step": 67140 }, { "epoch": 5.477200424178155, "grad_norm": 5.951833248138428, "learning_rate": 2.5199673566399278e-05, "loss": 0.3858, "num_input_tokens_seen": 64130144, "step": 67145 }, { "epoch": 5.477608287788564, "grad_norm": 5.422423362731934, "learning_rate": 2.5196114396464622e-05, "loss": 0.4182, "num_input_tokens_seen": 64134992, "step": 67150 }, { "epoch": 5.478016151398972, "grad_norm": 1.936557412147522, "learning_rate": 2.519255522255481e-05, "loss": 0.2612, "num_input_tokens_seen": 64140144, "step": 67155 }, { "epoch": 5.478424015009381, "grad_norm": 0.8604207634925842, "learning_rate": 2.5188996044741993e-05, "loss": 0.3243, "num_input_tokens_seen": 64145536, "step": 67160 }, { "epoch": 5.47883187861979, "grad_norm": 1.0430352687835693, "learning_rate": 2.51854368630983e-05, "loss": 0.3269, "num_input_tokens_seen": 64150048, "step": 67165 }, { "epoch": 5.479239742230198, "grad_norm": 3.2008426189422607, "learning_rate": 2.5181877677695886e-05, "loss": 0.3743, "num_input_tokens_seen": 64154576, "step": 67170 }, { "epoch": 5.479647605840607, "grad_norm": 3.804727792739868, "learning_rate": 2.5178318488606878e-05, "loss": 0.4144, "num_input_tokens_seen": 64158880, "step": 67175 }, { "epoch": 5.480055469451016, "grad_norm": 1.6002529859542847, "learning_rate": 2.5174759295903438e-05, "loss": 0.3301, "num_input_tokens_seen": 64163056, "step": 67180 }, { "epoch": 5.480463333061424, "grad_norm": 3.3182408809661865, "learning_rate": 2.51712000996577e-05, "loss": 0.286, "num_input_tokens_seen": 64167888, "step": 67185 }, { "epoch": 5.480871196671833, "grad_norm": 0.8412723541259766, "learning_rate": 2.516764089994181e-05, "loss": 0.3185, "num_input_tokens_seen": 64172624, "step": 67190 }, { "epoch": 5.481279060282241, "grad_norm": 1.4224658012390137, "learning_rate": 2.51640816968279e-05, "loss": 0.335, "num_input_tokens_seen": 64177920, "step": 67195 }, { "epoch": 5.48168692389265, "grad_norm": 2.022789478302002, "learning_rate": 2.516052249038814e-05, "loss": 0.3101, "num_input_tokens_seen": 64183408, "step": 67200 }, { "epoch": 5.482094787503059, "grad_norm": 2.2899389266967773, "learning_rate": 2.5156963280694645e-05, "loss": 0.2993, "num_input_tokens_seen": 64187872, "step": 67205 }, { "epoch": 5.482502651113467, "grad_norm": 3.2571840286254883, "learning_rate": 2.5153404067819576e-05, "loss": 0.3154, "num_input_tokens_seen": 64193008, "step": 67210 }, { "epoch": 5.482910514723876, "grad_norm": 3.2241036891937256, "learning_rate": 2.5149844851835077e-05, "loss": 0.403, "num_input_tokens_seen": 64197504, "step": 67215 }, { "epoch": 5.483318378334285, "grad_norm": 1.7259875535964966, "learning_rate": 2.5146285632813272e-05, "loss": 0.3064, "num_input_tokens_seen": 64202480, "step": 67220 }, { "epoch": 5.4837262419446935, "grad_norm": 4.191437721252441, "learning_rate": 2.5142726410826335e-05, "loss": 0.3728, "num_input_tokens_seen": 64207552, "step": 67225 }, { "epoch": 5.4841341055551025, "grad_norm": 2.9968955516815186, "learning_rate": 2.513916718594639e-05, "loss": 0.3804, "num_input_tokens_seen": 64212240, "step": 67230 }, { "epoch": 5.484541969165511, "grad_norm": 3.266693592071533, "learning_rate": 2.5135607958245584e-05, "loss": 0.4096, "num_input_tokens_seen": 64218032, "step": 67235 }, { "epoch": 5.48494983277592, "grad_norm": 1.5104684829711914, "learning_rate": 2.5132048727796052e-05, "loss": 0.3782, "num_input_tokens_seen": 64222640, "step": 67240 }, { "epoch": 5.485357696386329, "grad_norm": 11.53739070892334, "learning_rate": 2.512848949466996e-05, "loss": 0.352, "num_input_tokens_seen": 64227872, "step": 67245 }, { "epoch": 5.485765559996737, "grad_norm": 1.0043233633041382, "learning_rate": 2.512493025893944e-05, "loss": 0.2742, "num_input_tokens_seen": 64233008, "step": 67250 }, { "epoch": 5.486173423607146, "grad_norm": 2.0300025939941406, "learning_rate": 2.512137102067663e-05, "loss": 0.3469, "num_input_tokens_seen": 64237776, "step": 67255 }, { "epoch": 5.486581287217555, "grad_norm": 0.8666377663612366, "learning_rate": 2.5117811779953693e-05, "loss": 0.3372, "num_input_tokens_seen": 64242960, "step": 67260 }, { "epoch": 5.486989150827963, "grad_norm": 1.3178390264511108, "learning_rate": 2.5114252536842746e-05, "loss": 0.376, "num_input_tokens_seen": 64247664, "step": 67265 }, { "epoch": 5.487397014438372, "grad_norm": 0.880743682384491, "learning_rate": 2.511069329141596e-05, "loss": 0.3308, "num_input_tokens_seen": 64253216, "step": 67270 }, { "epoch": 5.487804878048781, "grad_norm": 4.196314811706543, "learning_rate": 2.5107134043745462e-05, "loss": 0.3269, "num_input_tokens_seen": 64257584, "step": 67275 }, { "epoch": 5.488212741659189, "grad_norm": 1.5747536420822144, "learning_rate": 2.5103574793903406e-05, "loss": 0.3393, "num_input_tokens_seen": 64262128, "step": 67280 }, { "epoch": 5.488620605269598, "grad_norm": 5.516620635986328, "learning_rate": 2.510001554196193e-05, "loss": 0.3351, "num_input_tokens_seen": 64267552, "step": 67285 }, { "epoch": 5.489028468880006, "grad_norm": 2.3940236568450928, "learning_rate": 2.5096456287993186e-05, "loss": 0.3343, "num_input_tokens_seen": 64272160, "step": 67290 }, { "epoch": 5.489436332490415, "grad_norm": 1.8357285261154175, "learning_rate": 2.509289703206931e-05, "loss": 0.3109, "num_input_tokens_seen": 64276928, "step": 67295 }, { "epoch": 5.489844196100824, "grad_norm": 4.687271595001221, "learning_rate": 2.508933777426246e-05, "loss": 0.3574, "num_input_tokens_seen": 64281664, "step": 67300 }, { "epoch": 5.490252059711232, "grad_norm": 5.271717071533203, "learning_rate": 2.508577851464476e-05, "loss": 0.3327, "num_input_tokens_seen": 64286080, "step": 67305 }, { "epoch": 5.490659923321641, "grad_norm": 5.484164714813232, "learning_rate": 2.5082219253288368e-05, "loss": 0.3622, "num_input_tokens_seen": 64290768, "step": 67310 }, { "epoch": 5.49106778693205, "grad_norm": 1.4949227571487427, "learning_rate": 2.5078659990265435e-05, "loss": 0.3263, "num_input_tokens_seen": 64294720, "step": 67315 }, { "epoch": 5.4914756505424585, "grad_norm": 5.0165815353393555, "learning_rate": 2.5075100725648088e-05, "loss": 0.3708, "num_input_tokens_seen": 64299600, "step": 67320 }, { "epoch": 5.4918835141528675, "grad_norm": 0.5406811237335205, "learning_rate": 2.5071541459508487e-05, "loss": 0.3028, "num_input_tokens_seen": 64304864, "step": 67325 }, { "epoch": 5.492291377763276, "grad_norm": 2.4508023262023926, "learning_rate": 2.506798219191876e-05, "loss": 0.3225, "num_input_tokens_seen": 64309792, "step": 67330 }, { "epoch": 5.492699241373685, "grad_norm": 3.4908862113952637, "learning_rate": 2.5064422922951074e-05, "loss": 0.3786, "num_input_tokens_seen": 64314912, "step": 67335 }, { "epoch": 5.493107104984094, "grad_norm": 0.859298050403595, "learning_rate": 2.5060863652677556e-05, "loss": 0.2945, "num_input_tokens_seen": 64319264, "step": 67340 }, { "epoch": 5.493514968594502, "grad_norm": 4.999946594238281, "learning_rate": 2.505730438117036e-05, "loss": 0.3203, "num_input_tokens_seen": 64324320, "step": 67345 }, { "epoch": 5.493922832204911, "grad_norm": 0.6135810613632202, "learning_rate": 2.505374510850163e-05, "loss": 0.3439, "num_input_tokens_seen": 64328944, "step": 67350 }, { "epoch": 5.49433069581532, "grad_norm": 2.2704832553863525, "learning_rate": 2.505018583474351e-05, "loss": 0.3743, "num_input_tokens_seen": 64334032, "step": 67355 }, { "epoch": 5.494738559425728, "grad_norm": 4.156241416931152, "learning_rate": 2.5046626559968145e-05, "loss": 0.3942, "num_input_tokens_seen": 64338672, "step": 67360 }, { "epoch": 5.495146423036137, "grad_norm": 1.3181489706039429, "learning_rate": 2.504306728424768e-05, "loss": 0.3638, "num_input_tokens_seen": 64342832, "step": 67365 }, { "epoch": 5.495554286646545, "grad_norm": 3.0672242641448975, "learning_rate": 2.5039508007654262e-05, "loss": 0.3143, "num_input_tokens_seen": 64347696, "step": 67370 }, { "epoch": 5.495962150256954, "grad_norm": 1.084063172340393, "learning_rate": 2.5035948730260023e-05, "loss": 0.2676, "num_input_tokens_seen": 64351856, "step": 67375 }, { "epoch": 5.496370013867363, "grad_norm": 2.5093770027160645, "learning_rate": 2.503238945213713e-05, "loss": 0.2842, "num_input_tokens_seen": 64356016, "step": 67380 }, { "epoch": 5.496777877477771, "grad_norm": 3.5357577800750732, "learning_rate": 2.5028830173357702e-05, "loss": 0.2332, "num_input_tokens_seen": 64361040, "step": 67385 }, { "epoch": 5.49718574108818, "grad_norm": 2.764737844467163, "learning_rate": 2.5025270893993914e-05, "loss": 0.4227, "num_input_tokens_seen": 64365648, "step": 67390 }, { "epoch": 5.497593604698589, "grad_norm": 6.477660655975342, "learning_rate": 2.5021711614117882e-05, "loss": 0.376, "num_input_tokens_seen": 64370912, "step": 67395 }, { "epoch": 5.498001468308997, "grad_norm": 1.0699646472930908, "learning_rate": 2.5018152333801775e-05, "loss": 0.4053, "num_input_tokens_seen": 64375616, "step": 67400 }, { "epoch": 5.498409331919406, "grad_norm": 5.338164806365967, "learning_rate": 2.5014593053117724e-05, "loss": 0.3675, "num_input_tokens_seen": 64380096, "step": 67405 }, { "epoch": 5.498817195529815, "grad_norm": 1.8077402114868164, "learning_rate": 2.5011033772137876e-05, "loss": 0.3474, "num_input_tokens_seen": 64384928, "step": 67410 }, { "epoch": 5.499225059140223, "grad_norm": 1.5911582708358765, "learning_rate": 2.5007474490934384e-05, "loss": 0.374, "num_input_tokens_seen": 64389536, "step": 67415 }, { "epoch": 5.499632922750632, "grad_norm": 1.096626877784729, "learning_rate": 2.500391520957938e-05, "loss": 0.2381, "num_input_tokens_seen": 64394416, "step": 67420 }, { "epoch": 5.5000407863610405, "grad_norm": 2.5454635620117188, "learning_rate": 2.5000355928145024e-05, "loss": 0.4035, "num_input_tokens_seen": 64398992, "step": 67425 }, { "epoch": 5.5004486499714496, "grad_norm": 0.8104411363601685, "learning_rate": 2.4996796646703455e-05, "loss": 0.2604, "num_input_tokens_seen": 64403536, "step": 67430 }, { "epoch": 5.5004486499714496, "eval_loss": 0.3360634446144104, "eval_runtime": 570.9132, "eval_samples_per_second": 4.773, "eval_steps_per_second": 2.387, "num_input_tokens_seen": 64403536, "step": 67430 }, { "epoch": 5.500856513581859, "grad_norm": 0.8336061835289001, "learning_rate": 2.499323736532681e-05, "loss": 0.32, "num_input_tokens_seen": 64408736, "step": 67435 }, { "epoch": 5.501264377192267, "grad_norm": 2.5182132720947266, "learning_rate": 2.4989678084087243e-05, "loss": 0.3099, "num_input_tokens_seen": 64413152, "step": 67440 }, { "epoch": 5.501672240802676, "grad_norm": 9.831584930419922, "learning_rate": 2.4986118803056894e-05, "loss": 0.4552, "num_input_tokens_seen": 64416544, "step": 67445 }, { "epoch": 5.502080104413084, "grad_norm": 4.440183162689209, "learning_rate": 2.4982559522307913e-05, "loss": 0.3786, "num_input_tokens_seen": 64420848, "step": 67450 }, { "epoch": 5.502487968023493, "grad_norm": 3.1515347957611084, "learning_rate": 2.4979000241912446e-05, "loss": 0.4038, "num_input_tokens_seen": 64424544, "step": 67455 }, { "epoch": 5.502895831633902, "grad_norm": 0.9522117376327515, "learning_rate": 2.4975440961942634e-05, "loss": 0.2691, "num_input_tokens_seen": 64429232, "step": 67460 }, { "epoch": 5.50330369524431, "grad_norm": 3.485156536102295, "learning_rate": 2.4971881682470626e-05, "loss": 0.2919, "num_input_tokens_seen": 64433232, "step": 67465 }, { "epoch": 5.503711558854719, "grad_norm": 2.5359504222869873, "learning_rate": 2.496832240356855e-05, "loss": 0.3526, "num_input_tokens_seen": 64437456, "step": 67470 }, { "epoch": 5.504119422465128, "grad_norm": 2.1036198139190674, "learning_rate": 2.4964763125308578e-05, "loss": 0.4337, "num_input_tokens_seen": 64442480, "step": 67475 }, { "epoch": 5.504527286075536, "grad_norm": 5.511847972869873, "learning_rate": 2.496120384776284e-05, "loss": 0.3679, "num_input_tokens_seen": 64446992, "step": 67480 }, { "epoch": 5.504935149685945, "grad_norm": 5.160215854644775, "learning_rate": 2.4957644571003487e-05, "loss": 0.2975, "num_input_tokens_seen": 64452368, "step": 67485 }, { "epoch": 5.505343013296354, "grad_norm": 5.094608306884766, "learning_rate": 2.4954085295102655e-05, "loss": 0.3281, "num_input_tokens_seen": 64457024, "step": 67490 }, { "epoch": 5.505750876906762, "grad_norm": 12.276761054992676, "learning_rate": 2.495052602013249e-05, "loss": 0.3265, "num_input_tokens_seen": 64462464, "step": 67495 }, { "epoch": 5.506158740517171, "grad_norm": 1.6900739669799805, "learning_rate": 2.4946966746165154e-05, "loss": 0.2974, "num_input_tokens_seen": 64467200, "step": 67500 }, { "epoch": 5.506566604127579, "grad_norm": 6.0751776695251465, "learning_rate": 2.4943407473272776e-05, "loss": 0.3253, "num_input_tokens_seen": 64471824, "step": 67505 }, { "epoch": 5.506974467737988, "grad_norm": 2.2999985218048096, "learning_rate": 2.4939848201527498e-05, "loss": 0.3073, "num_input_tokens_seen": 64475792, "step": 67510 }, { "epoch": 5.507382331348397, "grad_norm": 2.3682923316955566, "learning_rate": 2.493628893100148e-05, "loss": 0.2672, "num_input_tokens_seen": 64480416, "step": 67515 }, { "epoch": 5.5077901949588055, "grad_norm": 20.252849578857422, "learning_rate": 2.4932729661766854e-05, "loss": 0.4474, "num_input_tokens_seen": 64485536, "step": 67520 }, { "epoch": 5.5081980585692145, "grad_norm": 5.5508832931518555, "learning_rate": 2.4929170393895772e-05, "loss": 0.4135, "num_input_tokens_seen": 64490352, "step": 67525 }, { "epoch": 5.508605922179623, "grad_norm": 0.5427389144897461, "learning_rate": 2.4925611127460373e-05, "loss": 0.327, "num_input_tokens_seen": 64495504, "step": 67530 }, { "epoch": 5.509013785790032, "grad_norm": 2.1706032752990723, "learning_rate": 2.4922051862532795e-05, "loss": 0.4466, "num_input_tokens_seen": 64500560, "step": 67535 }, { "epoch": 5.509421649400441, "grad_norm": 10.137948036193848, "learning_rate": 2.4918492599185205e-05, "loss": 0.4159, "num_input_tokens_seen": 64504864, "step": 67540 }, { "epoch": 5.50982951301085, "grad_norm": 7.631271839141846, "learning_rate": 2.491493333748974e-05, "loss": 0.512, "num_input_tokens_seen": 64509344, "step": 67545 }, { "epoch": 5.510237376621258, "grad_norm": 5.9334869384765625, "learning_rate": 2.4911374077518533e-05, "loss": 0.3286, "num_input_tokens_seen": 64514368, "step": 67550 }, { "epoch": 5.510645240231667, "grad_norm": 4.739426136016846, "learning_rate": 2.4907814819343725e-05, "loss": 0.3404, "num_input_tokens_seen": 64519168, "step": 67555 }, { "epoch": 5.511053103842075, "grad_norm": 2.7380669116973877, "learning_rate": 2.4904255563037486e-05, "loss": 0.4088, "num_input_tokens_seen": 64523696, "step": 67560 }, { "epoch": 5.511460967452484, "grad_norm": 3.2206366062164307, "learning_rate": 2.4900696308671944e-05, "loss": 0.3166, "num_input_tokens_seen": 64527904, "step": 67565 }, { "epoch": 5.511868831062893, "grad_norm": 1.3132357597351074, "learning_rate": 2.4897137056319248e-05, "loss": 0.359, "num_input_tokens_seen": 64532960, "step": 67570 }, { "epoch": 5.512276694673301, "grad_norm": 2.1756415367126465, "learning_rate": 2.4893577806051537e-05, "loss": 0.3842, "num_input_tokens_seen": 64537200, "step": 67575 }, { "epoch": 5.51268455828371, "grad_norm": 9.691692352294922, "learning_rate": 2.4890018557940955e-05, "loss": 0.3251, "num_input_tokens_seen": 64542208, "step": 67580 }, { "epoch": 5.513092421894118, "grad_norm": 3.817922592163086, "learning_rate": 2.4886459312059654e-05, "loss": 0.3638, "num_input_tokens_seen": 64546544, "step": 67585 }, { "epoch": 5.513500285504527, "grad_norm": 1.1547572612762451, "learning_rate": 2.4882900068479776e-05, "loss": 0.2838, "num_input_tokens_seen": 64551552, "step": 67590 }, { "epoch": 5.513908149114936, "grad_norm": 1.1108635663986206, "learning_rate": 2.487934082727346e-05, "loss": 0.2264, "num_input_tokens_seen": 64556256, "step": 67595 }, { "epoch": 5.514316012725344, "grad_norm": 5.613996505737305, "learning_rate": 2.4875781588512852e-05, "loss": 0.3494, "num_input_tokens_seen": 64561296, "step": 67600 }, { "epoch": 5.514723876335753, "grad_norm": 3.238154649734497, "learning_rate": 2.4872222352270102e-05, "loss": 0.3345, "num_input_tokens_seen": 64566528, "step": 67605 }, { "epoch": 5.515131739946162, "grad_norm": 4.372758865356445, "learning_rate": 2.4868663118617352e-05, "loss": 0.3336, "num_input_tokens_seen": 64571200, "step": 67610 }, { "epoch": 5.5155396035565705, "grad_norm": 4.586879730224609, "learning_rate": 2.4865103887626745e-05, "loss": 0.4426, "num_input_tokens_seen": 64576144, "step": 67615 }, { "epoch": 5.5159474671669795, "grad_norm": 6.217690944671631, "learning_rate": 2.4861544659370426e-05, "loss": 0.385, "num_input_tokens_seen": 64581552, "step": 67620 }, { "epoch": 5.5163553307773885, "grad_norm": 7.4131855964660645, "learning_rate": 2.4857985433920526e-05, "loss": 0.2865, "num_input_tokens_seen": 64586784, "step": 67625 }, { "epoch": 5.516763194387797, "grad_norm": 11.541834831237793, "learning_rate": 2.485442621134921e-05, "loss": 0.2678, "num_input_tokens_seen": 64592000, "step": 67630 }, { "epoch": 5.517171057998206, "grad_norm": 1.2032907009124756, "learning_rate": 2.4850866991728617e-05, "loss": 0.242, "num_input_tokens_seen": 64596768, "step": 67635 }, { "epoch": 5.517578921608614, "grad_norm": 7.784059524536133, "learning_rate": 2.4847307775130886e-05, "loss": 0.3875, "num_input_tokens_seen": 64602544, "step": 67640 }, { "epoch": 5.517986785219023, "grad_norm": 5.28513240814209, "learning_rate": 2.4843748561628152e-05, "loss": 0.4246, "num_input_tokens_seen": 64607536, "step": 67645 }, { "epoch": 5.518394648829432, "grad_norm": 9.826380729675293, "learning_rate": 2.484018935129258e-05, "loss": 0.4468, "num_input_tokens_seen": 64612880, "step": 67650 }, { "epoch": 5.51880251243984, "grad_norm": 2.591625928878784, "learning_rate": 2.4836630144196298e-05, "loss": 0.2494, "num_input_tokens_seen": 64618112, "step": 67655 }, { "epoch": 5.519210376050249, "grad_norm": 1.0687083005905151, "learning_rate": 2.483307094041145e-05, "loss": 0.4478, "num_input_tokens_seen": 64623280, "step": 67660 }, { "epoch": 5.519618239660657, "grad_norm": 5.627254009246826, "learning_rate": 2.482951174001019e-05, "loss": 0.3413, "num_input_tokens_seen": 64627600, "step": 67665 }, { "epoch": 5.520026103271066, "grad_norm": 2.7726335525512695, "learning_rate": 2.482595254306464e-05, "loss": 0.4109, "num_input_tokens_seen": 64632736, "step": 67670 }, { "epoch": 5.520433966881475, "grad_norm": 7.20261287689209, "learning_rate": 2.482239334964697e-05, "loss": 0.3474, "num_input_tokens_seen": 64638192, "step": 67675 }, { "epoch": 5.520841830491883, "grad_norm": 0.9266654849052429, "learning_rate": 2.481883415982931e-05, "loss": 0.2785, "num_input_tokens_seen": 64643616, "step": 67680 }, { "epoch": 5.521249694102292, "grad_norm": 2.569110631942749, "learning_rate": 2.4815274973683805e-05, "loss": 0.3636, "num_input_tokens_seen": 64648304, "step": 67685 }, { "epoch": 5.521657557712701, "grad_norm": 1.1463927030563354, "learning_rate": 2.4811715791282586e-05, "loss": 0.2998, "num_input_tokens_seen": 64653872, "step": 67690 }, { "epoch": 5.522065421323109, "grad_norm": 5.453666687011719, "learning_rate": 2.4808156612697824e-05, "loss": 0.2757, "num_input_tokens_seen": 64658512, "step": 67695 }, { "epoch": 5.522473284933518, "grad_norm": 1.5340498685836792, "learning_rate": 2.480459743800164e-05, "loss": 0.3322, "num_input_tokens_seen": 64663600, "step": 67700 }, { "epoch": 5.522881148543927, "grad_norm": 4.4552321434021, "learning_rate": 2.4801038267266185e-05, "loss": 0.334, "num_input_tokens_seen": 64668704, "step": 67705 }, { "epoch": 5.523289012154335, "grad_norm": 2.1399292945861816, "learning_rate": 2.4797479100563594e-05, "loss": 0.2696, "num_input_tokens_seen": 64673488, "step": 67710 }, { "epoch": 5.523696875764744, "grad_norm": 3.5901284217834473, "learning_rate": 2.479391993796602e-05, "loss": 0.3789, "num_input_tokens_seen": 64678656, "step": 67715 }, { "epoch": 5.524104739375153, "grad_norm": 12.045571327209473, "learning_rate": 2.4790360779545604e-05, "loss": 0.3046, "num_input_tokens_seen": 64683344, "step": 67720 }, { "epoch": 5.524512602985562, "grad_norm": 2.2633538246154785, "learning_rate": 2.4786801625374485e-05, "loss": 0.3968, "num_input_tokens_seen": 64687600, "step": 67725 }, { "epoch": 5.524920466595971, "grad_norm": 2.5924901962280273, "learning_rate": 2.4783242475524803e-05, "loss": 0.384, "num_input_tokens_seen": 64692768, "step": 67730 }, { "epoch": 5.525328330206379, "grad_norm": 3.2653236389160156, "learning_rate": 2.4779683330068698e-05, "loss": 0.3719, "num_input_tokens_seen": 64698096, "step": 67735 }, { "epoch": 5.525736193816788, "grad_norm": 1.9065946340560913, "learning_rate": 2.4776124189078327e-05, "loss": 0.3899, "num_input_tokens_seen": 64703136, "step": 67740 }, { "epoch": 5.526144057427197, "grad_norm": 1.9380742311477661, "learning_rate": 2.4772565052625823e-05, "loss": 0.2808, "num_input_tokens_seen": 64707360, "step": 67745 }, { "epoch": 5.526551921037605, "grad_norm": 0.6208946704864502, "learning_rate": 2.476900592078333e-05, "loss": 0.3024, "num_input_tokens_seen": 64712336, "step": 67750 }, { "epoch": 5.526959784648014, "grad_norm": 5.057799816131592, "learning_rate": 2.4765446793622977e-05, "loss": 0.3777, "num_input_tokens_seen": 64717104, "step": 67755 }, { "epoch": 5.527367648258423, "grad_norm": 8.903226852416992, "learning_rate": 2.476188767121693e-05, "loss": 0.3421, "num_input_tokens_seen": 64722416, "step": 67760 }, { "epoch": 5.527775511868831, "grad_norm": 2.4846601486206055, "learning_rate": 2.475832855363732e-05, "loss": 0.3299, "num_input_tokens_seen": 64727952, "step": 67765 }, { "epoch": 5.52818337547924, "grad_norm": 1.0637791156768799, "learning_rate": 2.4754769440956283e-05, "loss": 0.2723, "num_input_tokens_seen": 64733152, "step": 67770 }, { "epoch": 5.528591239089648, "grad_norm": 3.354682445526123, "learning_rate": 2.475121033324597e-05, "loss": 0.2959, "num_input_tokens_seen": 64737856, "step": 67775 }, { "epoch": 5.528999102700057, "grad_norm": 4.532473087310791, "learning_rate": 2.474765123057851e-05, "loss": 0.377, "num_input_tokens_seen": 64742656, "step": 67780 }, { "epoch": 5.529406966310466, "grad_norm": 7.24763298034668, "learning_rate": 2.4744092133026057e-05, "loss": 0.4242, "num_input_tokens_seen": 64746720, "step": 67785 }, { "epoch": 5.529814829920874, "grad_norm": 1.8721418380737305, "learning_rate": 2.4740533040660748e-05, "loss": 0.3084, "num_input_tokens_seen": 64750704, "step": 67790 }, { "epoch": 5.530222693531283, "grad_norm": 1.2115212678909302, "learning_rate": 2.4736973953554723e-05, "loss": 0.3279, "num_input_tokens_seen": 64755168, "step": 67795 }, { "epoch": 5.530630557141691, "grad_norm": 1.6683748960494995, "learning_rate": 2.4733414871780117e-05, "loss": 0.3422, "num_input_tokens_seen": 64760816, "step": 67800 }, { "epoch": 5.5310384207521, "grad_norm": 4.104167461395264, "learning_rate": 2.4729855795409086e-05, "loss": 0.3598, "num_input_tokens_seen": 64765856, "step": 67805 }, { "epoch": 5.531446284362509, "grad_norm": 0.5957997441291809, "learning_rate": 2.4726296724513765e-05, "loss": 0.2782, "num_input_tokens_seen": 64770352, "step": 67810 }, { "epoch": 5.5318541479729175, "grad_norm": 0.7696822881698608, "learning_rate": 2.4722737659166292e-05, "loss": 0.288, "num_input_tokens_seen": 64775040, "step": 67815 }, { "epoch": 5.5322620115833265, "grad_norm": 1.6781947612762451, "learning_rate": 2.471917859943881e-05, "loss": 0.3591, "num_input_tokens_seen": 64779264, "step": 67820 }, { "epoch": 5.5326698751937355, "grad_norm": 1.75790536403656, "learning_rate": 2.4715619545403447e-05, "loss": 0.2864, "num_input_tokens_seen": 64783520, "step": 67825 }, { "epoch": 5.533077738804144, "grad_norm": 1.5758538246154785, "learning_rate": 2.4712060497132365e-05, "loss": 0.2946, "num_input_tokens_seen": 64788016, "step": 67830 }, { "epoch": 5.533485602414553, "grad_norm": 4.1044816970825195, "learning_rate": 2.4708501454697693e-05, "loss": 0.3986, "num_input_tokens_seen": 64792304, "step": 67835 }, { "epoch": 5.533893466024962, "grad_norm": 0.9030531644821167, "learning_rate": 2.4704942418171577e-05, "loss": 0.3284, "num_input_tokens_seen": 64796976, "step": 67840 }, { "epoch": 5.53430132963537, "grad_norm": 0.9566595554351807, "learning_rate": 2.4701383387626143e-05, "loss": 0.3259, "num_input_tokens_seen": 64802256, "step": 67845 }, { "epoch": 5.534709193245779, "grad_norm": 0.9148054718971252, "learning_rate": 2.469782436313355e-05, "loss": 0.2681, "num_input_tokens_seen": 64806224, "step": 67850 }, { "epoch": 5.535117056856187, "grad_norm": 2.5430281162261963, "learning_rate": 2.4694265344765926e-05, "loss": 0.4072, "num_input_tokens_seen": 64811136, "step": 67855 }, { "epoch": 5.535524920466596, "grad_norm": 1.954877257347107, "learning_rate": 2.4690706332595415e-05, "loss": 0.3322, "num_input_tokens_seen": 64815888, "step": 67860 }, { "epoch": 5.535932784077005, "grad_norm": 3.451077699661255, "learning_rate": 2.4687147326694155e-05, "loss": 0.2611, "num_input_tokens_seen": 64820992, "step": 67865 }, { "epoch": 5.536340647687413, "grad_norm": 6.05141019821167, "learning_rate": 2.4683588327134274e-05, "loss": 0.311, "num_input_tokens_seen": 64825808, "step": 67870 }, { "epoch": 5.536748511297822, "grad_norm": 4.951204776763916, "learning_rate": 2.4680029333987934e-05, "loss": 0.2789, "num_input_tokens_seen": 64831488, "step": 67875 }, { "epoch": 5.53715637490823, "grad_norm": 4.157509803771973, "learning_rate": 2.4676470347327267e-05, "loss": 0.3829, "num_input_tokens_seen": 64836112, "step": 67880 }, { "epoch": 5.537564238518639, "grad_norm": 1.6644145250320435, "learning_rate": 2.46729113672244e-05, "loss": 0.3432, "num_input_tokens_seen": 64841120, "step": 67885 }, { "epoch": 5.537972102129048, "grad_norm": 4.866354465484619, "learning_rate": 2.466935239375148e-05, "loss": 0.3921, "num_input_tokens_seen": 64846768, "step": 67890 }, { "epoch": 5.538379965739456, "grad_norm": 6.83745813369751, "learning_rate": 2.4665793426980653e-05, "loss": 0.3433, "num_input_tokens_seen": 64852096, "step": 67895 }, { "epoch": 5.538787829349865, "grad_norm": 4.127307891845703, "learning_rate": 2.466223446698405e-05, "loss": 0.3479, "num_input_tokens_seen": 64855552, "step": 67900 }, { "epoch": 5.539195692960274, "grad_norm": 1.3412343263626099, "learning_rate": 2.465867551383381e-05, "loss": 0.3353, "num_input_tokens_seen": 64860176, "step": 67905 }, { "epoch": 5.5396035565706825, "grad_norm": 3.9398627281188965, "learning_rate": 2.465511656760207e-05, "loss": 0.28, "num_input_tokens_seen": 64863680, "step": 67910 }, { "epoch": 5.5400114201810915, "grad_norm": 2.5240578651428223, "learning_rate": 2.4651557628360975e-05, "loss": 0.3154, "num_input_tokens_seen": 64868272, "step": 67915 }, { "epoch": 5.5404192837915005, "grad_norm": 2.738159656524658, "learning_rate": 2.4647998696182657e-05, "loss": 0.3034, "num_input_tokens_seen": 64872928, "step": 67920 }, { "epoch": 5.540827147401909, "grad_norm": 4.551543235778809, "learning_rate": 2.4644439771139257e-05, "loss": 0.3257, "num_input_tokens_seen": 64878704, "step": 67925 }, { "epoch": 5.541235011012318, "grad_norm": 11.466474533081055, "learning_rate": 2.4640880853302914e-05, "loss": 0.3117, "num_input_tokens_seen": 64883424, "step": 67930 }, { "epoch": 5.541642874622726, "grad_norm": 7.552706718444824, "learning_rate": 2.463732194274575e-05, "loss": 0.2594, "num_input_tokens_seen": 64888400, "step": 67935 }, { "epoch": 5.542050738233135, "grad_norm": 2.923569917678833, "learning_rate": 2.4633763039539928e-05, "loss": 0.3697, "num_input_tokens_seen": 64893328, "step": 67940 }, { "epoch": 5.542458601843544, "grad_norm": 5.451505661010742, "learning_rate": 2.4630204143757572e-05, "loss": 0.254, "num_input_tokens_seen": 64898320, "step": 67945 }, { "epoch": 5.542866465453952, "grad_norm": 9.043807029724121, "learning_rate": 2.4626645255470824e-05, "loss": 0.2801, "num_input_tokens_seen": 64904016, "step": 67950 }, { "epoch": 5.543274329064361, "grad_norm": 2.4690093994140625, "learning_rate": 2.4623086374751804e-05, "loss": 0.248, "num_input_tokens_seen": 64909344, "step": 67955 }, { "epoch": 5.54368219267477, "grad_norm": 9.028903007507324, "learning_rate": 2.4619527501672676e-05, "loss": 0.3712, "num_input_tokens_seen": 64914048, "step": 67960 }, { "epoch": 5.544090056285178, "grad_norm": 1.8315017223358154, "learning_rate": 2.4615968636305565e-05, "loss": 0.4585, "num_input_tokens_seen": 64918864, "step": 67965 }, { "epoch": 5.544497919895587, "grad_norm": 1.3475151062011719, "learning_rate": 2.4612409778722603e-05, "loss": 0.3084, "num_input_tokens_seen": 64923840, "step": 67970 }, { "epoch": 5.544905783505996, "grad_norm": 2.3062028884887695, "learning_rate": 2.4608850928995932e-05, "loss": 0.4111, "num_input_tokens_seen": 64929792, "step": 67975 }, { "epoch": 5.545313647116404, "grad_norm": 1.418715238571167, "learning_rate": 2.4605292087197683e-05, "loss": 0.3599, "num_input_tokens_seen": 64934704, "step": 67980 }, { "epoch": 5.545721510726813, "grad_norm": 0.3707513213157654, "learning_rate": 2.4601733253399998e-05, "loss": 0.2933, "num_input_tokens_seen": 64940032, "step": 67985 }, { "epoch": 5.546129374337221, "grad_norm": 8.515524864196777, "learning_rate": 2.4598174427675012e-05, "loss": 0.4149, "num_input_tokens_seen": 64945072, "step": 67990 }, { "epoch": 5.54653723794763, "grad_norm": 1.1322654485702515, "learning_rate": 2.4594615610094858e-05, "loss": 0.3634, "num_input_tokens_seen": 64949408, "step": 67995 }, { "epoch": 5.546945101558039, "grad_norm": 1.6944434642791748, "learning_rate": 2.4591056800731662e-05, "loss": 0.2752, "num_input_tokens_seen": 64953968, "step": 68000 }, { "epoch": 5.5473529651684474, "grad_norm": 2.709806203842163, "learning_rate": 2.4587497999657584e-05, "loss": 0.3806, "num_input_tokens_seen": 64958992, "step": 68005 }, { "epoch": 5.5477608287788565, "grad_norm": 8.930643081665039, "learning_rate": 2.458393920694475e-05, "loss": 0.349, "num_input_tokens_seen": 64964144, "step": 68010 }, { "epoch": 5.548168692389265, "grad_norm": 2.3610846996307373, "learning_rate": 2.4580380422665285e-05, "loss": 0.3455, "num_input_tokens_seen": 64968928, "step": 68015 }, { "epoch": 5.548576555999674, "grad_norm": 7.030532360076904, "learning_rate": 2.4576821646891333e-05, "loss": 0.3793, "num_input_tokens_seen": 64973312, "step": 68020 }, { "epoch": 5.548984419610083, "grad_norm": 5.325316905975342, "learning_rate": 2.4573262879695012e-05, "loss": 0.3468, "num_input_tokens_seen": 64978400, "step": 68025 }, { "epoch": 5.549392283220491, "grad_norm": 13.967020034790039, "learning_rate": 2.4569704121148487e-05, "loss": 0.3951, "num_input_tokens_seen": 64982880, "step": 68030 }, { "epoch": 5.5498001468309, "grad_norm": 1.5623688697814941, "learning_rate": 2.4566145371323877e-05, "loss": 0.2554, "num_input_tokens_seen": 64987440, "step": 68035 }, { "epoch": 5.550208010441309, "grad_norm": 7.9644036293029785, "learning_rate": 2.456258663029331e-05, "loss": 0.3267, "num_input_tokens_seen": 64991296, "step": 68040 }, { "epoch": 5.550615874051717, "grad_norm": 3.1604931354522705, "learning_rate": 2.4559027898128926e-05, "loss": 0.2819, "num_input_tokens_seen": 64995920, "step": 68045 }, { "epoch": 5.551023737662126, "grad_norm": 1.5321271419525146, "learning_rate": 2.4555469174902862e-05, "loss": 0.3531, "num_input_tokens_seen": 64999808, "step": 68050 }, { "epoch": 5.551431601272535, "grad_norm": 2.235229969024658, "learning_rate": 2.455191046068725e-05, "loss": 0.4435, "num_input_tokens_seen": 65004208, "step": 68055 }, { "epoch": 5.551839464882943, "grad_norm": 6.967586517333984, "learning_rate": 2.4548351755554225e-05, "loss": 0.3251, "num_input_tokens_seen": 65008960, "step": 68060 }, { "epoch": 5.552247328493352, "grad_norm": 0.7245627045631409, "learning_rate": 2.4544793059575903e-05, "loss": 0.2834, "num_input_tokens_seen": 65013856, "step": 68065 }, { "epoch": 5.55265519210376, "grad_norm": 7.870152473449707, "learning_rate": 2.4541234372824446e-05, "loss": 0.3333, "num_input_tokens_seen": 65018608, "step": 68070 }, { "epoch": 5.553063055714169, "grad_norm": 8.950033187866211, "learning_rate": 2.453767569537197e-05, "loss": 0.4034, "num_input_tokens_seen": 65023280, "step": 68075 }, { "epoch": 5.553470919324578, "grad_norm": 7.2142181396484375, "learning_rate": 2.4534117027290615e-05, "loss": 0.3537, "num_input_tokens_seen": 65027664, "step": 68080 }, { "epoch": 5.553878782934986, "grad_norm": 10.219374656677246, "learning_rate": 2.4530558368652507e-05, "loss": 0.3607, "num_input_tokens_seen": 65032720, "step": 68085 }, { "epoch": 5.554286646545395, "grad_norm": 3.4180660247802734, "learning_rate": 2.4526999719529774e-05, "loss": 0.319, "num_input_tokens_seen": 65037072, "step": 68090 }, { "epoch": 5.554694510155803, "grad_norm": 5.380715370178223, "learning_rate": 2.4523441079994563e-05, "loss": 0.2762, "num_input_tokens_seen": 65041776, "step": 68095 }, { "epoch": 5.555102373766212, "grad_norm": 1.9433953762054443, "learning_rate": 2.4519882450119e-05, "loss": 0.4367, "num_input_tokens_seen": 65045952, "step": 68100 }, { "epoch": 5.555510237376621, "grad_norm": 1.8238478899002075, "learning_rate": 2.4516323829975215e-05, "loss": 0.3628, "num_input_tokens_seen": 65050672, "step": 68105 }, { "epoch": 5.55591810098703, "grad_norm": 5.12174129486084, "learning_rate": 2.4512765219635336e-05, "loss": 0.4233, "num_input_tokens_seen": 65055328, "step": 68110 }, { "epoch": 5.5563259645974385, "grad_norm": 5.113334655761719, "learning_rate": 2.4509206619171505e-05, "loss": 0.3635, "num_input_tokens_seen": 65060816, "step": 68115 }, { "epoch": 5.556733828207848, "grad_norm": 4.308104038238525, "learning_rate": 2.4505648028655846e-05, "loss": 0.3856, "num_input_tokens_seen": 65065840, "step": 68120 }, { "epoch": 5.557141691818256, "grad_norm": 2.6067299842834473, "learning_rate": 2.4502089448160488e-05, "loss": 0.2893, "num_input_tokens_seen": 65071648, "step": 68125 }, { "epoch": 5.557549555428665, "grad_norm": 0.9068809747695923, "learning_rate": 2.4498530877757568e-05, "loss": 0.3827, "num_input_tokens_seen": 65076544, "step": 68130 }, { "epoch": 5.557957419039074, "grad_norm": 2.0111148357391357, "learning_rate": 2.449497231751921e-05, "loss": 0.356, "num_input_tokens_seen": 65082160, "step": 68135 }, { "epoch": 5.558365282649482, "grad_norm": 5.9004082679748535, "learning_rate": 2.4491413767517553e-05, "loss": 0.3834, "num_input_tokens_seen": 65086928, "step": 68140 }, { "epoch": 5.558773146259891, "grad_norm": 0.8381807804107666, "learning_rate": 2.4487855227824724e-05, "loss": 0.4127, "num_input_tokens_seen": 65091104, "step": 68145 }, { "epoch": 5.559181009870299, "grad_norm": 1.064889669418335, "learning_rate": 2.448429669851285e-05, "loss": 0.3834, "num_input_tokens_seen": 65095552, "step": 68150 }, { "epoch": 5.559588873480708, "grad_norm": 6.298399925231934, "learning_rate": 2.4480738179654057e-05, "loss": 0.4139, "num_input_tokens_seen": 65100688, "step": 68155 }, { "epoch": 5.559996737091117, "grad_norm": 4.946349143981934, "learning_rate": 2.4477179671320485e-05, "loss": 0.335, "num_input_tokens_seen": 65105472, "step": 68160 }, { "epoch": 5.560404600701525, "grad_norm": 1.6923810243606567, "learning_rate": 2.4473621173584264e-05, "loss": 0.4239, "num_input_tokens_seen": 65110480, "step": 68165 }, { "epoch": 5.560812464311934, "grad_norm": 1.7748639583587646, "learning_rate": 2.4470062686517514e-05, "loss": 0.3665, "num_input_tokens_seen": 65115392, "step": 68170 }, { "epoch": 5.561220327922343, "grad_norm": 1.3771047592163086, "learning_rate": 2.446650421019237e-05, "loss": 0.3802, "num_input_tokens_seen": 65119664, "step": 68175 }, { "epoch": 5.561628191532751, "grad_norm": 1.5126736164093018, "learning_rate": 2.4462945744680954e-05, "loss": 0.2702, "num_input_tokens_seen": 65124496, "step": 68180 }, { "epoch": 5.56203605514316, "grad_norm": 1.3646705150604248, "learning_rate": 2.4459387290055404e-05, "loss": 0.2743, "num_input_tokens_seen": 65129328, "step": 68185 }, { "epoch": 5.562443918753569, "grad_norm": 1.1473537683486938, "learning_rate": 2.4455828846387837e-05, "loss": 0.394, "num_input_tokens_seen": 65134016, "step": 68190 }, { "epoch": 5.562851782363977, "grad_norm": 4.196527004241943, "learning_rate": 2.4452270413750398e-05, "loss": 0.2726, "num_input_tokens_seen": 65139408, "step": 68195 }, { "epoch": 5.563259645974386, "grad_norm": 2.277621269226074, "learning_rate": 2.4448711992215196e-05, "loss": 0.3046, "num_input_tokens_seen": 65143184, "step": 68200 }, { "epoch": 5.5636675095847945, "grad_norm": 1.1506361961364746, "learning_rate": 2.4445153581854374e-05, "loss": 0.3386, "num_input_tokens_seen": 65147456, "step": 68205 }, { "epoch": 5.5640753731952035, "grad_norm": 3.3674910068511963, "learning_rate": 2.444159518274005e-05, "loss": 0.3911, "num_input_tokens_seen": 65151808, "step": 68210 }, { "epoch": 5.5644832368056125, "grad_norm": 2.3881964683532715, "learning_rate": 2.4438036794944362e-05, "loss": 0.3471, "num_input_tokens_seen": 65156048, "step": 68215 }, { "epoch": 5.564891100416021, "grad_norm": 1.7489804029464722, "learning_rate": 2.443447841853942e-05, "loss": 0.3588, "num_input_tokens_seen": 65159808, "step": 68220 }, { "epoch": 5.56529896402643, "grad_norm": 3.173898935317993, "learning_rate": 2.4430920053597356e-05, "loss": 0.2768, "num_input_tokens_seen": 65164288, "step": 68225 }, { "epoch": 5.565706827636838, "grad_norm": 2.4602017402648926, "learning_rate": 2.442736170019031e-05, "loss": 0.3246, "num_input_tokens_seen": 65169680, "step": 68230 }, { "epoch": 5.566114691247247, "grad_norm": 1.5443792343139648, "learning_rate": 2.4423803358390397e-05, "loss": 0.2602, "num_input_tokens_seen": 65174592, "step": 68235 }, { "epoch": 5.566522554857656, "grad_norm": 7.992013931274414, "learning_rate": 2.4420245028269752e-05, "loss": 0.3216, "num_input_tokens_seen": 65179344, "step": 68240 }, { "epoch": 5.566930418468064, "grad_norm": 4.293177127838135, "learning_rate": 2.441668670990048e-05, "loss": 0.3266, "num_input_tokens_seen": 65184128, "step": 68245 }, { "epoch": 5.567338282078473, "grad_norm": 10.087575912475586, "learning_rate": 2.4413128403354723e-05, "loss": 0.3364, "num_input_tokens_seen": 65188144, "step": 68250 }, { "epoch": 5.567746145688882, "grad_norm": 2.0612969398498535, "learning_rate": 2.4409570108704612e-05, "loss": 0.3552, "num_input_tokens_seen": 65193200, "step": 68255 }, { "epoch": 5.56815400929929, "grad_norm": 6.3882060050964355, "learning_rate": 2.440601182602226e-05, "loss": 0.3749, "num_input_tokens_seen": 65198848, "step": 68260 }, { "epoch": 5.568561872909699, "grad_norm": 2.027853012084961, "learning_rate": 2.4402453555379795e-05, "loss": 0.3577, "num_input_tokens_seen": 65202576, "step": 68265 }, { "epoch": 5.568969736520108, "grad_norm": 1.4217205047607422, "learning_rate": 2.4398895296849345e-05, "loss": 0.238, "num_input_tokens_seen": 65207584, "step": 68270 }, { "epoch": 5.569377600130516, "grad_norm": 1.1294254064559937, "learning_rate": 2.4395337050503033e-05, "loss": 0.3165, "num_input_tokens_seen": 65212320, "step": 68275 }, { "epoch": 5.569785463740925, "grad_norm": 4.3909173011779785, "learning_rate": 2.439177881641298e-05, "loss": 0.2554, "num_input_tokens_seen": 65216176, "step": 68280 }, { "epoch": 5.570193327351333, "grad_norm": 3.6030006408691406, "learning_rate": 2.4388220594651315e-05, "loss": 0.2882, "num_input_tokens_seen": 65222160, "step": 68285 }, { "epoch": 5.570601190961742, "grad_norm": 3.02426815032959, "learning_rate": 2.4384662385290148e-05, "loss": 0.3059, "num_input_tokens_seen": 65227360, "step": 68290 }, { "epoch": 5.571009054572151, "grad_norm": 3.100998878479004, "learning_rate": 2.4381104188401622e-05, "loss": 0.2877, "num_input_tokens_seen": 65232080, "step": 68295 }, { "epoch": 5.5714169181825595, "grad_norm": 6.463664531707764, "learning_rate": 2.437754600405785e-05, "loss": 0.441, "num_input_tokens_seen": 65237360, "step": 68300 }, { "epoch": 5.5718247817929685, "grad_norm": 2.4003946781158447, "learning_rate": 2.4373987832330954e-05, "loss": 0.319, "num_input_tokens_seen": 65241968, "step": 68305 }, { "epoch": 5.5722326454033775, "grad_norm": 3.2104387283325195, "learning_rate": 2.437042967329305e-05, "loss": 0.2406, "num_input_tokens_seen": 65246672, "step": 68310 }, { "epoch": 5.572640509013786, "grad_norm": 7.9766340255737305, "learning_rate": 2.4366871527016283e-05, "loss": 0.3029, "num_input_tokens_seen": 65251280, "step": 68315 }, { "epoch": 5.573048372624195, "grad_norm": 2.8699326515197754, "learning_rate": 2.4363313393572758e-05, "loss": 0.2566, "num_input_tokens_seen": 65256704, "step": 68320 }, { "epoch": 5.573456236234604, "grad_norm": 7.949285507202148, "learning_rate": 2.43597552730346e-05, "loss": 0.5272, "num_input_tokens_seen": 65260432, "step": 68325 }, { "epoch": 5.573864099845012, "grad_norm": 5.610367298126221, "learning_rate": 2.4356197165473935e-05, "loss": 0.2218, "num_input_tokens_seen": 65264464, "step": 68330 }, { "epoch": 5.574271963455421, "grad_norm": 11.562127113342285, "learning_rate": 2.435263907096287e-05, "loss": 0.2163, "num_input_tokens_seen": 65269760, "step": 68335 }, { "epoch": 5.574679827065829, "grad_norm": 12.386895179748535, "learning_rate": 2.434908098957354e-05, "loss": 0.2693, "num_input_tokens_seen": 65274736, "step": 68340 }, { "epoch": 5.575087690676238, "grad_norm": 7.154600620269775, "learning_rate": 2.4345522921378065e-05, "loss": 0.269, "num_input_tokens_seen": 65279920, "step": 68345 }, { "epoch": 5.575495554286647, "grad_norm": 4.45448112487793, "learning_rate": 2.4341964866448562e-05, "loss": 0.2392, "num_input_tokens_seen": 65284960, "step": 68350 }, { "epoch": 5.575903417897055, "grad_norm": 18.46797752380371, "learning_rate": 2.433840682485714e-05, "loss": 0.468, "num_input_tokens_seen": 65288624, "step": 68355 }, { "epoch": 5.576311281507464, "grad_norm": 0.3737645149230957, "learning_rate": 2.433484879667594e-05, "loss": 0.3392, "num_input_tokens_seen": 65293088, "step": 68360 }, { "epoch": 5.576719145117872, "grad_norm": 3.9330055713653564, "learning_rate": 2.4331290781977074e-05, "loss": 0.3276, "num_input_tokens_seen": 65298512, "step": 68365 }, { "epoch": 5.577127008728281, "grad_norm": 6.179501056671143, "learning_rate": 2.432773278083266e-05, "loss": 0.3045, "num_input_tokens_seen": 65303536, "step": 68370 }, { "epoch": 5.57753487233869, "grad_norm": 11.481308937072754, "learning_rate": 2.4324174793314815e-05, "loss": 0.2468, "num_input_tokens_seen": 65308736, "step": 68375 }, { "epoch": 5.577942735949098, "grad_norm": 4.2824249267578125, "learning_rate": 2.432061681949565e-05, "loss": 0.3612, "num_input_tokens_seen": 65313216, "step": 68380 }, { "epoch": 5.578350599559507, "grad_norm": 22.79253387451172, "learning_rate": 2.4317058859447306e-05, "loss": 0.3424, "num_input_tokens_seen": 65318080, "step": 68385 }, { "epoch": 5.578758463169916, "grad_norm": 1.641321063041687, "learning_rate": 2.4313500913241888e-05, "loss": 0.3273, "num_input_tokens_seen": 65322704, "step": 68390 }, { "epoch": 5.579166326780324, "grad_norm": 1.5355910062789917, "learning_rate": 2.4309942980951513e-05, "loss": 0.1665, "num_input_tokens_seen": 65327232, "step": 68395 }, { "epoch": 5.579574190390733, "grad_norm": 11.925792694091797, "learning_rate": 2.4306385062648295e-05, "loss": 0.2992, "num_input_tokens_seen": 65332160, "step": 68400 }, { "epoch": 5.5799820540011424, "grad_norm": 3.4459943771362305, "learning_rate": 2.4302827158404364e-05, "loss": 0.3049, "num_input_tokens_seen": 65336480, "step": 68405 }, { "epoch": 5.580389917611551, "grad_norm": 5.552301406860352, "learning_rate": 2.429926926829183e-05, "loss": 0.4611, "num_input_tokens_seen": 65340768, "step": 68410 }, { "epoch": 5.58079778122196, "grad_norm": 12.145658493041992, "learning_rate": 2.429571139238281e-05, "loss": 0.2508, "num_input_tokens_seen": 65346464, "step": 68415 }, { "epoch": 5.581205644832368, "grad_norm": 9.178521156311035, "learning_rate": 2.429215353074942e-05, "loss": 0.3849, "num_input_tokens_seen": 65350496, "step": 68420 }, { "epoch": 5.581613508442777, "grad_norm": 11.726558685302734, "learning_rate": 2.428859568346377e-05, "loss": 0.35, "num_input_tokens_seen": 65355296, "step": 68425 }, { "epoch": 5.582021372053186, "grad_norm": 2.0219266414642334, "learning_rate": 2.428503785059799e-05, "loss": 0.3735, "num_input_tokens_seen": 65359440, "step": 68430 }, { "epoch": 5.582429235663594, "grad_norm": 11.495068550109863, "learning_rate": 2.428148003222419e-05, "loss": 0.2997, "num_input_tokens_seen": 65364592, "step": 68435 }, { "epoch": 5.582837099274003, "grad_norm": 5.142479419708252, "learning_rate": 2.4277922228414483e-05, "loss": 0.3294, "num_input_tokens_seen": 65369216, "step": 68440 }, { "epoch": 5.583244962884411, "grad_norm": 11.710564613342285, "learning_rate": 2.4274364439240977e-05, "loss": 0.2895, "num_input_tokens_seen": 65374848, "step": 68445 }, { "epoch": 5.58365282649482, "grad_norm": 4.013866901397705, "learning_rate": 2.4270806664775805e-05, "loss": 0.3246, "num_input_tokens_seen": 65379376, "step": 68450 }, { "epoch": 5.584060690105229, "grad_norm": 0.39088311791419983, "learning_rate": 2.426724890509107e-05, "loss": 0.336, "num_input_tokens_seen": 65384400, "step": 68455 }, { "epoch": 5.584468553715637, "grad_norm": 1.0804160833358765, "learning_rate": 2.4263691160258893e-05, "loss": 0.3723, "num_input_tokens_seen": 65389360, "step": 68460 }, { "epoch": 5.584876417326046, "grad_norm": 2.650526762008667, "learning_rate": 2.4260133430351374e-05, "loss": 0.2718, "num_input_tokens_seen": 65393360, "step": 68465 }, { "epoch": 5.585284280936455, "grad_norm": 4.37000036239624, "learning_rate": 2.4256575715440645e-05, "loss": 0.3784, "num_input_tokens_seen": 65398960, "step": 68470 }, { "epoch": 5.585692144546863, "grad_norm": 9.919776916503906, "learning_rate": 2.425301801559881e-05, "loss": 0.3989, "num_input_tokens_seen": 65404384, "step": 68475 }, { "epoch": 5.586100008157272, "grad_norm": 1.956079363822937, "learning_rate": 2.424946033089798e-05, "loss": 0.2405, "num_input_tokens_seen": 65409472, "step": 68480 }, { "epoch": 5.586507871767681, "grad_norm": 2.1927852630615234, "learning_rate": 2.4245902661410268e-05, "loss": 0.3111, "num_input_tokens_seen": 65414496, "step": 68485 }, { "epoch": 5.586915735378089, "grad_norm": 4.125131130218506, "learning_rate": 2.4242345007207783e-05, "loss": 0.5074, "num_input_tokens_seen": 65419296, "step": 68490 }, { "epoch": 5.587323598988498, "grad_norm": 8.14553451538086, "learning_rate": 2.4238787368362652e-05, "loss": 0.3034, "num_input_tokens_seen": 65423424, "step": 68495 }, { "epoch": 5.5877314625989065, "grad_norm": 24.919418334960938, "learning_rate": 2.423522974494698e-05, "loss": 0.2803, "num_input_tokens_seen": 65427776, "step": 68500 }, { "epoch": 5.5881393262093155, "grad_norm": 10.331098556518555, "learning_rate": 2.4231672137032872e-05, "loss": 0.4225, "num_input_tokens_seen": 65432624, "step": 68505 }, { "epoch": 5.5885471898197245, "grad_norm": 7.914684772491455, "learning_rate": 2.4228114544692432e-05, "loss": 0.3711, "num_input_tokens_seen": 65437200, "step": 68510 }, { "epoch": 5.588955053430133, "grad_norm": 0.8587178587913513, "learning_rate": 2.4224556967997795e-05, "loss": 0.2306, "num_input_tokens_seen": 65442640, "step": 68515 }, { "epoch": 5.589362917040542, "grad_norm": 13.41076946258545, "learning_rate": 2.4220999407021057e-05, "loss": 0.3395, "num_input_tokens_seen": 65446480, "step": 68520 }, { "epoch": 5.589770780650951, "grad_norm": 5.364495754241943, "learning_rate": 2.421744186183433e-05, "loss": 0.3919, "num_input_tokens_seen": 65449968, "step": 68525 }, { "epoch": 5.590178644261359, "grad_norm": 0.8995818495750427, "learning_rate": 2.4213884332509727e-05, "loss": 0.3438, "num_input_tokens_seen": 65454752, "step": 68530 }, { "epoch": 5.590586507871768, "grad_norm": 12.312923431396484, "learning_rate": 2.4210326819119348e-05, "loss": 0.3163, "num_input_tokens_seen": 65459984, "step": 68535 }, { "epoch": 5.590994371482177, "grad_norm": 8.02969741821289, "learning_rate": 2.4206769321735313e-05, "loss": 0.2248, "num_input_tokens_seen": 65464496, "step": 68540 }, { "epoch": 5.591402235092585, "grad_norm": 9.900398254394531, "learning_rate": 2.4203211840429725e-05, "loss": 0.4523, "num_input_tokens_seen": 65469408, "step": 68545 }, { "epoch": 5.591810098702994, "grad_norm": 1.1812902688980103, "learning_rate": 2.41996543752747e-05, "loss": 0.2691, "num_input_tokens_seen": 65473920, "step": 68550 }, { "epoch": 5.592217962313402, "grad_norm": 6.062137126922607, "learning_rate": 2.4196096926342325e-05, "loss": 0.4134, "num_input_tokens_seen": 65478496, "step": 68555 }, { "epoch": 5.592625825923811, "grad_norm": 7.247558116912842, "learning_rate": 2.419253949370474e-05, "loss": 0.4024, "num_input_tokens_seen": 65482704, "step": 68560 }, { "epoch": 5.59303368953422, "grad_norm": 7.195115089416504, "learning_rate": 2.418898207743403e-05, "loss": 0.3184, "num_input_tokens_seen": 65487472, "step": 68565 }, { "epoch": 5.593441553144628, "grad_norm": 6.804939270019531, "learning_rate": 2.4185424677602312e-05, "loss": 0.3626, "num_input_tokens_seen": 65492192, "step": 68570 }, { "epoch": 5.593849416755037, "grad_norm": 1.5932743549346924, "learning_rate": 2.4181867294281688e-05, "loss": 0.2119, "num_input_tokens_seen": 65497712, "step": 68575 }, { "epoch": 5.594257280365445, "grad_norm": 1.9262645244598389, "learning_rate": 2.4178309927544255e-05, "loss": 0.2496, "num_input_tokens_seen": 65502544, "step": 68580 }, { "epoch": 5.594665143975854, "grad_norm": 8.249340057373047, "learning_rate": 2.4174752577462147e-05, "loss": 0.3286, "num_input_tokens_seen": 65506816, "step": 68585 }, { "epoch": 5.595073007586263, "grad_norm": 1.260935664176941, "learning_rate": 2.4171195244107448e-05, "loss": 0.3085, "num_input_tokens_seen": 65511312, "step": 68590 }, { "epoch": 5.5954808711966715, "grad_norm": 7.2512946128845215, "learning_rate": 2.416763792755227e-05, "loss": 0.2574, "num_input_tokens_seen": 65516320, "step": 68595 }, { "epoch": 5.5958887348070805, "grad_norm": 10.481937408447266, "learning_rate": 2.416408062786871e-05, "loss": 0.3225, "num_input_tokens_seen": 65521104, "step": 68600 }, { "epoch": 5.5962965984174895, "grad_norm": 9.40831184387207, "learning_rate": 2.416052334512889e-05, "loss": 0.286, "num_input_tokens_seen": 65525840, "step": 68605 }, { "epoch": 5.596704462027898, "grad_norm": 0.6773804426193237, "learning_rate": 2.4156966079404907e-05, "loss": 0.1493, "num_input_tokens_seen": 65531104, "step": 68610 }, { "epoch": 5.597112325638307, "grad_norm": 1.062601089477539, "learning_rate": 2.4153408830768858e-05, "loss": 0.2066, "num_input_tokens_seen": 65534992, "step": 68615 }, { "epoch": 5.597520189248716, "grad_norm": 3.539592981338501, "learning_rate": 2.4149851599292846e-05, "loss": 0.4348, "num_input_tokens_seen": 65539088, "step": 68620 }, { "epoch": 5.597928052859124, "grad_norm": 0.8251657485961914, "learning_rate": 2.4146294385048987e-05, "loss": 0.4009, "num_input_tokens_seen": 65543920, "step": 68625 }, { "epoch": 5.598335916469533, "grad_norm": 1.5586568117141724, "learning_rate": 2.4142737188109378e-05, "loss": 0.3682, "num_input_tokens_seen": 65549504, "step": 68630 }, { "epoch": 5.598743780079941, "grad_norm": 0.7820638418197632, "learning_rate": 2.4139180008546125e-05, "loss": 0.1877, "num_input_tokens_seen": 65554416, "step": 68635 }, { "epoch": 5.59915164369035, "grad_norm": 5.288365840911865, "learning_rate": 2.4135622846431327e-05, "loss": 0.3988, "num_input_tokens_seen": 65558768, "step": 68640 }, { "epoch": 5.599559507300759, "grad_norm": 0.8311864137649536, "learning_rate": 2.4132065701837077e-05, "loss": 0.4014, "num_input_tokens_seen": 65562768, "step": 68645 }, { "epoch": 5.599967370911167, "grad_norm": 4.0044474601745605, "learning_rate": 2.4128508574835494e-05, "loss": 0.28, "num_input_tokens_seen": 65567760, "step": 68650 }, { "epoch": 5.600375234521576, "grad_norm": 6.285884380340576, "learning_rate": 2.4124951465498675e-05, "loss": 0.4895, "num_input_tokens_seen": 65572944, "step": 68655 }, { "epoch": 5.600783098131985, "grad_norm": 1.541030764579773, "learning_rate": 2.4121394373898716e-05, "loss": 0.3321, "num_input_tokens_seen": 65577760, "step": 68660 }, { "epoch": 5.601190961742393, "grad_norm": 9.083964347839355, "learning_rate": 2.4117837300107716e-05, "loss": 0.3246, "num_input_tokens_seen": 65582608, "step": 68665 }, { "epoch": 5.601598825352802, "grad_norm": 3.081810712814331, "learning_rate": 2.4114280244197782e-05, "loss": 0.4361, "num_input_tokens_seen": 65586672, "step": 68670 }, { "epoch": 5.602006688963211, "grad_norm": 6.035749912261963, "learning_rate": 2.4110723206241012e-05, "loss": 0.3115, "num_input_tokens_seen": 65590880, "step": 68675 }, { "epoch": 5.602414552573619, "grad_norm": 4.917010307312012, "learning_rate": 2.4107166186309503e-05, "loss": 0.3253, "num_input_tokens_seen": 65595568, "step": 68680 }, { "epoch": 5.602822416184028, "grad_norm": 6.196191310882568, "learning_rate": 2.410360918447536e-05, "loss": 0.3888, "num_input_tokens_seen": 65600816, "step": 68685 }, { "epoch": 5.603230279794436, "grad_norm": 7.500821113586426, "learning_rate": 2.4100052200810667e-05, "loss": 0.3646, "num_input_tokens_seen": 65604880, "step": 68690 }, { "epoch": 5.6036381434048455, "grad_norm": 1.011026382446289, "learning_rate": 2.409649523538754e-05, "loss": 0.2418, "num_input_tokens_seen": 65609920, "step": 68695 }, { "epoch": 5.6040460070152545, "grad_norm": 1.7858859300613403, "learning_rate": 2.4092938288278073e-05, "loss": 0.2943, "num_input_tokens_seen": 65615184, "step": 68700 }, { "epoch": 5.604453870625663, "grad_norm": 8.727968215942383, "learning_rate": 2.4089381359554364e-05, "loss": 0.3977, "num_input_tokens_seen": 65619136, "step": 68705 }, { "epoch": 5.604861734236072, "grad_norm": 9.750869750976562, "learning_rate": 2.4085824449288493e-05, "loss": 0.277, "num_input_tokens_seen": 65624112, "step": 68710 }, { "epoch": 5.60526959784648, "grad_norm": 1.6455118656158447, "learning_rate": 2.4082267557552585e-05, "loss": 0.2604, "num_input_tokens_seen": 65628368, "step": 68715 }, { "epoch": 5.605677461456889, "grad_norm": 3.038731098175049, "learning_rate": 2.4078710684418723e-05, "loss": 0.5084, "num_input_tokens_seen": 65632944, "step": 68720 }, { "epoch": 5.606085325067298, "grad_norm": 10.216174125671387, "learning_rate": 2.4075153829959004e-05, "loss": 0.3178, "num_input_tokens_seen": 65638208, "step": 68725 }, { "epoch": 5.606493188677706, "grad_norm": 5.1129841804504395, "learning_rate": 2.4071596994245523e-05, "loss": 0.3183, "num_input_tokens_seen": 65642704, "step": 68730 }, { "epoch": 5.606901052288115, "grad_norm": 1.3479896783828735, "learning_rate": 2.4068040177350373e-05, "loss": 0.3099, "num_input_tokens_seen": 65648064, "step": 68735 }, { "epoch": 5.607308915898524, "grad_norm": 3.06309175491333, "learning_rate": 2.4064483379345657e-05, "loss": 0.3476, "num_input_tokens_seen": 65651680, "step": 68740 }, { "epoch": 5.607716779508932, "grad_norm": 11.38813304901123, "learning_rate": 2.4060926600303464e-05, "loss": 0.4353, "num_input_tokens_seen": 65656752, "step": 68745 }, { "epoch": 5.608124643119341, "grad_norm": 0.6875945329666138, "learning_rate": 2.4057369840295894e-05, "loss": 0.3774, "num_input_tokens_seen": 65661488, "step": 68750 }, { "epoch": 5.60853250672975, "grad_norm": 1.3458995819091797, "learning_rate": 2.405381309939502e-05, "loss": 0.2669, "num_input_tokens_seen": 65666208, "step": 68755 }, { "epoch": 5.608940370340158, "grad_norm": 7.873947620391846, "learning_rate": 2.4050256377672966e-05, "loss": 0.3451, "num_input_tokens_seen": 65671216, "step": 68760 }, { "epoch": 5.609348233950567, "grad_norm": 4.7009806632995605, "learning_rate": 2.4046699675201813e-05, "loss": 0.3256, "num_input_tokens_seen": 65676144, "step": 68765 }, { "epoch": 5.609756097560975, "grad_norm": 0.8431392312049866, "learning_rate": 2.4043142992053653e-05, "loss": 0.2789, "num_input_tokens_seen": 65681856, "step": 68770 }, { "epoch": 5.610163961171384, "grad_norm": 1.1504905223846436, "learning_rate": 2.4039586328300576e-05, "loss": 0.3116, "num_input_tokens_seen": 65686224, "step": 68775 }, { "epoch": 5.610571824781793, "grad_norm": 4.877447128295898, "learning_rate": 2.4036029684014664e-05, "loss": 0.4473, "num_input_tokens_seen": 65690768, "step": 68780 }, { "epoch": 5.610979688392201, "grad_norm": 1.8588552474975586, "learning_rate": 2.4032473059268034e-05, "loss": 0.3948, "num_input_tokens_seen": 65695488, "step": 68785 }, { "epoch": 5.61138755200261, "grad_norm": 1.1830189228057861, "learning_rate": 2.4028916454132764e-05, "loss": 0.2779, "num_input_tokens_seen": 65699632, "step": 68790 }, { "epoch": 5.6117954156130185, "grad_norm": 5.30496883392334, "learning_rate": 2.402535986868094e-05, "loss": 0.3602, "num_input_tokens_seen": 65704400, "step": 68795 }, { "epoch": 5.6122032792234275, "grad_norm": 1.875902771949768, "learning_rate": 2.4021803302984657e-05, "loss": 0.3007, "num_input_tokens_seen": 65708688, "step": 68800 }, { "epoch": 5.6126111428338366, "grad_norm": 19.11481475830078, "learning_rate": 2.401824675711601e-05, "loss": 0.419, "num_input_tokens_seen": 65713680, "step": 68805 }, { "epoch": 5.613019006444245, "grad_norm": 2.122711181640625, "learning_rate": 2.4014690231147087e-05, "loss": 0.3061, "num_input_tokens_seen": 65717552, "step": 68810 }, { "epoch": 5.613426870054654, "grad_norm": 1.6726280450820923, "learning_rate": 2.4011133725149966e-05, "loss": 0.3473, "num_input_tokens_seen": 65722032, "step": 68815 }, { "epoch": 5.613834733665063, "grad_norm": 1.3950176239013672, "learning_rate": 2.400757723919674e-05, "loss": 0.3811, "num_input_tokens_seen": 65727008, "step": 68820 }, { "epoch": 5.614242597275471, "grad_norm": 1.741388201713562, "learning_rate": 2.4004020773359513e-05, "loss": 0.251, "num_input_tokens_seen": 65732416, "step": 68825 }, { "epoch": 5.61465046088588, "grad_norm": 1.7753112316131592, "learning_rate": 2.400046432771036e-05, "loss": 0.3556, "num_input_tokens_seen": 65736768, "step": 68830 }, { "epoch": 5.615058324496289, "grad_norm": 0.6295148730278015, "learning_rate": 2.399690790232137e-05, "loss": 0.3014, "num_input_tokens_seen": 65741696, "step": 68835 }, { "epoch": 5.615466188106697, "grad_norm": 2.818726062774658, "learning_rate": 2.399335149726463e-05, "loss": 0.3916, "num_input_tokens_seen": 65746464, "step": 68840 }, { "epoch": 5.615874051717106, "grad_norm": 2.1651928424835205, "learning_rate": 2.3989795112612215e-05, "loss": 0.2626, "num_input_tokens_seen": 65751088, "step": 68845 }, { "epoch": 5.616281915327514, "grad_norm": 2.030330181121826, "learning_rate": 2.3986238748436237e-05, "loss": 0.4331, "num_input_tokens_seen": 65755648, "step": 68850 }, { "epoch": 5.616689778937923, "grad_norm": 11.952516555786133, "learning_rate": 2.398268240480877e-05, "loss": 0.346, "num_input_tokens_seen": 65760480, "step": 68855 }, { "epoch": 5.617097642548332, "grad_norm": 1.3623255491256714, "learning_rate": 2.3979126081801895e-05, "loss": 0.2927, "num_input_tokens_seen": 65765424, "step": 68860 }, { "epoch": 5.61750550615874, "grad_norm": 2.087095022201538, "learning_rate": 2.3975569779487697e-05, "loss": 0.2905, "num_input_tokens_seen": 65770432, "step": 68865 }, { "epoch": 5.617913369769149, "grad_norm": 8.121166229248047, "learning_rate": 2.397201349793827e-05, "loss": 0.2661, "num_input_tokens_seen": 65774944, "step": 68870 }, { "epoch": 5.618321233379558, "grad_norm": 1.8635082244873047, "learning_rate": 2.396845723722569e-05, "loss": 0.2592, "num_input_tokens_seen": 65779264, "step": 68875 }, { "epoch": 5.618729096989966, "grad_norm": 1.2238681316375732, "learning_rate": 2.3964900997422043e-05, "loss": 0.2995, "num_input_tokens_seen": 65784224, "step": 68880 }, { "epoch": 5.619136960600375, "grad_norm": 1.318955659866333, "learning_rate": 2.3961344778599415e-05, "loss": 0.2912, "num_input_tokens_seen": 65789872, "step": 68885 }, { "epoch": 5.619544824210784, "grad_norm": 2.644362449645996, "learning_rate": 2.3957788580829882e-05, "loss": 0.3149, "num_input_tokens_seen": 65794416, "step": 68890 }, { "epoch": 5.6199526878211925, "grad_norm": 4.427544116973877, "learning_rate": 2.3954232404185538e-05, "loss": 0.3893, "num_input_tokens_seen": 65799728, "step": 68895 }, { "epoch": 5.6203605514316015, "grad_norm": 0.8247110247612, "learning_rate": 2.3950676248738457e-05, "loss": 0.3512, "num_input_tokens_seen": 65804752, "step": 68900 }, { "epoch": 5.62076841504201, "grad_norm": 0.6308794021606445, "learning_rate": 2.3947120114560726e-05, "loss": 0.255, "num_input_tokens_seen": 65809120, "step": 68905 }, { "epoch": 5.621176278652419, "grad_norm": 1.1064928770065308, "learning_rate": 2.3943564001724407e-05, "loss": 0.2891, "num_input_tokens_seen": 65813824, "step": 68910 }, { "epoch": 5.621584142262828, "grad_norm": 5.237583160400391, "learning_rate": 2.3940007910301613e-05, "loss": 0.1758, "num_input_tokens_seen": 65818256, "step": 68915 }, { "epoch": 5.621992005873236, "grad_norm": 2.5508217811584473, "learning_rate": 2.3936451840364406e-05, "loss": 0.4148, "num_input_tokens_seen": 65823184, "step": 68920 }, { "epoch": 5.622399869483645, "grad_norm": 0.4937514364719391, "learning_rate": 2.3932895791984868e-05, "loss": 0.298, "num_input_tokens_seen": 65828704, "step": 68925 }, { "epoch": 5.622807733094053, "grad_norm": 1.9905911684036255, "learning_rate": 2.3929339765235075e-05, "loss": 0.5453, "num_input_tokens_seen": 65833792, "step": 68930 }, { "epoch": 5.623215596704462, "grad_norm": 5.365468502044678, "learning_rate": 2.392578376018711e-05, "loss": 0.431, "num_input_tokens_seen": 65838480, "step": 68935 }, { "epoch": 5.623623460314871, "grad_norm": 1.4341769218444824, "learning_rate": 2.392222777691305e-05, "loss": 0.4059, "num_input_tokens_seen": 65842912, "step": 68940 }, { "epoch": 5.624031323925279, "grad_norm": 1.9600448608398438, "learning_rate": 2.391867181548498e-05, "loss": 0.3953, "num_input_tokens_seen": 65847968, "step": 68945 }, { "epoch": 5.624439187535688, "grad_norm": 2.5753259658813477, "learning_rate": 2.391511587597497e-05, "loss": 0.3969, "num_input_tokens_seen": 65853488, "step": 68950 }, { "epoch": 5.624847051146097, "grad_norm": 5.283260345458984, "learning_rate": 2.39115599584551e-05, "loss": 0.2679, "num_input_tokens_seen": 65858912, "step": 68955 }, { "epoch": 5.625254914756505, "grad_norm": 13.263004302978516, "learning_rate": 2.3908004062997452e-05, "loss": 0.3565, "num_input_tokens_seen": 65863264, "step": 68960 }, { "epoch": 5.625662778366914, "grad_norm": 4.367330551147461, "learning_rate": 2.3904448189674095e-05, "loss": 0.4231, "num_input_tokens_seen": 65868368, "step": 68965 }, { "epoch": 5.626070641977323, "grad_norm": 2.205082416534424, "learning_rate": 2.390089233855711e-05, "loss": 0.3261, "num_input_tokens_seen": 65872352, "step": 68970 }, { "epoch": 5.626478505587731, "grad_norm": 11.224726676940918, "learning_rate": 2.3897336509718567e-05, "loss": 0.485, "num_input_tokens_seen": 65875936, "step": 68975 }, { "epoch": 5.62688636919814, "grad_norm": 2.298940896987915, "learning_rate": 2.3893780703230538e-05, "loss": 0.4245, "num_input_tokens_seen": 65880224, "step": 68980 }, { "epoch": 5.6272942328085485, "grad_norm": 0.7861528396606445, "learning_rate": 2.389022491916511e-05, "loss": 0.3163, "num_input_tokens_seen": 65885760, "step": 68985 }, { "epoch": 5.6277020964189575, "grad_norm": 0.7494428753852844, "learning_rate": 2.3886669157594354e-05, "loss": 0.306, "num_input_tokens_seen": 65890688, "step": 68990 }, { "epoch": 5.6281099600293665, "grad_norm": 4.343667984008789, "learning_rate": 2.388311341859034e-05, "loss": 0.3297, "num_input_tokens_seen": 65895152, "step": 68995 }, { "epoch": 5.628517823639775, "grad_norm": 1.2890582084655762, "learning_rate": 2.3879557702225133e-05, "loss": 0.3511, "num_input_tokens_seen": 65900304, "step": 69000 }, { "epoch": 5.628925687250184, "grad_norm": 1.7656898498535156, "learning_rate": 2.3876002008570827e-05, "loss": 0.3017, "num_input_tokens_seen": 65904864, "step": 69005 }, { "epoch": 5.629333550860592, "grad_norm": 2.343724250793457, "learning_rate": 2.387244633769948e-05, "loss": 0.3729, "num_input_tokens_seen": 65909312, "step": 69010 }, { "epoch": 5.629741414471001, "grad_norm": 0.814186155796051, "learning_rate": 2.3868890689683168e-05, "loss": 0.2762, "num_input_tokens_seen": 65912864, "step": 69015 }, { "epoch": 5.63014927808141, "grad_norm": 7.328980445861816, "learning_rate": 2.3865335064593956e-05, "loss": 0.3141, "num_input_tokens_seen": 65916960, "step": 69020 }, { "epoch": 5.630557141691818, "grad_norm": 7.999710559844971, "learning_rate": 2.3861779462503925e-05, "loss": 0.3658, "num_input_tokens_seen": 65921776, "step": 69025 }, { "epoch": 5.630965005302227, "grad_norm": 3.451054573059082, "learning_rate": 2.3858223883485143e-05, "loss": 0.2954, "num_input_tokens_seen": 65926288, "step": 69030 }, { "epoch": 5.631372868912636, "grad_norm": 4.523022174835205, "learning_rate": 2.3854668327609674e-05, "loss": 0.3961, "num_input_tokens_seen": 65931360, "step": 69035 }, { "epoch": 5.631780732523044, "grad_norm": 1.1381211280822754, "learning_rate": 2.385111279494959e-05, "loss": 0.2288, "num_input_tokens_seen": 65936336, "step": 69040 }, { "epoch": 5.632188596133453, "grad_norm": 14.739758491516113, "learning_rate": 2.3847557285576956e-05, "loss": 0.4125, "num_input_tokens_seen": 65941712, "step": 69045 }, { "epoch": 5.632596459743862, "grad_norm": 2.909487247467041, "learning_rate": 2.3844001799563852e-05, "loss": 0.31, "num_input_tokens_seen": 65945712, "step": 69050 }, { "epoch": 5.63300432335427, "grad_norm": 1.8638951778411865, "learning_rate": 2.3840446336982343e-05, "loss": 0.3313, "num_input_tokens_seen": 65950240, "step": 69055 }, { "epoch": 5.633412186964679, "grad_norm": 9.856712341308594, "learning_rate": 2.3836890897904497e-05, "loss": 0.2708, "num_input_tokens_seen": 65955600, "step": 69060 }, { "epoch": 5.633820050575087, "grad_norm": 1.0464465618133545, "learning_rate": 2.383333548240236e-05, "loss": 0.3327, "num_input_tokens_seen": 65961168, "step": 69065 }, { "epoch": 5.634227914185496, "grad_norm": 11.597063064575195, "learning_rate": 2.3829780090548034e-05, "loss": 0.2299, "num_input_tokens_seen": 65966160, "step": 69070 }, { "epoch": 5.634635777795905, "grad_norm": 5.490277290344238, "learning_rate": 2.3826224722413565e-05, "loss": 0.3949, "num_input_tokens_seen": 65971488, "step": 69075 }, { "epoch": 5.635043641406313, "grad_norm": 2.8397629261016846, "learning_rate": 2.3822669378071025e-05, "loss": 0.4409, "num_input_tokens_seen": 65976400, "step": 69080 }, { "epoch": 5.635451505016722, "grad_norm": 8.328126907348633, "learning_rate": 2.3819114057592472e-05, "loss": 0.3145, "num_input_tokens_seen": 65981440, "step": 69085 }, { "epoch": 5.635859368627131, "grad_norm": 1.1616894006729126, "learning_rate": 2.3815558761049973e-05, "loss": 0.3167, "num_input_tokens_seen": 65986720, "step": 69090 }, { "epoch": 5.63626723223754, "grad_norm": 1.901389718055725, "learning_rate": 2.3812003488515603e-05, "loss": 0.3471, "num_input_tokens_seen": 65992160, "step": 69095 }, { "epoch": 5.636675095847949, "grad_norm": 2.068720579147339, "learning_rate": 2.380844824006141e-05, "loss": 0.4431, "num_input_tokens_seen": 65997040, "step": 69100 }, { "epoch": 5.637082959458358, "grad_norm": 0.6881092190742493, "learning_rate": 2.3804893015759472e-05, "loss": 0.3374, "num_input_tokens_seen": 66001984, "step": 69105 }, { "epoch": 5.637490823068766, "grad_norm": 2.3507721424102783, "learning_rate": 2.380133781568183e-05, "loss": 0.3246, "num_input_tokens_seen": 66007328, "step": 69110 }, { "epoch": 5.637898686679175, "grad_norm": 3.4621543884277344, "learning_rate": 2.3797782639900573e-05, "loss": 0.2748, "num_input_tokens_seen": 66011296, "step": 69115 }, { "epoch": 5.638306550289583, "grad_norm": 3.0861692428588867, "learning_rate": 2.379422748848775e-05, "loss": 0.2891, "num_input_tokens_seen": 66016384, "step": 69120 }, { "epoch": 5.638714413899992, "grad_norm": 1.3829922676086426, "learning_rate": 2.3790672361515426e-05, "loss": 0.2836, "num_input_tokens_seen": 66021216, "step": 69125 }, { "epoch": 5.639122277510401, "grad_norm": 28.816490173339844, "learning_rate": 2.3787117259055657e-05, "loss": 0.3306, "num_input_tokens_seen": 66026320, "step": 69130 }, { "epoch": 5.639530141120809, "grad_norm": 1.844193696975708, "learning_rate": 2.3783562181180496e-05, "loss": 0.342, "num_input_tokens_seen": 66031472, "step": 69135 }, { "epoch": 5.639938004731218, "grad_norm": 0.6859123706817627, "learning_rate": 2.378000712796202e-05, "loss": 0.3186, "num_input_tokens_seen": 66036864, "step": 69140 }, { "epoch": 5.640345868341626, "grad_norm": 3.556368827819824, "learning_rate": 2.3776452099472287e-05, "loss": 0.3641, "num_input_tokens_seen": 66041264, "step": 69145 }, { "epoch": 5.640753731952035, "grad_norm": 1.8695201873779297, "learning_rate": 2.377289709578335e-05, "loss": 0.2951, "num_input_tokens_seen": 66045856, "step": 69150 }, { "epoch": 5.641161595562444, "grad_norm": 6.141520023345947, "learning_rate": 2.3769342116967255e-05, "loss": 0.3755, "num_input_tokens_seen": 66051200, "step": 69155 }, { "epoch": 5.641569459172852, "grad_norm": 14.02212142944336, "learning_rate": 2.3765787163096085e-05, "loss": 0.3695, "num_input_tokens_seen": 66055184, "step": 69160 }, { "epoch": 5.641977322783261, "grad_norm": 7.03411865234375, "learning_rate": 2.376223223424188e-05, "loss": 0.2576, "num_input_tokens_seen": 66059984, "step": 69165 }, { "epoch": 5.64238518639367, "grad_norm": 14.612646102905273, "learning_rate": 2.37586773304767e-05, "loss": 0.3256, "num_input_tokens_seen": 66064592, "step": 69170 }, { "epoch": 5.642793050004078, "grad_norm": 0.7407358288764954, "learning_rate": 2.375512245187261e-05, "loss": 0.3273, "num_input_tokens_seen": 66068688, "step": 69175 }, { "epoch": 5.643200913614487, "grad_norm": 12.734721183776855, "learning_rate": 2.3751567598501644e-05, "loss": 0.1871, "num_input_tokens_seen": 66073440, "step": 69180 }, { "epoch": 5.643608777224896, "grad_norm": 1.9813226461410522, "learning_rate": 2.3748012770435883e-05, "loss": 0.2511, "num_input_tokens_seen": 66077408, "step": 69185 }, { "epoch": 5.6440166408353045, "grad_norm": 8.482022285461426, "learning_rate": 2.3744457967747375e-05, "loss": 0.434, "num_input_tokens_seen": 66082400, "step": 69190 }, { "epoch": 5.6444245044457135, "grad_norm": 16.52780532836914, "learning_rate": 2.3740903190508165e-05, "loss": 0.2003, "num_input_tokens_seen": 66086912, "step": 69195 }, { "epoch": 5.644832368056122, "grad_norm": 6.415302753448486, "learning_rate": 2.3737348438790306e-05, "loss": 0.3885, "num_input_tokens_seen": 66092160, "step": 69200 }, { "epoch": 5.645240231666531, "grad_norm": 9.5619478225708, "learning_rate": 2.3733793712665865e-05, "loss": 0.3889, "num_input_tokens_seen": 66096912, "step": 69205 }, { "epoch": 5.64564809527694, "grad_norm": 1.5692920684814453, "learning_rate": 2.373023901220689e-05, "loss": 0.4115, "num_input_tokens_seen": 66101888, "step": 69210 }, { "epoch": 5.646055958887348, "grad_norm": 2.1815192699432373, "learning_rate": 2.3726684337485428e-05, "loss": 0.4693, "num_input_tokens_seen": 66106768, "step": 69215 }, { "epoch": 5.646463822497757, "grad_norm": 11.901829719543457, "learning_rate": 2.372312968857353e-05, "loss": 0.5652, "num_input_tokens_seen": 66111008, "step": 69220 }, { "epoch": 5.646871686108166, "grad_norm": 2.1834120750427246, "learning_rate": 2.3719575065543254e-05, "loss": 0.5065, "num_input_tokens_seen": 66116160, "step": 69225 }, { "epoch": 5.647279549718574, "grad_norm": 1.0979108810424805, "learning_rate": 2.3716020468466648e-05, "loss": 0.4015, "num_input_tokens_seen": 66120176, "step": 69230 }, { "epoch": 5.647687413328983, "grad_norm": 7.761170864105225, "learning_rate": 2.3712465897415766e-05, "loss": 0.3092, "num_input_tokens_seen": 66125248, "step": 69235 }, { "epoch": 5.648095276939392, "grad_norm": 6.665670394897461, "learning_rate": 2.3708911352462647e-05, "loss": 0.4113, "num_input_tokens_seen": 66130640, "step": 69240 }, { "epoch": 5.6485031405498, "grad_norm": 0.4166010916233063, "learning_rate": 2.370535683367934e-05, "loss": 0.2369, "num_input_tokens_seen": 66135360, "step": 69245 }, { "epoch": 5.648911004160209, "grad_norm": 1.2440153360366821, "learning_rate": 2.3701802341137908e-05, "loss": 0.3871, "num_input_tokens_seen": 66140624, "step": 69250 }, { "epoch": 5.649318867770617, "grad_norm": 0.9005221128463745, "learning_rate": 2.369824787491039e-05, "loss": 0.3259, "num_input_tokens_seen": 66145328, "step": 69255 }, { "epoch": 5.649726731381026, "grad_norm": 1.0336858034133911, "learning_rate": 2.3694693435068834e-05, "loss": 0.3723, "num_input_tokens_seen": 66150688, "step": 69260 }, { "epoch": 5.650134594991435, "grad_norm": 7.830057621002197, "learning_rate": 2.369113902168528e-05, "loss": 0.3283, "num_input_tokens_seen": 66155680, "step": 69265 }, { "epoch": 5.650542458601843, "grad_norm": 2.9493014812469482, "learning_rate": 2.3687584634831793e-05, "loss": 0.3139, "num_input_tokens_seen": 66160624, "step": 69270 }, { "epoch": 5.650950322212252, "grad_norm": 14.317892074584961, "learning_rate": 2.3684030274580405e-05, "loss": 0.3257, "num_input_tokens_seen": 66165296, "step": 69275 }, { "epoch": 5.6513581858226605, "grad_norm": 35.575016021728516, "learning_rate": 2.3680475941003165e-05, "loss": 0.4314, "num_input_tokens_seen": 66169728, "step": 69280 }, { "epoch": 5.6517660494330695, "grad_norm": 1.160559892654419, "learning_rate": 2.3676921634172115e-05, "loss": 0.2514, "num_input_tokens_seen": 66174880, "step": 69285 }, { "epoch": 5.6521739130434785, "grad_norm": 9.110347747802734, "learning_rate": 2.3673367354159297e-05, "loss": 0.3259, "num_input_tokens_seen": 66179968, "step": 69290 }, { "epoch": 5.652581776653887, "grad_norm": 1.501822590827942, "learning_rate": 2.3669813101036767e-05, "loss": 0.2926, "num_input_tokens_seen": 66184480, "step": 69295 }, { "epoch": 5.652989640264296, "grad_norm": 1.540284276008606, "learning_rate": 2.366625887487656e-05, "loss": 0.5299, "num_input_tokens_seen": 66189552, "step": 69300 }, { "epoch": 5.653397503874705, "grad_norm": 7.272968769073486, "learning_rate": 2.3662704675750716e-05, "loss": 0.2801, "num_input_tokens_seen": 66193600, "step": 69305 }, { "epoch": 5.653805367485113, "grad_norm": 0.7220526933670044, "learning_rate": 2.365915050373127e-05, "loss": 0.344, "num_input_tokens_seen": 66197472, "step": 69310 }, { "epoch": 5.654213231095522, "grad_norm": 1.6457139253616333, "learning_rate": 2.3655596358890286e-05, "loss": 0.4364, "num_input_tokens_seen": 66202384, "step": 69315 }, { "epoch": 5.654621094705931, "grad_norm": 1.7714293003082275, "learning_rate": 2.365204224129979e-05, "loss": 0.5211, "num_input_tokens_seen": 66207552, "step": 69320 }, { "epoch": 5.655028958316339, "grad_norm": 0.7783786058425903, "learning_rate": 2.364848815103183e-05, "loss": 0.2871, "num_input_tokens_seen": 66212160, "step": 69325 }, { "epoch": 5.655436821926748, "grad_norm": 2.3102056980133057, "learning_rate": 2.3644934088158435e-05, "loss": 0.3969, "num_input_tokens_seen": 66216768, "step": 69330 }, { "epoch": 5.655844685537156, "grad_norm": 1.349057674407959, "learning_rate": 2.3641380052751642e-05, "loss": 0.3239, "num_input_tokens_seen": 66221024, "step": 69335 }, { "epoch": 5.656252549147565, "grad_norm": 9.144198417663574, "learning_rate": 2.3637826044883507e-05, "loss": 0.5884, "num_input_tokens_seen": 66225856, "step": 69340 }, { "epoch": 5.656660412757974, "grad_norm": 0.7118490934371948, "learning_rate": 2.363427206462606e-05, "loss": 0.2742, "num_input_tokens_seen": 66230208, "step": 69345 }, { "epoch": 5.657068276368382, "grad_norm": 1.02045738697052, "learning_rate": 2.363071811205134e-05, "loss": 0.2804, "num_input_tokens_seen": 66235264, "step": 69350 }, { "epoch": 5.657476139978791, "grad_norm": 5.741731643676758, "learning_rate": 2.3627164187231376e-05, "loss": 0.3116, "num_input_tokens_seen": 66239520, "step": 69355 }, { "epoch": 5.657884003589199, "grad_norm": 9.98479175567627, "learning_rate": 2.3623610290238217e-05, "loss": 0.283, "num_input_tokens_seen": 66244016, "step": 69360 }, { "epoch": 5.658291867199608, "grad_norm": 2.934364080429077, "learning_rate": 2.3620056421143888e-05, "loss": 0.2876, "num_input_tokens_seen": 66247824, "step": 69365 }, { "epoch": 5.658699730810017, "grad_norm": 1.1143012046813965, "learning_rate": 2.3616502580020433e-05, "loss": 0.3271, "num_input_tokens_seen": 66253264, "step": 69370 }, { "epoch": 5.659107594420425, "grad_norm": 0.6313972473144531, "learning_rate": 2.361294876693987e-05, "loss": 0.3352, "num_input_tokens_seen": 66258256, "step": 69375 }, { "epoch": 5.6595154580308344, "grad_norm": 4.567271709442139, "learning_rate": 2.360939498197426e-05, "loss": 0.2604, "num_input_tokens_seen": 66263792, "step": 69380 }, { "epoch": 5.6599233216412435, "grad_norm": 1.5469577312469482, "learning_rate": 2.3605841225195617e-05, "loss": 0.2837, "num_input_tokens_seen": 66267968, "step": 69385 }, { "epoch": 5.660331185251652, "grad_norm": 17.86802864074707, "learning_rate": 2.3602287496675984e-05, "loss": 0.4957, "num_input_tokens_seen": 66272640, "step": 69390 }, { "epoch": 5.660739048862061, "grad_norm": 7.353886127471924, "learning_rate": 2.359873379648739e-05, "loss": 0.2698, "num_input_tokens_seen": 66276960, "step": 69395 }, { "epoch": 5.66114691247247, "grad_norm": 0.7238922715187073, "learning_rate": 2.3595180124701853e-05, "loss": 0.329, "num_input_tokens_seen": 66282960, "step": 69400 }, { "epoch": 5.661554776082878, "grad_norm": 0.5501623749732971, "learning_rate": 2.3591626481391434e-05, "loss": 0.2251, "num_input_tokens_seen": 66288016, "step": 69405 }, { "epoch": 5.661962639693287, "grad_norm": 4.483903884887695, "learning_rate": 2.3588072866628142e-05, "loss": 0.4193, "num_input_tokens_seen": 66292656, "step": 69410 }, { "epoch": 5.662370503303695, "grad_norm": 5.701106071472168, "learning_rate": 2.3584519280484016e-05, "loss": 0.1917, "num_input_tokens_seen": 66297808, "step": 69415 }, { "epoch": 5.662778366914104, "grad_norm": 12.552968978881836, "learning_rate": 2.3580965723031075e-05, "loss": 0.2074, "num_input_tokens_seen": 66302640, "step": 69420 }, { "epoch": 5.663186230524513, "grad_norm": 1.5522196292877197, "learning_rate": 2.3577412194341364e-05, "loss": 0.4046, "num_input_tokens_seen": 66307472, "step": 69425 }, { "epoch": 5.663594094134921, "grad_norm": 2.2724013328552246, "learning_rate": 2.3573858694486904e-05, "loss": 0.3841, "num_input_tokens_seen": 66312080, "step": 69430 }, { "epoch": 5.66400195774533, "grad_norm": 25.000314712524414, "learning_rate": 2.3570305223539717e-05, "loss": 0.4248, "num_input_tokens_seen": 66316912, "step": 69435 }, { "epoch": 5.664409821355739, "grad_norm": 7.110774517059326, "learning_rate": 2.3566751781571838e-05, "loss": 0.337, "num_input_tokens_seen": 66321936, "step": 69440 }, { "epoch": 5.664817684966147, "grad_norm": 0.6307262182235718, "learning_rate": 2.3563198368655284e-05, "loss": 0.38, "num_input_tokens_seen": 66326400, "step": 69445 }, { "epoch": 5.665225548576556, "grad_norm": 39.31126403808594, "learning_rate": 2.3559644984862095e-05, "loss": 0.387, "num_input_tokens_seen": 66330352, "step": 69450 }, { "epoch": 5.665633412186965, "grad_norm": 1.6959812641143799, "learning_rate": 2.3556091630264292e-05, "loss": 0.3725, "num_input_tokens_seen": 66335440, "step": 69455 }, { "epoch": 5.666041275797373, "grad_norm": 0.7037205100059509, "learning_rate": 2.35525383049339e-05, "loss": 0.2998, "num_input_tokens_seen": 66339568, "step": 69460 }, { "epoch": 5.666449139407782, "grad_norm": 3.0954244136810303, "learning_rate": 2.354898500894293e-05, "loss": 0.2832, "num_input_tokens_seen": 66344640, "step": 69465 }, { "epoch": 5.66685700301819, "grad_norm": 31.744169235229492, "learning_rate": 2.3545431742363424e-05, "loss": 0.3944, "num_input_tokens_seen": 66348992, "step": 69470 }, { "epoch": 5.667264866628599, "grad_norm": 0.560286283493042, "learning_rate": 2.35418785052674e-05, "loss": 0.3698, "num_input_tokens_seen": 66353808, "step": 69475 }, { "epoch": 5.667672730239008, "grad_norm": 5.221132755279541, "learning_rate": 2.3538325297726883e-05, "loss": 0.2332, "num_input_tokens_seen": 66357968, "step": 69480 }, { "epoch": 5.6680805938494165, "grad_norm": 12.238457679748535, "learning_rate": 2.3534772119813888e-05, "loss": 0.3636, "num_input_tokens_seen": 66362816, "step": 69485 }, { "epoch": 5.6684884574598255, "grad_norm": 0.9050720930099487, "learning_rate": 2.3531218971600437e-05, "loss": 0.3227, "num_input_tokens_seen": 66367984, "step": 69490 }, { "epoch": 5.668896321070234, "grad_norm": 8.819613456726074, "learning_rate": 2.3527665853158553e-05, "loss": 0.3424, "num_input_tokens_seen": 66372608, "step": 69495 }, { "epoch": 5.669304184680643, "grad_norm": 45.600059509277344, "learning_rate": 2.352411276456026e-05, "loss": 0.4459, "num_input_tokens_seen": 66376896, "step": 69500 }, { "epoch": 5.669712048291052, "grad_norm": 2.093024492263794, "learning_rate": 2.3520559705877577e-05, "loss": 0.603, "num_input_tokens_seen": 66381760, "step": 69505 }, { "epoch": 5.67011991190146, "grad_norm": 7.7928853034973145, "learning_rate": 2.3517006677182504e-05, "loss": 0.4449, "num_input_tokens_seen": 66386624, "step": 69510 }, { "epoch": 5.670527775511869, "grad_norm": 13.109663963317871, "learning_rate": 2.351345367854709e-05, "loss": 0.3863, "num_input_tokens_seen": 66391008, "step": 69515 }, { "epoch": 5.670935639122278, "grad_norm": 1.139274001121521, "learning_rate": 2.3509900710043335e-05, "loss": 0.3358, "num_input_tokens_seen": 66396160, "step": 69520 }, { "epoch": 5.671343502732686, "grad_norm": 21.062740325927734, "learning_rate": 2.3506347771743262e-05, "loss": 0.3821, "num_input_tokens_seen": 66400960, "step": 69525 }, { "epoch": 5.671751366343095, "grad_norm": 7.537483215332031, "learning_rate": 2.3502794863718885e-05, "loss": 0.4269, "num_input_tokens_seen": 66405952, "step": 69530 }, { "epoch": 5.672159229953504, "grad_norm": 3.2244558334350586, "learning_rate": 2.349924198604221e-05, "loss": 0.3619, "num_input_tokens_seen": 66410736, "step": 69535 }, { "epoch": 5.672567093563912, "grad_norm": 7.034311294555664, "learning_rate": 2.349568913878527e-05, "loss": 0.5152, "num_input_tokens_seen": 66416352, "step": 69540 }, { "epoch": 5.672974957174321, "grad_norm": 12.247941970825195, "learning_rate": 2.3492136322020073e-05, "loss": 0.3234, "num_input_tokens_seen": 66421504, "step": 69545 }, { "epoch": 5.673382820784729, "grad_norm": 30.098339080810547, "learning_rate": 2.348858353581863e-05, "loss": 0.4544, "num_input_tokens_seen": 66427264, "step": 69550 }, { "epoch": 5.673790684395138, "grad_norm": 0.3979002833366394, "learning_rate": 2.3485030780252956e-05, "loss": 0.4425, "num_input_tokens_seen": 66431840, "step": 69555 }, { "epoch": 5.674198548005547, "grad_norm": 28.078123092651367, "learning_rate": 2.348147805539507e-05, "loss": 0.4136, "num_input_tokens_seen": 66435840, "step": 69560 }, { "epoch": 5.674606411615955, "grad_norm": 20.75542640686035, "learning_rate": 2.3477925361316973e-05, "loss": 0.5161, "num_input_tokens_seen": 66440624, "step": 69565 }, { "epoch": 5.675014275226364, "grad_norm": 21.576702117919922, "learning_rate": 2.3474372698090684e-05, "loss": 0.4084, "num_input_tokens_seen": 66446032, "step": 69570 }, { "epoch": 5.6754221388367725, "grad_norm": 1.2666614055633545, "learning_rate": 2.3470820065788206e-05, "loss": 0.3126, "num_input_tokens_seen": 66450160, "step": 69575 }, { "epoch": 5.6758300024471815, "grad_norm": 3.9990553855895996, "learning_rate": 2.346726746448156e-05, "loss": 0.371, "num_input_tokens_seen": 66455552, "step": 69580 }, { "epoch": 5.6762378660575905, "grad_norm": 5.292226791381836, "learning_rate": 2.3463714894242754e-05, "loss": 0.6124, "num_input_tokens_seen": 66459536, "step": 69585 }, { "epoch": 5.6766457296679995, "grad_norm": 23.564796447753906, "learning_rate": 2.3460162355143793e-05, "loss": 0.4313, "num_input_tokens_seen": 66464128, "step": 69590 }, { "epoch": 5.677053593278408, "grad_norm": 37.955543518066406, "learning_rate": 2.345660984725669e-05, "loss": 0.3294, "num_input_tokens_seen": 66468640, "step": 69595 }, { "epoch": 5.677461456888817, "grad_norm": 16.341739654541016, "learning_rate": 2.3453057370653435e-05, "loss": 0.3845, "num_input_tokens_seen": 66473440, "step": 69600 }, { "epoch": 5.677869320499225, "grad_norm": 18.91013526916504, "learning_rate": 2.3449504925406064e-05, "loss": 0.3765, "num_input_tokens_seen": 66478048, "step": 69605 }, { "epoch": 5.678277184109634, "grad_norm": 8.110957145690918, "learning_rate": 2.3445952511586566e-05, "loss": 0.2766, "num_input_tokens_seen": 66483936, "step": 69610 }, { "epoch": 5.678685047720043, "grad_norm": 14.157593727111816, "learning_rate": 2.344240012926695e-05, "loss": 0.3895, "num_input_tokens_seen": 66489984, "step": 69615 }, { "epoch": 5.679092911330451, "grad_norm": 21.547067642211914, "learning_rate": 2.343884777851922e-05, "loss": 0.3299, "num_input_tokens_seen": 66494576, "step": 69620 }, { "epoch": 5.67950077494086, "grad_norm": 2.667672872543335, "learning_rate": 2.3435295459415385e-05, "loss": 0.349, "num_input_tokens_seen": 66499696, "step": 69625 }, { "epoch": 5.679908638551268, "grad_norm": 0.34468555450439453, "learning_rate": 2.3431743172027444e-05, "loss": 0.2672, "num_input_tokens_seen": 66504144, "step": 69630 }, { "epoch": 5.680316502161677, "grad_norm": 32.92527770996094, "learning_rate": 2.3428190916427402e-05, "loss": 0.5899, "num_input_tokens_seen": 66508544, "step": 69635 }, { "epoch": 5.680724365772086, "grad_norm": 0.5604032278060913, "learning_rate": 2.3424638692687264e-05, "loss": 0.3505, "num_input_tokens_seen": 66513072, "step": 69640 }, { "epoch": 5.681132229382494, "grad_norm": 6.569007396697998, "learning_rate": 2.342108650087903e-05, "loss": 0.2695, "num_input_tokens_seen": 66518128, "step": 69645 }, { "epoch": 5.681540092992903, "grad_norm": 6.005958080291748, "learning_rate": 2.34175343410747e-05, "loss": 0.6158, "num_input_tokens_seen": 66522160, "step": 69650 }, { "epoch": 5.681947956603312, "grad_norm": 1.5817760229110718, "learning_rate": 2.3413982213346282e-05, "loss": 0.2409, "num_input_tokens_seen": 66526096, "step": 69655 }, { "epoch": 5.68235582021372, "grad_norm": 15.145858764648438, "learning_rate": 2.3410430117765766e-05, "loss": 0.2764, "num_input_tokens_seen": 66531248, "step": 69660 }, { "epoch": 5.682763683824129, "grad_norm": 30.968795776367188, "learning_rate": 2.3406878054405147e-05, "loss": 0.4928, "num_input_tokens_seen": 66536464, "step": 69665 }, { "epoch": 5.683171547434538, "grad_norm": 4.733929634094238, "learning_rate": 2.3403326023336443e-05, "loss": 0.2683, "num_input_tokens_seen": 66541920, "step": 69670 }, { "epoch": 5.6835794110449465, "grad_norm": 7.7385945320129395, "learning_rate": 2.3399774024631645e-05, "loss": 0.2043, "num_input_tokens_seen": 66546064, "step": 69675 }, { "epoch": 5.6839872746553555, "grad_norm": 1.3446255922317505, "learning_rate": 2.3396222058362747e-05, "loss": 0.3506, "num_input_tokens_seen": 66550400, "step": 69680 }, { "epoch": 5.684395138265764, "grad_norm": 5.029801845550537, "learning_rate": 2.3392670124601747e-05, "loss": 0.469, "num_input_tokens_seen": 66555152, "step": 69685 }, { "epoch": 5.684803001876173, "grad_norm": 11.482919692993164, "learning_rate": 2.3389118223420632e-05, "loss": 0.2594, "num_input_tokens_seen": 66559136, "step": 69690 }, { "epoch": 5.685210865486582, "grad_norm": 18.427804946899414, "learning_rate": 2.3385566354891405e-05, "loss": 0.3335, "num_input_tokens_seen": 66564320, "step": 69695 }, { "epoch": 5.68561872909699, "grad_norm": 5.649832725524902, "learning_rate": 2.3382014519086072e-05, "loss": 0.392, "num_input_tokens_seen": 66569056, "step": 69700 }, { "epoch": 5.686026592707399, "grad_norm": 0.5227279663085938, "learning_rate": 2.337846271607661e-05, "loss": 0.335, "num_input_tokens_seen": 66574224, "step": 69705 }, { "epoch": 5.686434456317807, "grad_norm": 2.374413013458252, "learning_rate": 2.3374910945935024e-05, "loss": 0.3742, "num_input_tokens_seen": 66579360, "step": 69710 }, { "epoch": 5.686842319928216, "grad_norm": 0.9710273146629333, "learning_rate": 2.33713592087333e-05, "loss": 0.3645, "num_input_tokens_seen": 66584480, "step": 69715 }, { "epoch": 5.687250183538625, "grad_norm": 1.5524221658706665, "learning_rate": 2.3367807504543438e-05, "loss": 0.3871, "num_input_tokens_seen": 66588928, "step": 69720 }, { "epoch": 5.687658047149033, "grad_norm": 3.28891921043396, "learning_rate": 2.3364255833437424e-05, "loss": 0.3576, "num_input_tokens_seen": 66593232, "step": 69725 }, { "epoch": 5.688065910759442, "grad_norm": 1.9963961839675903, "learning_rate": 2.3360704195487245e-05, "loss": 0.4151, "num_input_tokens_seen": 66598752, "step": 69730 }, { "epoch": 5.688473774369851, "grad_norm": 1.141375184059143, "learning_rate": 2.335715259076489e-05, "loss": 0.439, "num_input_tokens_seen": 66603792, "step": 69735 }, { "epoch": 5.688881637980259, "grad_norm": 0.7931537628173828, "learning_rate": 2.3353601019342362e-05, "loss": 0.3568, "num_input_tokens_seen": 66609168, "step": 69740 }, { "epoch": 5.689289501590668, "grad_norm": 0.9893789887428284, "learning_rate": 2.335004948129164e-05, "loss": 0.3358, "num_input_tokens_seen": 66614272, "step": 69745 }, { "epoch": 5.689697365201077, "grad_norm": 2.134793758392334, "learning_rate": 2.3346497976684715e-05, "loss": 0.3639, "num_input_tokens_seen": 66618816, "step": 69750 }, { "epoch": 5.690105228811485, "grad_norm": 4.733846187591553, "learning_rate": 2.3342946505593562e-05, "loss": 0.3816, "num_input_tokens_seen": 66623824, "step": 69755 }, { "epoch": 5.690513092421894, "grad_norm": 6.69547700881958, "learning_rate": 2.3339395068090192e-05, "loss": 0.3901, "num_input_tokens_seen": 66629248, "step": 69760 }, { "epoch": 5.690920956032302, "grad_norm": 2.5839316844940186, "learning_rate": 2.333584366424658e-05, "loss": 0.3902, "num_input_tokens_seen": 66634656, "step": 69765 }, { "epoch": 5.691328819642711, "grad_norm": 2.956791400909424, "learning_rate": 2.3332292294134707e-05, "loss": 0.3239, "num_input_tokens_seen": 66639728, "step": 69770 }, { "epoch": 5.69173668325312, "grad_norm": 1.8270766735076904, "learning_rate": 2.3328740957826556e-05, "loss": 0.3423, "num_input_tokens_seen": 66643552, "step": 69775 }, { "epoch": 5.6921445468635286, "grad_norm": 13.051482200622559, "learning_rate": 2.332518965539412e-05, "loss": 0.3856, "num_input_tokens_seen": 66648224, "step": 69780 }, { "epoch": 5.692552410473938, "grad_norm": 0.4195135831832886, "learning_rate": 2.3321638386909378e-05, "loss": 0.2871, "num_input_tokens_seen": 66653024, "step": 69785 }, { "epoch": 5.692960274084347, "grad_norm": 0.5899463295936584, "learning_rate": 2.331808715244431e-05, "loss": 0.3696, "num_input_tokens_seen": 66657936, "step": 69790 }, { "epoch": 5.693368137694755, "grad_norm": 4.723237991333008, "learning_rate": 2.3314535952070905e-05, "loss": 0.2934, "num_input_tokens_seen": 66662880, "step": 69795 }, { "epoch": 5.693776001305164, "grad_norm": 9.739458084106445, "learning_rate": 2.331098478586113e-05, "loss": 0.2843, "num_input_tokens_seen": 66668928, "step": 69800 }, { "epoch": 5.694183864915573, "grad_norm": 0.6088104844093323, "learning_rate": 2.3307433653886986e-05, "loss": 0.3204, "num_input_tokens_seen": 66674480, "step": 69805 }, { "epoch": 5.694591728525981, "grad_norm": 2.8262217044830322, "learning_rate": 2.330388255622044e-05, "loss": 0.403, "num_input_tokens_seen": 66680128, "step": 69810 }, { "epoch": 5.69499959213639, "grad_norm": 0.8570383191108704, "learning_rate": 2.3300331492933476e-05, "loss": 0.3215, "num_input_tokens_seen": 66685184, "step": 69815 }, { "epoch": 5.695407455746798, "grad_norm": 10.571240425109863, "learning_rate": 2.3296780464098058e-05, "loss": 0.3783, "num_input_tokens_seen": 66690400, "step": 69820 }, { "epoch": 5.695815319357207, "grad_norm": 0.8267124891281128, "learning_rate": 2.329322946978619e-05, "loss": 0.2694, "num_input_tokens_seen": 66694768, "step": 69825 }, { "epoch": 5.696223182967616, "grad_norm": 2.3660166263580322, "learning_rate": 2.3289678510069832e-05, "loss": 0.2597, "num_input_tokens_seen": 66699840, "step": 69830 }, { "epoch": 5.696631046578024, "grad_norm": 11.341448783874512, "learning_rate": 2.3286127585020965e-05, "loss": 0.3477, "num_input_tokens_seen": 66704912, "step": 69835 }, { "epoch": 5.697038910188433, "grad_norm": 0.8810932636260986, "learning_rate": 2.3282576694711562e-05, "loss": 0.2708, "num_input_tokens_seen": 66709216, "step": 69840 }, { "epoch": 5.697446773798841, "grad_norm": 16.156747817993164, "learning_rate": 2.3279025839213597e-05, "loss": 0.3079, "num_input_tokens_seen": 66714352, "step": 69845 }, { "epoch": 5.69785463740925, "grad_norm": 1.3682969808578491, "learning_rate": 2.327547501859905e-05, "loss": 0.3746, "num_input_tokens_seen": 66719904, "step": 69850 }, { "epoch": 5.698262501019659, "grad_norm": 22.490995407104492, "learning_rate": 2.3271924232939894e-05, "loss": 0.3688, "num_input_tokens_seen": 66724304, "step": 69855 }, { "epoch": 5.698670364630067, "grad_norm": 18.90839195251465, "learning_rate": 2.3268373482308097e-05, "loss": 0.2298, "num_input_tokens_seen": 66729968, "step": 69860 }, { "epoch": 5.699078228240476, "grad_norm": 2.058394193649292, "learning_rate": 2.3264822766775625e-05, "loss": 0.3608, "num_input_tokens_seen": 66735088, "step": 69865 }, { "epoch": 5.699486091850885, "grad_norm": 1.1676501035690308, "learning_rate": 2.3261272086414468e-05, "loss": 0.4003, "num_input_tokens_seen": 66739968, "step": 69870 }, { "epoch": 5.6998939554612935, "grad_norm": 3.2123281955718994, "learning_rate": 2.3257721441296586e-05, "loss": 0.3192, "num_input_tokens_seen": 66744640, "step": 69875 }, { "epoch": 5.7003018190717025, "grad_norm": 5.734345436096191, "learning_rate": 2.325417083149395e-05, "loss": 0.3333, "num_input_tokens_seen": 66749936, "step": 69880 }, { "epoch": 5.7007096826821115, "grad_norm": 0.7280193567276001, "learning_rate": 2.325062025707853e-05, "loss": 0.2546, "num_input_tokens_seen": 66754368, "step": 69885 }, { "epoch": 5.70111754629252, "grad_norm": 4.358067512512207, "learning_rate": 2.3247069718122286e-05, "loss": 0.3322, "num_input_tokens_seen": 66759600, "step": 69890 }, { "epoch": 5.701525409902929, "grad_norm": 13.206740379333496, "learning_rate": 2.32435192146972e-05, "loss": 0.3434, "num_input_tokens_seen": 66765120, "step": 69895 }, { "epoch": 5.701933273513337, "grad_norm": 1.2787319421768188, "learning_rate": 2.323996874687524e-05, "loss": 0.3345, "num_input_tokens_seen": 66770288, "step": 69900 }, { "epoch": 5.702341137123746, "grad_norm": 1.5724775791168213, "learning_rate": 2.323641831472836e-05, "loss": 0.4123, "num_input_tokens_seen": 66774432, "step": 69905 }, { "epoch": 5.702749000734155, "grad_norm": 0.6106967926025391, "learning_rate": 2.3232867918328526e-05, "loss": 0.2755, "num_input_tokens_seen": 66780400, "step": 69910 }, { "epoch": 5.703156864344563, "grad_norm": 0.5501624941825867, "learning_rate": 2.3229317557747714e-05, "loss": 0.3245, "num_input_tokens_seen": 66784720, "step": 69915 }, { "epoch": 5.703564727954972, "grad_norm": 1.368272066116333, "learning_rate": 2.3225767233057887e-05, "loss": 0.3535, "num_input_tokens_seen": 66789008, "step": 69920 }, { "epoch": 5.70397259156538, "grad_norm": 0.6348151564598083, "learning_rate": 2.3222216944330998e-05, "loss": 0.3104, "num_input_tokens_seen": 66793824, "step": 69925 }, { "epoch": 5.704380455175789, "grad_norm": 4.0487871170043945, "learning_rate": 2.321866669163901e-05, "loss": 0.3599, "num_input_tokens_seen": 66798352, "step": 69930 }, { "epoch": 5.704788318786198, "grad_norm": 0.5135371685028076, "learning_rate": 2.32151164750539e-05, "loss": 0.3692, "num_input_tokens_seen": 66802000, "step": 69935 }, { "epoch": 5.705196182396606, "grad_norm": 6.416839599609375, "learning_rate": 2.321156629464762e-05, "loss": 0.3637, "num_input_tokens_seen": 66806000, "step": 69940 }, { "epoch": 5.705604046007015, "grad_norm": 10.501958847045898, "learning_rate": 2.3208016150492133e-05, "loss": 0.3757, "num_input_tokens_seen": 66810208, "step": 69945 }, { "epoch": 5.706011909617424, "grad_norm": 17.827922821044922, "learning_rate": 2.3204466042659396e-05, "loss": 0.3036, "num_input_tokens_seen": 66815616, "step": 69950 }, { "epoch": 5.706419773227832, "grad_norm": 1.5268768072128296, "learning_rate": 2.3200915971221358e-05, "loss": 0.3143, "num_input_tokens_seen": 66820944, "step": 69955 }, { "epoch": 5.706827636838241, "grad_norm": 5.861651420593262, "learning_rate": 2.3197365936250003e-05, "loss": 0.4311, "num_input_tokens_seen": 66825216, "step": 69960 }, { "epoch": 5.70723550044865, "grad_norm": 3.7910425662994385, "learning_rate": 2.3193815937817274e-05, "loss": 0.2655, "num_input_tokens_seen": 66829856, "step": 69965 }, { "epoch": 5.7076433640590585, "grad_norm": 1.4168015718460083, "learning_rate": 2.3190265975995124e-05, "loss": 0.3013, "num_input_tokens_seen": 66834736, "step": 69970 }, { "epoch": 5.7080512276694675, "grad_norm": 1.298569679260254, "learning_rate": 2.3186716050855512e-05, "loss": 0.3588, "num_input_tokens_seen": 66839840, "step": 69975 }, { "epoch": 5.708459091279876, "grad_norm": 1.8472355604171753, "learning_rate": 2.3183166162470398e-05, "loss": 0.3615, "num_input_tokens_seen": 66845520, "step": 69980 }, { "epoch": 5.708866954890285, "grad_norm": 1.615151286125183, "learning_rate": 2.3179616310911735e-05, "loss": 0.4017, "num_input_tokens_seen": 66849264, "step": 69985 }, { "epoch": 5.709274818500694, "grad_norm": 2.4809086322784424, "learning_rate": 2.3176066496251478e-05, "loss": 0.4008, "num_input_tokens_seen": 66853664, "step": 69990 }, { "epoch": 5.709682682111102, "grad_norm": 0.5308307409286499, "learning_rate": 2.3172516718561576e-05, "loss": 0.3659, "num_input_tokens_seen": 66857760, "step": 69995 }, { "epoch": 5.710090545721511, "grad_norm": 0.9232462644577026, "learning_rate": 2.3168966977913973e-05, "loss": 0.4062, "num_input_tokens_seen": 66862544, "step": 70000 }, { "epoch": 5.71049840933192, "grad_norm": 3.464588165283203, "learning_rate": 2.316541727438064e-05, "loss": 0.3056, "num_input_tokens_seen": 66867056, "step": 70005 }, { "epoch": 5.710906272942328, "grad_norm": 7.026179313659668, "learning_rate": 2.3161867608033523e-05, "loss": 0.2755, "num_input_tokens_seen": 66871184, "step": 70010 }, { "epoch": 5.711314136552737, "grad_norm": 1.7240900993347168, "learning_rate": 2.3158317978944568e-05, "loss": 0.3885, "num_input_tokens_seen": 66874912, "step": 70015 }, { "epoch": 5.711722000163146, "grad_norm": 3.489410161972046, "learning_rate": 2.315476838718571e-05, "loss": 0.3375, "num_input_tokens_seen": 66880064, "step": 70020 }, { "epoch": 5.712129863773554, "grad_norm": 3.360956907272339, "learning_rate": 2.315121883282893e-05, "loss": 0.2782, "num_input_tokens_seen": 66883920, "step": 70025 }, { "epoch": 5.712537727383963, "grad_norm": 1.8860260248184204, "learning_rate": 2.3147669315946153e-05, "loss": 0.3263, "num_input_tokens_seen": 66888752, "step": 70030 }, { "epoch": 5.712945590994371, "grad_norm": 4.890894889831543, "learning_rate": 2.3144119836609336e-05, "loss": 0.3678, "num_input_tokens_seen": 66893008, "step": 70035 }, { "epoch": 5.71335345460478, "grad_norm": 1.5858107805252075, "learning_rate": 2.3140570394890417e-05, "loss": 0.3324, "num_input_tokens_seen": 66896784, "step": 70040 }, { "epoch": 5.713761318215189, "grad_norm": 3.240311861038208, "learning_rate": 2.313702099086134e-05, "loss": 0.3807, "num_input_tokens_seen": 66901776, "step": 70045 }, { "epoch": 5.714169181825597, "grad_norm": 4.709892272949219, "learning_rate": 2.3133471624594067e-05, "loss": 0.3744, "num_input_tokens_seen": 66907328, "step": 70050 }, { "epoch": 5.714577045436006, "grad_norm": 1.1383944749832153, "learning_rate": 2.3129922296160527e-05, "loss": 0.3054, "num_input_tokens_seen": 66912592, "step": 70055 }, { "epoch": 5.714984909046414, "grad_norm": 4.691784381866455, "learning_rate": 2.312637300563267e-05, "loss": 0.2772, "num_input_tokens_seen": 66916992, "step": 70060 }, { "epoch": 5.715392772656823, "grad_norm": 0.5302135944366455, "learning_rate": 2.3122823753082422e-05, "loss": 0.3021, "num_input_tokens_seen": 66921808, "step": 70065 }, { "epoch": 5.7158006362672324, "grad_norm": 0.5586836934089661, "learning_rate": 2.3119274538581752e-05, "loss": 0.3753, "num_input_tokens_seen": 66925920, "step": 70070 }, { "epoch": 5.716208499877641, "grad_norm": 1.0005316734313965, "learning_rate": 2.311572536220259e-05, "loss": 0.2669, "num_input_tokens_seen": 66930768, "step": 70075 }, { "epoch": 5.71661636348805, "grad_norm": 2.4825971126556396, "learning_rate": 2.311217622401687e-05, "loss": 0.4073, "num_input_tokens_seen": 66935104, "step": 70080 }, { "epoch": 5.717024227098459, "grad_norm": 4.395745754241943, "learning_rate": 2.310862712409654e-05, "loss": 0.4137, "num_input_tokens_seen": 66939200, "step": 70085 }, { "epoch": 5.717432090708867, "grad_norm": 17.58699607849121, "learning_rate": 2.3105078062513518e-05, "loss": 0.3082, "num_input_tokens_seen": 66944480, "step": 70090 }, { "epoch": 5.717839954319276, "grad_norm": 1.827651023864746, "learning_rate": 2.3101529039339773e-05, "loss": 0.445, "num_input_tokens_seen": 66949904, "step": 70095 }, { "epoch": 5.718247817929685, "grad_norm": 3.1906137466430664, "learning_rate": 2.309798005464723e-05, "loss": 0.3386, "num_input_tokens_seen": 66955008, "step": 70100 }, { "epoch": 5.718655681540093, "grad_norm": 1.400130033493042, "learning_rate": 2.3094431108507816e-05, "loss": 0.3421, "num_input_tokens_seen": 66959440, "step": 70105 }, { "epoch": 5.719063545150502, "grad_norm": 4.861090183258057, "learning_rate": 2.3090882200993475e-05, "loss": 0.3529, "num_input_tokens_seen": 66965232, "step": 70110 }, { "epoch": 5.71947140876091, "grad_norm": 2.5139708518981934, "learning_rate": 2.3087333332176142e-05, "loss": 0.3337, "num_input_tokens_seen": 66970128, "step": 70115 }, { "epoch": 5.719879272371319, "grad_norm": 0.49761319160461426, "learning_rate": 2.3083784502127752e-05, "loss": 0.3034, "num_input_tokens_seen": 66975488, "step": 70120 }, { "epoch": 5.720287135981728, "grad_norm": 1.5766518115997314, "learning_rate": 2.3080235710920233e-05, "loss": 0.3538, "num_input_tokens_seen": 66979616, "step": 70125 }, { "epoch": 5.720694999592136, "grad_norm": 8.999292373657227, "learning_rate": 2.3076686958625513e-05, "loss": 0.2577, "num_input_tokens_seen": 66984896, "step": 70130 }, { "epoch": 5.721102863202545, "grad_norm": 1.15845787525177, "learning_rate": 2.307313824531554e-05, "loss": 0.2865, "num_input_tokens_seen": 66989424, "step": 70135 }, { "epoch": 5.721510726812953, "grad_norm": 3.695338249206543, "learning_rate": 2.3069589571062237e-05, "loss": 0.2911, "num_input_tokens_seen": 66994688, "step": 70140 }, { "epoch": 5.721918590423362, "grad_norm": 3.821535110473633, "learning_rate": 2.306604093593753e-05, "loss": 0.3061, "num_input_tokens_seen": 66999824, "step": 70145 }, { "epoch": 5.722326454033771, "grad_norm": 7.236593246459961, "learning_rate": 2.3062492340013356e-05, "loss": 0.3692, "num_input_tokens_seen": 67005024, "step": 70150 }, { "epoch": 5.72273431764418, "grad_norm": 0.7718627452850342, "learning_rate": 2.3058943783361624e-05, "loss": 0.3404, "num_input_tokens_seen": 67009280, "step": 70155 }, { "epoch": 5.723142181254588, "grad_norm": 0.5932870507240295, "learning_rate": 2.305539526605429e-05, "loss": 0.2735, "num_input_tokens_seen": 67013520, "step": 70160 }, { "epoch": 5.723550044864997, "grad_norm": 7.49092435836792, "learning_rate": 2.3051846788163267e-05, "loss": 0.4167, "num_input_tokens_seen": 67017920, "step": 70165 }, { "epoch": 5.7239579084754055, "grad_norm": 1.2999663352966309, "learning_rate": 2.3048298349760482e-05, "loss": 0.336, "num_input_tokens_seen": 67022448, "step": 70170 }, { "epoch": 5.7243657720858145, "grad_norm": 0.8555477857589722, "learning_rate": 2.3044749950917856e-05, "loss": 0.3249, "num_input_tokens_seen": 67028096, "step": 70175 }, { "epoch": 5.7247736356962236, "grad_norm": 1.239709734916687, "learning_rate": 2.3041201591707322e-05, "loss": 0.4245, "num_input_tokens_seen": 67033072, "step": 70180 }, { "epoch": 5.725181499306632, "grad_norm": 6.386023998260498, "learning_rate": 2.30376532722008e-05, "loss": 0.2484, "num_input_tokens_seen": 67037440, "step": 70185 }, { "epoch": 5.725589362917041, "grad_norm": 0.6096998453140259, "learning_rate": 2.303410499247021e-05, "loss": 0.3105, "num_input_tokens_seen": 67042400, "step": 70190 }, { "epoch": 5.725997226527449, "grad_norm": 0.4859774112701416, "learning_rate": 2.303055675258748e-05, "loss": 0.3415, "num_input_tokens_seen": 67047488, "step": 70195 }, { "epoch": 5.726405090137858, "grad_norm": 4.1643524169921875, "learning_rate": 2.3027008552624512e-05, "loss": 0.342, "num_input_tokens_seen": 67052608, "step": 70200 }, { "epoch": 5.726812953748267, "grad_norm": 2.5743322372436523, "learning_rate": 2.3023460392653257e-05, "loss": 0.285, "num_input_tokens_seen": 67057248, "step": 70205 }, { "epoch": 5.727220817358675, "grad_norm": 1.5447864532470703, "learning_rate": 2.301991227274562e-05, "loss": 0.2965, "num_input_tokens_seen": 67061664, "step": 70210 }, { "epoch": 5.727628680969084, "grad_norm": 2.870694875717163, "learning_rate": 2.301636419297352e-05, "loss": 0.2816, "num_input_tokens_seen": 67067216, "step": 70215 }, { "epoch": 5.728036544579493, "grad_norm": 13.435036659240723, "learning_rate": 2.3012816153408863e-05, "loss": 0.2847, "num_input_tokens_seen": 67072048, "step": 70220 }, { "epoch": 5.728444408189901, "grad_norm": 9.397197723388672, "learning_rate": 2.3009268154123587e-05, "loss": 0.287, "num_input_tokens_seen": 67076896, "step": 70225 }, { "epoch": 5.72885227180031, "grad_norm": 3.636770009994507, "learning_rate": 2.3005720195189602e-05, "loss": 0.3946, "num_input_tokens_seen": 67081808, "step": 70230 }, { "epoch": 5.729260135410719, "grad_norm": 8.228744506835938, "learning_rate": 2.3002172276678818e-05, "loss": 0.3575, "num_input_tokens_seen": 67086800, "step": 70235 }, { "epoch": 5.729667999021127, "grad_norm": 23.967529296875, "learning_rate": 2.2998624398663157e-05, "loss": 0.3248, "num_input_tokens_seen": 67090928, "step": 70240 }, { "epoch": 5.730075862631536, "grad_norm": 2.31548810005188, "learning_rate": 2.2995076561214524e-05, "loss": 0.3128, "num_input_tokens_seen": 67095312, "step": 70245 }, { "epoch": 5.730483726241944, "grad_norm": 7.136308670043945, "learning_rate": 2.299152876440484e-05, "loss": 0.4222, "num_input_tokens_seen": 67100512, "step": 70250 }, { "epoch": 5.730891589852353, "grad_norm": 6.226531505584717, "learning_rate": 2.2987981008306014e-05, "loss": 0.493, "num_input_tokens_seen": 67105456, "step": 70255 }, { "epoch": 5.731299453462762, "grad_norm": 0.46038350462913513, "learning_rate": 2.298443329298996e-05, "loss": 0.2256, "num_input_tokens_seen": 67109840, "step": 70260 }, { "epoch": 5.7317073170731705, "grad_norm": 4.510568618774414, "learning_rate": 2.2980885618528575e-05, "loss": 0.3734, "num_input_tokens_seen": 67114640, "step": 70265 }, { "epoch": 5.7321151806835795, "grad_norm": 1.9084925651550293, "learning_rate": 2.297733798499379e-05, "loss": 0.4407, "num_input_tokens_seen": 67120112, "step": 70270 }, { "epoch": 5.732523044293988, "grad_norm": 9.217167854309082, "learning_rate": 2.297379039245751e-05, "loss": 0.432, "num_input_tokens_seen": 67125584, "step": 70275 }, { "epoch": 5.732930907904397, "grad_norm": 13.354815483093262, "learning_rate": 2.297024284099163e-05, "loss": 0.4141, "num_input_tokens_seen": 67130640, "step": 70280 }, { "epoch": 5.733338771514806, "grad_norm": 4.086246967315674, "learning_rate": 2.296669533066807e-05, "loss": 0.3702, "num_input_tokens_seen": 67135648, "step": 70285 }, { "epoch": 5.733746635125214, "grad_norm": 0.6478174328804016, "learning_rate": 2.296314786155872e-05, "loss": 0.3197, "num_input_tokens_seen": 67140960, "step": 70290 }, { "epoch": 5.734154498735623, "grad_norm": 2.8866987228393555, "learning_rate": 2.295960043373551e-05, "loss": 0.4322, "num_input_tokens_seen": 67145872, "step": 70295 }, { "epoch": 5.734562362346032, "grad_norm": 6.976200103759766, "learning_rate": 2.295605304727033e-05, "loss": 0.2677, "num_input_tokens_seen": 67150896, "step": 70300 }, { "epoch": 5.73497022595644, "grad_norm": 3.2185585498809814, "learning_rate": 2.2952505702235085e-05, "loss": 0.3388, "num_input_tokens_seen": 67156768, "step": 70305 }, { "epoch": 5.735378089566849, "grad_norm": 7.3341240882873535, "learning_rate": 2.2948958398701676e-05, "loss": 0.2679, "num_input_tokens_seen": 67162144, "step": 70310 }, { "epoch": 5.735785953177258, "grad_norm": 0.8631025552749634, "learning_rate": 2.2945411136742013e-05, "loss": 0.3607, "num_input_tokens_seen": 67166480, "step": 70315 }, { "epoch": 5.736193816787666, "grad_norm": 0.8714056015014648, "learning_rate": 2.2941863916427995e-05, "loss": 0.294, "num_input_tokens_seen": 67171136, "step": 70320 }, { "epoch": 5.736601680398075, "grad_norm": 7.8446221351623535, "learning_rate": 2.2938316737831516e-05, "loss": 0.2964, "num_input_tokens_seen": 67176368, "step": 70325 }, { "epoch": 5.737009544008483, "grad_norm": 5.097020626068115, "learning_rate": 2.2934769601024474e-05, "loss": 0.3337, "num_input_tokens_seen": 67181040, "step": 70330 }, { "epoch": 5.737417407618892, "grad_norm": 14.880606651306152, "learning_rate": 2.2931222506078786e-05, "loss": 0.4104, "num_input_tokens_seen": 67186016, "step": 70335 }, { "epoch": 5.737825271229301, "grad_norm": 5.430740833282471, "learning_rate": 2.292767545306634e-05, "loss": 0.3095, "num_input_tokens_seen": 67190352, "step": 70340 }, { "epoch": 5.738233134839709, "grad_norm": 1.7464638948440552, "learning_rate": 2.2924128442059027e-05, "loss": 0.428, "num_input_tokens_seen": 67194768, "step": 70345 }, { "epoch": 5.738640998450118, "grad_norm": 5.357989311218262, "learning_rate": 2.2920581473128752e-05, "loss": 0.3511, "num_input_tokens_seen": 67199152, "step": 70350 }, { "epoch": 5.739048862060527, "grad_norm": 13.045903205871582, "learning_rate": 2.2917034546347392e-05, "loss": 0.3862, "num_input_tokens_seen": 67204160, "step": 70355 }, { "epoch": 5.7394567256709355, "grad_norm": 0.9464671015739441, "learning_rate": 2.2913487661786873e-05, "loss": 0.4828, "num_input_tokens_seen": 67209296, "step": 70360 }, { "epoch": 5.7398645892813445, "grad_norm": 9.420557975769043, "learning_rate": 2.2909940819519073e-05, "loss": 0.3087, "num_input_tokens_seen": 67213344, "step": 70365 }, { "epoch": 5.7402724528917535, "grad_norm": 4.549088001251221, "learning_rate": 2.290639401961588e-05, "loss": 0.3421, "num_input_tokens_seen": 67217872, "step": 70370 }, { "epoch": 5.740680316502162, "grad_norm": 2.154301404953003, "learning_rate": 2.2902847262149185e-05, "loss": 0.303, "num_input_tokens_seen": 67222064, "step": 70375 }, { "epoch": 5.741088180112571, "grad_norm": 5.862706184387207, "learning_rate": 2.2899300547190893e-05, "loss": 0.458, "num_input_tokens_seen": 67226144, "step": 70380 }, { "epoch": 5.741496043722979, "grad_norm": 1.6533820629119873, "learning_rate": 2.2895753874812884e-05, "loss": 0.3143, "num_input_tokens_seen": 67230576, "step": 70385 }, { "epoch": 5.741903907333388, "grad_norm": 0.707518458366394, "learning_rate": 2.2892207245087045e-05, "loss": 0.3644, "num_input_tokens_seen": 67234432, "step": 70390 }, { "epoch": 5.742311770943797, "grad_norm": 0.576181173324585, "learning_rate": 2.2888660658085274e-05, "loss": 0.3128, "num_input_tokens_seen": 67239840, "step": 70395 }, { "epoch": 5.742719634554205, "grad_norm": 8.803984642028809, "learning_rate": 2.2885114113879453e-05, "loss": 0.3062, "num_input_tokens_seen": 67244672, "step": 70400 }, { "epoch": 5.743127498164614, "grad_norm": 0.6427536010742188, "learning_rate": 2.2881567612541475e-05, "loss": 0.2839, "num_input_tokens_seen": 67249744, "step": 70405 }, { "epoch": 5.743535361775022, "grad_norm": 0.8572283983230591, "learning_rate": 2.2878021154143216e-05, "loss": 0.4031, "num_input_tokens_seen": 67254896, "step": 70410 }, { "epoch": 5.743943225385431, "grad_norm": 5.721119403839111, "learning_rate": 2.287447473875657e-05, "loss": 0.3519, "num_input_tokens_seen": 67260512, "step": 70415 }, { "epoch": 5.74435108899584, "grad_norm": 11.542490005493164, "learning_rate": 2.287092836645341e-05, "loss": 0.4117, "num_input_tokens_seen": 67266160, "step": 70420 }, { "epoch": 5.744758952606248, "grad_norm": 1.0368438959121704, "learning_rate": 2.2867382037305636e-05, "loss": 0.3589, "num_input_tokens_seen": 67270512, "step": 70425 }, { "epoch": 5.745166816216657, "grad_norm": 1.3839421272277832, "learning_rate": 2.2863835751385118e-05, "loss": 0.3421, "num_input_tokens_seen": 67275488, "step": 70430 }, { "epoch": 5.745574679827066, "grad_norm": 4.244668483734131, "learning_rate": 2.2860289508763745e-05, "loss": 0.3793, "num_input_tokens_seen": 67280400, "step": 70435 }, { "epoch": 5.745982543437474, "grad_norm": 0.5371466875076294, "learning_rate": 2.2856743309513396e-05, "loss": 0.307, "num_input_tokens_seen": 67285648, "step": 70440 }, { "epoch": 5.746390407047883, "grad_norm": 0.7626423835754395, "learning_rate": 2.285319715370594e-05, "loss": 0.2777, "num_input_tokens_seen": 67290320, "step": 70445 }, { "epoch": 5.746798270658292, "grad_norm": 0.644900381565094, "learning_rate": 2.2849651041413265e-05, "loss": 0.4076, "num_input_tokens_seen": 67295328, "step": 70450 }, { "epoch": 5.7472061342687, "grad_norm": 1.2213950157165527, "learning_rate": 2.2846104972707262e-05, "loss": 0.3242, "num_input_tokens_seen": 67299984, "step": 70455 }, { "epoch": 5.747613997879109, "grad_norm": 1.601344347000122, "learning_rate": 2.284255894765979e-05, "loss": 0.3177, "num_input_tokens_seen": 67304784, "step": 70460 }, { "epoch": 5.7480218614895175, "grad_norm": 2.8922767639160156, "learning_rate": 2.2839012966342727e-05, "loss": 0.3519, "num_input_tokens_seen": 67310432, "step": 70465 }, { "epoch": 5.748429725099927, "grad_norm": 1.2166284322738647, "learning_rate": 2.2835467028827958e-05, "loss": 0.3758, "num_input_tokens_seen": 67315152, "step": 70470 }, { "epoch": 5.748837588710336, "grad_norm": 4.976607322692871, "learning_rate": 2.2831921135187354e-05, "loss": 0.4194, "num_input_tokens_seen": 67319728, "step": 70475 }, { "epoch": 5.749245452320744, "grad_norm": 11.323800086975098, "learning_rate": 2.282837528549279e-05, "loss": 0.3414, "num_input_tokens_seen": 67325152, "step": 70480 }, { "epoch": 5.749653315931153, "grad_norm": 1.350036859512329, "learning_rate": 2.282482947981612e-05, "loss": 0.3208, "num_input_tokens_seen": 67329840, "step": 70485 }, { "epoch": 5.750061179541561, "grad_norm": 2.5931220054626465, "learning_rate": 2.2821283718229243e-05, "loss": 0.352, "num_input_tokens_seen": 67333728, "step": 70490 }, { "epoch": 5.75046904315197, "grad_norm": 0.8623517751693726, "learning_rate": 2.281773800080402e-05, "loss": 0.3588, "num_input_tokens_seen": 67338416, "step": 70495 }, { "epoch": 5.750876906762379, "grad_norm": 2.271611452102661, "learning_rate": 2.281419232761232e-05, "loss": 0.3093, "num_input_tokens_seen": 67343984, "step": 70500 }, { "epoch": 5.751284770372787, "grad_norm": 1.1365848779678345, "learning_rate": 2.2810646698726017e-05, "loss": 0.3183, "num_input_tokens_seen": 67349056, "step": 70505 }, { "epoch": 5.751692633983196, "grad_norm": 0.5384534001350403, "learning_rate": 2.2807101114216962e-05, "loss": 0.3523, "num_input_tokens_seen": 67353264, "step": 70510 }, { "epoch": 5.752100497593605, "grad_norm": 5.796545505523682, "learning_rate": 2.2803555574157045e-05, "loss": 0.3175, "num_input_tokens_seen": 67358352, "step": 70515 }, { "epoch": 5.752508361204013, "grad_norm": 1.0822110176086426, "learning_rate": 2.2800010078618125e-05, "loss": 0.3055, "num_input_tokens_seen": 67363120, "step": 70520 }, { "epoch": 5.752916224814422, "grad_norm": 1.2921679019927979, "learning_rate": 2.2796464627672064e-05, "loss": 0.3867, "num_input_tokens_seen": 67367504, "step": 70525 }, { "epoch": 5.753324088424831, "grad_norm": 0.9044039249420166, "learning_rate": 2.2792919221390728e-05, "loss": 0.2757, "num_input_tokens_seen": 67371312, "step": 70530 }, { "epoch": 5.753731952035239, "grad_norm": 6.45011568069458, "learning_rate": 2.278937385984598e-05, "loss": 0.3428, "num_input_tokens_seen": 67375792, "step": 70535 }, { "epoch": 5.754139815645648, "grad_norm": 1.9257614612579346, "learning_rate": 2.2785828543109687e-05, "loss": 0.3647, "num_input_tokens_seen": 67380144, "step": 70540 }, { "epoch": 5.754547679256056, "grad_norm": 0.7114078998565674, "learning_rate": 2.2782283271253713e-05, "loss": 0.299, "num_input_tokens_seen": 67385728, "step": 70545 }, { "epoch": 5.754955542866465, "grad_norm": 2.1119017601013184, "learning_rate": 2.277873804434991e-05, "loss": 0.3111, "num_input_tokens_seen": 67390240, "step": 70550 }, { "epoch": 5.755363406476874, "grad_norm": 3.077481508255005, "learning_rate": 2.2775192862470133e-05, "loss": 0.3168, "num_input_tokens_seen": 67394928, "step": 70555 }, { "epoch": 5.7557712700872825, "grad_norm": 2.62754487991333, "learning_rate": 2.277164772568626e-05, "loss": 0.3017, "num_input_tokens_seen": 67400272, "step": 70560 }, { "epoch": 5.7561791336976915, "grad_norm": 1.6869137287139893, "learning_rate": 2.2768102634070147e-05, "loss": 0.3228, "num_input_tokens_seen": 67404864, "step": 70565 }, { "epoch": 5.7565869973081005, "grad_norm": 16.50309181213379, "learning_rate": 2.276455758769364e-05, "loss": 0.3233, "num_input_tokens_seen": 67410464, "step": 70570 }, { "epoch": 5.756994860918509, "grad_norm": 12.667487144470215, "learning_rate": 2.2761012586628592e-05, "loss": 0.2846, "num_input_tokens_seen": 67415296, "step": 70575 }, { "epoch": 5.757402724528918, "grad_norm": 1.573581337928772, "learning_rate": 2.2757467630946877e-05, "loss": 0.3039, "num_input_tokens_seen": 67420432, "step": 70580 }, { "epoch": 5.757810588139327, "grad_norm": 13.853372573852539, "learning_rate": 2.2753922720720337e-05, "loss": 0.3424, "num_input_tokens_seen": 67424352, "step": 70585 }, { "epoch": 5.758218451749735, "grad_norm": 3.25583553314209, "learning_rate": 2.2750377856020836e-05, "loss": 0.4615, "num_input_tokens_seen": 67428368, "step": 70590 }, { "epoch": 5.758626315360144, "grad_norm": 5.291629314422607, "learning_rate": 2.274683303692022e-05, "loss": 0.3574, "num_input_tokens_seen": 67433568, "step": 70595 }, { "epoch": 5.759034178970552, "grad_norm": 4.43548059463501, "learning_rate": 2.2743288263490327e-05, "loss": 0.3476, "num_input_tokens_seen": 67438208, "step": 70600 }, { "epoch": 5.759442042580961, "grad_norm": 1.552435278892517, "learning_rate": 2.2739743535803033e-05, "loss": 0.3112, "num_input_tokens_seen": 67442640, "step": 70605 }, { "epoch": 5.75984990619137, "grad_norm": 1.7611466646194458, "learning_rate": 2.273619885393018e-05, "loss": 0.2855, "num_input_tokens_seen": 67446880, "step": 70610 }, { "epoch": 5.760257769801778, "grad_norm": 1.2264095544815063, "learning_rate": 2.2732654217943608e-05, "loss": 0.303, "num_input_tokens_seen": 67450896, "step": 70615 }, { "epoch": 5.760665633412187, "grad_norm": 5.522759437561035, "learning_rate": 2.2729109627915167e-05, "loss": 0.4319, "num_input_tokens_seen": 67455984, "step": 70620 }, { "epoch": 5.761073497022595, "grad_norm": 0.9317482709884644, "learning_rate": 2.2725565083916715e-05, "loss": 0.3636, "num_input_tokens_seen": 67460528, "step": 70625 }, { "epoch": 5.761481360633004, "grad_norm": 1.0285013914108276, "learning_rate": 2.2722020586020092e-05, "loss": 0.4053, "num_input_tokens_seen": 67464544, "step": 70630 }, { "epoch": 5.761889224243413, "grad_norm": 2.108534812927246, "learning_rate": 2.2718476134297145e-05, "loss": 0.3247, "num_input_tokens_seen": 67469696, "step": 70635 }, { "epoch": 5.762297087853821, "grad_norm": 3.3296844959259033, "learning_rate": 2.2714931728819718e-05, "loss": 0.3169, "num_input_tokens_seen": 67474560, "step": 70640 }, { "epoch": 5.76270495146423, "grad_norm": 4.11920690536499, "learning_rate": 2.2711387369659644e-05, "loss": 0.3695, "num_input_tokens_seen": 67479696, "step": 70645 }, { "epoch": 5.763112815074639, "grad_norm": 2.567183494567871, "learning_rate": 2.2707843056888783e-05, "loss": 0.3642, "num_input_tokens_seen": 67483408, "step": 70650 }, { "epoch": 5.7635206786850475, "grad_norm": 7.629391193389893, "learning_rate": 2.2704298790578974e-05, "loss": 0.343, "num_input_tokens_seen": 67487888, "step": 70655 }, { "epoch": 5.7639285422954565, "grad_norm": 3.3880057334899902, "learning_rate": 2.2700754570802048e-05, "loss": 0.3681, "num_input_tokens_seen": 67492432, "step": 70660 }, { "epoch": 5.7643364059058655, "grad_norm": 8.302173614501953, "learning_rate": 2.2697210397629847e-05, "loss": 0.3476, "num_input_tokens_seen": 67497424, "step": 70665 }, { "epoch": 5.764744269516274, "grad_norm": 1.1009516716003418, "learning_rate": 2.269366627113422e-05, "loss": 0.3446, "num_input_tokens_seen": 67502720, "step": 70670 }, { "epoch": 5.765152133126683, "grad_norm": 5.493751525878906, "learning_rate": 2.2690122191386994e-05, "loss": 0.5113, "num_input_tokens_seen": 67506544, "step": 70675 }, { "epoch": 5.765559996737091, "grad_norm": 1.838503360748291, "learning_rate": 2.268657815846001e-05, "loss": 0.3446, "num_input_tokens_seen": 67511520, "step": 70680 }, { "epoch": 5.7659678603475, "grad_norm": 6.355713844299316, "learning_rate": 2.2683034172425096e-05, "loss": 0.4027, "num_input_tokens_seen": 67516688, "step": 70685 }, { "epoch": 5.766375723957909, "grad_norm": 19.095905303955078, "learning_rate": 2.2679490233354102e-05, "loss": 0.3861, "num_input_tokens_seen": 67521344, "step": 70690 }, { "epoch": 5.766783587568317, "grad_norm": 2.6840367317199707, "learning_rate": 2.267594634131886e-05, "loss": 0.3796, "num_input_tokens_seen": 67526064, "step": 70695 }, { "epoch": 5.767191451178726, "grad_norm": 1.0101211071014404, "learning_rate": 2.2672402496391193e-05, "loss": 0.3459, "num_input_tokens_seen": 67530432, "step": 70700 }, { "epoch": 5.767599314789135, "grad_norm": 0.960278332233429, "learning_rate": 2.2668858698642944e-05, "loss": 0.3676, "num_input_tokens_seen": 67534400, "step": 70705 }, { "epoch": 5.768007178399543, "grad_norm": 0.8226656913757324, "learning_rate": 2.2665314948145923e-05, "loss": 0.4028, "num_input_tokens_seen": 67538800, "step": 70710 }, { "epoch": 5.768415042009952, "grad_norm": 2.1393911838531494, "learning_rate": 2.2661771244971985e-05, "loss": 0.3125, "num_input_tokens_seen": 67543120, "step": 70715 }, { "epoch": 5.768822905620361, "grad_norm": 6.9830451011657715, "learning_rate": 2.2658227589192953e-05, "loss": 0.4184, "num_input_tokens_seen": 67548608, "step": 70720 }, { "epoch": 5.769230769230769, "grad_norm": 1.6782439947128296, "learning_rate": 2.2654683980880652e-05, "loss": 0.3065, "num_input_tokens_seen": 67553424, "step": 70725 }, { "epoch": 5.769638632841178, "grad_norm": 1.776911973953247, "learning_rate": 2.2651140420106905e-05, "loss": 0.3809, "num_input_tokens_seen": 67558048, "step": 70730 }, { "epoch": 5.770046496451586, "grad_norm": 13.07746696472168, "learning_rate": 2.2647596906943553e-05, "loss": 0.309, "num_input_tokens_seen": 67563184, "step": 70735 }, { "epoch": 5.770454360061995, "grad_norm": 2.3999760150909424, "learning_rate": 2.2644053441462405e-05, "loss": 0.3769, "num_input_tokens_seen": 67568304, "step": 70740 }, { "epoch": 5.770862223672404, "grad_norm": 0.7628822326660156, "learning_rate": 2.2640510023735298e-05, "loss": 0.3466, "num_input_tokens_seen": 67573536, "step": 70745 }, { "epoch": 5.771270087282812, "grad_norm": 11.801140785217285, "learning_rate": 2.263696665383405e-05, "loss": 0.3303, "num_input_tokens_seen": 67578656, "step": 70750 }, { "epoch": 5.771677950893221, "grad_norm": 3.598893880844116, "learning_rate": 2.263342333183047e-05, "loss": 0.3208, "num_input_tokens_seen": 67583152, "step": 70755 }, { "epoch": 5.77208581450363, "grad_norm": 12.465336799621582, "learning_rate": 2.2629880057796406e-05, "loss": 0.3536, "num_input_tokens_seen": 67588400, "step": 70760 }, { "epoch": 5.772493678114039, "grad_norm": 2.2213923931121826, "learning_rate": 2.2626336831803667e-05, "loss": 0.3963, "num_input_tokens_seen": 67593920, "step": 70765 }, { "epoch": 5.772901541724448, "grad_norm": 4.390316486358643, "learning_rate": 2.262279365392407e-05, "loss": 0.3552, "num_input_tokens_seen": 67597936, "step": 70770 }, { "epoch": 5.773309405334856, "grad_norm": 0.4611906111240387, "learning_rate": 2.2619250524229425e-05, "loss": 0.3507, "num_input_tokens_seen": 67601920, "step": 70775 }, { "epoch": 5.773717268945265, "grad_norm": 1.7014464139938354, "learning_rate": 2.261570744279157e-05, "loss": 0.3286, "num_input_tokens_seen": 67607456, "step": 70780 }, { "epoch": 5.774125132555674, "grad_norm": 0.734609067440033, "learning_rate": 2.2612164409682313e-05, "loss": 0.3191, "num_input_tokens_seen": 67611920, "step": 70785 }, { "epoch": 5.774532996166082, "grad_norm": 1.0922818183898926, "learning_rate": 2.260862142497347e-05, "loss": 0.353, "num_input_tokens_seen": 67617536, "step": 70790 }, { "epoch": 5.774940859776491, "grad_norm": 1.367830514907837, "learning_rate": 2.2605078488736853e-05, "loss": 0.32, "num_input_tokens_seen": 67622224, "step": 70795 }, { "epoch": 5.7753487233869, "grad_norm": 4.361760139465332, "learning_rate": 2.2601535601044272e-05, "loss": 0.2904, "num_input_tokens_seen": 67626832, "step": 70800 }, { "epoch": 5.775756586997308, "grad_norm": 5.753479480743408, "learning_rate": 2.259799276196755e-05, "loss": 0.3126, "num_input_tokens_seen": 67631568, "step": 70805 }, { "epoch": 5.776164450607717, "grad_norm": 1.055543303489685, "learning_rate": 2.2594449971578496e-05, "loss": 0.3271, "num_input_tokens_seen": 67636048, "step": 70810 }, { "epoch": 5.776572314218125, "grad_norm": 10.4088773727417, "learning_rate": 2.259090722994892e-05, "loss": 0.3471, "num_input_tokens_seen": 67641296, "step": 70815 }, { "epoch": 5.776980177828534, "grad_norm": 0.6862629055976868, "learning_rate": 2.2587364537150614e-05, "loss": 0.347, "num_input_tokens_seen": 67645952, "step": 70820 }, { "epoch": 5.777388041438943, "grad_norm": 1.3150863647460938, "learning_rate": 2.258382189325542e-05, "loss": 0.3194, "num_input_tokens_seen": 67650304, "step": 70825 }, { "epoch": 5.777795905049351, "grad_norm": 1.616695761680603, "learning_rate": 2.2580279298335128e-05, "loss": 0.3363, "num_input_tokens_seen": 67654672, "step": 70830 }, { "epoch": 5.77820376865976, "grad_norm": 1.0101183652877808, "learning_rate": 2.2576736752461548e-05, "loss": 0.2796, "num_input_tokens_seen": 67660032, "step": 70835 }, { "epoch": 5.778611632270168, "grad_norm": 7.350654602050781, "learning_rate": 2.257319425570648e-05, "loss": 0.3258, "num_input_tokens_seen": 67664464, "step": 70840 }, { "epoch": 5.779019495880577, "grad_norm": 2.5160746574401855, "learning_rate": 2.2569651808141725e-05, "loss": 0.2611, "num_input_tokens_seen": 67668352, "step": 70845 }, { "epoch": 5.779427359490986, "grad_norm": 1.0344939231872559, "learning_rate": 2.256610940983911e-05, "loss": 0.3001, "num_input_tokens_seen": 67673696, "step": 70850 }, { "epoch": 5.7798352231013945, "grad_norm": 0.9817555546760559, "learning_rate": 2.2562567060870416e-05, "loss": 0.2402, "num_input_tokens_seen": 67678144, "step": 70855 }, { "epoch": 5.7802430867118035, "grad_norm": 3.4980380535125732, "learning_rate": 2.2559024761307458e-05, "loss": 0.3697, "num_input_tokens_seen": 67682640, "step": 70860 }, { "epoch": 5.7806509503222125, "grad_norm": 5.471381187438965, "learning_rate": 2.255548251122202e-05, "loss": 0.2925, "num_input_tokens_seen": 67687568, "step": 70865 }, { "epoch": 5.781058813932621, "grad_norm": 1.6948504447937012, "learning_rate": 2.255194031068592e-05, "loss": 0.3498, "num_input_tokens_seen": 67691840, "step": 70870 }, { "epoch": 5.78146667754303, "grad_norm": 0.48102957010269165, "learning_rate": 2.254839815977095e-05, "loss": 0.5316, "num_input_tokens_seen": 67696528, "step": 70875 }, { "epoch": 5.781874541153439, "grad_norm": 1.6488218307495117, "learning_rate": 2.2544856058548907e-05, "loss": 0.3568, "num_input_tokens_seen": 67700960, "step": 70880 }, { "epoch": 5.782282404763847, "grad_norm": 3.0455265045166016, "learning_rate": 2.254131400709158e-05, "loss": 0.3849, "num_input_tokens_seen": 67706112, "step": 70885 }, { "epoch": 5.782690268374256, "grad_norm": 1.5832804441452026, "learning_rate": 2.2537772005470782e-05, "loss": 0.3144, "num_input_tokens_seen": 67711216, "step": 70890 }, { "epoch": 5.783098131984664, "grad_norm": 0.6727370023727417, "learning_rate": 2.2534230053758302e-05, "loss": 0.2981, "num_input_tokens_seen": 67716128, "step": 70895 }, { "epoch": 5.783505995595073, "grad_norm": 4.448075294494629, "learning_rate": 2.253068815202593e-05, "loss": 0.3074, "num_input_tokens_seen": 67721904, "step": 70900 }, { "epoch": 5.783913859205482, "grad_norm": 3.7432782649993896, "learning_rate": 2.2527146300345454e-05, "loss": 0.3674, "num_input_tokens_seen": 67726080, "step": 70905 }, { "epoch": 5.78432172281589, "grad_norm": 0.9633559584617615, "learning_rate": 2.252360449878867e-05, "loss": 0.3457, "num_input_tokens_seen": 67730832, "step": 70910 }, { "epoch": 5.784729586426299, "grad_norm": 0.8926204442977905, "learning_rate": 2.2520062747427376e-05, "loss": 0.4441, "num_input_tokens_seen": 67736160, "step": 70915 }, { "epoch": 5.785137450036708, "grad_norm": 8.36861801147461, "learning_rate": 2.2516521046333354e-05, "loss": 0.3519, "num_input_tokens_seen": 67740992, "step": 70920 }, { "epoch": 5.785545313647116, "grad_norm": 1.180959701538086, "learning_rate": 2.2512979395578402e-05, "loss": 0.303, "num_input_tokens_seen": 67746000, "step": 70925 }, { "epoch": 5.785953177257525, "grad_norm": 4.255415439605713, "learning_rate": 2.250943779523429e-05, "loss": 0.3511, "num_input_tokens_seen": 67751088, "step": 70930 }, { "epoch": 5.786361040867934, "grad_norm": 5.6107611656188965, "learning_rate": 2.2505896245372824e-05, "loss": 0.3365, "num_input_tokens_seen": 67755440, "step": 70935 }, { "epoch": 5.786768904478342, "grad_norm": 2.613370656967163, "learning_rate": 2.250235474606578e-05, "loss": 0.3362, "num_input_tokens_seen": 67759744, "step": 70940 }, { "epoch": 5.787176768088751, "grad_norm": 6.092951774597168, "learning_rate": 2.2498813297384944e-05, "loss": 0.318, "num_input_tokens_seen": 67764752, "step": 70945 }, { "epoch": 5.7875846316991595, "grad_norm": 1.1287990808486938, "learning_rate": 2.24952718994021e-05, "loss": 0.3341, "num_input_tokens_seen": 67768944, "step": 70950 }, { "epoch": 5.7879924953095685, "grad_norm": 2.5438172817230225, "learning_rate": 2.249173055218902e-05, "loss": 0.4298, "num_input_tokens_seen": 67773712, "step": 70955 }, { "epoch": 5.7884003589199775, "grad_norm": 0.9232624769210815, "learning_rate": 2.2488189255817505e-05, "loss": 0.4084, "num_input_tokens_seen": 67778448, "step": 70960 }, { "epoch": 5.788808222530386, "grad_norm": 2.306894302368164, "learning_rate": 2.2484648010359328e-05, "loss": 0.3303, "num_input_tokens_seen": 67782816, "step": 70965 }, { "epoch": 5.789216086140795, "grad_norm": 1.2045810222625732, "learning_rate": 2.2481106815886265e-05, "loss": 0.324, "num_input_tokens_seen": 67787760, "step": 70970 }, { "epoch": 5.789623949751203, "grad_norm": 0.7484149932861328, "learning_rate": 2.247756567247009e-05, "loss": 0.2887, "num_input_tokens_seen": 67792192, "step": 70975 }, { "epoch": 5.790031813361612, "grad_norm": 2.5498485565185547, "learning_rate": 2.2474024580182594e-05, "loss": 0.3717, "num_input_tokens_seen": 67797936, "step": 70980 }, { "epoch": 5.790439676972021, "grad_norm": 0.6562007069587708, "learning_rate": 2.2470483539095547e-05, "loss": 0.3232, "num_input_tokens_seen": 67803232, "step": 70985 }, { "epoch": 5.790847540582429, "grad_norm": 1.966581106185913, "learning_rate": 2.2466942549280722e-05, "loss": 0.3162, "num_input_tokens_seen": 67808512, "step": 70990 }, { "epoch": 5.791255404192838, "grad_norm": 9.747306823730469, "learning_rate": 2.2463401610809897e-05, "loss": 0.2326, "num_input_tokens_seen": 67813920, "step": 70995 }, { "epoch": 5.791663267803247, "grad_norm": 0.684932291507721, "learning_rate": 2.245986072375484e-05, "loss": 0.3202, "num_input_tokens_seen": 67818928, "step": 71000 }, { "epoch": 5.792071131413655, "grad_norm": 7.678856372833252, "learning_rate": 2.245631988818733e-05, "loss": 0.3073, "num_input_tokens_seen": 67823120, "step": 71005 }, { "epoch": 5.792478995024064, "grad_norm": 0.7088605165481567, "learning_rate": 2.2452779104179132e-05, "loss": 0.3466, "num_input_tokens_seen": 67828944, "step": 71010 }, { "epoch": 5.792886858634473, "grad_norm": 4.9279375076293945, "learning_rate": 2.2449238371802025e-05, "loss": 0.4062, "num_input_tokens_seen": 67832848, "step": 71015 }, { "epoch": 5.793294722244881, "grad_norm": 11.082884788513184, "learning_rate": 2.244569769112776e-05, "loss": 0.3784, "num_input_tokens_seen": 67838032, "step": 71020 }, { "epoch": 5.79370258585529, "grad_norm": 8.430652618408203, "learning_rate": 2.244215706222813e-05, "loss": 0.3305, "num_input_tokens_seen": 67842736, "step": 71025 }, { "epoch": 5.794110449465698, "grad_norm": 0.9033901691436768, "learning_rate": 2.2438616485174885e-05, "loss": 0.2646, "num_input_tokens_seen": 67847824, "step": 71030 }, { "epoch": 5.794518313076107, "grad_norm": 0.8447980880737305, "learning_rate": 2.2435075960039797e-05, "loss": 0.3781, "num_input_tokens_seen": 67852528, "step": 71035 }, { "epoch": 5.794926176686516, "grad_norm": 1.6239309310913086, "learning_rate": 2.2431535486894616e-05, "loss": 0.2369, "num_input_tokens_seen": 67856704, "step": 71040 }, { "epoch": 5.7953340402969244, "grad_norm": 20.01085662841797, "learning_rate": 2.2427995065811135e-05, "loss": 0.3376, "num_input_tokens_seen": 67861008, "step": 71045 }, { "epoch": 5.7957419039073335, "grad_norm": 1.748602271080017, "learning_rate": 2.24244546968611e-05, "loss": 0.2883, "num_input_tokens_seen": 67865952, "step": 71050 }, { "epoch": 5.796149767517742, "grad_norm": 0.3303532302379608, "learning_rate": 2.242091438011627e-05, "loss": 0.2286, "num_input_tokens_seen": 67870288, "step": 71055 }, { "epoch": 5.796557631128151, "grad_norm": 20.40006446838379, "learning_rate": 2.2417374115648416e-05, "loss": 0.2393, "num_input_tokens_seen": 67874800, "step": 71060 }, { "epoch": 5.79696549473856, "grad_norm": 23.18442726135254, "learning_rate": 2.241383390352928e-05, "loss": 0.5602, "num_input_tokens_seen": 67880432, "step": 71065 }, { "epoch": 5.797373358348968, "grad_norm": 11.196327209472656, "learning_rate": 2.241029374383064e-05, "loss": 0.4813, "num_input_tokens_seen": 67885520, "step": 71070 }, { "epoch": 5.797781221959377, "grad_norm": 4.469812393188477, "learning_rate": 2.2406753636624246e-05, "loss": 0.5247, "num_input_tokens_seen": 67890224, "step": 71075 }, { "epoch": 5.798189085569786, "grad_norm": 0.47509095072746277, "learning_rate": 2.2403213581981845e-05, "loss": 0.2116, "num_input_tokens_seen": 67895008, "step": 71080 }, { "epoch": 5.798596949180194, "grad_norm": 8.118417739868164, "learning_rate": 2.2399673579975207e-05, "loss": 0.2721, "num_input_tokens_seen": 67899104, "step": 71085 }, { "epoch": 5.799004812790603, "grad_norm": 0.7813690304756165, "learning_rate": 2.239613363067608e-05, "loss": 0.4137, "num_input_tokens_seen": 67903872, "step": 71090 }, { "epoch": 5.799412676401012, "grad_norm": 31.493370056152344, "learning_rate": 2.239259373415622e-05, "loss": 0.4611, "num_input_tokens_seen": 67909040, "step": 71095 }, { "epoch": 5.79982054001142, "grad_norm": 8.319994926452637, "learning_rate": 2.238905389048738e-05, "loss": 0.4543, "num_input_tokens_seen": 67913408, "step": 71100 }, { "epoch": 5.800228403621829, "grad_norm": 5.529242038726807, "learning_rate": 2.2385514099741307e-05, "loss": 0.327, "num_input_tokens_seen": 67917776, "step": 71105 }, { "epoch": 5.800636267232237, "grad_norm": 1.334816336631775, "learning_rate": 2.2381974361989737e-05, "loss": 0.2707, "num_input_tokens_seen": 67922912, "step": 71110 }, { "epoch": 5.801044130842646, "grad_norm": 16.808860778808594, "learning_rate": 2.237843467730445e-05, "loss": 0.2608, "num_input_tokens_seen": 67927136, "step": 71115 }, { "epoch": 5.801451994453055, "grad_norm": 13.238130569458008, "learning_rate": 2.2374895045757175e-05, "loss": 0.3172, "num_input_tokens_seen": 67931568, "step": 71120 }, { "epoch": 5.801859858063463, "grad_norm": 18.52549171447754, "learning_rate": 2.2371355467419663e-05, "loss": 0.4209, "num_input_tokens_seen": 67935808, "step": 71125 }, { "epoch": 5.802267721673872, "grad_norm": 4.247589588165283, "learning_rate": 2.2367815942363653e-05, "loss": 0.4621, "num_input_tokens_seen": 67940064, "step": 71130 }, { "epoch": 5.802675585284281, "grad_norm": 16.826828002929688, "learning_rate": 2.2364276470660904e-05, "loss": 0.3103, "num_input_tokens_seen": 67944608, "step": 71135 }, { "epoch": 5.803083448894689, "grad_norm": 1.901214361190796, "learning_rate": 2.2360737052383142e-05, "loss": 0.2787, "num_input_tokens_seen": 67948976, "step": 71140 }, { "epoch": 5.803491312505098, "grad_norm": 4.16157341003418, "learning_rate": 2.2357197687602124e-05, "loss": 0.2657, "num_input_tokens_seen": 67954592, "step": 71145 }, { "epoch": 5.803899176115507, "grad_norm": 8.856801986694336, "learning_rate": 2.2353658376389584e-05, "loss": 0.387, "num_input_tokens_seen": 67960144, "step": 71150 }, { "epoch": 5.8043070397259156, "grad_norm": 3.9413158893585205, "learning_rate": 2.2350119118817265e-05, "loss": 0.369, "num_input_tokens_seen": 67964320, "step": 71155 }, { "epoch": 5.804714903336325, "grad_norm": 1.5359845161437988, "learning_rate": 2.2346579914956905e-05, "loss": 0.2055, "num_input_tokens_seen": 67969120, "step": 71160 }, { "epoch": 5.805122766946733, "grad_norm": 12.68550968170166, "learning_rate": 2.2343040764880247e-05, "loss": 0.2512, "num_input_tokens_seen": 67974288, "step": 71165 }, { "epoch": 5.805530630557142, "grad_norm": 1.0447237491607666, "learning_rate": 2.233950166865902e-05, "loss": 0.409, "num_input_tokens_seen": 67978240, "step": 71170 }, { "epoch": 5.805938494167551, "grad_norm": 2.192171096801758, "learning_rate": 2.2335962626364958e-05, "loss": 0.3645, "num_input_tokens_seen": 67983440, "step": 71175 }, { "epoch": 5.806346357777959, "grad_norm": 12.833742141723633, "learning_rate": 2.2332423638069803e-05, "loss": 0.4197, "num_input_tokens_seen": 67988384, "step": 71180 }, { "epoch": 5.806754221388368, "grad_norm": 2.5288538932800293, "learning_rate": 2.2328884703845296e-05, "loss": 0.4885, "num_input_tokens_seen": 67992432, "step": 71185 }, { "epoch": 5.807162084998776, "grad_norm": 1.8362860679626465, "learning_rate": 2.2325345823763157e-05, "loss": 0.3415, "num_input_tokens_seen": 67997728, "step": 71190 }, { "epoch": 5.807569948609185, "grad_norm": 14.273032188415527, "learning_rate": 2.2321806997895126e-05, "loss": 0.3643, "num_input_tokens_seen": 68002304, "step": 71195 }, { "epoch": 5.807977812219594, "grad_norm": 2.915667772293091, "learning_rate": 2.2318268226312914e-05, "loss": 0.4504, "num_input_tokens_seen": 68007792, "step": 71200 }, { "epoch": 5.808385675830002, "grad_norm": 3.227055072784424, "learning_rate": 2.2314729509088276e-05, "loss": 0.3154, "num_input_tokens_seen": 68012080, "step": 71205 }, { "epoch": 5.808793539440411, "grad_norm": 2.3505642414093018, "learning_rate": 2.2311190846292932e-05, "loss": 0.4742, "num_input_tokens_seen": 68017456, "step": 71210 }, { "epoch": 5.80920140305082, "grad_norm": 2.883314847946167, "learning_rate": 2.2307652237998605e-05, "loss": 0.2926, "num_input_tokens_seen": 68022416, "step": 71215 }, { "epoch": 5.809609266661228, "grad_norm": 2.0640735626220703, "learning_rate": 2.230411368427702e-05, "loss": 0.3835, "num_input_tokens_seen": 68027280, "step": 71220 }, { "epoch": 5.810017130271637, "grad_norm": 21.632007598876953, "learning_rate": 2.2300575185199908e-05, "loss": 0.3943, "num_input_tokens_seen": 68032816, "step": 71225 }, { "epoch": 5.810424993882046, "grad_norm": 1.1491122245788574, "learning_rate": 2.2297036740838992e-05, "loss": 0.3824, "num_input_tokens_seen": 68037152, "step": 71230 }, { "epoch": 5.810832857492454, "grad_norm": 12.59643268585205, "learning_rate": 2.2293498351265993e-05, "loss": 0.2876, "num_input_tokens_seen": 68041424, "step": 71235 }, { "epoch": 5.811240721102863, "grad_norm": 0.8284943699836731, "learning_rate": 2.228996001655262e-05, "loss": 0.2115, "num_input_tokens_seen": 68046800, "step": 71240 }, { "epoch": 5.8116485847132715, "grad_norm": 4.663750171661377, "learning_rate": 2.2286421736770618e-05, "loss": 0.4934, "num_input_tokens_seen": 68051936, "step": 71245 }, { "epoch": 5.8120564483236805, "grad_norm": 15.285749435424805, "learning_rate": 2.228288351199169e-05, "loss": 0.382, "num_input_tokens_seen": 68056288, "step": 71250 }, { "epoch": 5.8124643119340895, "grad_norm": 1.570369005203247, "learning_rate": 2.2279345342287567e-05, "loss": 0.364, "num_input_tokens_seen": 68060832, "step": 71255 }, { "epoch": 5.812872175544498, "grad_norm": 3.566349983215332, "learning_rate": 2.227580722772995e-05, "loss": 0.4262, "num_input_tokens_seen": 68065216, "step": 71260 }, { "epoch": 5.813280039154907, "grad_norm": 7.218744277954102, "learning_rate": 2.227226916839056e-05, "loss": 0.5343, "num_input_tokens_seen": 68069376, "step": 71265 }, { "epoch": 5.813687902765316, "grad_norm": 4.8748650550842285, "learning_rate": 2.2268731164341116e-05, "loss": 0.397, "num_input_tokens_seen": 68073920, "step": 71270 }, { "epoch": 5.814095766375724, "grad_norm": 0.9470487236976624, "learning_rate": 2.2265193215653337e-05, "loss": 0.3473, "num_input_tokens_seen": 68078864, "step": 71275 }, { "epoch": 5.814503629986133, "grad_norm": 0.3947233259677887, "learning_rate": 2.2261655322398928e-05, "loss": 0.2283, "num_input_tokens_seen": 68083184, "step": 71280 }, { "epoch": 5.814911493596542, "grad_norm": 1.2786719799041748, "learning_rate": 2.2258117484649595e-05, "loss": 0.2978, "num_input_tokens_seen": 68087824, "step": 71285 }, { "epoch": 5.81531935720695, "grad_norm": 28.187801361083984, "learning_rate": 2.2254579702477066e-05, "loss": 0.313, "num_input_tokens_seen": 68092336, "step": 71290 }, { "epoch": 5.815727220817359, "grad_norm": 9.264849662780762, "learning_rate": 2.2251041975953035e-05, "loss": 0.5157, "num_input_tokens_seen": 68096992, "step": 71295 }, { "epoch": 5.816135084427767, "grad_norm": 1.2021633386611938, "learning_rate": 2.2247504305149217e-05, "loss": 0.3593, "num_input_tokens_seen": 68101280, "step": 71300 }, { "epoch": 5.816542948038176, "grad_norm": 0.5925488471984863, "learning_rate": 2.2243966690137315e-05, "loss": 0.4068, "num_input_tokens_seen": 68105728, "step": 71305 }, { "epoch": 5.816950811648585, "grad_norm": 2.562397003173828, "learning_rate": 2.2240429130989028e-05, "loss": 0.3196, "num_input_tokens_seen": 68109872, "step": 71310 }, { "epoch": 5.817358675258993, "grad_norm": 29.035717010498047, "learning_rate": 2.223689162777608e-05, "loss": 0.429, "num_input_tokens_seen": 68114480, "step": 71315 }, { "epoch": 5.817766538869402, "grad_norm": 1.838912844657898, "learning_rate": 2.2233354180570166e-05, "loss": 0.3046, "num_input_tokens_seen": 68119856, "step": 71320 }, { "epoch": 5.81817440247981, "grad_norm": 0.80654376745224, "learning_rate": 2.2229816789442986e-05, "loss": 0.261, "num_input_tokens_seen": 68125424, "step": 71325 }, { "epoch": 5.818582266090219, "grad_norm": 0.7277140617370605, "learning_rate": 2.222627945446623e-05, "loss": 0.3072, "num_input_tokens_seen": 68129536, "step": 71330 }, { "epoch": 5.818990129700628, "grad_norm": 0.5990562438964844, "learning_rate": 2.2222742175711626e-05, "loss": 0.3452, "num_input_tokens_seen": 68134000, "step": 71335 }, { "epoch": 5.8193979933110365, "grad_norm": 0.43691903352737427, "learning_rate": 2.2219204953250855e-05, "loss": 0.3728, "num_input_tokens_seen": 68138848, "step": 71340 }, { "epoch": 5.8198058569214455, "grad_norm": 0.6565340161323547, "learning_rate": 2.221566778715562e-05, "loss": 0.3674, "num_input_tokens_seen": 68143712, "step": 71345 }, { "epoch": 5.8202137205318545, "grad_norm": 1.2291313409805298, "learning_rate": 2.2212130677497615e-05, "loss": 0.4275, "num_input_tokens_seen": 68149024, "step": 71350 }, { "epoch": 5.820621584142263, "grad_norm": 6.06151819229126, "learning_rate": 2.2208593624348528e-05, "loss": 0.2932, "num_input_tokens_seen": 68153360, "step": 71355 }, { "epoch": 5.821029447752672, "grad_norm": 1.0162975788116455, "learning_rate": 2.2205056627780073e-05, "loss": 0.3744, "num_input_tokens_seen": 68158400, "step": 71360 }, { "epoch": 5.821437311363081, "grad_norm": 0.8019331693649292, "learning_rate": 2.220151968786393e-05, "loss": 0.2208, "num_input_tokens_seen": 68164048, "step": 71365 }, { "epoch": 5.821845174973489, "grad_norm": 0.8927148580551147, "learning_rate": 2.219798280467179e-05, "loss": 0.331, "num_input_tokens_seen": 68168992, "step": 71370 }, { "epoch": 5.822253038583898, "grad_norm": 0.559447169303894, "learning_rate": 2.219444597827534e-05, "loss": 0.3505, "num_input_tokens_seen": 68173168, "step": 71375 }, { "epoch": 5.822660902194306, "grad_norm": 0.5776879787445068, "learning_rate": 2.2190909208746292e-05, "loss": 0.2885, "num_input_tokens_seen": 68177904, "step": 71380 }, { "epoch": 5.823068765804715, "grad_norm": 23.608274459838867, "learning_rate": 2.2187372496156315e-05, "loss": 0.295, "num_input_tokens_seen": 68183328, "step": 71385 }, { "epoch": 5.823476629415124, "grad_norm": 0.23767448961734772, "learning_rate": 2.2183835840577104e-05, "loss": 0.3172, "num_input_tokens_seen": 68186832, "step": 71390 }, { "epoch": 5.823884493025532, "grad_norm": 0.5331069231033325, "learning_rate": 2.218029924208035e-05, "loss": 0.2907, "num_input_tokens_seen": 68190320, "step": 71395 }, { "epoch": 5.824292356635941, "grad_norm": 0.6004487872123718, "learning_rate": 2.2176762700737713e-05, "loss": 0.3356, "num_input_tokens_seen": 68195632, "step": 71400 }, { "epoch": 5.824700220246349, "grad_norm": 1.932492971420288, "learning_rate": 2.217322621662091e-05, "loss": 0.4015, "num_input_tokens_seen": 68200000, "step": 71405 }, { "epoch": 5.825108083856758, "grad_norm": 1.0001262426376343, "learning_rate": 2.2169689789801613e-05, "loss": 0.3971, "num_input_tokens_seen": 68205536, "step": 71410 }, { "epoch": 5.825515947467167, "grad_norm": 2.0767159461975098, "learning_rate": 2.2166153420351497e-05, "loss": 0.356, "num_input_tokens_seen": 68211200, "step": 71415 }, { "epoch": 5.825923811077575, "grad_norm": 4.473176002502441, "learning_rate": 2.2162617108342246e-05, "loss": 0.3446, "num_input_tokens_seen": 68216816, "step": 71420 }, { "epoch": 5.826331674687984, "grad_norm": 1.5947197675704956, "learning_rate": 2.2159080853845543e-05, "loss": 0.3501, "num_input_tokens_seen": 68221264, "step": 71425 }, { "epoch": 5.826739538298393, "grad_norm": 1.8571585416793823, "learning_rate": 2.2155544656933064e-05, "loss": 0.2889, "num_input_tokens_seen": 68226304, "step": 71430 }, { "epoch": 5.827147401908801, "grad_norm": 2.447707414627075, "learning_rate": 2.2152008517676487e-05, "loss": 0.2973, "num_input_tokens_seen": 68232032, "step": 71435 }, { "epoch": 5.82755526551921, "grad_norm": 33.48662567138672, "learning_rate": 2.2148472436147476e-05, "loss": 0.3862, "num_input_tokens_seen": 68238176, "step": 71440 }, { "epoch": 5.8279631291296194, "grad_norm": 0.612312376499176, "learning_rate": 2.214493641241773e-05, "loss": 0.3541, "num_input_tokens_seen": 68242752, "step": 71445 }, { "epoch": 5.828370992740028, "grad_norm": 1.3617416620254517, "learning_rate": 2.214140044655891e-05, "loss": 0.3105, "num_input_tokens_seen": 68246896, "step": 71450 }, { "epoch": 5.828778856350437, "grad_norm": 3.75750994682312, "learning_rate": 2.2137864538642683e-05, "loss": 0.3838, "num_input_tokens_seen": 68251360, "step": 71455 }, { "epoch": 5.829186719960845, "grad_norm": 0.976650059223175, "learning_rate": 2.213432868874073e-05, "loss": 0.3675, "num_input_tokens_seen": 68257200, "step": 71460 }, { "epoch": 5.829594583571254, "grad_norm": 1.173872947692871, "learning_rate": 2.2130792896924702e-05, "loss": 0.3874, "num_input_tokens_seen": 68262368, "step": 71465 }, { "epoch": 5.830002447181663, "grad_norm": 1.428368091583252, "learning_rate": 2.2127257163266295e-05, "loss": 0.3451, "num_input_tokens_seen": 68267456, "step": 71470 }, { "epoch": 5.830410310792071, "grad_norm": 0.7828216552734375, "learning_rate": 2.2123721487837168e-05, "loss": 0.3429, "num_input_tokens_seen": 68272400, "step": 71475 }, { "epoch": 5.83081817440248, "grad_norm": 7.201313018798828, "learning_rate": 2.2120185870708983e-05, "loss": 0.2686, "num_input_tokens_seen": 68277744, "step": 71480 }, { "epoch": 5.831226038012889, "grad_norm": 1.0748536586761475, "learning_rate": 2.21166503119534e-05, "loss": 0.3379, "num_input_tokens_seen": 68282736, "step": 71485 }, { "epoch": 5.831633901623297, "grad_norm": 0.6085215210914612, "learning_rate": 2.2113114811642093e-05, "loss": 0.3451, "num_input_tokens_seen": 68287440, "step": 71490 }, { "epoch": 5.832041765233706, "grad_norm": 0.8109742403030396, "learning_rate": 2.2109579369846726e-05, "loss": 0.3093, "num_input_tokens_seen": 68292048, "step": 71495 }, { "epoch": 5.832449628844115, "grad_norm": 2.086456298828125, "learning_rate": 2.210604398663896e-05, "loss": 0.3879, "num_input_tokens_seen": 68296048, "step": 71500 }, { "epoch": 5.832857492454523, "grad_norm": 0.8815107345581055, "learning_rate": 2.2102508662090444e-05, "loss": 0.3415, "num_input_tokens_seen": 68300976, "step": 71505 }, { "epoch": 5.833265356064932, "grad_norm": 0.8883678317070007, "learning_rate": 2.209897339627284e-05, "loss": 0.3492, "num_input_tokens_seen": 68305712, "step": 71510 }, { "epoch": 5.83367321967534, "grad_norm": 1.0990657806396484, "learning_rate": 2.2095438189257822e-05, "loss": 0.3622, "num_input_tokens_seen": 68311216, "step": 71515 }, { "epoch": 5.834081083285749, "grad_norm": 2.8251445293426514, "learning_rate": 2.2091903041117042e-05, "loss": 0.2874, "num_input_tokens_seen": 68315760, "step": 71520 }, { "epoch": 5.834488946896158, "grad_norm": 0.6207906603813171, "learning_rate": 2.2088367951922145e-05, "loss": 0.3318, "num_input_tokens_seen": 68320320, "step": 71525 }, { "epoch": 5.834896810506566, "grad_norm": 1.2785868644714355, "learning_rate": 2.2084832921744787e-05, "loss": 0.3344, "num_input_tokens_seen": 68325472, "step": 71530 }, { "epoch": 5.835304674116975, "grad_norm": 4.100013732910156, "learning_rate": 2.2081297950656634e-05, "loss": 0.3646, "num_input_tokens_seen": 68330688, "step": 71535 }, { "epoch": 5.8357125377273835, "grad_norm": 3.0112972259521484, "learning_rate": 2.2077763038729334e-05, "loss": 0.2826, "num_input_tokens_seen": 68336080, "step": 71540 }, { "epoch": 5.8361204013377925, "grad_norm": 4.2034406661987305, "learning_rate": 2.2074228186034533e-05, "loss": 0.3048, "num_input_tokens_seen": 68340912, "step": 71545 }, { "epoch": 5.8365282649482015, "grad_norm": 4.782789707183838, "learning_rate": 2.2070693392643887e-05, "loss": 0.2866, "num_input_tokens_seen": 68346992, "step": 71550 }, { "epoch": 5.83693612855861, "grad_norm": 2.82395601272583, "learning_rate": 2.2067158658629035e-05, "loss": 0.2154, "num_input_tokens_seen": 68351488, "step": 71555 }, { "epoch": 5.837343992169019, "grad_norm": 0.7426298260688782, "learning_rate": 2.206362398406163e-05, "loss": 0.3147, "num_input_tokens_seen": 68356560, "step": 71560 }, { "epoch": 5.837751855779428, "grad_norm": 5.0933146476745605, "learning_rate": 2.2060089369013328e-05, "loss": 0.3128, "num_input_tokens_seen": 68360624, "step": 71565 }, { "epoch": 5.838159719389836, "grad_norm": 3.1436729431152344, "learning_rate": 2.205655481355576e-05, "loss": 0.3481, "num_input_tokens_seen": 68364272, "step": 71570 }, { "epoch": 5.838567583000245, "grad_norm": 0.37207749485969543, "learning_rate": 2.2053020317760566e-05, "loss": 0.4439, "num_input_tokens_seen": 68368336, "step": 71575 }, { "epoch": 5.838975446610654, "grad_norm": 5.086349964141846, "learning_rate": 2.204948588169941e-05, "loss": 0.4606, "num_input_tokens_seen": 68373280, "step": 71580 }, { "epoch": 5.839383310221062, "grad_norm": 7.442663669586182, "learning_rate": 2.2045951505443917e-05, "loss": 0.4624, "num_input_tokens_seen": 68379456, "step": 71585 }, { "epoch": 5.839791173831471, "grad_norm": 6.714998722076416, "learning_rate": 2.2042417189065737e-05, "loss": 0.2713, "num_input_tokens_seen": 68384688, "step": 71590 }, { "epoch": 5.840199037441879, "grad_norm": 2.255622625350952, "learning_rate": 2.20388829326365e-05, "loss": 0.4104, "num_input_tokens_seen": 68389776, "step": 71595 }, { "epoch": 5.840606901052288, "grad_norm": 29.160995483398438, "learning_rate": 2.203534873622784e-05, "loss": 0.3913, "num_input_tokens_seen": 68394864, "step": 71600 }, { "epoch": 5.841014764662697, "grad_norm": 3.105841636657715, "learning_rate": 2.203181459991141e-05, "loss": 0.4254, "num_input_tokens_seen": 68399760, "step": 71605 }, { "epoch": 5.841422628273105, "grad_norm": 2.456648111343384, "learning_rate": 2.2028280523758838e-05, "loss": 0.3121, "num_input_tokens_seen": 68404688, "step": 71610 }, { "epoch": 5.841830491883514, "grad_norm": 0.9021990299224854, "learning_rate": 2.2024746507841753e-05, "loss": 0.3006, "num_input_tokens_seen": 68409392, "step": 71615 }, { "epoch": 5.842238355493922, "grad_norm": 13.115442276000977, "learning_rate": 2.202121255223179e-05, "loss": 0.2508, "num_input_tokens_seen": 68414080, "step": 71620 }, { "epoch": 5.842646219104331, "grad_norm": 0.8388055562973022, "learning_rate": 2.201767865700059e-05, "loss": 0.3132, "num_input_tokens_seen": 68419120, "step": 71625 }, { "epoch": 5.84305408271474, "grad_norm": 2.1420867443084717, "learning_rate": 2.2014144822219775e-05, "loss": 0.3449, "num_input_tokens_seen": 68424224, "step": 71630 }, { "epoch": 5.8434619463251485, "grad_norm": 1.3651498556137085, "learning_rate": 2.2010611047960973e-05, "loss": 0.3573, "num_input_tokens_seen": 68428496, "step": 71635 }, { "epoch": 5.8438698099355575, "grad_norm": 0.772345244884491, "learning_rate": 2.2007077334295805e-05, "loss": 0.3178, "num_input_tokens_seen": 68432848, "step": 71640 }, { "epoch": 5.8442776735459665, "grad_norm": 1.5013113021850586, "learning_rate": 2.2003543681295923e-05, "loss": 0.2924, "num_input_tokens_seen": 68437344, "step": 71645 }, { "epoch": 5.844685537156375, "grad_norm": 5.337947368621826, "learning_rate": 2.200001008903293e-05, "loss": 0.3059, "num_input_tokens_seen": 68442352, "step": 71650 }, { "epoch": 5.845093400766784, "grad_norm": 2.2676849365234375, "learning_rate": 2.1996476557578465e-05, "loss": 0.3096, "num_input_tokens_seen": 68446544, "step": 71655 }, { "epoch": 5.845501264377193, "grad_norm": 2.023345708847046, "learning_rate": 2.199294308700414e-05, "loss": 0.4297, "num_input_tokens_seen": 68451472, "step": 71660 }, { "epoch": 5.845909127987601, "grad_norm": 0.973110556602478, "learning_rate": 2.198940967738157e-05, "loss": 0.3728, "num_input_tokens_seen": 68455824, "step": 71665 }, { "epoch": 5.84631699159801, "grad_norm": 0.9887554049491882, "learning_rate": 2.1985876328782398e-05, "loss": 0.333, "num_input_tokens_seen": 68459696, "step": 71670 }, { "epoch": 5.846724855208418, "grad_norm": 2.016014337539673, "learning_rate": 2.1982343041278232e-05, "loss": 0.2657, "num_input_tokens_seen": 68464112, "step": 71675 }, { "epoch": 5.847132718818827, "grad_norm": 32.88965606689453, "learning_rate": 2.197880981494069e-05, "loss": 0.4146, "num_input_tokens_seen": 68468960, "step": 71680 }, { "epoch": 5.847540582429236, "grad_norm": 2.51769757270813, "learning_rate": 2.1975276649841384e-05, "loss": 0.3593, "num_input_tokens_seen": 68473424, "step": 71685 }, { "epoch": 5.847948446039644, "grad_norm": 0.5903658270835876, "learning_rate": 2.197174354605194e-05, "loss": 0.3303, "num_input_tokens_seen": 68477440, "step": 71690 }, { "epoch": 5.848356309650053, "grad_norm": 10.608367919921875, "learning_rate": 2.196821050364397e-05, "loss": 0.3126, "num_input_tokens_seen": 68482256, "step": 71695 }, { "epoch": 5.848764173260462, "grad_norm": 7.065718173980713, "learning_rate": 2.196467752268908e-05, "loss": 0.2696, "num_input_tokens_seen": 68486736, "step": 71700 }, { "epoch": 5.84917203687087, "grad_norm": 3.047337532043457, "learning_rate": 2.196114460325889e-05, "loss": 0.3509, "num_input_tokens_seen": 68491616, "step": 71705 }, { "epoch": 5.849579900481279, "grad_norm": 35.22726821899414, "learning_rate": 2.1957611745424997e-05, "loss": 0.3377, "num_input_tokens_seen": 68496496, "step": 71710 }, { "epoch": 5.849987764091688, "grad_norm": 1.2605233192443848, "learning_rate": 2.1954078949259033e-05, "loss": 0.3367, "num_input_tokens_seen": 68501152, "step": 71715 }, { "epoch": 5.850395627702096, "grad_norm": 12.844101905822754, "learning_rate": 2.195054621483259e-05, "loss": 0.2746, "num_input_tokens_seen": 68505392, "step": 71720 }, { "epoch": 5.850803491312505, "grad_norm": 3.461897134780884, "learning_rate": 2.1947013542217283e-05, "loss": 0.368, "num_input_tokens_seen": 68510016, "step": 71725 }, { "epoch": 5.851211354922913, "grad_norm": 25.08262062072754, "learning_rate": 2.1943480931484703e-05, "loss": 0.4178, "num_input_tokens_seen": 68515520, "step": 71730 }, { "epoch": 5.8516192185333225, "grad_norm": 2.060746192932129, "learning_rate": 2.1939948382706478e-05, "loss": 0.3722, "num_input_tokens_seen": 68520080, "step": 71735 }, { "epoch": 5.8520270821437315, "grad_norm": 11.117304801940918, "learning_rate": 2.1936415895954196e-05, "loss": 0.3278, "num_input_tokens_seen": 68524320, "step": 71740 }, { "epoch": 5.85243494575414, "grad_norm": 2.470944881439209, "learning_rate": 2.193288347129946e-05, "loss": 0.2174, "num_input_tokens_seen": 68528736, "step": 71745 }, { "epoch": 5.852842809364549, "grad_norm": 1.2714345455169678, "learning_rate": 2.1929351108813877e-05, "loss": 0.3159, "num_input_tokens_seen": 68534048, "step": 71750 }, { "epoch": 5.853250672974957, "grad_norm": 1.064921498298645, "learning_rate": 2.1925818808569037e-05, "loss": 0.2694, "num_input_tokens_seen": 68539232, "step": 71755 }, { "epoch": 5.853658536585366, "grad_norm": 2.2504184246063232, "learning_rate": 2.192228657063655e-05, "loss": 0.2978, "num_input_tokens_seen": 68543728, "step": 71760 }, { "epoch": 5.854066400195775, "grad_norm": 2.6074769496917725, "learning_rate": 2.1918754395088003e-05, "loss": 0.3737, "num_input_tokens_seen": 68547744, "step": 71765 }, { "epoch": 5.854474263806183, "grad_norm": 6.066936492919922, "learning_rate": 2.1915222281994995e-05, "loss": 0.3287, "num_input_tokens_seen": 68551744, "step": 71770 }, { "epoch": 5.854882127416592, "grad_norm": 28.970993041992188, "learning_rate": 2.191169023142911e-05, "loss": 0.3164, "num_input_tokens_seen": 68556800, "step": 71775 }, { "epoch": 5.855289991027001, "grad_norm": 3.6638927459716797, "learning_rate": 2.1908158243461965e-05, "loss": 0.4446, "num_input_tokens_seen": 68561600, "step": 71780 }, { "epoch": 5.855697854637409, "grad_norm": 13.100143432617188, "learning_rate": 2.1904626318165136e-05, "loss": 0.2177, "num_input_tokens_seen": 68566560, "step": 71785 }, { "epoch": 5.856105718247818, "grad_norm": 0.6806070804595947, "learning_rate": 2.1901094455610216e-05, "loss": 0.2364, "num_input_tokens_seen": 68571168, "step": 71790 }, { "epoch": 5.856513581858227, "grad_norm": 5.888856887817383, "learning_rate": 2.1897562655868786e-05, "loss": 0.3282, "num_input_tokens_seen": 68576720, "step": 71795 }, { "epoch": 5.856921445468635, "grad_norm": 1.649440884590149, "learning_rate": 2.1894030919012455e-05, "loss": 0.3063, "num_input_tokens_seen": 68581776, "step": 71800 }, { "epoch": 5.857329309079044, "grad_norm": 3.6460418701171875, "learning_rate": 2.1890499245112792e-05, "loss": 0.2853, "num_input_tokens_seen": 68584880, "step": 71805 }, { "epoch": 5.857737172689452, "grad_norm": 12.413206100463867, "learning_rate": 2.1886967634241393e-05, "loss": 0.3267, "num_input_tokens_seen": 68589632, "step": 71810 }, { "epoch": 5.858145036299861, "grad_norm": 8.44221019744873, "learning_rate": 2.1883436086469832e-05, "loss": 0.3326, "num_input_tokens_seen": 68594192, "step": 71815 }, { "epoch": 5.85855289991027, "grad_norm": 0.6930662989616394, "learning_rate": 2.1879904601869697e-05, "loss": 0.3946, "num_input_tokens_seen": 68599184, "step": 71820 }, { "epoch": 5.858960763520678, "grad_norm": 26.56515884399414, "learning_rate": 2.1876373180512573e-05, "loss": 0.2742, "num_input_tokens_seen": 68603680, "step": 71825 }, { "epoch": 5.859368627131087, "grad_norm": 21.695293426513672, "learning_rate": 2.1872841822470036e-05, "loss": 0.3373, "num_input_tokens_seen": 68609072, "step": 71830 }, { "epoch": 5.859776490741496, "grad_norm": 0.9450149536132812, "learning_rate": 2.1869310527813665e-05, "loss": 0.412, "num_input_tokens_seen": 68613696, "step": 71835 }, { "epoch": 5.8601843543519045, "grad_norm": 14.874977111816406, "learning_rate": 2.1865779296615037e-05, "loss": 0.2652, "num_input_tokens_seen": 68617616, "step": 71840 }, { "epoch": 5.8605922179623136, "grad_norm": 1.8954418897628784, "learning_rate": 2.1862248128945733e-05, "loss": 0.246, "num_input_tokens_seen": 68621920, "step": 71845 }, { "epoch": 5.861000081572723, "grad_norm": 14.582977294921875, "learning_rate": 2.1858717024877332e-05, "loss": 0.3057, "num_input_tokens_seen": 68627120, "step": 71850 }, { "epoch": 5.861407945183131, "grad_norm": 7.02672004699707, "learning_rate": 2.18551859844814e-05, "loss": 0.3591, "num_input_tokens_seen": 68632112, "step": 71855 }, { "epoch": 5.86181580879354, "grad_norm": 13.18996524810791, "learning_rate": 2.185165500782951e-05, "loss": 0.3522, "num_input_tokens_seen": 68637200, "step": 71860 }, { "epoch": 5.862223672403948, "grad_norm": 4.2415924072265625, "learning_rate": 2.1848124094993225e-05, "loss": 0.3182, "num_input_tokens_seen": 68642336, "step": 71865 }, { "epoch": 5.862631536014357, "grad_norm": 1.4075056314468384, "learning_rate": 2.1844593246044137e-05, "loss": 0.4722, "num_input_tokens_seen": 68647808, "step": 71870 }, { "epoch": 5.863039399624766, "grad_norm": 2.7689340114593506, "learning_rate": 2.1841062461053802e-05, "loss": 0.4688, "num_input_tokens_seen": 68652304, "step": 71875 }, { "epoch": 5.863447263235174, "grad_norm": 3.0683367252349854, "learning_rate": 2.183753174009379e-05, "loss": 0.2913, "num_input_tokens_seen": 68657728, "step": 71880 }, { "epoch": 5.863855126845583, "grad_norm": 5.505722522735596, "learning_rate": 2.1834001083235662e-05, "loss": 0.3286, "num_input_tokens_seen": 68663488, "step": 71885 }, { "epoch": 5.864262990455991, "grad_norm": 0.6160699725151062, "learning_rate": 2.1830470490550985e-05, "loss": 0.3565, "num_input_tokens_seen": 68669072, "step": 71890 }, { "epoch": 5.8646708540664, "grad_norm": 20.50594711303711, "learning_rate": 2.1826939962111327e-05, "loss": 0.4123, "num_input_tokens_seen": 68673824, "step": 71895 }, { "epoch": 5.865078717676809, "grad_norm": 21.639347076416016, "learning_rate": 2.182340949798825e-05, "loss": 0.4864, "num_input_tokens_seen": 68678160, "step": 71900 }, { "epoch": 5.865486581287217, "grad_norm": 8.016559600830078, "learning_rate": 2.181987909825331e-05, "loss": 0.332, "num_input_tokens_seen": 68682960, "step": 71905 }, { "epoch": 5.865894444897626, "grad_norm": 7.39197301864624, "learning_rate": 2.1816348762978068e-05, "loss": 0.2881, "num_input_tokens_seen": 68687632, "step": 71910 }, { "epoch": 5.866302308508035, "grad_norm": 10.169767379760742, "learning_rate": 2.181281849223409e-05, "loss": 0.3106, "num_input_tokens_seen": 68692496, "step": 71915 }, { "epoch": 5.866710172118443, "grad_norm": 10.092547416687012, "learning_rate": 2.1809288286092922e-05, "loss": 0.4289, "num_input_tokens_seen": 68697392, "step": 71920 }, { "epoch": 5.867118035728852, "grad_norm": 1.6377356052398682, "learning_rate": 2.1805758144626125e-05, "loss": 0.3795, "num_input_tokens_seen": 68701904, "step": 71925 }, { "epoch": 5.867525899339261, "grad_norm": 6.566694259643555, "learning_rate": 2.180222806790525e-05, "loss": 0.4078, "num_input_tokens_seen": 68707104, "step": 71930 }, { "epoch": 5.8679337629496695, "grad_norm": 0.7666202187538147, "learning_rate": 2.1798698056001854e-05, "loss": 0.3502, "num_input_tokens_seen": 68711808, "step": 71935 }, { "epoch": 5.8683416265600785, "grad_norm": 10.407113075256348, "learning_rate": 2.1795168108987492e-05, "loss": 0.3074, "num_input_tokens_seen": 68715952, "step": 71940 }, { "epoch": 5.868749490170487, "grad_norm": 1.0798295736312866, "learning_rate": 2.1791638226933713e-05, "loss": 0.291, "num_input_tokens_seen": 68721184, "step": 71945 }, { "epoch": 5.869157353780896, "grad_norm": 14.735624313354492, "learning_rate": 2.178810840991206e-05, "loss": 0.3281, "num_input_tokens_seen": 68725840, "step": 71950 }, { "epoch": 5.869565217391305, "grad_norm": 2.1880605220794678, "learning_rate": 2.1784578657994077e-05, "loss": 0.3916, "num_input_tokens_seen": 68730624, "step": 71955 }, { "epoch": 5.869973081001713, "grad_norm": 0.9896679520606995, "learning_rate": 2.1781048971251327e-05, "loss": 0.3673, "num_input_tokens_seen": 68735200, "step": 71960 }, { "epoch": 5.870380944612122, "grad_norm": 0.6547567844390869, "learning_rate": 2.177751934975535e-05, "loss": 0.2839, "num_input_tokens_seen": 68739888, "step": 71965 }, { "epoch": 5.87078880822253, "grad_norm": 0.7133326530456543, "learning_rate": 2.1773989793577682e-05, "loss": 0.2421, "num_input_tokens_seen": 68743760, "step": 71970 }, { "epoch": 5.871196671832939, "grad_norm": 2.0125749111175537, "learning_rate": 2.177046030278987e-05, "loss": 0.2599, "num_input_tokens_seen": 68749456, "step": 71975 }, { "epoch": 5.871604535443348, "grad_norm": 15.062200546264648, "learning_rate": 2.1766930877463455e-05, "loss": 0.4132, "num_input_tokens_seen": 68754544, "step": 71980 }, { "epoch": 5.872012399053756, "grad_norm": 1.1778547763824463, "learning_rate": 2.1763401517669978e-05, "loss": 0.2232, "num_input_tokens_seen": 68759936, "step": 71985 }, { "epoch": 5.872420262664165, "grad_norm": 9.661759376525879, "learning_rate": 2.175987222348098e-05, "loss": 0.2548, "num_input_tokens_seen": 68764272, "step": 71990 }, { "epoch": 5.872828126274574, "grad_norm": 8.07242202758789, "learning_rate": 2.1756342994967982e-05, "loss": 0.3472, "num_input_tokens_seen": 68768528, "step": 71995 }, { "epoch": 5.873235989884982, "grad_norm": 3.606322765350342, "learning_rate": 2.1752813832202544e-05, "loss": 0.1852, "num_input_tokens_seen": 68773968, "step": 72000 }, { "epoch": 5.873643853495391, "grad_norm": 16.533000946044922, "learning_rate": 2.174928473525619e-05, "loss": 0.4198, "num_input_tokens_seen": 68778384, "step": 72005 }, { "epoch": 5.8740517171058, "grad_norm": 0.5399917960166931, "learning_rate": 2.1745755704200456e-05, "loss": 0.3284, "num_input_tokens_seen": 68783104, "step": 72010 }, { "epoch": 5.874459580716208, "grad_norm": 1.4139710664749146, "learning_rate": 2.174222673910687e-05, "loss": 0.4401, "num_input_tokens_seen": 68788096, "step": 72015 }, { "epoch": 5.874867444326617, "grad_norm": 36.44570541381836, "learning_rate": 2.1738697840046952e-05, "loss": 0.3703, "num_input_tokens_seen": 68793728, "step": 72020 }, { "epoch": 5.8752753079370255, "grad_norm": 5.759183883666992, "learning_rate": 2.1735169007092254e-05, "loss": 0.4449, "num_input_tokens_seen": 68799168, "step": 72025 }, { "epoch": 5.8756831715474345, "grad_norm": 4.427147388458252, "learning_rate": 2.1731640240314292e-05, "loss": 0.3424, "num_input_tokens_seen": 68804688, "step": 72030 }, { "epoch": 5.8760910351578435, "grad_norm": 1.313173770904541, "learning_rate": 2.1728111539784596e-05, "loss": 0.3196, "num_input_tokens_seen": 68809872, "step": 72035 }, { "epoch": 5.876498898768252, "grad_norm": 26.781089782714844, "learning_rate": 2.172458290557468e-05, "loss": 0.4345, "num_input_tokens_seen": 68814560, "step": 72040 }, { "epoch": 5.876906762378661, "grad_norm": 8.532730102539062, "learning_rate": 2.1721054337756084e-05, "loss": 0.2497, "num_input_tokens_seen": 68819584, "step": 72045 }, { "epoch": 5.87731462598907, "grad_norm": 1.8362897634506226, "learning_rate": 2.171752583640032e-05, "loss": 0.2902, "num_input_tokens_seen": 68823616, "step": 72050 }, { "epoch": 5.877722489599478, "grad_norm": 31.512426376342773, "learning_rate": 2.1713997401578918e-05, "loss": 0.3321, "num_input_tokens_seen": 68828368, "step": 72055 }, { "epoch": 5.878130353209887, "grad_norm": 28.828126907348633, "learning_rate": 2.171046903336339e-05, "loss": 0.3762, "num_input_tokens_seen": 68832192, "step": 72060 }, { "epoch": 5.878538216820296, "grad_norm": 5.313738822937012, "learning_rate": 2.1706940731825245e-05, "loss": 0.2263, "num_input_tokens_seen": 68837104, "step": 72065 }, { "epoch": 5.878946080430704, "grad_norm": 17.001968383789062, "learning_rate": 2.1703412497036026e-05, "loss": 0.445, "num_input_tokens_seen": 68842848, "step": 72070 }, { "epoch": 5.879353944041113, "grad_norm": 35.0078239440918, "learning_rate": 2.169988432906723e-05, "loss": 0.2876, "num_input_tokens_seen": 68848176, "step": 72075 }, { "epoch": 5.879761807651521, "grad_norm": 5.490014553070068, "learning_rate": 2.1696356227990377e-05, "loss": 0.3261, "num_input_tokens_seen": 68852944, "step": 72080 }, { "epoch": 5.88016967126193, "grad_norm": 28.888683319091797, "learning_rate": 2.169282819387697e-05, "loss": 0.2955, "num_input_tokens_seen": 68857264, "step": 72085 }, { "epoch": 5.880577534872339, "grad_norm": 4.543796062469482, "learning_rate": 2.1689300226798543e-05, "loss": 0.3044, "num_input_tokens_seen": 68861840, "step": 72090 }, { "epoch": 5.880985398482747, "grad_norm": 21.87232208251953, "learning_rate": 2.1685772326826594e-05, "loss": 0.4122, "num_input_tokens_seen": 68865984, "step": 72095 }, { "epoch": 5.881393262093156, "grad_norm": 1.1256431341171265, "learning_rate": 2.1682244494032628e-05, "loss": 0.2736, "num_input_tokens_seen": 68870928, "step": 72100 }, { "epoch": 5.881801125703564, "grad_norm": 5.900679111480713, "learning_rate": 2.167871672848816e-05, "loss": 0.4277, "num_input_tokens_seen": 68874736, "step": 72105 }, { "epoch": 5.882208989313973, "grad_norm": 8.175887107849121, "learning_rate": 2.1675189030264682e-05, "loss": 0.3484, "num_input_tokens_seen": 68879584, "step": 72110 }, { "epoch": 5.882616852924382, "grad_norm": 5.776471138000488, "learning_rate": 2.1671661399433722e-05, "loss": 0.3572, "num_input_tokens_seen": 68884432, "step": 72115 }, { "epoch": 5.88302471653479, "grad_norm": 10.074738502502441, "learning_rate": 2.1668133836066764e-05, "loss": 0.4035, "num_input_tokens_seen": 68889392, "step": 72120 }, { "epoch": 5.883432580145199, "grad_norm": 13.191173553466797, "learning_rate": 2.1664606340235322e-05, "loss": 0.2389, "num_input_tokens_seen": 68894432, "step": 72125 }, { "epoch": 5.883840443755608, "grad_norm": 4.873699188232422, "learning_rate": 2.166107891201088e-05, "loss": 0.2828, "num_input_tokens_seen": 68899296, "step": 72130 }, { "epoch": 5.884248307366017, "grad_norm": 5.234044075012207, "learning_rate": 2.165755155146496e-05, "loss": 0.4931, "num_input_tokens_seen": 68904496, "step": 72135 }, { "epoch": 5.884656170976426, "grad_norm": 15.95826244354248, "learning_rate": 2.165402425866905e-05, "loss": 0.4261, "num_input_tokens_seen": 68909728, "step": 72140 }, { "epoch": 5.885064034586835, "grad_norm": 9.72181224822998, "learning_rate": 2.1650497033694646e-05, "loss": 0.3581, "num_input_tokens_seen": 68914352, "step": 72145 }, { "epoch": 5.885471898197243, "grad_norm": 12.647704124450684, "learning_rate": 2.1646969876613245e-05, "loss": 0.3156, "num_input_tokens_seen": 68919440, "step": 72150 }, { "epoch": 5.885879761807652, "grad_norm": 37.061893463134766, "learning_rate": 2.1643442787496328e-05, "loss": 0.3973, "num_input_tokens_seen": 68924112, "step": 72155 }, { "epoch": 5.88628762541806, "grad_norm": 13.12628173828125, "learning_rate": 2.1639915766415415e-05, "loss": 0.3543, "num_input_tokens_seen": 68929376, "step": 72160 }, { "epoch": 5.886695489028469, "grad_norm": 26.85538673400879, "learning_rate": 2.1636388813441978e-05, "loss": 0.4499, "num_input_tokens_seen": 68934048, "step": 72165 }, { "epoch": 5.887103352638878, "grad_norm": 1.4227849245071411, "learning_rate": 2.1632861928647507e-05, "loss": 0.4094, "num_input_tokens_seen": 68938752, "step": 72170 }, { "epoch": 5.887511216249286, "grad_norm": 31.81072998046875, "learning_rate": 2.162933511210349e-05, "loss": 0.4381, "num_input_tokens_seen": 68943088, "step": 72175 }, { "epoch": 5.887919079859695, "grad_norm": 10.304120063781738, "learning_rate": 2.1625808363881428e-05, "loss": 0.2607, "num_input_tokens_seen": 68948080, "step": 72180 }, { "epoch": 5.888326943470103, "grad_norm": 6.466337203979492, "learning_rate": 2.1622281684052793e-05, "loss": 0.3209, "num_input_tokens_seen": 68952752, "step": 72185 }, { "epoch": 5.888734807080512, "grad_norm": 17.48092269897461, "learning_rate": 2.161875507268907e-05, "loss": 0.3332, "num_input_tokens_seen": 68958080, "step": 72190 }, { "epoch": 5.889142670690921, "grad_norm": 0.4844924509525299, "learning_rate": 2.161522852986174e-05, "loss": 0.2248, "num_input_tokens_seen": 68963056, "step": 72195 }, { "epoch": 5.88955053430133, "grad_norm": 1.091567039489746, "learning_rate": 2.1611702055642296e-05, "loss": 0.2549, "num_input_tokens_seen": 68968240, "step": 72200 }, { "epoch": 5.889958397911738, "grad_norm": 2.900670289993286, "learning_rate": 2.1608175650102218e-05, "loss": 0.3423, "num_input_tokens_seen": 68972992, "step": 72205 }, { "epoch": 5.890366261522147, "grad_norm": 110.28565216064453, "learning_rate": 2.160464931331297e-05, "loss": 0.3839, "num_input_tokens_seen": 68977856, "step": 72210 }, { "epoch": 5.890774125132555, "grad_norm": 13.643121719360352, "learning_rate": 2.1601123045346043e-05, "loss": 0.2946, "num_input_tokens_seen": 68983104, "step": 72215 }, { "epoch": 5.891181988742964, "grad_norm": 15.218689918518066, "learning_rate": 2.1597596846272895e-05, "loss": 0.3447, "num_input_tokens_seen": 68987872, "step": 72220 }, { "epoch": 5.891589852353373, "grad_norm": 0.9224116802215576, "learning_rate": 2.1594070716165024e-05, "loss": 0.371, "num_input_tokens_seen": 68992368, "step": 72225 }, { "epoch": 5.8919977159637815, "grad_norm": 2.099090337753296, "learning_rate": 2.1590544655093892e-05, "loss": 0.4491, "num_input_tokens_seen": 68996496, "step": 72230 }, { "epoch": 5.8924055795741905, "grad_norm": 11.10633373260498, "learning_rate": 2.158701866313097e-05, "loss": 0.3253, "num_input_tokens_seen": 69001664, "step": 72235 }, { "epoch": 5.892813443184599, "grad_norm": 25.13686180114746, "learning_rate": 2.1583492740347727e-05, "loss": 0.4699, "num_input_tokens_seen": 69006480, "step": 72240 }, { "epoch": 5.893221306795008, "grad_norm": 16.979541778564453, "learning_rate": 2.157996688681564e-05, "loss": 0.328, "num_input_tokens_seen": 69011136, "step": 72245 }, { "epoch": 5.893629170405417, "grad_norm": 0.5109410881996155, "learning_rate": 2.1576441102606167e-05, "loss": 0.3146, "num_input_tokens_seen": 69015664, "step": 72250 }, { "epoch": 5.894037034015825, "grad_norm": 0.9163620471954346, "learning_rate": 2.157291538779078e-05, "loss": 0.3092, "num_input_tokens_seen": 69020624, "step": 72255 }, { "epoch": 5.894444897626234, "grad_norm": 2.2726292610168457, "learning_rate": 2.1569389742440943e-05, "loss": 0.3524, "num_input_tokens_seen": 69025904, "step": 72260 }, { "epoch": 5.894852761236643, "grad_norm": 0.5963500142097473, "learning_rate": 2.156586416662811e-05, "loss": 0.4063, "num_input_tokens_seen": 69030768, "step": 72265 }, { "epoch": 5.895260624847051, "grad_norm": 1.3114069700241089, "learning_rate": 2.1562338660423755e-05, "loss": 0.4881, "num_input_tokens_seen": 69035424, "step": 72270 }, { "epoch": 5.89566848845746, "grad_norm": 3.466855049133301, "learning_rate": 2.155881322389934e-05, "loss": 0.3342, "num_input_tokens_seen": 69040496, "step": 72275 }, { "epoch": 5.896076352067869, "grad_norm": 26.332462310791016, "learning_rate": 2.1555287857126315e-05, "loss": 0.3934, "num_input_tokens_seen": 69044768, "step": 72280 }, { "epoch": 5.896484215678277, "grad_norm": 3.57951283454895, "learning_rate": 2.1551762560176134e-05, "loss": 0.315, "num_input_tokens_seen": 69049360, "step": 72285 }, { "epoch": 5.896892079288686, "grad_norm": 25.261926651000977, "learning_rate": 2.154823733312027e-05, "loss": 0.3925, "num_input_tokens_seen": 69054736, "step": 72290 }, { "epoch": 5.897299942899094, "grad_norm": 0.993087887763977, "learning_rate": 2.1544712176030167e-05, "loss": 0.3597, "num_input_tokens_seen": 69060304, "step": 72295 }, { "epoch": 5.897707806509503, "grad_norm": 1.0149179697036743, "learning_rate": 2.154118708897728e-05, "loss": 0.3253, "num_input_tokens_seen": 69065584, "step": 72300 }, { "epoch": 5.898115670119912, "grad_norm": 11.635835647583008, "learning_rate": 2.1537662072033062e-05, "loss": 0.4657, "num_input_tokens_seen": 69070704, "step": 72305 }, { "epoch": 5.89852353373032, "grad_norm": 0.6549049019813538, "learning_rate": 2.153413712526896e-05, "loss": 0.2958, "num_input_tokens_seen": 69076192, "step": 72310 }, { "epoch": 5.898931397340729, "grad_norm": 5.670068740844727, "learning_rate": 2.1530612248756426e-05, "loss": 0.298, "num_input_tokens_seen": 69081392, "step": 72315 }, { "epoch": 5.8993392609511375, "grad_norm": 1.5602015256881714, "learning_rate": 2.1527087442566912e-05, "loss": 0.3427, "num_input_tokens_seen": 69086032, "step": 72320 }, { "epoch": 5.8997471245615465, "grad_norm": 9.392509460449219, "learning_rate": 2.1523562706771858e-05, "loss": 0.3268, "num_input_tokens_seen": 69090336, "step": 72325 }, { "epoch": 5.9001549881719555, "grad_norm": 4.0114593505859375, "learning_rate": 2.1520038041442698e-05, "loss": 0.405, "num_input_tokens_seen": 69095296, "step": 72330 }, { "epoch": 5.900562851782364, "grad_norm": 16.808908462524414, "learning_rate": 2.15165134466509e-05, "loss": 0.2272, "num_input_tokens_seen": 69099488, "step": 72335 }, { "epoch": 5.900970715392773, "grad_norm": 11.161725997924805, "learning_rate": 2.15129889224679e-05, "loss": 0.3306, "num_input_tokens_seen": 69104000, "step": 72340 }, { "epoch": 5.901378579003182, "grad_norm": 33.373111724853516, "learning_rate": 2.1509464468965124e-05, "loss": 0.3485, "num_input_tokens_seen": 69108576, "step": 72345 }, { "epoch": 5.90178644261359, "grad_norm": 3.4194228649139404, "learning_rate": 2.150594008621401e-05, "loss": 0.4472, "num_input_tokens_seen": 69113216, "step": 72350 }, { "epoch": 5.902194306223999, "grad_norm": 30.979137420654297, "learning_rate": 2.150241577428602e-05, "loss": 0.3415, "num_input_tokens_seen": 69117472, "step": 72355 }, { "epoch": 5.902602169834408, "grad_norm": 4.011172771453857, "learning_rate": 2.149889153325258e-05, "loss": 0.4476, "num_input_tokens_seen": 69122704, "step": 72360 }, { "epoch": 5.903010033444816, "grad_norm": 1.7581870555877686, "learning_rate": 2.1495367363185116e-05, "loss": 0.3339, "num_input_tokens_seen": 69127808, "step": 72365 }, { "epoch": 5.903417897055225, "grad_norm": 8.569321632385254, "learning_rate": 2.1491843264155066e-05, "loss": 0.2019, "num_input_tokens_seen": 69132208, "step": 72370 }, { "epoch": 5.903825760665633, "grad_norm": 7.843234539031982, "learning_rate": 2.1488319236233854e-05, "loss": 0.2546, "num_input_tokens_seen": 69137776, "step": 72375 }, { "epoch": 5.904233624276042, "grad_norm": 2.1978063583374023, "learning_rate": 2.1484795279492932e-05, "loss": 0.3319, "num_input_tokens_seen": 69143024, "step": 72380 }, { "epoch": 5.904641487886451, "grad_norm": 6.7291646003723145, "learning_rate": 2.148127139400371e-05, "loss": 0.2788, "num_input_tokens_seen": 69147744, "step": 72385 }, { "epoch": 5.905049351496859, "grad_norm": 3.7285549640655518, "learning_rate": 2.1477747579837625e-05, "loss": 0.358, "num_input_tokens_seen": 69152176, "step": 72390 }, { "epoch": 5.905457215107268, "grad_norm": 35.493499755859375, "learning_rate": 2.1474223837066088e-05, "loss": 0.2982, "num_input_tokens_seen": 69156368, "step": 72395 }, { "epoch": 5.905865078717677, "grad_norm": 8.76686954498291, "learning_rate": 2.1470700165760547e-05, "loss": 0.2777, "num_input_tokens_seen": 69161392, "step": 72400 }, { "epoch": 5.906272942328085, "grad_norm": 3.0770809650421143, "learning_rate": 2.146717656599241e-05, "loss": 0.2742, "num_input_tokens_seen": 69166224, "step": 72405 }, { "epoch": 5.906680805938494, "grad_norm": 20.632009506225586, "learning_rate": 2.146365303783311e-05, "loss": 0.3483, "num_input_tokens_seen": 69170384, "step": 72410 }, { "epoch": 5.907088669548903, "grad_norm": 5.578988075256348, "learning_rate": 2.1460129581354054e-05, "loss": 0.4297, "num_input_tokens_seen": 69175392, "step": 72415 }, { "epoch": 5.9074965331593114, "grad_norm": 3.6861860752105713, "learning_rate": 2.1456606196626658e-05, "loss": 0.3725, "num_input_tokens_seen": 69180288, "step": 72420 }, { "epoch": 5.9079043967697205, "grad_norm": 0.6120628118515015, "learning_rate": 2.145308288372236e-05, "loss": 0.3211, "num_input_tokens_seen": 69183824, "step": 72425 }, { "epoch": 5.908312260380129, "grad_norm": 3.6533291339874268, "learning_rate": 2.1449559642712564e-05, "loss": 0.3494, "num_input_tokens_seen": 69188272, "step": 72430 }, { "epoch": 5.908720123990538, "grad_norm": 16.29569435119629, "learning_rate": 2.1446036473668687e-05, "loss": 0.2648, "num_input_tokens_seen": 69193104, "step": 72435 }, { "epoch": 5.909127987600947, "grad_norm": 7.454306602478027, "learning_rate": 2.1442513376662134e-05, "loss": 0.4313, "num_input_tokens_seen": 69197872, "step": 72440 }, { "epoch": 5.909535851211355, "grad_norm": 26.215436935424805, "learning_rate": 2.1438990351764328e-05, "loss": 0.3843, "num_input_tokens_seen": 69202800, "step": 72445 }, { "epoch": 5.909943714821764, "grad_norm": 33.9411735534668, "learning_rate": 2.1435467399046676e-05, "loss": 0.3103, "num_input_tokens_seen": 69207632, "step": 72450 }, { "epoch": 5.910351578432172, "grad_norm": 21.840877532958984, "learning_rate": 2.1431944518580585e-05, "loss": 0.3285, "num_input_tokens_seen": 69212128, "step": 72455 }, { "epoch": 5.910759442042581, "grad_norm": 23.760957717895508, "learning_rate": 2.1428421710437464e-05, "loss": 0.3587, "num_input_tokens_seen": 69217120, "step": 72460 }, { "epoch": 5.91116730565299, "grad_norm": 1.6261032819747925, "learning_rate": 2.14248989746887e-05, "loss": 0.5144, "num_input_tokens_seen": 69221792, "step": 72465 }, { "epoch": 5.911575169263398, "grad_norm": 6.297496318817139, "learning_rate": 2.1421376311405733e-05, "loss": 0.2766, "num_input_tokens_seen": 69225968, "step": 72470 }, { "epoch": 5.911983032873807, "grad_norm": 11.2618989944458, "learning_rate": 2.1417853720659943e-05, "loss": 0.3118, "num_input_tokens_seen": 69230448, "step": 72475 }, { "epoch": 5.912390896484216, "grad_norm": 23.555334091186523, "learning_rate": 2.1414331202522736e-05, "loss": 0.3178, "num_input_tokens_seen": 69235776, "step": 72480 }, { "epoch": 5.912798760094624, "grad_norm": 17.820039749145508, "learning_rate": 2.1410808757065502e-05, "loss": 0.3717, "num_input_tokens_seen": 69241120, "step": 72485 }, { "epoch": 5.913206623705033, "grad_norm": 6.693694114685059, "learning_rate": 2.140728638435966e-05, "loss": 0.3575, "num_input_tokens_seen": 69245632, "step": 72490 }, { "epoch": 5.913614487315442, "grad_norm": 41.62202453613281, "learning_rate": 2.14037640844766e-05, "loss": 0.4595, "num_input_tokens_seen": 69250432, "step": 72495 }, { "epoch": 5.91402235092585, "grad_norm": 11.863061904907227, "learning_rate": 2.140024185748771e-05, "loss": 0.2465, "num_input_tokens_seen": 69254672, "step": 72500 }, { "epoch": 5.914430214536259, "grad_norm": 1.1379029750823975, "learning_rate": 2.1396719703464386e-05, "loss": 0.2988, "num_input_tokens_seen": 69258976, "step": 72505 }, { "epoch": 5.914838078146667, "grad_norm": 0.8805387020111084, "learning_rate": 2.1393197622478024e-05, "loss": 0.3797, "num_input_tokens_seen": 69263472, "step": 72510 }, { "epoch": 5.915245941757076, "grad_norm": 2.376067638397217, "learning_rate": 2.1389675614600013e-05, "loss": 0.3329, "num_input_tokens_seen": 69268752, "step": 72515 }, { "epoch": 5.915653805367485, "grad_norm": 1.9456431865692139, "learning_rate": 2.1386153679901742e-05, "loss": 0.4204, "num_input_tokens_seen": 69274112, "step": 72520 }, { "epoch": 5.9160616689778935, "grad_norm": 0.9199162721633911, "learning_rate": 2.1382631818454607e-05, "loss": 0.3211, "num_input_tokens_seen": 69278928, "step": 72525 }, { "epoch": 5.9164695325883025, "grad_norm": 0.7415782809257507, "learning_rate": 2.137911003032997e-05, "loss": 0.3757, "num_input_tokens_seen": 69284160, "step": 72530 }, { "epoch": 5.916877396198711, "grad_norm": 0.5131043791770935, "learning_rate": 2.1375588315599248e-05, "loss": 0.3813, "num_input_tokens_seen": 69288976, "step": 72535 }, { "epoch": 5.91728525980912, "grad_norm": 1.9369964599609375, "learning_rate": 2.1372066674333814e-05, "loss": 0.3472, "num_input_tokens_seen": 69293072, "step": 72540 }, { "epoch": 5.917693123419529, "grad_norm": 4.555840015411377, "learning_rate": 2.1368545106605043e-05, "loss": 0.3928, "num_input_tokens_seen": 69297184, "step": 72545 }, { "epoch": 5.918100987029937, "grad_norm": 5.133847236633301, "learning_rate": 2.136502361248431e-05, "loss": 0.3132, "num_input_tokens_seen": 69301936, "step": 72550 }, { "epoch": 5.918508850640346, "grad_norm": 11.833481788635254, "learning_rate": 2.1361502192043014e-05, "loss": 0.3303, "num_input_tokens_seen": 69306336, "step": 72555 }, { "epoch": 5.918916714250755, "grad_norm": 7.362631320953369, "learning_rate": 2.1357980845352522e-05, "loss": 0.3939, "num_input_tokens_seen": 69311248, "step": 72560 }, { "epoch": 5.919324577861163, "grad_norm": 4.724076271057129, "learning_rate": 2.135445957248421e-05, "loss": 0.3463, "num_input_tokens_seen": 69315600, "step": 72565 }, { "epoch": 5.919732441471572, "grad_norm": 0.7898191213607788, "learning_rate": 2.1350938373509454e-05, "loss": 0.4068, "num_input_tokens_seen": 69319984, "step": 72570 }, { "epoch": 5.920140305081981, "grad_norm": 2.100802421569824, "learning_rate": 2.1347417248499624e-05, "loss": 0.435, "num_input_tokens_seen": 69324352, "step": 72575 }, { "epoch": 5.920548168692389, "grad_norm": 1.1471389532089233, "learning_rate": 2.1343896197526098e-05, "loss": 0.3513, "num_input_tokens_seen": 69329248, "step": 72580 }, { "epoch": 5.920956032302798, "grad_norm": 14.853614807128906, "learning_rate": 2.1340375220660243e-05, "loss": 0.3617, "num_input_tokens_seen": 69334320, "step": 72585 }, { "epoch": 5.921363895913206, "grad_norm": 3.025972366333008, "learning_rate": 2.133685431797342e-05, "loss": 0.3213, "num_input_tokens_seen": 69339536, "step": 72590 }, { "epoch": 5.921771759523615, "grad_norm": 12.858111381530762, "learning_rate": 2.1333333489537005e-05, "loss": 0.4181, "num_input_tokens_seen": 69344912, "step": 72595 }, { "epoch": 5.922179623134024, "grad_norm": 3.814107894897461, "learning_rate": 2.1329812735422367e-05, "loss": 0.355, "num_input_tokens_seen": 69349808, "step": 72600 }, { "epoch": 5.922587486744432, "grad_norm": 3.8601016998291016, "learning_rate": 2.1326292055700864e-05, "loss": 0.412, "num_input_tokens_seen": 69353792, "step": 72605 }, { "epoch": 5.922995350354841, "grad_norm": 2.6495718955993652, "learning_rate": 2.1322771450443864e-05, "loss": 0.247, "num_input_tokens_seen": 69358608, "step": 72610 }, { "epoch": 5.92340321396525, "grad_norm": 3.2145891189575195, "learning_rate": 2.131925091972272e-05, "loss": 0.4262, "num_input_tokens_seen": 69363712, "step": 72615 }, { "epoch": 5.9238110775756585, "grad_norm": 1.8608086109161377, "learning_rate": 2.1315730463608785e-05, "loss": 0.3142, "num_input_tokens_seen": 69368816, "step": 72620 }, { "epoch": 5.9242189411860675, "grad_norm": 14.780716896057129, "learning_rate": 2.131221008217344e-05, "loss": 0.4631, "num_input_tokens_seen": 69373744, "step": 72625 }, { "epoch": 5.9246268047964765, "grad_norm": 2.428489923477173, "learning_rate": 2.1308689775488026e-05, "loss": 0.3907, "num_input_tokens_seen": 69378768, "step": 72630 }, { "epoch": 5.925034668406885, "grad_norm": 0.9881457686424255, "learning_rate": 2.1305169543623908e-05, "loss": 0.3966, "num_input_tokens_seen": 69383632, "step": 72635 }, { "epoch": 5.925442532017294, "grad_norm": 0.6529442071914673, "learning_rate": 2.1301649386652424e-05, "loss": 0.3742, "num_input_tokens_seen": 69388352, "step": 72640 }, { "epoch": 5.925850395627702, "grad_norm": 14.109801292419434, "learning_rate": 2.1298129304644942e-05, "loss": 0.2605, "num_input_tokens_seen": 69393648, "step": 72645 }, { "epoch": 5.926258259238111, "grad_norm": 31.046825408935547, "learning_rate": 2.12946092976728e-05, "loss": 0.4012, "num_input_tokens_seen": 69398320, "step": 72650 }, { "epoch": 5.92666612284852, "grad_norm": 0.4892124831676483, "learning_rate": 2.1291089365807363e-05, "loss": 0.4444, "num_input_tokens_seen": 69402112, "step": 72655 }, { "epoch": 5.927073986458928, "grad_norm": 6.831915855407715, "learning_rate": 2.1287569509119966e-05, "loss": 0.4415, "num_input_tokens_seen": 69407136, "step": 72660 }, { "epoch": 5.927481850069337, "grad_norm": 1.2590103149414062, "learning_rate": 2.1284049727681948e-05, "loss": 0.3377, "num_input_tokens_seen": 69411968, "step": 72665 }, { "epoch": 5.927889713679745, "grad_norm": 2.848191976547241, "learning_rate": 2.1280530021564672e-05, "loss": 0.3215, "num_input_tokens_seen": 69417472, "step": 72670 }, { "epoch": 5.928297577290154, "grad_norm": 2.3493454456329346, "learning_rate": 2.1277010390839468e-05, "loss": 0.3126, "num_input_tokens_seen": 69422032, "step": 72675 }, { "epoch": 5.928705440900563, "grad_norm": 7.248928070068359, "learning_rate": 2.1273490835577685e-05, "loss": 0.4528, "num_input_tokens_seen": 69426192, "step": 72680 }, { "epoch": 5.929113304510971, "grad_norm": 6.827816009521484, "learning_rate": 2.126997135585065e-05, "loss": 0.4209, "num_input_tokens_seen": 69431392, "step": 72685 }, { "epoch": 5.92952116812138, "grad_norm": 41.03827667236328, "learning_rate": 2.126645195172972e-05, "loss": 0.3699, "num_input_tokens_seen": 69435488, "step": 72690 }, { "epoch": 5.929929031731789, "grad_norm": 14.759161949157715, "learning_rate": 2.126293262328622e-05, "loss": 0.2832, "num_input_tokens_seen": 69439920, "step": 72695 }, { "epoch": 5.930336895342197, "grad_norm": 15.88703441619873, "learning_rate": 2.125941337059149e-05, "loss": 0.3974, "num_input_tokens_seen": 69444640, "step": 72700 }, { "epoch": 5.930744758952606, "grad_norm": 12.369574546813965, "learning_rate": 2.1255894193716863e-05, "loss": 0.4272, "num_input_tokens_seen": 69450016, "step": 72705 }, { "epoch": 5.931152622563015, "grad_norm": 7.023141384124756, "learning_rate": 2.1252375092733657e-05, "loss": 0.2913, "num_input_tokens_seen": 69454000, "step": 72710 }, { "epoch": 5.9315604861734235, "grad_norm": 3.68686842918396, "learning_rate": 2.1248856067713225e-05, "loss": 0.3125, "num_input_tokens_seen": 69459728, "step": 72715 }, { "epoch": 5.9319683497838325, "grad_norm": 4.3627519607543945, "learning_rate": 2.1245337118726886e-05, "loss": 0.286, "num_input_tokens_seen": 69464368, "step": 72720 }, { "epoch": 5.932376213394241, "grad_norm": 1.5610711574554443, "learning_rate": 2.124181824584597e-05, "loss": 0.2945, "num_input_tokens_seen": 69469136, "step": 72725 }, { "epoch": 5.93278407700465, "grad_norm": 10.982627868652344, "learning_rate": 2.1238299449141795e-05, "loss": 0.2437, "num_input_tokens_seen": 69473952, "step": 72730 }, { "epoch": 5.933191940615059, "grad_norm": 4.074221611022949, "learning_rate": 2.1234780728685698e-05, "loss": 0.4948, "num_input_tokens_seen": 69477952, "step": 72735 }, { "epoch": 5.933599804225467, "grad_norm": 10.827704429626465, "learning_rate": 2.1231262084548996e-05, "loss": 0.2993, "num_input_tokens_seen": 69481472, "step": 72740 }, { "epoch": 5.934007667835876, "grad_norm": 6.881661415100098, "learning_rate": 2.1227743516803012e-05, "loss": 0.3009, "num_input_tokens_seen": 69486080, "step": 72745 }, { "epoch": 5.934415531446284, "grad_norm": 8.441665649414062, "learning_rate": 2.122422502551905e-05, "loss": 0.4377, "num_input_tokens_seen": 69491056, "step": 72750 }, { "epoch": 5.934823395056693, "grad_norm": 2.489534616470337, "learning_rate": 2.1220706610768453e-05, "loss": 0.3702, "num_input_tokens_seen": 69495360, "step": 72755 }, { "epoch": 5.935231258667102, "grad_norm": 0.643926203250885, "learning_rate": 2.121718827262253e-05, "loss": 0.2605, "num_input_tokens_seen": 69499328, "step": 72760 }, { "epoch": 5.935639122277511, "grad_norm": 1.9525014162063599, "learning_rate": 2.1213670011152593e-05, "loss": 0.4075, "num_input_tokens_seen": 69503872, "step": 72765 }, { "epoch": 5.936046985887919, "grad_norm": 6.815155982971191, "learning_rate": 2.1210151826429953e-05, "loss": 0.3226, "num_input_tokens_seen": 69509024, "step": 72770 }, { "epoch": 5.936454849498328, "grad_norm": 9.930185317993164, "learning_rate": 2.1206633718525914e-05, "loss": 0.3917, "num_input_tokens_seen": 69514160, "step": 72775 }, { "epoch": 5.936862713108736, "grad_norm": 14.568578720092773, "learning_rate": 2.120311568751181e-05, "loss": 0.5573, "num_input_tokens_seen": 69519136, "step": 72780 }, { "epoch": 5.937270576719145, "grad_norm": 2.139777898788452, "learning_rate": 2.1199597733458936e-05, "loss": 0.3706, "num_input_tokens_seen": 69523904, "step": 72785 }, { "epoch": 5.937678440329554, "grad_norm": 5.3283257484436035, "learning_rate": 2.1196079856438606e-05, "loss": 0.3478, "num_input_tokens_seen": 69528256, "step": 72790 }, { "epoch": 5.938086303939962, "grad_norm": 4.4136247634887695, "learning_rate": 2.1192562056522113e-05, "loss": 0.3834, "num_input_tokens_seen": 69533232, "step": 72795 }, { "epoch": 5.938494167550371, "grad_norm": 1.8452105522155762, "learning_rate": 2.1189044333780773e-05, "loss": 0.3476, "num_input_tokens_seen": 69538288, "step": 72800 }, { "epoch": 5.938902031160779, "grad_norm": 19.239282608032227, "learning_rate": 2.1185526688285885e-05, "loss": 0.5333, "num_input_tokens_seen": 69543136, "step": 72805 }, { "epoch": 5.939309894771188, "grad_norm": 1.7950993776321411, "learning_rate": 2.118200912010875e-05, "loss": 0.2724, "num_input_tokens_seen": 69547968, "step": 72810 }, { "epoch": 5.939717758381597, "grad_norm": 19.43308448791504, "learning_rate": 2.117849162932067e-05, "loss": 0.4244, "num_input_tokens_seen": 69551984, "step": 72815 }, { "epoch": 5.9401256219920056, "grad_norm": 17.12456703186035, "learning_rate": 2.117497421599293e-05, "loss": 0.2813, "num_input_tokens_seen": 69556240, "step": 72820 }, { "epoch": 5.940533485602415, "grad_norm": 2.0192203521728516, "learning_rate": 2.1171456880196844e-05, "loss": 0.2404, "num_input_tokens_seen": 69561248, "step": 72825 }, { "epoch": 5.940941349212824, "grad_norm": 7.601144313812256, "learning_rate": 2.1167939622003704e-05, "loss": 0.4063, "num_input_tokens_seen": 69566208, "step": 72830 }, { "epoch": 5.941349212823232, "grad_norm": 23.409069061279297, "learning_rate": 2.11644224414848e-05, "loss": 0.39, "num_input_tokens_seen": 69570464, "step": 72835 }, { "epoch": 5.941757076433641, "grad_norm": 8.191149711608887, "learning_rate": 2.116090533871141e-05, "loss": 0.3875, "num_input_tokens_seen": 69576064, "step": 72840 }, { "epoch": 5.94216494004405, "grad_norm": 14.125885963439941, "learning_rate": 2.115738831375485e-05, "loss": 0.3451, "num_input_tokens_seen": 69580784, "step": 72845 }, { "epoch": 5.942572803654458, "grad_norm": 63.1419792175293, "learning_rate": 2.1153871366686395e-05, "loss": 0.4522, "num_input_tokens_seen": 69584336, "step": 72850 }, { "epoch": 5.942980667264867, "grad_norm": 38.74258804321289, "learning_rate": 2.1150354497577336e-05, "loss": 0.4368, "num_input_tokens_seen": 69587824, "step": 72855 }, { "epoch": 5.943388530875275, "grad_norm": 3.6531195640563965, "learning_rate": 2.114683770649895e-05, "loss": 0.343, "num_input_tokens_seen": 69591936, "step": 72860 }, { "epoch": 5.943796394485684, "grad_norm": 0.590758740901947, "learning_rate": 2.1143320993522524e-05, "loss": 0.266, "num_input_tokens_seen": 69596096, "step": 72865 }, { "epoch": 5.944204258096093, "grad_norm": 2.61820125579834, "learning_rate": 2.113980435871935e-05, "loss": 0.2778, "num_input_tokens_seen": 69600768, "step": 72870 }, { "epoch": 5.944612121706501, "grad_norm": 14.065918922424316, "learning_rate": 2.1136287802160697e-05, "loss": 0.2292, "num_input_tokens_seen": 69605264, "step": 72875 }, { "epoch": 5.94501998531691, "grad_norm": 22.908493041992188, "learning_rate": 2.1132771323917853e-05, "loss": 0.3311, "num_input_tokens_seen": 69610496, "step": 72880 }, { "epoch": 5.945427848927318, "grad_norm": 4.3263936042785645, "learning_rate": 2.1129254924062075e-05, "loss": 0.3114, "num_input_tokens_seen": 69615584, "step": 72885 }, { "epoch": 5.945835712537727, "grad_norm": 1.4368635416030884, "learning_rate": 2.1125738602664668e-05, "loss": 0.2809, "num_input_tokens_seen": 69620624, "step": 72890 }, { "epoch": 5.946243576148136, "grad_norm": 16.575422286987305, "learning_rate": 2.1122222359796895e-05, "loss": 0.3074, "num_input_tokens_seen": 69625968, "step": 72895 }, { "epoch": 5.946651439758544, "grad_norm": 0.46091926097869873, "learning_rate": 2.111870619553003e-05, "loss": 0.3181, "num_input_tokens_seen": 69630240, "step": 72900 }, { "epoch": 5.947059303368953, "grad_norm": 28.063018798828125, "learning_rate": 2.1115190109935322e-05, "loss": 0.4892, "num_input_tokens_seen": 69634416, "step": 72905 }, { "epoch": 5.947467166979362, "grad_norm": 17.89823341369629, "learning_rate": 2.1111674103084074e-05, "loss": 0.3107, "num_input_tokens_seen": 69639296, "step": 72910 }, { "epoch": 5.9478750305897705, "grad_norm": 0.6350995302200317, "learning_rate": 2.110815817504754e-05, "loss": 0.3655, "num_input_tokens_seen": 69643968, "step": 72915 }, { "epoch": 5.9482828942001795, "grad_norm": 12.576860427856445, "learning_rate": 2.110464232589699e-05, "loss": 0.4033, "num_input_tokens_seen": 69648000, "step": 72920 }, { "epoch": 5.9486907578105885, "grad_norm": 5.067251205444336, "learning_rate": 2.110112655570368e-05, "loss": 0.3581, "num_input_tokens_seen": 69653024, "step": 72925 }, { "epoch": 5.949098621420997, "grad_norm": 4.826562404632568, "learning_rate": 2.1097610864538875e-05, "loss": 0.3044, "num_input_tokens_seen": 69657584, "step": 72930 }, { "epoch": 5.949506485031406, "grad_norm": 1.4154329299926758, "learning_rate": 2.1094095252473845e-05, "loss": 0.368, "num_input_tokens_seen": 69661776, "step": 72935 }, { "epoch": 5.949914348641814, "grad_norm": 2.6956238746643066, "learning_rate": 2.1090579719579842e-05, "loss": 0.2748, "num_input_tokens_seen": 69666528, "step": 72940 }, { "epoch": 5.950322212252223, "grad_norm": 10.767218589782715, "learning_rate": 2.108706426592813e-05, "loss": 0.3207, "num_input_tokens_seen": 69671392, "step": 72945 }, { "epoch": 5.950730075862632, "grad_norm": 7.496333599090576, "learning_rate": 2.108354889158995e-05, "loss": 0.2704, "num_input_tokens_seen": 69676464, "step": 72950 }, { "epoch": 5.95113793947304, "grad_norm": 35.99626159667969, "learning_rate": 2.108003359663658e-05, "loss": 0.364, "num_input_tokens_seen": 69680544, "step": 72955 }, { "epoch": 5.951545803083449, "grad_norm": 1.4887878894805908, "learning_rate": 2.1076518381139263e-05, "loss": 0.3731, "num_input_tokens_seen": 69686048, "step": 72960 }, { "epoch": 5.951953666693858, "grad_norm": 0.46432778239250183, "learning_rate": 2.1073003245169253e-05, "loss": 0.2609, "num_input_tokens_seen": 69690384, "step": 72965 }, { "epoch": 5.952361530304266, "grad_norm": 0.5303273797035217, "learning_rate": 2.10694881887978e-05, "loss": 0.3189, "num_input_tokens_seen": 69695168, "step": 72970 }, { "epoch": 5.952769393914675, "grad_norm": 35.90885925292969, "learning_rate": 2.1065973212096136e-05, "loss": 0.3462, "num_input_tokens_seen": 69700192, "step": 72975 }, { "epoch": 5.953177257525084, "grad_norm": 13.966192245483398, "learning_rate": 2.1062458315135537e-05, "loss": 0.3405, "num_input_tokens_seen": 69703808, "step": 72980 }, { "epoch": 5.953585121135492, "grad_norm": 0.555401086807251, "learning_rate": 2.105894349798723e-05, "loss": 0.2767, "num_input_tokens_seen": 69709376, "step": 72985 }, { "epoch": 5.953992984745901, "grad_norm": 1.101958990097046, "learning_rate": 2.105542876072247e-05, "loss": 0.4104, "num_input_tokens_seen": 69713680, "step": 72990 }, { "epoch": 5.954400848356309, "grad_norm": 0.39597293734550476, "learning_rate": 2.1051914103412483e-05, "loss": 0.3783, "num_input_tokens_seen": 69717728, "step": 72995 }, { "epoch": 5.954808711966718, "grad_norm": 24.32233428955078, "learning_rate": 2.1048399526128532e-05, "loss": 0.3935, "num_input_tokens_seen": 69722736, "step": 73000 }, { "epoch": 5.955216575577127, "grad_norm": 17.09638214111328, "learning_rate": 2.1044885028941836e-05, "loss": 0.3706, "num_input_tokens_seen": 69727280, "step": 73005 }, { "epoch": 5.9556244391875355, "grad_norm": 17.943052291870117, "learning_rate": 2.1041370611923645e-05, "loss": 0.3684, "num_input_tokens_seen": 69731824, "step": 73010 }, { "epoch": 5.9560323027979445, "grad_norm": 8.431947708129883, "learning_rate": 2.1037856275145185e-05, "loss": 0.2403, "num_input_tokens_seen": 69736640, "step": 73015 }, { "epoch": 5.956440166408353, "grad_norm": 0.7637823224067688, "learning_rate": 2.1034342018677684e-05, "loss": 0.2798, "num_input_tokens_seen": 69740272, "step": 73020 }, { "epoch": 5.956848030018762, "grad_norm": 1.6449915170669556, "learning_rate": 2.10308278425924e-05, "loss": 0.337, "num_input_tokens_seen": 69744800, "step": 73025 }, { "epoch": 5.957255893629171, "grad_norm": 0.46072298288345337, "learning_rate": 2.1027313746960548e-05, "loss": 0.3733, "num_input_tokens_seen": 69750160, "step": 73030 }, { "epoch": 5.957663757239579, "grad_norm": 0.8400983810424805, "learning_rate": 2.1023799731853354e-05, "loss": 0.3256, "num_input_tokens_seen": 69754896, "step": 73035 }, { "epoch": 5.958071620849988, "grad_norm": 0.6485100388526917, "learning_rate": 2.1020285797342042e-05, "loss": 0.3195, "num_input_tokens_seen": 69758640, "step": 73040 }, { "epoch": 5.958479484460397, "grad_norm": 1.3745381832122803, "learning_rate": 2.101677194349786e-05, "loss": 0.3238, "num_input_tokens_seen": 69763040, "step": 73045 }, { "epoch": 5.958887348070805, "grad_norm": 0.4571286141872406, "learning_rate": 2.1013258170392013e-05, "loss": 0.3296, "num_input_tokens_seen": 69768112, "step": 73050 }, { "epoch": 5.959295211681214, "grad_norm": 0.7685815691947937, "learning_rate": 2.100974447809573e-05, "loss": 0.3447, "num_input_tokens_seen": 69772800, "step": 73055 }, { "epoch": 5.959703075291623, "grad_norm": 0.8914644122123718, "learning_rate": 2.1006230866680233e-05, "loss": 0.3226, "num_input_tokens_seen": 69777744, "step": 73060 }, { "epoch": 5.960110938902031, "grad_norm": 0.924333393573761, "learning_rate": 2.1002717336216733e-05, "loss": 0.2744, "num_input_tokens_seen": 69783552, "step": 73065 }, { "epoch": 5.96051880251244, "grad_norm": 10.630563735961914, "learning_rate": 2.0999203886776457e-05, "loss": 0.4152, "num_input_tokens_seen": 69788832, "step": 73070 }, { "epoch": 5.960926666122848, "grad_norm": 3.258639335632324, "learning_rate": 2.099569051843062e-05, "loss": 0.2722, "num_input_tokens_seen": 69793088, "step": 73075 }, { "epoch": 5.961334529733257, "grad_norm": 12.07492446899414, "learning_rate": 2.0992177231250436e-05, "loss": 0.282, "num_input_tokens_seen": 69798192, "step": 73080 }, { "epoch": 5.961742393343666, "grad_norm": 7.963562488555908, "learning_rate": 2.0988664025307104e-05, "loss": 0.2495, "num_input_tokens_seen": 69802720, "step": 73085 }, { "epoch": 5.962150256954074, "grad_norm": 10.240880966186523, "learning_rate": 2.098515090067186e-05, "loss": 0.324, "num_input_tokens_seen": 69806736, "step": 73090 }, { "epoch": 5.962558120564483, "grad_norm": 5.6208577156066895, "learning_rate": 2.0981637857415898e-05, "loss": 0.4329, "num_input_tokens_seen": 69811040, "step": 73095 }, { "epoch": 5.962965984174891, "grad_norm": 21.76434898376465, "learning_rate": 2.0978124895610433e-05, "loss": 0.3018, "num_input_tokens_seen": 69816544, "step": 73100 }, { "epoch": 5.9633738477853, "grad_norm": 6.7280192375183105, "learning_rate": 2.0974612015326655e-05, "loss": 0.3452, "num_input_tokens_seen": 69821344, "step": 73105 }, { "epoch": 5.9637817113957095, "grad_norm": 6.376031398773193, "learning_rate": 2.0971099216635792e-05, "loss": 0.3121, "num_input_tokens_seen": 69824848, "step": 73110 }, { "epoch": 5.964189575006118, "grad_norm": 1.502440094947815, "learning_rate": 2.0967586499609035e-05, "loss": 0.4936, "num_input_tokens_seen": 69829424, "step": 73115 }, { "epoch": 5.964597438616527, "grad_norm": 10.040867805480957, "learning_rate": 2.0964073864317584e-05, "loss": 0.3259, "num_input_tokens_seen": 69833888, "step": 73120 }, { "epoch": 5.965005302226936, "grad_norm": 2.5399186611175537, "learning_rate": 2.0960561310832645e-05, "loss": 0.3828, "num_input_tokens_seen": 69838288, "step": 73125 }, { "epoch": 5.965413165837344, "grad_norm": 2.904696226119995, "learning_rate": 2.0957048839225408e-05, "loss": 0.3024, "num_input_tokens_seen": 69843056, "step": 73130 }, { "epoch": 5.965821029447753, "grad_norm": 13.708247184753418, "learning_rate": 2.0953536449567074e-05, "loss": 0.2815, "num_input_tokens_seen": 69848368, "step": 73135 }, { "epoch": 5.966228893058162, "grad_norm": 2.875789165496826, "learning_rate": 2.0950024141928837e-05, "loss": 0.4205, "num_input_tokens_seen": 69854208, "step": 73140 }, { "epoch": 5.96663675666857, "grad_norm": 4.839745044708252, "learning_rate": 2.0946511916381893e-05, "loss": 0.3085, "num_input_tokens_seen": 69858640, "step": 73145 }, { "epoch": 5.967044620278979, "grad_norm": 4.907371997833252, "learning_rate": 2.0942999772997416e-05, "loss": 0.3429, "num_input_tokens_seen": 69863936, "step": 73150 }, { "epoch": 5.967452483889387, "grad_norm": 44.415977478027344, "learning_rate": 2.093948771184662e-05, "loss": 0.3224, "num_input_tokens_seen": 69868256, "step": 73155 }, { "epoch": 5.967860347499796, "grad_norm": 0.6128016114234924, "learning_rate": 2.0935975733000685e-05, "loss": 0.4244, "num_input_tokens_seen": 69872928, "step": 73160 }, { "epoch": 5.968268211110205, "grad_norm": 3.473151445388794, "learning_rate": 2.0932463836530794e-05, "loss": 0.3818, "num_input_tokens_seen": 69877488, "step": 73165 }, { "epoch": 5.968676074720613, "grad_norm": 2.4563205242156982, "learning_rate": 2.0928952022508133e-05, "loss": 0.3327, "num_input_tokens_seen": 69882624, "step": 73170 }, { "epoch": 5.969083938331022, "grad_norm": 10.533909797668457, "learning_rate": 2.0925440291003872e-05, "loss": 0.3073, "num_input_tokens_seen": 69886544, "step": 73175 }, { "epoch": 5.969491801941431, "grad_norm": 0.8497768640518188, "learning_rate": 2.0921928642089216e-05, "loss": 0.2199, "num_input_tokens_seen": 69890976, "step": 73180 }, { "epoch": 5.969899665551839, "grad_norm": 10.667272567749023, "learning_rate": 2.0918417075835337e-05, "loss": 0.2843, "num_input_tokens_seen": 69896144, "step": 73185 }, { "epoch": 5.970307529162248, "grad_norm": 0.6040636897087097, "learning_rate": 2.091490559231341e-05, "loss": 0.3027, "num_input_tokens_seen": 69901504, "step": 73190 }, { "epoch": 5.970715392772657, "grad_norm": 2.5421526432037354, "learning_rate": 2.09113941915946e-05, "loss": 0.3799, "num_input_tokens_seen": 69906048, "step": 73195 }, { "epoch": 5.971123256383065, "grad_norm": 0.8093016743659973, "learning_rate": 2.0907882873750102e-05, "loss": 0.3482, "num_input_tokens_seen": 69911104, "step": 73200 }, { "epoch": 5.971531119993474, "grad_norm": 5.714128017425537, "learning_rate": 2.0904371638851077e-05, "loss": 0.2705, "num_input_tokens_seen": 69915856, "step": 73205 }, { "epoch": 5.9719389836038825, "grad_norm": 1.7936819791793823, "learning_rate": 2.09008604869687e-05, "loss": 0.3689, "num_input_tokens_seen": 69920544, "step": 73210 }, { "epoch": 5.9723468472142915, "grad_norm": 2.6121175289154053, "learning_rate": 2.0897349418174137e-05, "loss": 0.3745, "num_input_tokens_seen": 69925024, "step": 73215 }, { "epoch": 5.9727547108247006, "grad_norm": 1.4602892398834229, "learning_rate": 2.0893838432538547e-05, "loss": 0.3279, "num_input_tokens_seen": 69929936, "step": 73220 }, { "epoch": 5.973162574435109, "grad_norm": 0.49439501762390137, "learning_rate": 2.089032753013312e-05, "loss": 0.2591, "num_input_tokens_seen": 69934464, "step": 73225 }, { "epoch": 5.973570438045518, "grad_norm": 1.4428579807281494, "learning_rate": 2.088681671102901e-05, "loss": 0.3724, "num_input_tokens_seen": 69939568, "step": 73230 }, { "epoch": 5.973978301655926, "grad_norm": 4.5642266273498535, "learning_rate": 2.0883305975297375e-05, "loss": 0.3619, "num_input_tokens_seen": 69944288, "step": 73235 }, { "epoch": 5.974386165266335, "grad_norm": 1.3789318799972534, "learning_rate": 2.0879795323009366e-05, "loss": 0.3136, "num_input_tokens_seen": 69948544, "step": 73240 }, { "epoch": 5.974794028876744, "grad_norm": 3.422595977783203, "learning_rate": 2.0876284754236166e-05, "loss": 0.3717, "num_input_tokens_seen": 69952880, "step": 73245 }, { "epoch": 5.975201892487152, "grad_norm": 0.9025893211364746, "learning_rate": 2.0872774269048923e-05, "loss": 0.3049, "num_input_tokens_seen": 69958512, "step": 73250 }, { "epoch": 5.975609756097561, "grad_norm": 16.727142333984375, "learning_rate": 2.0869263867518794e-05, "loss": 0.4788, "num_input_tokens_seen": 69963168, "step": 73255 }, { "epoch": 5.97601761970797, "grad_norm": 3.4613282680511475, "learning_rate": 2.0865753549716926e-05, "loss": 0.334, "num_input_tokens_seen": 69967872, "step": 73260 }, { "epoch": 5.976425483318378, "grad_norm": 3.854712963104248, "learning_rate": 2.0862243315714474e-05, "loss": 0.2817, "num_input_tokens_seen": 69973456, "step": 73265 }, { "epoch": 5.976833346928787, "grad_norm": 5.611807823181152, "learning_rate": 2.08587331655826e-05, "loss": 0.2725, "num_input_tokens_seen": 69977872, "step": 73270 }, { "epoch": 5.977241210539196, "grad_norm": 1.7292304039001465, "learning_rate": 2.085522309939244e-05, "loss": 0.292, "num_input_tokens_seen": 69983040, "step": 73275 }, { "epoch": 5.977649074149604, "grad_norm": 2.953563690185547, "learning_rate": 2.0851713117215144e-05, "loss": 0.4004, "num_input_tokens_seen": 69988624, "step": 73280 }, { "epoch": 5.978056937760013, "grad_norm": 7.4294939041137695, "learning_rate": 2.0848203219121863e-05, "loss": 0.4482, "num_input_tokens_seen": 69992496, "step": 73285 }, { "epoch": 5.978464801370421, "grad_norm": 0.9868807792663574, "learning_rate": 2.084469340518374e-05, "loss": 0.3113, "num_input_tokens_seen": 69997008, "step": 73290 }, { "epoch": 5.97887266498083, "grad_norm": 4.307457447052002, "learning_rate": 2.0841183675471916e-05, "loss": 0.3791, "num_input_tokens_seen": 70002000, "step": 73295 }, { "epoch": 5.979280528591239, "grad_norm": 0.5539158582687378, "learning_rate": 2.0837674030057534e-05, "loss": 0.2944, "num_input_tokens_seen": 70006992, "step": 73300 }, { "epoch": 5.9796883922016475, "grad_norm": 6.013190269470215, "learning_rate": 2.0834164469011717e-05, "loss": 0.2604, "num_input_tokens_seen": 70012112, "step": 73305 }, { "epoch": 5.9800962558120565, "grad_norm": 10.063776016235352, "learning_rate": 2.083065499240563e-05, "loss": 0.3504, "num_input_tokens_seen": 70016720, "step": 73310 }, { "epoch": 5.9805041194224655, "grad_norm": 1.8900532722473145, "learning_rate": 2.0827145600310392e-05, "loss": 0.3234, "num_input_tokens_seen": 70022240, "step": 73315 }, { "epoch": 5.980911983032874, "grad_norm": 0.7184766530990601, "learning_rate": 2.0823636292797142e-05, "loss": 0.316, "num_input_tokens_seen": 70027152, "step": 73320 }, { "epoch": 5.981319846643283, "grad_norm": 6.164658546447754, "learning_rate": 2.0820127069937008e-05, "loss": 0.4249, "num_input_tokens_seen": 70031760, "step": 73325 }, { "epoch": 5.981727710253692, "grad_norm": 0.49423274397850037, "learning_rate": 2.0816617931801118e-05, "loss": 0.3113, "num_input_tokens_seen": 70036288, "step": 73330 }, { "epoch": 5.9821355738641, "grad_norm": 0.9446577429771423, "learning_rate": 2.0813108878460608e-05, "loss": 0.3721, "num_input_tokens_seen": 70040000, "step": 73335 }, { "epoch": 5.982543437474509, "grad_norm": 0.68702232837677, "learning_rate": 2.08095999099866e-05, "loss": 0.3306, "num_input_tokens_seen": 70045440, "step": 73340 }, { "epoch": 5.982951301084917, "grad_norm": 1.5101407766342163, "learning_rate": 2.0806091026450223e-05, "loss": 0.3974, "num_input_tokens_seen": 70050608, "step": 73345 }, { "epoch": 5.983359164695326, "grad_norm": 0.8738137483596802, "learning_rate": 2.0802582227922597e-05, "loss": 0.2863, "num_input_tokens_seen": 70054736, "step": 73350 }, { "epoch": 5.983767028305735, "grad_norm": 0.5184047222137451, "learning_rate": 2.0799073514474844e-05, "loss": 0.3873, "num_input_tokens_seen": 70059200, "step": 73355 }, { "epoch": 5.984174891916143, "grad_norm": 0.4078463912010193, "learning_rate": 2.0795564886178093e-05, "loss": 0.3135, "num_input_tokens_seen": 70064192, "step": 73360 }, { "epoch": 5.984582755526552, "grad_norm": 1.2563010454177856, "learning_rate": 2.079205634310345e-05, "loss": 0.3576, "num_input_tokens_seen": 70069008, "step": 73365 }, { "epoch": 5.98499061913696, "grad_norm": 0.6057195663452148, "learning_rate": 2.0788547885322043e-05, "loss": 0.322, "num_input_tokens_seen": 70073952, "step": 73370 }, { "epoch": 5.985398482747369, "grad_norm": 0.7886597514152527, "learning_rate": 2.0785039512904967e-05, "loss": 0.3464, "num_input_tokens_seen": 70078864, "step": 73375 }, { "epoch": 5.985806346357778, "grad_norm": 3.047255516052246, "learning_rate": 2.0781531225923357e-05, "loss": 0.3509, "num_input_tokens_seen": 70083520, "step": 73380 }, { "epoch": 5.986214209968186, "grad_norm": 3.2186601161956787, "learning_rate": 2.0778023024448325e-05, "loss": 0.4044, "num_input_tokens_seen": 70089168, "step": 73385 }, { "epoch": 5.986622073578595, "grad_norm": 0.7967421412467957, "learning_rate": 2.0774514908550962e-05, "loss": 0.4015, "num_input_tokens_seen": 70093824, "step": 73390 }, { "epoch": 5.987029937189004, "grad_norm": 0.9332965016365051, "learning_rate": 2.0771006878302386e-05, "loss": 0.2705, "num_input_tokens_seen": 70098688, "step": 73395 }, { "epoch": 5.9874378007994125, "grad_norm": 1.2913711071014404, "learning_rate": 2.0767498933773704e-05, "loss": 0.2887, "num_input_tokens_seen": 70103936, "step": 73400 }, { "epoch": 5.9878456644098215, "grad_norm": 2.946615219116211, "learning_rate": 2.0763991075036025e-05, "loss": 0.3206, "num_input_tokens_seen": 70108768, "step": 73405 }, { "epoch": 5.9882535280202305, "grad_norm": 6.095251083374023, "learning_rate": 2.0760483302160443e-05, "loss": 0.3721, "num_input_tokens_seen": 70113856, "step": 73410 }, { "epoch": 5.988661391630639, "grad_norm": 1.1435538530349731, "learning_rate": 2.0756975615218068e-05, "loss": 0.3433, "num_input_tokens_seen": 70118512, "step": 73415 }, { "epoch": 5.989069255241048, "grad_norm": 3.477419853210449, "learning_rate": 2.0753468014279986e-05, "loss": 0.2916, "num_input_tokens_seen": 70123248, "step": 73420 }, { "epoch": 5.989477118851456, "grad_norm": 7.046275615692139, "learning_rate": 2.074996049941731e-05, "loss": 0.3222, "num_input_tokens_seen": 70127456, "step": 73425 }, { "epoch": 5.989884982461865, "grad_norm": 0.6351450085639954, "learning_rate": 2.074645307070113e-05, "loss": 0.3466, "num_input_tokens_seen": 70131888, "step": 73430 }, { "epoch": 5.990292846072274, "grad_norm": 4.042038917541504, "learning_rate": 2.0742945728202537e-05, "loss": 0.3392, "num_input_tokens_seen": 70136096, "step": 73435 }, { "epoch": 5.990700709682682, "grad_norm": 0.5232340693473816, "learning_rate": 2.0739438471992614e-05, "loss": 0.3044, "num_input_tokens_seen": 70140864, "step": 73440 }, { "epoch": 5.991108573293091, "grad_norm": 1.474003791809082, "learning_rate": 2.0735931302142474e-05, "loss": 0.2844, "num_input_tokens_seen": 70145616, "step": 73445 }, { "epoch": 5.991516436903499, "grad_norm": 10.598433494567871, "learning_rate": 2.0732424218723196e-05, "loss": 0.3375, "num_input_tokens_seen": 70151024, "step": 73450 }, { "epoch": 5.991924300513908, "grad_norm": 1.8441473245620728, "learning_rate": 2.0728917221805864e-05, "loss": 0.4358, "num_input_tokens_seen": 70155920, "step": 73455 }, { "epoch": 5.992332164124317, "grad_norm": 5.16037130355835, "learning_rate": 2.0725410311461564e-05, "loss": 0.3858, "num_input_tokens_seen": 70160992, "step": 73460 }, { "epoch": 5.992740027734725, "grad_norm": 4.6443891525268555, "learning_rate": 2.0721903487761372e-05, "loss": 0.313, "num_input_tokens_seen": 70165200, "step": 73465 }, { "epoch": 5.993147891345134, "grad_norm": 3.3716704845428467, "learning_rate": 2.0718396750776388e-05, "loss": 0.3061, "num_input_tokens_seen": 70170496, "step": 73470 }, { "epoch": 5.993555754955543, "grad_norm": 1.2186347246170044, "learning_rate": 2.071489010057768e-05, "loss": 0.3019, "num_input_tokens_seen": 70175936, "step": 73475 }, { "epoch": 5.993963618565951, "grad_norm": 6.11584997177124, "learning_rate": 2.0711383537236335e-05, "loss": 0.4012, "num_input_tokens_seen": 70179984, "step": 73480 }, { "epoch": 5.99437148217636, "grad_norm": 0.6266757845878601, "learning_rate": 2.0707877060823412e-05, "loss": 0.235, "num_input_tokens_seen": 70185168, "step": 73485 }, { "epoch": 5.994779345786769, "grad_norm": 0.6766954064369202, "learning_rate": 2.0704370671410006e-05, "loss": 0.3136, "num_input_tokens_seen": 70188528, "step": 73490 }, { "epoch": 5.995187209397177, "grad_norm": 2.1070501804351807, "learning_rate": 2.0700864369067178e-05, "loss": 0.337, "num_input_tokens_seen": 70193424, "step": 73495 }, { "epoch": 5.995595073007586, "grad_norm": 3.100717067718506, "learning_rate": 2.069735815386601e-05, "loss": 0.3377, "num_input_tokens_seen": 70197872, "step": 73500 }, { "epoch": 5.9960029366179945, "grad_norm": 12.83285903930664, "learning_rate": 2.0693852025877548e-05, "loss": 0.3777, "num_input_tokens_seen": 70203472, "step": 73505 }, { "epoch": 5.996410800228404, "grad_norm": 10.91220760345459, "learning_rate": 2.069034598517288e-05, "loss": 0.3367, "num_input_tokens_seen": 70208112, "step": 73510 }, { "epoch": 5.996818663838813, "grad_norm": 3.4900972843170166, "learning_rate": 2.0686840031823078e-05, "loss": 0.2552, "num_input_tokens_seen": 70213184, "step": 73515 }, { "epoch": 5.997226527449221, "grad_norm": 2.225904941558838, "learning_rate": 2.068333416589919e-05, "loss": 0.2962, "num_input_tokens_seen": 70218320, "step": 73520 }, { "epoch": 5.99763439105963, "grad_norm": 1.3731986284255981, "learning_rate": 2.067982838747229e-05, "loss": 0.4057, "num_input_tokens_seen": 70222560, "step": 73525 }, { "epoch": 5.998042254670039, "grad_norm": 19.13941764831543, "learning_rate": 2.0676322696613415e-05, "loss": 0.3446, "num_input_tokens_seen": 70227056, "step": 73530 }, { "epoch": 5.998450118280447, "grad_norm": 0.7393238544464111, "learning_rate": 2.0672817093393652e-05, "loss": 0.2653, "num_input_tokens_seen": 70231328, "step": 73535 }, { "epoch": 5.998857981890856, "grad_norm": 1.6956707239151, "learning_rate": 2.0669311577884053e-05, "loss": 0.3115, "num_input_tokens_seen": 70235472, "step": 73540 }, { "epoch": 5.999265845501265, "grad_norm": 7.65179967880249, "learning_rate": 2.0665806150155666e-05, "loss": 0.5331, "num_input_tokens_seen": 70239824, "step": 73545 }, { "epoch": 5.999673709111673, "grad_norm": 0.7489588856697083, "learning_rate": 2.0662300810279544e-05, "loss": 0.3496, "num_input_tokens_seen": 70244560, "step": 73550 }, { "epoch": 6.000081572722082, "grad_norm": 28.931943893432617, "learning_rate": 2.0658795558326743e-05, "loss": 0.315, "num_input_tokens_seen": 70249408, "step": 73555 }, { "epoch": 6.00048943633249, "grad_norm": 7.593747138977051, "learning_rate": 2.0655290394368313e-05, "loss": 0.2416, "num_input_tokens_seen": 70253968, "step": 73560 }, { "epoch": 6.00048943633249, "eval_loss": 0.3535979092121124, "eval_runtime": 570.8878, "eval_samples_per_second": 4.773, "eval_steps_per_second": 2.388, "num_input_tokens_seen": 70253968, "step": 73560 }, { "epoch": 6.000897299942899, "grad_norm": 3.5460124015808105, "learning_rate": 2.06517853184753e-05, "loss": 0.3801, "num_input_tokens_seen": 70259184, "step": 73565 }, { "epoch": 6.001305163553308, "grad_norm": 7.968684673309326, "learning_rate": 2.064828033071875e-05, "loss": 0.2501, "num_input_tokens_seen": 70264704, "step": 73570 }, { "epoch": 6.001713027163716, "grad_norm": 0.49453768134117126, "learning_rate": 2.06447754311697e-05, "loss": 0.4002, "num_input_tokens_seen": 70269840, "step": 73575 }, { "epoch": 6.002120890774125, "grad_norm": 42.63667297363281, "learning_rate": 2.0641270619899212e-05, "loss": 0.3833, "num_input_tokens_seen": 70275504, "step": 73580 }, { "epoch": 6.002528754384533, "grad_norm": 2.72735857963562, "learning_rate": 2.0637765896978315e-05, "loss": 0.2392, "num_input_tokens_seen": 70280448, "step": 73585 }, { "epoch": 6.002936617994942, "grad_norm": 3.904386520385742, "learning_rate": 2.063426126247805e-05, "loss": 0.2678, "num_input_tokens_seen": 70285248, "step": 73590 }, { "epoch": 6.003344481605351, "grad_norm": 0.9603795409202576, "learning_rate": 2.0630756716469444e-05, "loss": 0.2164, "num_input_tokens_seen": 70290112, "step": 73595 }, { "epoch": 6.0037523452157595, "grad_norm": 4.066997528076172, "learning_rate": 2.062725225902355e-05, "loss": 0.3164, "num_input_tokens_seen": 70295184, "step": 73600 }, { "epoch": 6.0041602088261685, "grad_norm": 7.0743818283081055, "learning_rate": 2.06237478902114e-05, "loss": 0.2476, "num_input_tokens_seen": 70299232, "step": 73605 }, { "epoch": 6.0045680724365775, "grad_norm": 1.168165683746338, "learning_rate": 2.062024361010402e-05, "loss": 0.3235, "num_input_tokens_seen": 70304176, "step": 73610 }, { "epoch": 6.004975936046986, "grad_norm": 15.36939525604248, "learning_rate": 2.0616739418772434e-05, "loss": 0.3513, "num_input_tokens_seen": 70308384, "step": 73615 }, { "epoch": 6.005383799657395, "grad_norm": 20.83623695373535, "learning_rate": 2.061323531628768e-05, "loss": 0.4097, "num_input_tokens_seen": 70313216, "step": 73620 }, { "epoch": 6.005791663267804, "grad_norm": 18.697900772094727, "learning_rate": 2.0609731302720785e-05, "loss": 0.411, "num_input_tokens_seen": 70317760, "step": 73625 }, { "epoch": 6.006199526878212, "grad_norm": 32.73760986328125, "learning_rate": 2.060622737814277e-05, "loss": 0.3303, "num_input_tokens_seen": 70321552, "step": 73630 }, { "epoch": 6.006607390488621, "grad_norm": 192.20005798339844, "learning_rate": 2.060272354262466e-05, "loss": 0.5698, "num_input_tokens_seen": 70326064, "step": 73635 }, { "epoch": 6.007015254099029, "grad_norm": 2.323608160018921, "learning_rate": 2.0599219796237462e-05, "loss": 0.4111, "num_input_tokens_seen": 70331296, "step": 73640 }, { "epoch": 6.007423117709438, "grad_norm": 59.68412399291992, "learning_rate": 2.0595716139052222e-05, "loss": 0.2647, "num_input_tokens_seen": 70335952, "step": 73645 }, { "epoch": 6.007830981319847, "grad_norm": 27.023889541625977, "learning_rate": 2.0592212571139946e-05, "loss": 0.3256, "num_input_tokens_seen": 70340736, "step": 73650 }, { "epoch": 6.008238844930255, "grad_norm": 16.006877899169922, "learning_rate": 2.0588709092571644e-05, "loss": 0.2811, "num_input_tokens_seen": 70345488, "step": 73655 }, { "epoch": 6.008646708540664, "grad_norm": 3.206613063812256, "learning_rate": 2.0585205703418326e-05, "loss": 0.2897, "num_input_tokens_seen": 70349568, "step": 73660 }, { "epoch": 6.009054572151073, "grad_norm": 14.780713081359863, "learning_rate": 2.0581702403751023e-05, "loss": 0.5642, "num_input_tokens_seen": 70353520, "step": 73665 }, { "epoch": 6.009462435761481, "grad_norm": 2.0444157123565674, "learning_rate": 2.0578199193640734e-05, "loss": 0.3004, "num_input_tokens_seen": 70358320, "step": 73670 }, { "epoch": 6.00987029937189, "grad_norm": 21.540945053100586, "learning_rate": 2.057469607315847e-05, "loss": 0.4659, "num_input_tokens_seen": 70363104, "step": 73675 }, { "epoch": 6.010278162982298, "grad_norm": 2.928684711456299, "learning_rate": 2.0571193042375237e-05, "loss": 0.4034, "num_input_tokens_seen": 70367424, "step": 73680 }, { "epoch": 6.010686026592707, "grad_norm": 3.9894001483917236, "learning_rate": 2.056769010136203e-05, "loss": 0.343, "num_input_tokens_seen": 70371840, "step": 73685 }, { "epoch": 6.011093890203116, "grad_norm": 4.227901458740234, "learning_rate": 2.0564187250189866e-05, "loss": 0.4367, "num_input_tokens_seen": 70376528, "step": 73690 }, { "epoch": 6.0115017538135245, "grad_norm": 5.601004600524902, "learning_rate": 2.0560684488929746e-05, "loss": 0.3915, "num_input_tokens_seen": 70381856, "step": 73695 }, { "epoch": 6.0119096174239335, "grad_norm": 0.7859182357788086, "learning_rate": 2.0557181817652662e-05, "loss": 0.3972, "num_input_tokens_seen": 70386912, "step": 73700 }, { "epoch": 6.0123174810343425, "grad_norm": 0.537560224533081, "learning_rate": 2.0553679236429608e-05, "loss": 0.325, "num_input_tokens_seen": 70391904, "step": 73705 }, { "epoch": 6.012725344644751, "grad_norm": 0.5262407660484314, "learning_rate": 2.055017674533159e-05, "loss": 0.3006, "num_input_tokens_seen": 70396464, "step": 73710 }, { "epoch": 6.01313320825516, "grad_norm": 3.8573343753814697, "learning_rate": 2.0546674344429607e-05, "loss": 0.3364, "num_input_tokens_seen": 70402128, "step": 73715 }, { "epoch": 6.013541071865568, "grad_norm": 0.619616687297821, "learning_rate": 2.054317203379464e-05, "loss": 0.3887, "num_input_tokens_seen": 70407360, "step": 73720 }, { "epoch": 6.013948935475977, "grad_norm": 23.451608657836914, "learning_rate": 2.0539669813497677e-05, "loss": 0.3496, "num_input_tokens_seen": 70412448, "step": 73725 }, { "epoch": 6.014356799086386, "grad_norm": 12.770055770874023, "learning_rate": 2.0536167683609705e-05, "loss": 0.3184, "num_input_tokens_seen": 70418352, "step": 73730 }, { "epoch": 6.014764662696794, "grad_norm": 13.199166297912598, "learning_rate": 2.0532665644201725e-05, "loss": 0.4312, "num_input_tokens_seen": 70423376, "step": 73735 }, { "epoch": 6.015172526307203, "grad_norm": 13.104281425476074, "learning_rate": 2.052916369534472e-05, "loss": 0.3654, "num_input_tokens_seen": 70428256, "step": 73740 }, { "epoch": 6.015580389917612, "grad_norm": 0.6122602224349976, "learning_rate": 2.0525661837109664e-05, "loss": 0.3281, "num_input_tokens_seen": 70433904, "step": 73745 }, { "epoch": 6.01598825352802, "grad_norm": 1.342496395111084, "learning_rate": 2.0522160069567536e-05, "loss": 0.2755, "num_input_tokens_seen": 70438912, "step": 73750 }, { "epoch": 6.016396117138429, "grad_norm": 31.83832550048828, "learning_rate": 2.0518658392789326e-05, "loss": 0.4484, "num_input_tokens_seen": 70443584, "step": 73755 }, { "epoch": 6.016803980748837, "grad_norm": 0.8294753432273865, "learning_rate": 2.0515156806846004e-05, "loss": 0.2686, "num_input_tokens_seen": 70449360, "step": 73760 }, { "epoch": 6.017211844359246, "grad_norm": 8.559009552001953, "learning_rate": 2.051165531180855e-05, "loss": 0.2983, "num_input_tokens_seen": 70454832, "step": 73765 }, { "epoch": 6.017619707969655, "grad_norm": 2.2781102657318115, "learning_rate": 2.0508153907747936e-05, "loss": 0.4447, "num_input_tokens_seen": 70459024, "step": 73770 }, { "epoch": 6.018027571580063, "grad_norm": 1.1840944290161133, "learning_rate": 2.0504652594735123e-05, "loss": 0.4099, "num_input_tokens_seen": 70462704, "step": 73775 }, { "epoch": 6.018435435190472, "grad_norm": 21.067882537841797, "learning_rate": 2.05011513728411e-05, "loss": 0.2896, "num_input_tokens_seen": 70468016, "step": 73780 }, { "epoch": 6.018843298800881, "grad_norm": 50.10951232910156, "learning_rate": 2.0497650242136828e-05, "loss": 0.4353, "num_input_tokens_seen": 70472096, "step": 73785 }, { "epoch": 6.019251162411289, "grad_norm": 1.9811952114105225, "learning_rate": 2.049414920269327e-05, "loss": 0.3022, "num_input_tokens_seen": 70476992, "step": 73790 }, { "epoch": 6.0196590260216984, "grad_norm": 0.6416458487510681, "learning_rate": 2.0490648254581388e-05, "loss": 0.4742, "num_input_tokens_seen": 70481200, "step": 73795 }, { "epoch": 6.0200668896321075, "grad_norm": 0.4118368625640869, "learning_rate": 2.048714739787216e-05, "loss": 0.2539, "num_input_tokens_seen": 70485520, "step": 73800 }, { "epoch": 6.020474753242516, "grad_norm": 0.4943223297595978, "learning_rate": 2.0483646632636532e-05, "loss": 0.2653, "num_input_tokens_seen": 70489984, "step": 73805 }, { "epoch": 6.020882616852925, "grad_norm": 3.230011463165283, "learning_rate": 2.048014595894547e-05, "loss": 0.3637, "num_input_tokens_seen": 70494704, "step": 73810 }, { "epoch": 6.021290480463333, "grad_norm": 22.818349838256836, "learning_rate": 2.0476645376869928e-05, "loss": 0.3046, "num_input_tokens_seen": 70499696, "step": 73815 }, { "epoch": 6.021698344073742, "grad_norm": 2.6727185249328613, "learning_rate": 2.0473144886480856e-05, "loss": 0.2531, "num_input_tokens_seen": 70503840, "step": 73820 }, { "epoch": 6.022106207684151, "grad_norm": 1.6643928289413452, "learning_rate": 2.046964448784922e-05, "loss": 0.3472, "num_input_tokens_seen": 70506928, "step": 73825 }, { "epoch": 6.022514071294559, "grad_norm": 2.637280225753784, "learning_rate": 2.0466144181045968e-05, "loss": 0.3632, "num_input_tokens_seen": 70511504, "step": 73830 }, { "epoch": 6.022921934904968, "grad_norm": 5.131108283996582, "learning_rate": 2.0462643966142047e-05, "loss": 0.3818, "num_input_tokens_seen": 70516384, "step": 73835 }, { "epoch": 6.023329798515377, "grad_norm": 9.549484252929688, "learning_rate": 2.045914384320839e-05, "loss": 0.3314, "num_input_tokens_seen": 70521632, "step": 73840 }, { "epoch": 6.023737662125785, "grad_norm": 0.397460013628006, "learning_rate": 2.045564381231597e-05, "loss": 0.3945, "num_input_tokens_seen": 70525984, "step": 73845 }, { "epoch": 6.024145525736194, "grad_norm": 0.6850369572639465, "learning_rate": 2.045214387353572e-05, "loss": 0.319, "num_input_tokens_seen": 70530560, "step": 73850 }, { "epoch": 6.024553389346602, "grad_norm": 1.3172342777252197, "learning_rate": 2.0448644026938578e-05, "loss": 0.2574, "num_input_tokens_seen": 70535728, "step": 73855 }, { "epoch": 6.024961252957011, "grad_norm": 3.165733575820923, "learning_rate": 2.044514427259548e-05, "loss": 0.3024, "num_input_tokens_seen": 70540768, "step": 73860 }, { "epoch": 6.02536911656742, "grad_norm": 0.6605291962623596, "learning_rate": 2.0441644610577382e-05, "loss": 0.3232, "num_input_tokens_seen": 70545536, "step": 73865 }, { "epoch": 6.025776980177828, "grad_norm": 1.958950161933899, "learning_rate": 2.043814504095521e-05, "loss": 0.3953, "num_input_tokens_seen": 70549712, "step": 73870 }, { "epoch": 6.026184843788237, "grad_norm": 4.0902581214904785, "learning_rate": 2.04346455637999e-05, "loss": 0.3175, "num_input_tokens_seen": 70554912, "step": 73875 }, { "epoch": 6.026592707398646, "grad_norm": 0.4566192626953125, "learning_rate": 2.0431146179182384e-05, "loss": 0.3228, "num_input_tokens_seen": 70560256, "step": 73880 }, { "epoch": 6.027000571009054, "grad_norm": 28.22125816345215, "learning_rate": 2.042764688717359e-05, "loss": 0.3945, "num_input_tokens_seen": 70564704, "step": 73885 }, { "epoch": 6.027408434619463, "grad_norm": 6.628612995147705, "learning_rate": 2.0424147687844453e-05, "loss": 0.3291, "num_input_tokens_seen": 70569184, "step": 73890 }, { "epoch": 6.0278162982298715, "grad_norm": 3.9889650344848633, "learning_rate": 2.04206485812659e-05, "loss": 0.4227, "num_input_tokens_seen": 70574016, "step": 73895 }, { "epoch": 6.0282241618402805, "grad_norm": 1.0864622592926025, "learning_rate": 2.0417149567508855e-05, "loss": 0.348, "num_input_tokens_seen": 70579088, "step": 73900 }, { "epoch": 6.0286320254506895, "grad_norm": 8.376537322998047, "learning_rate": 2.0413650646644228e-05, "loss": 0.2678, "num_input_tokens_seen": 70583552, "step": 73905 }, { "epoch": 6.029039889061098, "grad_norm": 10.50445556640625, "learning_rate": 2.0410151818742968e-05, "loss": 0.3828, "num_input_tokens_seen": 70588480, "step": 73910 }, { "epoch": 6.029447752671507, "grad_norm": 2.029127836227417, "learning_rate": 2.0406653083875978e-05, "loss": 0.2499, "num_input_tokens_seen": 70594112, "step": 73915 }, { "epoch": 6.029855616281916, "grad_norm": 4.806776523590088, "learning_rate": 2.040315444211418e-05, "loss": 0.3233, "num_input_tokens_seen": 70599104, "step": 73920 }, { "epoch": 6.030263479892324, "grad_norm": 5.268260478973389, "learning_rate": 2.0399655893528484e-05, "loss": 0.4874, "num_input_tokens_seen": 70603984, "step": 73925 }, { "epoch": 6.030671343502733, "grad_norm": 10.635860443115234, "learning_rate": 2.03961574381898e-05, "loss": 0.3161, "num_input_tokens_seen": 70608240, "step": 73930 }, { "epoch": 6.031079207113141, "grad_norm": 0.3589438796043396, "learning_rate": 2.0392659076169058e-05, "loss": 0.1837, "num_input_tokens_seen": 70612800, "step": 73935 }, { "epoch": 6.03148707072355, "grad_norm": 3.3874714374542236, "learning_rate": 2.038916080753716e-05, "loss": 0.2818, "num_input_tokens_seen": 70617424, "step": 73940 }, { "epoch": 6.031894934333959, "grad_norm": 18.934043884277344, "learning_rate": 2.0385662632365013e-05, "loss": 0.3218, "num_input_tokens_seen": 70622400, "step": 73945 }, { "epoch": 6.032302797944367, "grad_norm": 5.638821601867676, "learning_rate": 2.0382164550723517e-05, "loss": 0.2845, "num_input_tokens_seen": 70626496, "step": 73950 }, { "epoch": 6.032710661554776, "grad_norm": 0.7613235712051392, "learning_rate": 2.037866656268359e-05, "loss": 0.3991, "num_input_tokens_seen": 70631648, "step": 73955 }, { "epoch": 6.033118525165185, "grad_norm": 11.633077621459961, "learning_rate": 2.0375168668316128e-05, "loss": 0.3942, "num_input_tokens_seen": 70637056, "step": 73960 }, { "epoch": 6.033526388775593, "grad_norm": 9.456483840942383, "learning_rate": 2.0371670867692035e-05, "loss": 0.4299, "num_input_tokens_seen": 70642064, "step": 73965 }, { "epoch": 6.033934252386002, "grad_norm": 0.9553301334381104, "learning_rate": 2.03681731608822e-05, "loss": 0.224, "num_input_tokens_seen": 70647040, "step": 73970 }, { "epoch": 6.034342115996411, "grad_norm": 3.6678478717803955, "learning_rate": 2.036467554795752e-05, "loss": 0.294, "num_input_tokens_seen": 70651424, "step": 73975 }, { "epoch": 6.034749979606819, "grad_norm": 4.003353595733643, "learning_rate": 2.0361178028988907e-05, "loss": 0.2684, "num_input_tokens_seen": 70656544, "step": 73980 }, { "epoch": 6.035157843217228, "grad_norm": 2.173154830932617, "learning_rate": 2.0357680604047247e-05, "loss": 0.3061, "num_input_tokens_seen": 70661408, "step": 73985 }, { "epoch": 6.0355657068276365, "grad_norm": 16.914039611816406, "learning_rate": 2.0354183273203426e-05, "loss": 0.452, "num_input_tokens_seen": 70666208, "step": 73990 }, { "epoch": 6.0359735704380455, "grad_norm": 14.953970909118652, "learning_rate": 2.0350686036528323e-05, "loss": 0.4633, "num_input_tokens_seen": 70671072, "step": 73995 }, { "epoch": 6.0363814340484545, "grad_norm": 3.0724422931671143, "learning_rate": 2.0347188894092852e-05, "loss": 0.4084, "num_input_tokens_seen": 70675584, "step": 74000 }, { "epoch": 6.036789297658863, "grad_norm": 8.059374809265137, "learning_rate": 2.0343691845967886e-05, "loss": 0.4289, "num_input_tokens_seen": 70678832, "step": 74005 }, { "epoch": 6.037197161269272, "grad_norm": 1.011636734008789, "learning_rate": 2.0340194892224306e-05, "loss": 0.2865, "num_input_tokens_seen": 70682944, "step": 74010 }, { "epoch": 6.037605024879681, "grad_norm": 6.353992462158203, "learning_rate": 2.0336698032932992e-05, "loss": 0.3971, "num_input_tokens_seen": 70687856, "step": 74015 }, { "epoch": 6.038012888490089, "grad_norm": 0.9766897559165955, "learning_rate": 2.0333201268164827e-05, "loss": 0.3806, "num_input_tokens_seen": 70693360, "step": 74020 }, { "epoch": 6.038420752100498, "grad_norm": 11.449682235717773, "learning_rate": 2.0329704597990696e-05, "loss": 0.4333, "num_input_tokens_seen": 70698272, "step": 74025 }, { "epoch": 6.038828615710906, "grad_norm": 4.300530433654785, "learning_rate": 2.0326208022481465e-05, "loss": 0.3631, "num_input_tokens_seen": 70703248, "step": 74030 }, { "epoch": 6.039236479321315, "grad_norm": 32.16551971435547, "learning_rate": 2.0322711541708007e-05, "loss": 0.3149, "num_input_tokens_seen": 70708464, "step": 74035 }, { "epoch": 6.039644342931724, "grad_norm": 1.9997069835662842, "learning_rate": 2.0319215155741195e-05, "loss": 0.2889, "num_input_tokens_seen": 70714016, "step": 74040 }, { "epoch": 6.040052206542132, "grad_norm": 1.3954390287399292, "learning_rate": 2.0315718864651912e-05, "loss": 0.3084, "num_input_tokens_seen": 70718544, "step": 74045 }, { "epoch": 6.040460070152541, "grad_norm": 7.1230854988098145, "learning_rate": 2.0312222668511015e-05, "loss": 0.3965, "num_input_tokens_seen": 70724416, "step": 74050 }, { "epoch": 6.04086793376295, "grad_norm": 27.000953674316406, "learning_rate": 2.0308726567389376e-05, "loss": 0.4503, "num_input_tokens_seen": 70729728, "step": 74055 }, { "epoch": 6.041275797373358, "grad_norm": 0.5716023445129395, "learning_rate": 2.0305230561357844e-05, "loss": 0.3588, "num_input_tokens_seen": 70734880, "step": 74060 }, { "epoch": 6.041683660983767, "grad_norm": 7.915818214416504, "learning_rate": 2.0301734650487302e-05, "loss": 0.2609, "num_input_tokens_seen": 70739360, "step": 74065 }, { "epoch": 6.042091524594175, "grad_norm": 4.726524829864502, "learning_rate": 2.0298238834848604e-05, "loss": 0.3133, "num_input_tokens_seen": 70744320, "step": 74070 }, { "epoch": 6.042499388204584, "grad_norm": 0.31274649500846863, "learning_rate": 2.029474311451261e-05, "loss": 0.3404, "num_input_tokens_seen": 70748048, "step": 74075 }, { "epoch": 6.042907251814993, "grad_norm": 10.34501838684082, "learning_rate": 2.0291247489550168e-05, "loss": 0.357, "num_input_tokens_seen": 70753248, "step": 74080 }, { "epoch": 6.0433151154254015, "grad_norm": 5.265839099884033, "learning_rate": 2.0287751960032135e-05, "loss": 0.2199, "num_input_tokens_seen": 70758352, "step": 74085 }, { "epoch": 6.0437229790358105, "grad_norm": 1.8335821628570557, "learning_rate": 2.0284256526029373e-05, "loss": 0.2994, "num_input_tokens_seen": 70763568, "step": 74090 }, { "epoch": 6.0441308426462195, "grad_norm": 7.878942966461182, "learning_rate": 2.0280761187612723e-05, "loss": 0.4448, "num_input_tokens_seen": 70768784, "step": 74095 }, { "epoch": 6.044538706256628, "grad_norm": 1.4074119329452515, "learning_rate": 2.0277265944853043e-05, "loss": 0.2484, "num_input_tokens_seen": 70772496, "step": 74100 }, { "epoch": 6.044946569867037, "grad_norm": 10.259159088134766, "learning_rate": 2.027377079782117e-05, "loss": 0.294, "num_input_tokens_seen": 70777632, "step": 74105 }, { "epoch": 6.045354433477445, "grad_norm": 17.528663635253906, "learning_rate": 2.027027574658796e-05, "loss": 0.4584, "num_input_tokens_seen": 70783440, "step": 74110 }, { "epoch": 6.045762297087854, "grad_norm": 8.20647144317627, "learning_rate": 2.0266780791224253e-05, "loss": 0.2684, "num_input_tokens_seen": 70788272, "step": 74115 }, { "epoch": 6.046170160698263, "grad_norm": 15.167014122009277, "learning_rate": 2.0263285931800886e-05, "loss": 0.5005, "num_input_tokens_seen": 70793408, "step": 74120 }, { "epoch": 6.046578024308671, "grad_norm": 17.18865394592285, "learning_rate": 2.0259791168388697e-05, "loss": 0.3984, "num_input_tokens_seen": 70798320, "step": 74125 }, { "epoch": 6.04698588791908, "grad_norm": 6.174278736114502, "learning_rate": 2.0256296501058522e-05, "loss": 0.3622, "num_input_tokens_seen": 70803056, "step": 74130 }, { "epoch": 6.047393751529489, "grad_norm": 0.6633146405220032, "learning_rate": 2.025280192988121e-05, "loss": 0.38, "num_input_tokens_seen": 70807936, "step": 74135 }, { "epoch": 6.047801615139897, "grad_norm": 9.883051872253418, "learning_rate": 2.0249307454927584e-05, "loss": 0.3448, "num_input_tokens_seen": 70812272, "step": 74140 }, { "epoch": 6.048209478750306, "grad_norm": 3.81135630607605, "learning_rate": 2.0245813076268477e-05, "loss": 0.5562, "num_input_tokens_seen": 70817776, "step": 74145 }, { "epoch": 6.048617342360714, "grad_norm": 11.931070327758789, "learning_rate": 2.0242318793974716e-05, "loss": 0.36, "num_input_tokens_seen": 70822400, "step": 74150 }, { "epoch": 6.049025205971123, "grad_norm": 1.1559065580368042, "learning_rate": 2.023882460811713e-05, "loss": 0.263, "num_input_tokens_seen": 70827696, "step": 74155 }, { "epoch": 6.049433069581532, "grad_norm": 8.38426685333252, "learning_rate": 2.0235330518766553e-05, "loss": 0.3243, "num_input_tokens_seen": 70833024, "step": 74160 }, { "epoch": 6.04984093319194, "grad_norm": 14.447859764099121, "learning_rate": 2.02318365259938e-05, "loss": 0.2391, "num_input_tokens_seen": 70838048, "step": 74165 }, { "epoch": 6.050248796802349, "grad_norm": 6.681663513183594, "learning_rate": 2.0228342629869694e-05, "loss": 0.5525, "num_input_tokens_seen": 70842768, "step": 74170 }, { "epoch": 6.050656660412758, "grad_norm": 21.450355529785156, "learning_rate": 2.0224848830465052e-05, "loss": 0.355, "num_input_tokens_seen": 70847712, "step": 74175 }, { "epoch": 6.051064524023166, "grad_norm": 0.6277954578399658, "learning_rate": 2.02213551278507e-05, "loss": 0.2802, "num_input_tokens_seen": 70853472, "step": 74180 }, { "epoch": 6.051472387633575, "grad_norm": 24.9710750579834, "learning_rate": 2.0217861522097446e-05, "loss": 0.4007, "num_input_tokens_seen": 70858256, "step": 74185 }, { "epoch": 6.051880251243984, "grad_norm": 1.2934234142303467, "learning_rate": 2.021436801327611e-05, "loss": 0.4304, "num_input_tokens_seen": 70862032, "step": 74190 }, { "epoch": 6.0522881148543926, "grad_norm": 4.065269947052002, "learning_rate": 2.0210874601457488e-05, "loss": 0.2301, "num_input_tokens_seen": 70866800, "step": 74195 }, { "epoch": 6.052695978464802, "grad_norm": 19.605642318725586, "learning_rate": 2.0207381286712416e-05, "loss": 0.3549, "num_input_tokens_seen": 70871136, "step": 74200 }, { "epoch": 6.05310384207521, "grad_norm": 2.1553032398223877, "learning_rate": 2.0203888069111687e-05, "loss": 0.4136, "num_input_tokens_seen": 70875312, "step": 74205 }, { "epoch": 6.053511705685619, "grad_norm": 14.364420890808105, "learning_rate": 2.0200394948726108e-05, "loss": 0.3269, "num_input_tokens_seen": 70879776, "step": 74210 }, { "epoch": 6.053919569296028, "grad_norm": 1.5164554119110107, "learning_rate": 2.0196901925626477e-05, "loss": 0.2895, "num_input_tokens_seen": 70884528, "step": 74215 }, { "epoch": 6.054327432906436, "grad_norm": 1.2926357984542847, "learning_rate": 2.019340899988361e-05, "loss": 0.2942, "num_input_tokens_seen": 70889408, "step": 74220 }, { "epoch": 6.054735296516845, "grad_norm": 8.202908515930176, "learning_rate": 2.01899161715683e-05, "loss": 0.427, "num_input_tokens_seen": 70893504, "step": 74225 }, { "epoch": 6.055143160127254, "grad_norm": 8.51290225982666, "learning_rate": 2.0186423440751348e-05, "loss": 0.2821, "num_input_tokens_seen": 70897840, "step": 74230 }, { "epoch": 6.055551023737662, "grad_norm": 3.4051690101623535, "learning_rate": 2.0182930807503548e-05, "loss": 0.3029, "num_input_tokens_seen": 70903008, "step": 74235 }, { "epoch": 6.055958887348071, "grad_norm": 15.595207214355469, "learning_rate": 2.0179438271895685e-05, "loss": 0.4437, "num_input_tokens_seen": 70906800, "step": 74240 }, { "epoch": 6.056366750958479, "grad_norm": 0.8121752142906189, "learning_rate": 2.0175945833998568e-05, "loss": 0.2469, "num_input_tokens_seen": 70912416, "step": 74245 }, { "epoch": 6.056774614568888, "grad_norm": 25.244895935058594, "learning_rate": 2.0172453493882975e-05, "loss": 0.4396, "num_input_tokens_seen": 70916864, "step": 74250 }, { "epoch": 6.057182478179297, "grad_norm": 12.157061576843262, "learning_rate": 2.01689612516197e-05, "loss": 0.3978, "num_input_tokens_seen": 70921840, "step": 74255 }, { "epoch": 6.057590341789705, "grad_norm": 14.526399612426758, "learning_rate": 2.0165469107279522e-05, "loss": 0.388, "num_input_tokens_seen": 70926560, "step": 74260 }, { "epoch": 6.057998205400114, "grad_norm": 1.354546308517456, "learning_rate": 2.0161977060933234e-05, "loss": 0.1933, "num_input_tokens_seen": 70931312, "step": 74265 }, { "epoch": 6.058406069010523, "grad_norm": 19.586734771728516, "learning_rate": 2.015848511265162e-05, "loss": 0.2199, "num_input_tokens_seen": 70937200, "step": 74270 }, { "epoch": 6.058813932620931, "grad_norm": 1.4464356899261475, "learning_rate": 2.0154993262505455e-05, "loss": 0.2522, "num_input_tokens_seen": 70942512, "step": 74275 }, { "epoch": 6.05922179623134, "grad_norm": 16.555217742919922, "learning_rate": 2.015150151056552e-05, "loss": 0.3057, "num_input_tokens_seen": 70947936, "step": 74280 }, { "epoch": 6.0596296598417485, "grad_norm": 0.5416258573532104, "learning_rate": 2.014800985690257e-05, "loss": 0.2498, "num_input_tokens_seen": 70952896, "step": 74285 }, { "epoch": 6.0600375234521575, "grad_norm": 0.9067997932434082, "learning_rate": 2.014451830158742e-05, "loss": 0.2649, "num_input_tokens_seen": 70958048, "step": 74290 }, { "epoch": 6.0604453870625665, "grad_norm": 0.4497453272342682, "learning_rate": 2.0141026844690813e-05, "loss": 0.3945, "num_input_tokens_seen": 70962768, "step": 74295 }, { "epoch": 6.060853250672975, "grad_norm": 2.017615556716919, "learning_rate": 2.013753548628353e-05, "loss": 0.3184, "num_input_tokens_seen": 70967216, "step": 74300 }, { "epoch": 6.061261114283384, "grad_norm": 8.945694923400879, "learning_rate": 2.013404422643633e-05, "loss": 0.4905, "num_input_tokens_seen": 70971440, "step": 74305 }, { "epoch": 6.061668977893793, "grad_norm": 7.457324981689453, "learning_rate": 2.0130553065219993e-05, "loss": 0.2447, "num_input_tokens_seen": 70977008, "step": 74310 }, { "epoch": 6.062076841504201, "grad_norm": 3.46563982963562, "learning_rate": 2.0127062002705273e-05, "loss": 0.2128, "num_input_tokens_seen": 70981808, "step": 74315 }, { "epoch": 6.06248470511461, "grad_norm": 12.261258125305176, "learning_rate": 2.012357103896294e-05, "loss": 0.3517, "num_input_tokens_seen": 70986672, "step": 74320 }, { "epoch": 6.062892568725019, "grad_norm": 1.2776776552200317, "learning_rate": 2.0120080174063745e-05, "loss": 0.3364, "num_input_tokens_seen": 70991840, "step": 74325 }, { "epoch": 6.063300432335427, "grad_norm": 3.105280876159668, "learning_rate": 2.0116589408078444e-05, "loss": 0.3099, "num_input_tokens_seen": 70996304, "step": 74330 }, { "epoch": 6.063708295945836, "grad_norm": 5.089895248413086, "learning_rate": 2.0113098741077813e-05, "loss": 0.3113, "num_input_tokens_seen": 71001264, "step": 74335 }, { "epoch": 6.064116159556244, "grad_norm": 0.6766757965087891, "learning_rate": 2.010960817313259e-05, "loss": 0.4709, "num_input_tokens_seen": 71005504, "step": 74340 }, { "epoch": 6.064524023166653, "grad_norm": 6.8059234619140625, "learning_rate": 2.010611770431353e-05, "loss": 0.4703, "num_input_tokens_seen": 71009632, "step": 74345 }, { "epoch": 6.064931886777062, "grad_norm": 5.464607238769531, "learning_rate": 2.0102627334691378e-05, "loss": 0.3252, "num_input_tokens_seen": 71014368, "step": 74350 }, { "epoch": 6.06533975038747, "grad_norm": 4.071897983551025, "learning_rate": 2.00991370643369e-05, "loss": 0.2282, "num_input_tokens_seen": 71018656, "step": 74355 }, { "epoch": 6.065747613997879, "grad_norm": 38.74658203125, "learning_rate": 2.0095646893320828e-05, "loss": 0.3063, "num_input_tokens_seen": 71023168, "step": 74360 }, { "epoch": 6.066155477608288, "grad_norm": 12.366142272949219, "learning_rate": 2.0092156821713913e-05, "loss": 0.2809, "num_input_tokens_seen": 71027776, "step": 74365 }, { "epoch": 6.066563341218696, "grad_norm": 13.881500244140625, "learning_rate": 2.008866684958689e-05, "loss": 0.2828, "num_input_tokens_seen": 71032048, "step": 74370 }, { "epoch": 6.066971204829105, "grad_norm": 2.03001070022583, "learning_rate": 2.0085176977010502e-05, "loss": 0.2577, "num_input_tokens_seen": 71036032, "step": 74375 }, { "epoch": 6.0673790684395135, "grad_norm": 11.002718925476074, "learning_rate": 2.0081687204055488e-05, "loss": 0.4436, "num_input_tokens_seen": 71040640, "step": 74380 }, { "epoch": 6.0677869320499225, "grad_norm": 4.187834739685059, "learning_rate": 2.007819753079259e-05, "loss": 0.3474, "num_input_tokens_seen": 71045184, "step": 74385 }, { "epoch": 6.0681947956603315, "grad_norm": 0.5158711671829224, "learning_rate": 2.0074707957292532e-05, "loss": 0.33, "num_input_tokens_seen": 71049168, "step": 74390 }, { "epoch": 6.06860265927074, "grad_norm": 0.5019939541816711, "learning_rate": 2.0071218483626042e-05, "loss": 0.2729, "num_input_tokens_seen": 71054624, "step": 74395 }, { "epoch": 6.069010522881149, "grad_norm": 0.43446430563926697, "learning_rate": 2.006772910986387e-05, "loss": 0.2855, "num_input_tokens_seen": 71058608, "step": 74400 }, { "epoch": 6.069418386491558, "grad_norm": 2.00508189201355, "learning_rate": 2.0064239836076732e-05, "loss": 0.4195, "num_input_tokens_seen": 71063072, "step": 74405 }, { "epoch": 6.069826250101966, "grad_norm": 1.8305659294128418, "learning_rate": 2.0060750662335356e-05, "loss": 0.2967, "num_input_tokens_seen": 71067600, "step": 74410 }, { "epoch": 6.070234113712375, "grad_norm": 1.4803847074508667, "learning_rate": 2.0057261588710455e-05, "loss": 0.3134, "num_input_tokens_seen": 71072704, "step": 74415 }, { "epoch": 6.070641977322783, "grad_norm": 26.744098663330078, "learning_rate": 2.0053772615272774e-05, "loss": 0.4778, "num_input_tokens_seen": 71077472, "step": 74420 }, { "epoch": 6.071049840933192, "grad_norm": 0.3897123634815216, "learning_rate": 2.005028374209302e-05, "loss": 0.2515, "num_input_tokens_seen": 71081616, "step": 74425 }, { "epoch": 6.071457704543601, "grad_norm": 9.883944511413574, "learning_rate": 2.0046794969241905e-05, "loss": 0.3491, "num_input_tokens_seen": 71086384, "step": 74430 }, { "epoch": 6.071865568154009, "grad_norm": 23.920732498168945, "learning_rate": 2.0043306296790158e-05, "loss": 0.3583, "num_input_tokens_seen": 71091328, "step": 74435 }, { "epoch": 6.072273431764418, "grad_norm": 10.916316032409668, "learning_rate": 2.0039817724808475e-05, "loss": 0.2095, "num_input_tokens_seen": 71095648, "step": 74440 }, { "epoch": 6.072681295374827, "grad_norm": 0.4771394431591034, "learning_rate": 2.0036329253367588e-05, "loss": 0.4091, "num_input_tokens_seen": 71100528, "step": 74445 }, { "epoch": 6.073089158985235, "grad_norm": 1.7895619869232178, "learning_rate": 2.00328408825382e-05, "loss": 0.2561, "num_input_tokens_seen": 71105072, "step": 74450 }, { "epoch": 6.073497022595644, "grad_norm": 13.662606239318848, "learning_rate": 2.002935261239101e-05, "loss": 0.3111, "num_input_tokens_seen": 71109888, "step": 74455 }, { "epoch": 6.073904886206052, "grad_norm": 15.828828811645508, "learning_rate": 2.002586444299673e-05, "loss": 0.4055, "num_input_tokens_seen": 71114480, "step": 74460 }, { "epoch": 6.074312749816461, "grad_norm": 2.6631054878234863, "learning_rate": 2.0022376374426067e-05, "loss": 0.2744, "num_input_tokens_seen": 71118384, "step": 74465 }, { "epoch": 6.07472061342687, "grad_norm": 4.156918048858643, "learning_rate": 2.001888840674972e-05, "loss": 0.4386, "num_input_tokens_seen": 71122992, "step": 74470 }, { "epoch": 6.075128477037278, "grad_norm": 2.114189386367798, "learning_rate": 2.001540054003839e-05, "loss": 0.3383, "num_input_tokens_seen": 71126912, "step": 74475 }, { "epoch": 6.075536340647687, "grad_norm": 8.272933959960938, "learning_rate": 2.0011912774362772e-05, "loss": 0.2894, "num_input_tokens_seen": 71132048, "step": 74480 }, { "epoch": 6.0759442042580964, "grad_norm": 2.6577534675598145, "learning_rate": 2.000842510979355e-05, "loss": 0.4673, "num_input_tokens_seen": 71136096, "step": 74485 }, { "epoch": 6.076352067868505, "grad_norm": 1.0687509775161743, "learning_rate": 2.000493754640145e-05, "loss": 0.4357, "num_input_tokens_seen": 71140720, "step": 74490 }, { "epoch": 6.076759931478914, "grad_norm": 2.095959186553955, "learning_rate": 2.0001450084257133e-05, "loss": 0.277, "num_input_tokens_seen": 71146192, "step": 74495 }, { "epoch": 6.077167795089322, "grad_norm": 10.845212936401367, "learning_rate": 1.99979627234313e-05, "loss": 0.3968, "num_input_tokens_seen": 71150880, "step": 74500 }, { "epoch": 6.077575658699731, "grad_norm": 0.7519707679748535, "learning_rate": 1.9994475463994635e-05, "loss": 0.2937, "num_input_tokens_seen": 71155600, "step": 74505 }, { "epoch": 6.07798352231014, "grad_norm": 9.41215705871582, "learning_rate": 1.999098830601783e-05, "loss": 0.2457, "num_input_tokens_seen": 71160368, "step": 74510 }, { "epoch": 6.078391385920548, "grad_norm": 1.3568414449691772, "learning_rate": 1.9987501249571567e-05, "loss": 0.2442, "num_input_tokens_seen": 71165488, "step": 74515 }, { "epoch": 6.078799249530957, "grad_norm": 0.5352786183357239, "learning_rate": 1.9984014294726515e-05, "loss": 0.2275, "num_input_tokens_seen": 71170208, "step": 74520 }, { "epoch": 6.079207113141366, "grad_norm": 7.4872355461120605, "learning_rate": 1.998052744155337e-05, "loss": 0.3807, "num_input_tokens_seen": 71174864, "step": 74525 }, { "epoch": 6.079614976751774, "grad_norm": 1.4640741348266602, "learning_rate": 1.9977040690122785e-05, "loss": 0.4421, "num_input_tokens_seen": 71180112, "step": 74530 }, { "epoch": 6.080022840362183, "grad_norm": 5.8757734298706055, "learning_rate": 1.9973554040505467e-05, "loss": 0.4317, "num_input_tokens_seen": 71185408, "step": 74535 }, { "epoch": 6.080430703972592, "grad_norm": 1.258427381515503, "learning_rate": 1.997006749277207e-05, "loss": 0.4676, "num_input_tokens_seen": 71189824, "step": 74540 }, { "epoch": 6.080838567583, "grad_norm": 0.40818750858306885, "learning_rate": 1.9966581046993267e-05, "loss": 0.2685, "num_input_tokens_seen": 71194848, "step": 74545 }, { "epoch": 6.081246431193409, "grad_norm": 3.554307699203491, "learning_rate": 1.9963094703239715e-05, "loss": 0.4001, "num_input_tokens_seen": 71199632, "step": 74550 }, { "epoch": 6.081654294803817, "grad_norm": 11.50914192199707, "learning_rate": 1.9959608461582103e-05, "loss": 0.3285, "num_input_tokens_seen": 71204272, "step": 74555 }, { "epoch": 6.082062158414226, "grad_norm": 1.1099830865859985, "learning_rate": 1.9956122322091092e-05, "loss": 0.3025, "num_input_tokens_seen": 71209344, "step": 74560 }, { "epoch": 6.082470022024635, "grad_norm": 0.7037743926048279, "learning_rate": 1.9952636284837334e-05, "loss": 0.3413, "num_input_tokens_seen": 71213744, "step": 74565 }, { "epoch": 6.082877885635043, "grad_norm": 0.8326513767242432, "learning_rate": 1.9949150349891494e-05, "loss": 0.3135, "num_input_tokens_seen": 71218528, "step": 74570 }, { "epoch": 6.083285749245452, "grad_norm": 2.492666244506836, "learning_rate": 1.9945664517324227e-05, "loss": 0.375, "num_input_tokens_seen": 71223696, "step": 74575 }, { "epoch": 6.083693612855861, "grad_norm": 12.842310905456543, "learning_rate": 1.9942178787206194e-05, "loss": 0.2746, "num_input_tokens_seen": 71227808, "step": 74580 }, { "epoch": 6.0841014764662695, "grad_norm": 2.3580567836761475, "learning_rate": 1.9938693159608048e-05, "loss": 0.3007, "num_input_tokens_seen": 71232704, "step": 74585 }, { "epoch": 6.0845093400766785, "grad_norm": 12.643844604492188, "learning_rate": 1.9935207634600446e-05, "loss": 0.2254, "num_input_tokens_seen": 71237520, "step": 74590 }, { "epoch": 6.084917203687087, "grad_norm": 1.8753166198730469, "learning_rate": 1.9931722212254016e-05, "loss": 0.2468, "num_input_tokens_seen": 71242336, "step": 74595 }, { "epoch": 6.085325067297496, "grad_norm": 0.8008317351341248, "learning_rate": 1.9928236892639436e-05, "loss": 0.3362, "num_input_tokens_seen": 71246704, "step": 74600 }, { "epoch": 6.085732930907905, "grad_norm": 1.493508219718933, "learning_rate": 1.992475167582734e-05, "loss": 0.2929, "num_input_tokens_seen": 71251616, "step": 74605 }, { "epoch": 6.086140794518313, "grad_norm": 2.737631320953369, "learning_rate": 1.992126656188837e-05, "loss": 0.2067, "num_input_tokens_seen": 71256832, "step": 74610 }, { "epoch": 6.086548658128722, "grad_norm": 12.207746505737305, "learning_rate": 1.9917781550893162e-05, "loss": 0.3982, "num_input_tokens_seen": 71262240, "step": 74615 }, { "epoch": 6.086956521739131, "grad_norm": 4.645144939422607, "learning_rate": 1.991429664291237e-05, "loss": 0.3867, "num_input_tokens_seen": 71267040, "step": 74620 }, { "epoch": 6.087364385349539, "grad_norm": 2.8834428787231445, "learning_rate": 1.9910811838016623e-05, "loss": 0.3196, "num_input_tokens_seen": 71271616, "step": 74625 }, { "epoch": 6.087772248959948, "grad_norm": 0.9889647364616394, "learning_rate": 1.9907327136276558e-05, "loss": 0.3948, "num_input_tokens_seen": 71276944, "step": 74630 }, { "epoch": 6.088180112570356, "grad_norm": 2.089381694793701, "learning_rate": 1.9903842537762806e-05, "loss": 0.3584, "num_input_tokens_seen": 71282224, "step": 74635 }, { "epoch": 6.088587976180765, "grad_norm": 0.41843846440315247, "learning_rate": 1.9900358042546e-05, "loss": 0.305, "num_input_tokens_seen": 71286944, "step": 74640 }, { "epoch": 6.088995839791174, "grad_norm": 1.0890865325927734, "learning_rate": 1.989687365069677e-05, "loss": 0.3544, "num_input_tokens_seen": 71290704, "step": 74645 }, { "epoch": 6.089403703401582, "grad_norm": 13.269217491149902, "learning_rate": 1.9893389362285748e-05, "loss": 0.3953, "num_input_tokens_seen": 71296224, "step": 74650 }, { "epoch": 6.089811567011991, "grad_norm": 1.3763597011566162, "learning_rate": 1.988990517738355e-05, "loss": 0.2901, "num_input_tokens_seen": 71300976, "step": 74655 }, { "epoch": 6.0902194306224, "grad_norm": 6.588308334350586, "learning_rate": 1.988642109606079e-05, "loss": 0.3787, "num_input_tokens_seen": 71305376, "step": 74660 }, { "epoch": 6.090627294232808, "grad_norm": 5.261174201965332, "learning_rate": 1.9882937118388115e-05, "loss": 0.2319, "num_input_tokens_seen": 71310208, "step": 74665 }, { "epoch": 6.091035157843217, "grad_norm": 2.3718559741973877, "learning_rate": 1.987945324443613e-05, "loss": 0.3645, "num_input_tokens_seen": 71314592, "step": 74670 }, { "epoch": 6.0914430214536255, "grad_norm": 1.9373763799667358, "learning_rate": 1.987596947427545e-05, "loss": 0.3501, "num_input_tokens_seen": 71318624, "step": 74675 }, { "epoch": 6.0918508850640345, "grad_norm": 1.4555615186691284, "learning_rate": 1.9872485807976697e-05, "loss": 0.2452, "num_input_tokens_seen": 71322800, "step": 74680 }, { "epoch": 6.0922587486744435, "grad_norm": 3.5295352935791016, "learning_rate": 1.986900224561046e-05, "loss": 0.3474, "num_input_tokens_seen": 71327600, "step": 74685 }, { "epoch": 6.092666612284852, "grad_norm": 1.369177222251892, "learning_rate": 1.9865518787247384e-05, "loss": 0.3308, "num_input_tokens_seen": 71332256, "step": 74690 }, { "epoch": 6.093074475895261, "grad_norm": 1.437812328338623, "learning_rate": 1.9862035432958058e-05, "loss": 0.3819, "num_input_tokens_seen": 71337824, "step": 74695 }, { "epoch": 6.09348233950567, "grad_norm": 15.895723342895508, "learning_rate": 1.985855218281309e-05, "loss": 0.3703, "num_input_tokens_seen": 71342768, "step": 74700 }, { "epoch": 6.093890203116078, "grad_norm": 5.035501956939697, "learning_rate": 1.9855069036883078e-05, "loss": 0.362, "num_input_tokens_seen": 71347536, "step": 74705 }, { "epoch": 6.094298066726487, "grad_norm": 7.145944595336914, "learning_rate": 1.9851585995238638e-05, "loss": 0.6407, "num_input_tokens_seen": 71352144, "step": 74710 }, { "epoch": 6.094705930336895, "grad_norm": 7.672191143035889, "learning_rate": 1.984810305795036e-05, "loss": 0.2176, "num_input_tokens_seen": 71356976, "step": 74715 }, { "epoch": 6.095113793947304, "grad_norm": 14.215845108032227, "learning_rate": 1.9844620225088846e-05, "loss": 0.5553, "num_input_tokens_seen": 71362608, "step": 74720 }, { "epoch": 6.095521657557713, "grad_norm": 18.67205238342285, "learning_rate": 1.9841137496724688e-05, "loss": 0.3987, "num_input_tokens_seen": 71367008, "step": 74725 }, { "epoch": 6.095929521168121, "grad_norm": 1.800737977027893, "learning_rate": 1.9837654872928468e-05, "loss": 0.3911, "num_input_tokens_seen": 71371808, "step": 74730 }, { "epoch": 6.09633738477853, "grad_norm": 1.8295108079910278, "learning_rate": 1.9834172353770806e-05, "loss": 0.3188, "num_input_tokens_seen": 71375920, "step": 74735 }, { "epoch": 6.096745248388939, "grad_norm": 15.340209007263184, "learning_rate": 1.9830689939322272e-05, "loss": 0.5035, "num_input_tokens_seen": 71381200, "step": 74740 }, { "epoch": 6.097153111999347, "grad_norm": 8.672356605529785, "learning_rate": 1.982720762965346e-05, "loss": 0.3376, "num_input_tokens_seen": 71386112, "step": 74745 }, { "epoch": 6.097560975609756, "grad_norm": 12.075240135192871, "learning_rate": 1.982372542483493e-05, "loss": 0.4132, "num_input_tokens_seen": 71391136, "step": 74750 }, { "epoch": 6.097968839220165, "grad_norm": 0.6134263277053833, "learning_rate": 1.982024332493731e-05, "loss": 0.3467, "num_input_tokens_seen": 71396672, "step": 74755 }, { "epoch": 6.098376702830573, "grad_norm": 11.623266220092773, "learning_rate": 1.981676133003115e-05, "loss": 0.2782, "num_input_tokens_seen": 71401600, "step": 74760 }, { "epoch": 6.098784566440982, "grad_norm": 14.924110412597656, "learning_rate": 1.981327944018704e-05, "loss": 0.2521, "num_input_tokens_seen": 71405984, "step": 74765 }, { "epoch": 6.0991924300513904, "grad_norm": 14.3942232131958, "learning_rate": 1.980979765547554e-05, "loss": 0.4104, "num_input_tokens_seen": 71410496, "step": 74770 }, { "epoch": 6.0996002936617995, "grad_norm": 1.0576837062835693, "learning_rate": 1.9806315975967243e-05, "loss": 0.4041, "num_input_tokens_seen": 71415696, "step": 74775 }, { "epoch": 6.1000081572722085, "grad_norm": 8.993386268615723, "learning_rate": 1.9802834401732717e-05, "loss": 0.4318, "num_input_tokens_seen": 71420144, "step": 74780 }, { "epoch": 6.100416020882617, "grad_norm": 4.832536220550537, "learning_rate": 1.9799352932842526e-05, "loss": 0.3911, "num_input_tokens_seen": 71424880, "step": 74785 }, { "epoch": 6.100823884493026, "grad_norm": 2.1312334537506104, "learning_rate": 1.9795871569367238e-05, "loss": 0.3182, "num_input_tokens_seen": 71429616, "step": 74790 }, { "epoch": 6.101231748103435, "grad_norm": 6.953587532043457, "learning_rate": 1.9792390311377422e-05, "loss": 0.3392, "num_input_tokens_seen": 71434336, "step": 74795 }, { "epoch": 6.101639611713843, "grad_norm": 3.5461761951446533, "learning_rate": 1.9788909158943646e-05, "loss": 0.3214, "num_input_tokens_seen": 71439312, "step": 74800 }, { "epoch": 6.102047475324252, "grad_norm": 0.534605085849762, "learning_rate": 1.9785428112136468e-05, "loss": 0.2733, "num_input_tokens_seen": 71443184, "step": 74805 }, { "epoch": 6.10245533893466, "grad_norm": 33.265689849853516, "learning_rate": 1.9781947171026445e-05, "loss": 0.3661, "num_input_tokens_seen": 71448352, "step": 74810 }, { "epoch": 6.102863202545069, "grad_norm": 10.377870559692383, "learning_rate": 1.9778466335684125e-05, "loss": 0.3163, "num_input_tokens_seen": 71453424, "step": 74815 }, { "epoch": 6.103271066155478, "grad_norm": 0.9267171025276184, "learning_rate": 1.977498560618009e-05, "loss": 0.2681, "num_input_tokens_seen": 71458704, "step": 74820 }, { "epoch": 6.103678929765886, "grad_norm": 0.4148310720920563, "learning_rate": 1.977150498258487e-05, "loss": 0.4117, "num_input_tokens_seen": 71463472, "step": 74825 }, { "epoch": 6.104086793376295, "grad_norm": 15.363329887390137, "learning_rate": 1.9768024464969023e-05, "loss": 0.3557, "num_input_tokens_seen": 71467824, "step": 74830 }, { "epoch": 6.104494656986704, "grad_norm": 0.7835709452629089, "learning_rate": 1.9764544053403097e-05, "loss": 0.403, "num_input_tokens_seen": 71471984, "step": 74835 }, { "epoch": 6.104902520597112, "grad_norm": 1.5328328609466553, "learning_rate": 1.9761063747957633e-05, "loss": 0.3813, "num_input_tokens_seen": 71476416, "step": 74840 }, { "epoch": 6.105310384207521, "grad_norm": 8.140619277954102, "learning_rate": 1.975758354870319e-05, "loss": 0.2717, "num_input_tokens_seen": 71481216, "step": 74845 }, { "epoch": 6.105718247817929, "grad_norm": 1.2670732736587524, "learning_rate": 1.975410345571029e-05, "loss": 0.3918, "num_input_tokens_seen": 71485536, "step": 74850 }, { "epoch": 6.106126111428338, "grad_norm": 28.742298126220703, "learning_rate": 1.9750623469049492e-05, "loss": 0.3386, "num_input_tokens_seen": 71490048, "step": 74855 }, { "epoch": 6.106533975038747, "grad_norm": 1.487657904624939, "learning_rate": 1.974714358879132e-05, "loss": 0.3112, "num_input_tokens_seen": 71495136, "step": 74860 }, { "epoch": 6.106941838649155, "grad_norm": 2.8838024139404297, "learning_rate": 1.974366381500632e-05, "loss": 0.3222, "num_input_tokens_seen": 71499568, "step": 74865 }, { "epoch": 6.107349702259564, "grad_norm": 1.0648274421691895, "learning_rate": 1.9740184147765024e-05, "loss": 0.2723, "num_input_tokens_seen": 71504752, "step": 74870 }, { "epoch": 6.107757565869973, "grad_norm": 2.253117799758911, "learning_rate": 1.9736704587137955e-05, "loss": 0.2713, "num_input_tokens_seen": 71509232, "step": 74875 }, { "epoch": 6.1081654294803815, "grad_norm": 0.7857000231742859, "learning_rate": 1.973322513319565e-05, "loss": 0.3872, "num_input_tokens_seen": 71513248, "step": 74880 }, { "epoch": 6.108573293090791, "grad_norm": 0.6527801752090454, "learning_rate": 1.9729745786008623e-05, "loss": 0.3321, "num_input_tokens_seen": 71517472, "step": 74885 }, { "epoch": 6.1089811567012, "grad_norm": 0.6789596676826477, "learning_rate": 1.972626654564742e-05, "loss": 0.2326, "num_input_tokens_seen": 71521648, "step": 74890 }, { "epoch": 6.109389020311608, "grad_norm": 1.4416860342025757, "learning_rate": 1.9722787412182554e-05, "loss": 0.3871, "num_input_tokens_seen": 71526176, "step": 74895 }, { "epoch": 6.109796883922017, "grad_norm": 6.557798385620117, "learning_rate": 1.9719308385684545e-05, "loss": 0.2996, "num_input_tokens_seen": 71530512, "step": 74900 }, { "epoch": 6.110204747532425, "grad_norm": 1.04709792137146, "learning_rate": 1.9715829466223907e-05, "loss": 0.4313, "num_input_tokens_seen": 71535792, "step": 74905 }, { "epoch": 6.110612611142834, "grad_norm": 13.258705139160156, "learning_rate": 1.9712350653871157e-05, "loss": 0.4582, "num_input_tokens_seen": 71540880, "step": 74910 }, { "epoch": 6.111020474753243, "grad_norm": 5.6568379402160645, "learning_rate": 1.970887194869682e-05, "loss": 0.4923, "num_input_tokens_seen": 71546112, "step": 74915 }, { "epoch": 6.111428338363651, "grad_norm": 12.864385604858398, "learning_rate": 1.9705393350771397e-05, "loss": 0.2833, "num_input_tokens_seen": 71550256, "step": 74920 }, { "epoch": 6.11183620197406, "grad_norm": 39.55954360961914, "learning_rate": 1.9701914860165404e-05, "loss": 0.3053, "num_input_tokens_seen": 71555216, "step": 74925 }, { "epoch": 6.112244065584469, "grad_norm": 0.4434169828891754, "learning_rate": 1.9698436476949337e-05, "loss": 0.3389, "num_input_tokens_seen": 71559616, "step": 74930 }, { "epoch": 6.112651929194877, "grad_norm": 2.372204303741455, "learning_rate": 1.969495820119372e-05, "loss": 0.3004, "num_input_tokens_seen": 71563920, "step": 74935 }, { "epoch": 6.113059792805286, "grad_norm": 0.9171723127365112, "learning_rate": 1.9691480032969044e-05, "loss": 0.4089, "num_input_tokens_seen": 71568608, "step": 74940 }, { "epoch": 6.113467656415694, "grad_norm": 1.2099123001098633, "learning_rate": 1.9688001972345808e-05, "loss": 0.4306, "num_input_tokens_seen": 71572784, "step": 74945 }, { "epoch": 6.113875520026103, "grad_norm": 1.7267169952392578, "learning_rate": 1.9684524019394513e-05, "loss": 0.3104, "num_input_tokens_seen": 71577824, "step": 74950 }, { "epoch": 6.114283383636512, "grad_norm": 0.6938654780387878, "learning_rate": 1.968104617418566e-05, "loss": 0.3297, "num_input_tokens_seen": 71583488, "step": 74955 }, { "epoch": 6.11469124724692, "grad_norm": 6.977174758911133, "learning_rate": 1.9677568436789746e-05, "loss": 0.3347, "num_input_tokens_seen": 71588416, "step": 74960 }, { "epoch": 6.115099110857329, "grad_norm": 0.5665169954299927, "learning_rate": 1.9674090807277254e-05, "loss": 0.2769, "num_input_tokens_seen": 71593680, "step": 74965 }, { "epoch": 6.115506974467738, "grad_norm": 2.561478853225708, "learning_rate": 1.967061328571867e-05, "loss": 0.3555, "num_input_tokens_seen": 71598768, "step": 74970 }, { "epoch": 6.1159148380781465, "grad_norm": 3.891547679901123, "learning_rate": 1.96671358721845e-05, "loss": 0.2159, "num_input_tokens_seen": 71603344, "step": 74975 }, { "epoch": 6.1163227016885555, "grad_norm": 12.382122993469238, "learning_rate": 1.966365856674522e-05, "loss": 0.4265, "num_input_tokens_seen": 71607440, "step": 74980 }, { "epoch": 6.116730565298964, "grad_norm": 10.879037857055664, "learning_rate": 1.9660181369471313e-05, "loss": 0.2756, "num_input_tokens_seen": 71612640, "step": 74985 }, { "epoch": 6.117138428909373, "grad_norm": 0.5751940608024597, "learning_rate": 1.965670428043326e-05, "loss": 0.362, "num_input_tokens_seen": 71617584, "step": 74990 }, { "epoch": 6.117546292519782, "grad_norm": 8.231086730957031, "learning_rate": 1.9653227299701538e-05, "loss": 0.3557, "num_input_tokens_seen": 71622464, "step": 74995 }, { "epoch": 6.11795415613019, "grad_norm": 2.6787285804748535, "learning_rate": 1.964975042734663e-05, "loss": 0.2753, "num_input_tokens_seen": 71627552, "step": 75000 }, { "epoch": 6.118362019740599, "grad_norm": 2.4634461402893066, "learning_rate": 1.9646273663439008e-05, "loss": 0.2675, "num_input_tokens_seen": 71631664, "step": 75005 }, { "epoch": 6.118769883351008, "grad_norm": 5.110599994659424, "learning_rate": 1.9642797008049144e-05, "loss": 0.4403, "num_input_tokens_seen": 71637344, "step": 75010 }, { "epoch": 6.119177746961416, "grad_norm": 19.275976181030273, "learning_rate": 1.9639320461247497e-05, "loss": 0.2219, "num_input_tokens_seen": 71641680, "step": 75015 }, { "epoch": 6.119585610571825, "grad_norm": 2.2604727745056152, "learning_rate": 1.9635844023104558e-05, "loss": 0.2617, "num_input_tokens_seen": 71646288, "step": 75020 }, { "epoch": 6.119993474182233, "grad_norm": 20.99610710144043, "learning_rate": 1.963236769369078e-05, "loss": 0.4321, "num_input_tokens_seen": 71651200, "step": 75025 }, { "epoch": 6.120401337792642, "grad_norm": 1.020912766456604, "learning_rate": 1.962889147307663e-05, "loss": 0.3872, "num_input_tokens_seen": 71655376, "step": 75030 }, { "epoch": 6.120809201403051, "grad_norm": 1.9988583326339722, "learning_rate": 1.9625415361332564e-05, "loss": 0.3519, "num_input_tokens_seen": 71660768, "step": 75035 }, { "epoch": 6.121217065013459, "grad_norm": 0.6131340265274048, "learning_rate": 1.9621939358529036e-05, "loss": 0.346, "num_input_tokens_seen": 71666368, "step": 75040 }, { "epoch": 6.121624928623868, "grad_norm": 11.235954284667969, "learning_rate": 1.9618463464736523e-05, "loss": 0.3148, "num_input_tokens_seen": 71671040, "step": 75045 }, { "epoch": 6.122032792234277, "grad_norm": 3.960045337677002, "learning_rate": 1.961498768002547e-05, "loss": 0.2004, "num_input_tokens_seen": 71676336, "step": 75050 }, { "epoch": 6.122440655844685, "grad_norm": 0.7335863709449768, "learning_rate": 1.9611512004466326e-05, "loss": 0.3503, "num_input_tokens_seen": 71681520, "step": 75055 }, { "epoch": 6.122848519455094, "grad_norm": 1.1608392000198364, "learning_rate": 1.960803643812954e-05, "loss": 0.3913, "num_input_tokens_seen": 71686496, "step": 75060 }, { "epoch": 6.1232563830655025, "grad_norm": 5.214164733886719, "learning_rate": 1.9604560981085567e-05, "loss": 0.2981, "num_input_tokens_seen": 71692048, "step": 75065 }, { "epoch": 6.1236642466759115, "grad_norm": 31.5277156829834, "learning_rate": 1.9601085633404855e-05, "loss": 0.4503, "num_input_tokens_seen": 71696560, "step": 75070 }, { "epoch": 6.1240721102863205, "grad_norm": 11.850850105285645, "learning_rate": 1.959761039515784e-05, "loss": 0.5129, "num_input_tokens_seen": 71701024, "step": 75075 }, { "epoch": 6.124479973896729, "grad_norm": 0.9677951335906982, "learning_rate": 1.9594135266414967e-05, "loss": 0.3699, "num_input_tokens_seen": 71705744, "step": 75080 }, { "epoch": 6.124887837507138, "grad_norm": 1.4918068647384644, "learning_rate": 1.9590660247246663e-05, "loss": 0.291, "num_input_tokens_seen": 71710848, "step": 75085 }, { "epoch": 6.125295701117547, "grad_norm": 7.9552998542785645, "learning_rate": 1.958718533772339e-05, "loss": 0.4532, "num_input_tokens_seen": 71715392, "step": 75090 }, { "epoch": 6.125703564727955, "grad_norm": 5.869540214538574, "learning_rate": 1.958371053791557e-05, "loss": 0.3232, "num_input_tokens_seen": 71720096, "step": 75095 }, { "epoch": 6.126111428338364, "grad_norm": 3.0801868438720703, "learning_rate": 1.9580235847893636e-05, "loss": 0.411, "num_input_tokens_seen": 71725760, "step": 75100 }, { "epoch": 6.126519291948773, "grad_norm": 11.398187637329102, "learning_rate": 1.9576761267728008e-05, "loss": 0.1933, "num_input_tokens_seen": 71730464, "step": 75105 }, { "epoch": 6.126927155559181, "grad_norm": 5.222429275512695, "learning_rate": 1.9573286797489136e-05, "loss": 0.484, "num_input_tokens_seen": 71734992, "step": 75110 }, { "epoch": 6.12733501916959, "grad_norm": 7.041354656219482, "learning_rate": 1.956981243724743e-05, "loss": 0.2515, "num_input_tokens_seen": 71739680, "step": 75115 }, { "epoch": 6.127742882779998, "grad_norm": 0.5070523023605347, "learning_rate": 1.9566338187073325e-05, "loss": 0.2673, "num_input_tokens_seen": 71744592, "step": 75120 }, { "epoch": 6.128150746390407, "grad_norm": 25.38505744934082, "learning_rate": 1.9562864047037234e-05, "loss": 0.27, "num_input_tokens_seen": 71750736, "step": 75125 }, { "epoch": 6.128558610000816, "grad_norm": 0.3792489171028137, "learning_rate": 1.955939001720957e-05, "loss": 0.4284, "num_input_tokens_seen": 71756448, "step": 75130 }, { "epoch": 6.128966473611224, "grad_norm": 3.8981235027313232, "learning_rate": 1.9555916097660765e-05, "loss": 0.293, "num_input_tokens_seen": 71760848, "step": 75135 }, { "epoch": 6.129374337221633, "grad_norm": 1.5885258913040161, "learning_rate": 1.955244228846123e-05, "loss": 0.342, "num_input_tokens_seen": 71765264, "step": 75140 }, { "epoch": 6.129782200832042, "grad_norm": 24.037822723388672, "learning_rate": 1.954896858968137e-05, "loss": 0.4429, "num_input_tokens_seen": 71770064, "step": 75145 }, { "epoch": 6.13019006444245, "grad_norm": 2.5677645206451416, "learning_rate": 1.9545495001391595e-05, "loss": 0.4406, "num_input_tokens_seen": 71774736, "step": 75150 }, { "epoch": 6.130597928052859, "grad_norm": 0.5596252083778381, "learning_rate": 1.9542021523662325e-05, "loss": 0.3684, "num_input_tokens_seen": 71779232, "step": 75155 }, { "epoch": 6.131005791663267, "grad_norm": 1.17715585231781, "learning_rate": 1.953854815656396e-05, "loss": 0.2838, "num_input_tokens_seen": 71784016, "step": 75160 }, { "epoch": 6.131413655273676, "grad_norm": 0.8426063060760498, "learning_rate": 1.9535074900166905e-05, "loss": 0.4912, "num_input_tokens_seen": 71788624, "step": 75165 }, { "epoch": 6.131821518884085, "grad_norm": 16.591630935668945, "learning_rate": 1.953160175454155e-05, "loss": 0.3153, "num_input_tokens_seen": 71793808, "step": 75170 }, { "epoch": 6.132229382494494, "grad_norm": 1.0718320608139038, "learning_rate": 1.9528128719758312e-05, "loss": 0.3322, "num_input_tokens_seen": 71798080, "step": 75175 }, { "epoch": 6.132637246104903, "grad_norm": 5.456929683685303, "learning_rate": 1.952465579588758e-05, "loss": 0.3508, "num_input_tokens_seen": 71802224, "step": 75180 }, { "epoch": 6.133045109715312, "grad_norm": 0.5733194947242737, "learning_rate": 1.952118298299975e-05, "loss": 0.5257, "num_input_tokens_seen": 71807408, "step": 75185 }, { "epoch": 6.13345297332572, "grad_norm": 1.1950727701187134, "learning_rate": 1.9517710281165213e-05, "loss": 0.3375, "num_input_tokens_seen": 71812432, "step": 75190 }, { "epoch": 6.133860836936129, "grad_norm": 2.3614845275878906, "learning_rate": 1.951423769045435e-05, "loss": 0.3195, "num_input_tokens_seen": 71817072, "step": 75195 }, { "epoch": 6.134268700546537, "grad_norm": 41.49474334716797, "learning_rate": 1.9510765210937567e-05, "loss": 0.426, "num_input_tokens_seen": 71822064, "step": 75200 }, { "epoch": 6.134676564156946, "grad_norm": 2.6325535774230957, "learning_rate": 1.950729284268524e-05, "loss": 0.3275, "num_input_tokens_seen": 71825504, "step": 75205 }, { "epoch": 6.135084427767355, "grad_norm": 0.5320361256599426, "learning_rate": 1.9503820585767755e-05, "loss": 0.3355, "num_input_tokens_seen": 71830080, "step": 75210 }, { "epoch": 6.135492291377763, "grad_norm": 0.8526374101638794, "learning_rate": 1.950034844025548e-05, "loss": 0.3536, "num_input_tokens_seen": 71834912, "step": 75215 }, { "epoch": 6.135900154988172, "grad_norm": 1.2090022563934326, "learning_rate": 1.949687640621881e-05, "loss": 0.367, "num_input_tokens_seen": 71839712, "step": 75220 }, { "epoch": 6.136308018598581, "grad_norm": 4.379880428314209, "learning_rate": 1.9493404483728122e-05, "loss": 0.2847, "num_input_tokens_seen": 71845136, "step": 75225 }, { "epoch": 6.136715882208989, "grad_norm": 0.3960166573524475, "learning_rate": 1.9489932672853783e-05, "loss": 0.3313, "num_input_tokens_seen": 71849280, "step": 75230 }, { "epoch": 6.137123745819398, "grad_norm": 44.44678497314453, "learning_rate": 1.948646097366617e-05, "loss": 0.3536, "num_input_tokens_seen": 71853872, "step": 75235 }, { "epoch": 6.137531609429806, "grad_norm": 9.955415725708008, "learning_rate": 1.9482989386235636e-05, "loss": 0.3138, "num_input_tokens_seen": 71858848, "step": 75240 }, { "epoch": 6.137939473040215, "grad_norm": 2.1599433422088623, "learning_rate": 1.9479517910632577e-05, "loss": 0.2887, "num_input_tokens_seen": 71864048, "step": 75245 }, { "epoch": 6.138347336650624, "grad_norm": 0.8176551461219788, "learning_rate": 1.947604654692734e-05, "loss": 0.3037, "num_input_tokens_seen": 71869312, "step": 75250 }, { "epoch": 6.138755200261032, "grad_norm": 3.1769487857818604, "learning_rate": 1.94725752951903e-05, "loss": 0.2918, "num_input_tokens_seen": 71874608, "step": 75255 }, { "epoch": 6.139163063871441, "grad_norm": 0.9668812155723572, "learning_rate": 1.9469104155491797e-05, "loss": 0.3057, "num_input_tokens_seen": 71880064, "step": 75260 }, { "epoch": 6.13957092748185, "grad_norm": 0.44208797812461853, "learning_rate": 1.946563312790221e-05, "loss": 0.3591, "num_input_tokens_seen": 71885504, "step": 75265 }, { "epoch": 6.1399787910922585, "grad_norm": 21.25184440612793, "learning_rate": 1.9462162212491887e-05, "loss": 0.4121, "num_input_tokens_seen": 71891008, "step": 75270 }, { "epoch": 6.1403866547026675, "grad_norm": 2.286813735961914, "learning_rate": 1.945869140933119e-05, "loss": 0.4725, "num_input_tokens_seen": 71896400, "step": 75275 }, { "epoch": 6.140794518313076, "grad_norm": 0.5697145462036133, "learning_rate": 1.9455220718490454e-05, "loss": 0.3558, "num_input_tokens_seen": 71901120, "step": 75280 }, { "epoch": 6.141202381923485, "grad_norm": 2.5437943935394287, "learning_rate": 1.945175014004003e-05, "loss": 0.2883, "num_input_tokens_seen": 71905360, "step": 75285 }, { "epoch": 6.141610245533894, "grad_norm": 0.5461941957473755, "learning_rate": 1.9448279674050284e-05, "loss": 0.4781, "num_input_tokens_seen": 71909792, "step": 75290 }, { "epoch": 6.142018109144302, "grad_norm": 2.2844128608703613, "learning_rate": 1.944480932059155e-05, "loss": 0.4018, "num_input_tokens_seen": 71915104, "step": 75295 }, { "epoch": 6.142425972754711, "grad_norm": 22.553966522216797, "learning_rate": 1.9441339079734172e-05, "loss": 0.498, "num_input_tokens_seen": 71920448, "step": 75300 }, { "epoch": 6.14283383636512, "grad_norm": 6.7554521560668945, "learning_rate": 1.9437868951548476e-05, "loss": 0.421, "num_input_tokens_seen": 71925216, "step": 75305 }, { "epoch": 6.143241699975528, "grad_norm": 16.679080963134766, "learning_rate": 1.9434398936104824e-05, "loss": 0.276, "num_input_tokens_seen": 71929472, "step": 75310 }, { "epoch": 6.143649563585937, "grad_norm": 0.3868936598300934, "learning_rate": 1.9430929033473543e-05, "loss": 0.2298, "num_input_tokens_seen": 71934880, "step": 75315 }, { "epoch": 6.144057427196346, "grad_norm": 3.482811212539673, "learning_rate": 1.9427459243724963e-05, "loss": 0.4011, "num_input_tokens_seen": 71939216, "step": 75320 }, { "epoch": 6.144465290806754, "grad_norm": 3.918120861053467, "learning_rate": 1.942398956692941e-05, "loss": 0.3459, "num_input_tokens_seen": 71944048, "step": 75325 }, { "epoch": 6.144873154417163, "grad_norm": 0.6320328116416931, "learning_rate": 1.9420520003157223e-05, "loss": 0.3509, "num_input_tokens_seen": 71949712, "step": 75330 }, { "epoch": 6.145281018027571, "grad_norm": 1.4011852741241455, "learning_rate": 1.9417050552478728e-05, "loss": 0.4224, "num_input_tokens_seen": 71954976, "step": 75335 }, { "epoch": 6.14568888163798, "grad_norm": 1.383032202720642, "learning_rate": 1.9413581214964244e-05, "loss": 0.3792, "num_input_tokens_seen": 71960336, "step": 75340 }, { "epoch": 6.146096745248389, "grad_norm": 2.4496920108795166, "learning_rate": 1.9410111990684096e-05, "loss": 0.3361, "num_input_tokens_seen": 71964256, "step": 75345 }, { "epoch": 6.146504608858797, "grad_norm": 2.9497716426849365, "learning_rate": 1.940664287970859e-05, "loss": 0.3513, "num_input_tokens_seen": 71969136, "step": 75350 }, { "epoch": 6.146912472469206, "grad_norm": 5.484217643737793, "learning_rate": 1.940317388210807e-05, "loss": 0.4423, "num_input_tokens_seen": 71974224, "step": 75355 }, { "epoch": 6.147320336079615, "grad_norm": 0.561883807182312, "learning_rate": 1.9399704997952833e-05, "loss": 0.3125, "num_input_tokens_seen": 71978944, "step": 75360 }, { "epoch": 6.1477281996900235, "grad_norm": 19.732540130615234, "learning_rate": 1.9396236227313198e-05, "loss": 0.3186, "num_input_tokens_seen": 71984160, "step": 75365 }, { "epoch": 6.1481360633004325, "grad_norm": 0.45019882917404175, "learning_rate": 1.9392767570259465e-05, "loss": 0.2739, "num_input_tokens_seen": 71989040, "step": 75370 }, { "epoch": 6.148543926910841, "grad_norm": 0.9010398387908936, "learning_rate": 1.938929902686196e-05, "loss": 0.3023, "num_input_tokens_seen": 71993984, "step": 75375 }, { "epoch": 6.14895179052125, "grad_norm": 1.2577648162841797, "learning_rate": 1.938583059719098e-05, "loss": 0.2808, "num_input_tokens_seen": 71998128, "step": 75380 }, { "epoch": 6.149359654131659, "grad_norm": 0.4689424932003021, "learning_rate": 1.9382362281316825e-05, "loss": 0.3051, "num_input_tokens_seen": 72003088, "step": 75385 }, { "epoch": 6.149767517742067, "grad_norm": 6.788632869720459, "learning_rate": 1.93788940793098e-05, "loss": 0.3054, "num_input_tokens_seen": 72007792, "step": 75390 }, { "epoch": 6.150175381352476, "grad_norm": 0.4107177257537842, "learning_rate": 1.93754259912402e-05, "loss": 0.3715, "num_input_tokens_seen": 72013344, "step": 75395 }, { "epoch": 6.150583244962885, "grad_norm": 1.203670620918274, "learning_rate": 1.937195801717833e-05, "loss": 0.4053, "num_input_tokens_seen": 72018272, "step": 75400 }, { "epoch": 6.150991108573293, "grad_norm": 8.096907615661621, "learning_rate": 1.936849015719448e-05, "loss": 0.2688, "num_input_tokens_seen": 72022672, "step": 75405 }, { "epoch": 6.151398972183702, "grad_norm": 1.455721378326416, "learning_rate": 1.9365022411358944e-05, "loss": 0.3875, "num_input_tokens_seen": 72028064, "step": 75410 }, { "epoch": 6.15180683579411, "grad_norm": 26.900644302368164, "learning_rate": 1.9361554779741996e-05, "loss": 0.3267, "num_input_tokens_seen": 72032576, "step": 75415 }, { "epoch": 6.152214699404519, "grad_norm": 0.3990944027900696, "learning_rate": 1.9358087262413946e-05, "loss": 0.3547, "num_input_tokens_seen": 72036944, "step": 75420 }, { "epoch": 6.152622563014928, "grad_norm": 3.968459367752075, "learning_rate": 1.935461985944507e-05, "loss": 0.3739, "num_input_tokens_seen": 72042256, "step": 75425 }, { "epoch": 6.153030426625336, "grad_norm": 8.257206916809082, "learning_rate": 1.9351152570905652e-05, "loss": 0.3754, "num_input_tokens_seen": 72047760, "step": 75430 }, { "epoch": 6.153438290235745, "grad_norm": 1.781805157661438, "learning_rate": 1.934768539686597e-05, "loss": 0.3659, "num_input_tokens_seen": 72053136, "step": 75435 }, { "epoch": 6.153846153846154, "grad_norm": 10.155102729797363, "learning_rate": 1.9344218337396296e-05, "loss": 0.2429, "num_input_tokens_seen": 72058208, "step": 75440 }, { "epoch": 6.154254017456562, "grad_norm": 16.180021286010742, "learning_rate": 1.934075139256692e-05, "loss": 0.3992, "num_input_tokens_seen": 72062992, "step": 75445 }, { "epoch": 6.154661881066971, "grad_norm": 15.053305625915527, "learning_rate": 1.9337284562448107e-05, "loss": 0.3924, "num_input_tokens_seen": 72067936, "step": 75450 }, { "epoch": 6.15506974467738, "grad_norm": 1.03713858127594, "learning_rate": 1.9333817847110133e-05, "loss": 0.314, "num_input_tokens_seen": 72072288, "step": 75455 }, { "epoch": 6.1554776082877884, "grad_norm": 4.647224426269531, "learning_rate": 1.9330351246623258e-05, "loss": 0.4036, "num_input_tokens_seen": 72077152, "step": 75460 }, { "epoch": 6.1558854718981975, "grad_norm": 10.911995887756348, "learning_rate": 1.9326884761057757e-05, "loss": 0.1758, "num_input_tokens_seen": 72082528, "step": 75465 }, { "epoch": 6.156293335508606, "grad_norm": 26.569135665893555, "learning_rate": 1.9323418390483894e-05, "loss": 0.4269, "num_input_tokens_seen": 72086400, "step": 75470 }, { "epoch": 6.156701199119015, "grad_norm": 2.09277081489563, "learning_rate": 1.931995213497193e-05, "loss": 0.3296, "num_input_tokens_seen": 72091008, "step": 75475 }, { "epoch": 6.157109062729424, "grad_norm": 3.83231782913208, "learning_rate": 1.931648599459211e-05, "loss": 0.2835, "num_input_tokens_seen": 72095824, "step": 75480 }, { "epoch": 6.157516926339832, "grad_norm": 1.789764404296875, "learning_rate": 1.931301996941471e-05, "loss": 0.3048, "num_input_tokens_seen": 72100288, "step": 75485 }, { "epoch": 6.157924789950241, "grad_norm": 1.2298860549926758, "learning_rate": 1.9309554059509983e-05, "loss": 0.3044, "num_input_tokens_seen": 72105776, "step": 75490 }, { "epoch": 6.15833265356065, "grad_norm": 5.308441638946533, "learning_rate": 1.9306088264948174e-05, "loss": 0.3046, "num_input_tokens_seen": 72110192, "step": 75495 }, { "epoch": 6.158740517171058, "grad_norm": 5.473794460296631, "learning_rate": 1.930262258579954e-05, "loss": 0.3557, "num_input_tokens_seen": 72116064, "step": 75500 }, { "epoch": 6.159148380781467, "grad_norm": 0.7355208396911621, "learning_rate": 1.929915702213431e-05, "loss": 0.3641, "num_input_tokens_seen": 72120480, "step": 75505 }, { "epoch": 6.159556244391875, "grad_norm": 2.7675321102142334, "learning_rate": 1.929569157402276e-05, "loss": 0.2869, "num_input_tokens_seen": 72125136, "step": 75510 }, { "epoch": 6.159964108002284, "grad_norm": 1.1354855298995972, "learning_rate": 1.9292226241535117e-05, "loss": 0.3582, "num_input_tokens_seen": 72130016, "step": 75515 }, { "epoch": 6.160371971612693, "grad_norm": 4.835369110107422, "learning_rate": 1.9288761024741623e-05, "loss": 0.4304, "num_input_tokens_seen": 72134576, "step": 75520 }, { "epoch": 6.160779835223101, "grad_norm": 0.7627756595611572, "learning_rate": 1.928529592371251e-05, "loss": 0.3581, "num_input_tokens_seen": 72139520, "step": 75525 }, { "epoch": 6.16118769883351, "grad_norm": 7.2909722328186035, "learning_rate": 1.9281830938518026e-05, "loss": 0.3981, "num_input_tokens_seen": 72144816, "step": 75530 }, { "epoch": 6.161595562443919, "grad_norm": 38.06251907348633, "learning_rate": 1.9278366069228398e-05, "loss": 0.3504, "num_input_tokens_seen": 72149840, "step": 75535 }, { "epoch": 6.162003426054327, "grad_norm": 18.028705596923828, "learning_rate": 1.9274901315913853e-05, "loss": 0.3089, "num_input_tokens_seen": 72155088, "step": 75540 }, { "epoch": 6.162411289664736, "grad_norm": 29.420352935791016, "learning_rate": 1.9271436678644632e-05, "loss": 0.3852, "num_input_tokens_seen": 72159072, "step": 75545 }, { "epoch": 6.162819153275144, "grad_norm": 2.5810153484344482, "learning_rate": 1.926797215749095e-05, "loss": 0.2269, "num_input_tokens_seen": 72163808, "step": 75550 }, { "epoch": 6.163227016885553, "grad_norm": 0.5394999980926514, "learning_rate": 1.926450775252304e-05, "loss": 0.2855, "num_input_tokens_seen": 72168384, "step": 75555 }, { "epoch": 6.163634880495962, "grad_norm": 0.5775947570800781, "learning_rate": 1.9261043463811124e-05, "loss": 0.2714, "num_input_tokens_seen": 72173328, "step": 75560 }, { "epoch": 6.1640427441063705, "grad_norm": 6.246584415435791, "learning_rate": 1.925757929142542e-05, "loss": 0.2195, "num_input_tokens_seen": 72178608, "step": 75565 }, { "epoch": 6.1644506077167796, "grad_norm": 1.1502103805541992, "learning_rate": 1.9254115235436125e-05, "loss": 0.4178, "num_input_tokens_seen": 72182832, "step": 75570 }, { "epoch": 6.164858471327189, "grad_norm": 0.4463866949081421, "learning_rate": 1.9250651295913492e-05, "loss": 0.3496, "num_input_tokens_seen": 72187104, "step": 75575 }, { "epoch": 6.165266334937597, "grad_norm": 0.4232081174850464, "learning_rate": 1.924718747292771e-05, "loss": 0.2345, "num_input_tokens_seen": 72192208, "step": 75580 }, { "epoch": 6.165674198548006, "grad_norm": 6.403387069702148, "learning_rate": 1.9243723766548994e-05, "loss": 0.2667, "num_input_tokens_seen": 72196384, "step": 75585 }, { "epoch": 6.166082062158414, "grad_norm": 3.3230557441711426, "learning_rate": 1.924026017684755e-05, "loss": 0.4584, "num_input_tokens_seen": 72200608, "step": 75590 }, { "epoch": 6.166489925768823, "grad_norm": 8.984332084655762, "learning_rate": 1.9236796703893582e-05, "loss": 0.3341, "num_input_tokens_seen": 72206224, "step": 75595 }, { "epoch": 6.166897789379232, "grad_norm": 31.27314567565918, "learning_rate": 1.9233333347757292e-05, "loss": 0.5306, "num_input_tokens_seen": 72210544, "step": 75600 }, { "epoch": 6.16730565298964, "grad_norm": 5.9206109046936035, "learning_rate": 1.9229870108508893e-05, "loss": 0.3706, "num_input_tokens_seen": 72215376, "step": 75605 }, { "epoch": 6.167713516600049, "grad_norm": 4.8334150314331055, "learning_rate": 1.9226406986218573e-05, "loss": 0.2992, "num_input_tokens_seen": 72220288, "step": 75610 }, { "epoch": 6.168121380210458, "grad_norm": 45.86674118041992, "learning_rate": 1.9222943980956526e-05, "loss": 0.3233, "num_input_tokens_seen": 72224256, "step": 75615 }, { "epoch": 6.168529243820866, "grad_norm": 11.51533317565918, "learning_rate": 1.9219481092792957e-05, "loss": 0.3325, "num_input_tokens_seen": 72228960, "step": 75620 }, { "epoch": 6.168937107431275, "grad_norm": 2.2159931659698486, "learning_rate": 1.9216018321798045e-05, "loss": 0.3641, "num_input_tokens_seen": 72233248, "step": 75625 }, { "epoch": 6.169344971041683, "grad_norm": 12.107316970825195, "learning_rate": 1.9212555668041987e-05, "loss": 0.654, "num_input_tokens_seen": 72238464, "step": 75630 }, { "epoch": 6.169752834652092, "grad_norm": 2.178675889968872, "learning_rate": 1.9209093131594967e-05, "loss": 0.2892, "num_input_tokens_seen": 72243504, "step": 75635 }, { "epoch": 6.170160698262501, "grad_norm": 1.2797985076904297, "learning_rate": 1.9205630712527158e-05, "loss": 0.3791, "num_input_tokens_seen": 72248544, "step": 75640 }, { "epoch": 6.170568561872909, "grad_norm": 1.759228229522705, "learning_rate": 1.920216841090876e-05, "loss": 0.3491, "num_input_tokens_seen": 72253408, "step": 75645 }, { "epoch": 6.170976425483318, "grad_norm": 31.093984603881836, "learning_rate": 1.9198706226809947e-05, "loss": 0.516, "num_input_tokens_seen": 72258400, "step": 75650 }, { "epoch": 6.171384289093727, "grad_norm": 1.8783427476882935, "learning_rate": 1.9195244160300896e-05, "loss": 0.3002, "num_input_tokens_seen": 72262368, "step": 75655 }, { "epoch": 6.1717921527041355, "grad_norm": 54.888343811035156, "learning_rate": 1.9191782211451766e-05, "loss": 0.3778, "num_input_tokens_seen": 72266800, "step": 75660 }, { "epoch": 6.1722000163145445, "grad_norm": 1.3153743743896484, "learning_rate": 1.9188320380332754e-05, "loss": 0.3213, "num_input_tokens_seen": 72271872, "step": 75665 }, { "epoch": 6.1726078799249535, "grad_norm": 1.5564208030700684, "learning_rate": 1.9184858667014016e-05, "loss": 0.2272, "num_input_tokens_seen": 72276448, "step": 75670 }, { "epoch": 6.173015743535362, "grad_norm": 14.52729606628418, "learning_rate": 1.9181397071565723e-05, "loss": 0.4689, "num_input_tokens_seen": 72281584, "step": 75675 }, { "epoch": 6.173423607145771, "grad_norm": 18.899059295654297, "learning_rate": 1.9177935594058037e-05, "loss": 0.4423, "num_input_tokens_seen": 72286048, "step": 75680 }, { "epoch": 6.173831470756179, "grad_norm": 49.088462829589844, "learning_rate": 1.9174474234561122e-05, "loss": 0.6152, "num_input_tokens_seen": 72290496, "step": 75685 }, { "epoch": 6.174239334366588, "grad_norm": 4.147897720336914, "learning_rate": 1.917101299314514e-05, "loss": 0.32, "num_input_tokens_seen": 72295488, "step": 75690 }, { "epoch": 6.174647197976997, "grad_norm": 4.919432163238525, "learning_rate": 1.916755186988025e-05, "loss": 0.4664, "num_input_tokens_seen": 72300608, "step": 75695 }, { "epoch": 6.175055061587405, "grad_norm": 3.27243709564209, "learning_rate": 1.9164090864836605e-05, "loss": 0.3317, "num_input_tokens_seen": 72304752, "step": 75700 }, { "epoch": 6.175462925197814, "grad_norm": 0.4831583797931671, "learning_rate": 1.9160629978084347e-05, "loss": 0.4585, "num_input_tokens_seen": 72309152, "step": 75705 }, { "epoch": 6.175870788808223, "grad_norm": 1.8449980020523071, "learning_rate": 1.9157169209693647e-05, "loss": 0.3426, "num_input_tokens_seen": 72313200, "step": 75710 }, { "epoch": 6.176278652418631, "grad_norm": 3.5120272636413574, "learning_rate": 1.915370855973465e-05, "loss": 0.2624, "num_input_tokens_seen": 72317936, "step": 75715 }, { "epoch": 6.17668651602904, "grad_norm": 2.420058012008667, "learning_rate": 1.915024802827749e-05, "loss": 0.248, "num_input_tokens_seen": 72323040, "step": 75720 }, { "epoch": 6.177094379639448, "grad_norm": 0.3306371569633484, "learning_rate": 1.9146787615392308e-05, "loss": 0.2981, "num_input_tokens_seen": 72327888, "step": 75725 }, { "epoch": 6.177502243249857, "grad_norm": 23.172760009765625, "learning_rate": 1.914332732114927e-05, "loss": 0.4306, "num_input_tokens_seen": 72332528, "step": 75730 }, { "epoch": 6.177910106860266, "grad_norm": 4.0692620277404785, "learning_rate": 1.9139867145618488e-05, "loss": 0.3091, "num_input_tokens_seen": 72337360, "step": 75735 }, { "epoch": 6.178317970470674, "grad_norm": 2.558384895324707, "learning_rate": 1.913640708887012e-05, "loss": 0.3538, "num_input_tokens_seen": 72342384, "step": 75740 }, { "epoch": 6.178725834081083, "grad_norm": 27.312288284301758, "learning_rate": 1.913294715097428e-05, "loss": 0.4224, "num_input_tokens_seen": 72347264, "step": 75745 }, { "epoch": 6.179133697691492, "grad_norm": 4.548547267913818, "learning_rate": 1.9129487332001104e-05, "loss": 0.3062, "num_input_tokens_seen": 72351520, "step": 75750 }, { "epoch": 6.1795415613019005, "grad_norm": 11.809167861938477, "learning_rate": 1.9126027632020735e-05, "loss": 0.3404, "num_input_tokens_seen": 72357040, "step": 75755 }, { "epoch": 6.1799494249123095, "grad_norm": 1.1943659782409668, "learning_rate": 1.9122568051103288e-05, "loss": 0.3457, "num_input_tokens_seen": 72361328, "step": 75760 }, { "epoch": 6.180357288522718, "grad_norm": 1.6567996740341187, "learning_rate": 1.9119108589318887e-05, "loss": 0.3172, "num_input_tokens_seen": 72366416, "step": 75765 }, { "epoch": 6.180765152133127, "grad_norm": 14.902922630310059, "learning_rate": 1.911564924673765e-05, "loss": 0.2993, "num_input_tokens_seen": 72370704, "step": 75770 }, { "epoch": 6.181173015743536, "grad_norm": 0.6996009945869446, "learning_rate": 1.911219002342971e-05, "loss": 0.3305, "num_input_tokens_seen": 72375424, "step": 75775 }, { "epoch": 6.181580879353944, "grad_norm": 1.157345175743103, "learning_rate": 1.9108730919465175e-05, "loss": 0.2715, "num_input_tokens_seen": 72380640, "step": 75780 }, { "epoch": 6.181988742964353, "grad_norm": 2.7092463970184326, "learning_rate": 1.9105271934914163e-05, "loss": 0.2503, "num_input_tokens_seen": 72385200, "step": 75785 }, { "epoch": 6.182396606574762, "grad_norm": 4.6970930099487305, "learning_rate": 1.9101813069846784e-05, "loss": 0.3067, "num_input_tokens_seen": 72390032, "step": 75790 }, { "epoch": 6.18280447018517, "grad_norm": 0.4725806415081024, "learning_rate": 1.9098354324333135e-05, "loss": 0.2064, "num_input_tokens_seen": 72394992, "step": 75795 }, { "epoch": 6.183212333795579, "grad_norm": 1.4514189958572388, "learning_rate": 1.909489569844335e-05, "loss": 0.2296, "num_input_tokens_seen": 72400320, "step": 75800 }, { "epoch": 6.183620197405988, "grad_norm": 16.599014282226562, "learning_rate": 1.9091437192247517e-05, "loss": 0.2142, "num_input_tokens_seen": 72404976, "step": 75805 }, { "epoch": 6.184028061016396, "grad_norm": 28.885618209838867, "learning_rate": 1.9087978805815742e-05, "loss": 0.3837, "num_input_tokens_seen": 72409808, "step": 75810 }, { "epoch": 6.184435924626805, "grad_norm": 3.2087762355804443, "learning_rate": 1.908452053921812e-05, "loss": 0.3955, "num_input_tokens_seen": 72415008, "step": 75815 }, { "epoch": 6.184843788237213, "grad_norm": 5.412635326385498, "learning_rate": 1.9081062392524756e-05, "loss": 0.3449, "num_input_tokens_seen": 72418928, "step": 75820 }, { "epoch": 6.185251651847622, "grad_norm": 37.16259765625, "learning_rate": 1.9077604365805747e-05, "loss": 0.3153, "num_input_tokens_seen": 72423392, "step": 75825 }, { "epoch": 6.185659515458031, "grad_norm": 25.22427749633789, "learning_rate": 1.9074146459131177e-05, "loss": 0.6666, "num_input_tokens_seen": 72427360, "step": 75830 }, { "epoch": 6.186067379068439, "grad_norm": 1.5482723712921143, "learning_rate": 1.907068867257114e-05, "loss": 0.2077, "num_input_tokens_seen": 72432304, "step": 75835 }, { "epoch": 6.186475242678848, "grad_norm": 1.6178926229476929, "learning_rate": 1.9067231006195718e-05, "loss": 0.4772, "num_input_tokens_seen": 72436704, "step": 75840 }, { "epoch": 6.186883106289256, "grad_norm": 4.685051441192627, "learning_rate": 1.9063773460075007e-05, "loss": 0.4386, "num_input_tokens_seen": 72441328, "step": 75845 }, { "epoch": 6.187290969899665, "grad_norm": 4.846343517303467, "learning_rate": 1.906031603427909e-05, "loss": 0.3927, "num_input_tokens_seen": 72445616, "step": 75850 }, { "epoch": 6.187698833510074, "grad_norm": 7.601210117340088, "learning_rate": 1.905685872887804e-05, "loss": 0.3975, "num_input_tokens_seen": 72450512, "step": 75855 }, { "epoch": 6.188106697120483, "grad_norm": 0.37465569376945496, "learning_rate": 1.9053401543941932e-05, "loss": 0.3253, "num_input_tokens_seen": 72455760, "step": 75860 }, { "epoch": 6.188514560730892, "grad_norm": 0.7887141108512878, "learning_rate": 1.9049944479540855e-05, "loss": 0.1459, "num_input_tokens_seen": 72460048, "step": 75865 }, { "epoch": 6.188922424341301, "grad_norm": 0.4948706030845642, "learning_rate": 1.9046487535744876e-05, "loss": 0.3028, "num_input_tokens_seen": 72464816, "step": 75870 }, { "epoch": 6.189330287951709, "grad_norm": 18.618309020996094, "learning_rate": 1.9043030712624064e-05, "loss": 0.5336, "num_input_tokens_seen": 72469280, "step": 75875 }, { "epoch": 6.189738151562118, "grad_norm": 10.476644515991211, "learning_rate": 1.903957401024849e-05, "loss": 0.2624, "num_input_tokens_seen": 72474544, "step": 75880 }, { "epoch": 6.190146015172527, "grad_norm": 0.5664214491844177, "learning_rate": 1.903611742868821e-05, "loss": 0.3565, "num_input_tokens_seen": 72480112, "step": 75885 }, { "epoch": 6.190553878782935, "grad_norm": 1.7526085376739502, "learning_rate": 1.9032660968013303e-05, "loss": 0.3064, "num_input_tokens_seen": 72484928, "step": 75890 }, { "epoch": 6.190961742393344, "grad_norm": 1.2267297506332397, "learning_rate": 1.902920462829382e-05, "loss": 0.4013, "num_input_tokens_seen": 72489712, "step": 75895 }, { "epoch": 6.191369606003752, "grad_norm": 19.091541290283203, "learning_rate": 1.9025748409599825e-05, "loss": 0.4102, "num_input_tokens_seen": 72494752, "step": 75900 }, { "epoch": 6.191777469614161, "grad_norm": 0.4030444025993347, "learning_rate": 1.902229231200136e-05, "loss": 0.2499, "num_input_tokens_seen": 72499472, "step": 75905 }, { "epoch": 6.19218533322457, "grad_norm": 5.646551132202148, "learning_rate": 1.9018836335568497e-05, "loss": 0.4034, "num_input_tokens_seen": 72503632, "step": 75910 }, { "epoch": 6.192593196834978, "grad_norm": 1.2332581281661987, "learning_rate": 1.9015380480371283e-05, "loss": 0.3194, "num_input_tokens_seen": 72507488, "step": 75915 }, { "epoch": 6.193001060445387, "grad_norm": 10.752317428588867, "learning_rate": 1.9011924746479764e-05, "loss": 0.2765, "num_input_tokens_seen": 72511520, "step": 75920 }, { "epoch": 6.193408924055796, "grad_norm": 0.6253498792648315, "learning_rate": 1.9008469133963974e-05, "loss": 0.4131, "num_input_tokens_seen": 72515584, "step": 75925 }, { "epoch": 6.193816787666204, "grad_norm": 0.8184922337532043, "learning_rate": 1.9005013642893983e-05, "loss": 0.3518, "num_input_tokens_seen": 72519488, "step": 75930 }, { "epoch": 6.194224651276613, "grad_norm": 17.753726959228516, "learning_rate": 1.9001558273339815e-05, "loss": 0.2964, "num_input_tokens_seen": 72523872, "step": 75935 }, { "epoch": 6.194632514887021, "grad_norm": 1.665692925453186, "learning_rate": 1.8998103025371512e-05, "loss": 0.3886, "num_input_tokens_seen": 72528592, "step": 75940 }, { "epoch": 6.19504037849743, "grad_norm": 0.3580281138420105, "learning_rate": 1.8994647899059108e-05, "loss": 0.3033, "num_input_tokens_seen": 72532640, "step": 75945 }, { "epoch": 6.195448242107839, "grad_norm": 8.489018440246582, "learning_rate": 1.8991192894472637e-05, "loss": 0.3023, "num_input_tokens_seen": 72538048, "step": 75950 }, { "epoch": 6.1958561057182475, "grad_norm": 0.6346114873886108, "learning_rate": 1.8987738011682138e-05, "loss": 0.4553, "num_input_tokens_seen": 72543344, "step": 75955 }, { "epoch": 6.1962639693286565, "grad_norm": 2.001155138015747, "learning_rate": 1.8984283250757633e-05, "loss": 0.4107, "num_input_tokens_seen": 72547744, "step": 75960 }, { "epoch": 6.1966718329390655, "grad_norm": 0.46137139201164246, "learning_rate": 1.898082861176915e-05, "loss": 0.3284, "num_input_tokens_seen": 72552400, "step": 75965 }, { "epoch": 6.197079696549474, "grad_norm": 0.41024020314216614, "learning_rate": 1.8977374094786705e-05, "loss": 0.2802, "num_input_tokens_seen": 72557088, "step": 75970 }, { "epoch": 6.197487560159883, "grad_norm": 24.65411949157715, "learning_rate": 1.897391969988034e-05, "loss": 0.3416, "num_input_tokens_seen": 72561872, "step": 75975 }, { "epoch": 6.197895423770291, "grad_norm": 10.084493637084961, "learning_rate": 1.897046542712006e-05, "loss": 0.4018, "num_input_tokens_seen": 72566208, "step": 75980 }, { "epoch": 6.1983032873807, "grad_norm": 8.650223731994629, "learning_rate": 1.896701127657588e-05, "loss": 0.3447, "num_input_tokens_seen": 72571776, "step": 75985 }, { "epoch": 6.198711150991109, "grad_norm": 2.232560873031616, "learning_rate": 1.896355724831782e-05, "loss": 0.1738, "num_input_tokens_seen": 72575792, "step": 75990 }, { "epoch": 6.199119014601517, "grad_norm": 12.006745338439941, "learning_rate": 1.896010334241588e-05, "loss": 0.3847, "num_input_tokens_seen": 72581136, "step": 75995 }, { "epoch": 6.199526878211926, "grad_norm": 9.221879959106445, "learning_rate": 1.895664955894009e-05, "loss": 0.3549, "num_input_tokens_seen": 72585920, "step": 76000 }, { "epoch": 6.199934741822335, "grad_norm": 28.461278915405273, "learning_rate": 1.8953195897960444e-05, "loss": 0.398, "num_input_tokens_seen": 72590656, "step": 76005 }, { "epoch": 6.200342605432743, "grad_norm": 5.279106140136719, "learning_rate": 1.8949742359546947e-05, "loss": 0.3161, "num_input_tokens_seen": 72595488, "step": 76010 }, { "epoch": 6.200750469043152, "grad_norm": 1.21243155002594, "learning_rate": 1.89462889437696e-05, "loss": 0.3336, "num_input_tokens_seen": 72599952, "step": 76015 }, { "epoch": 6.201158332653561, "grad_norm": 0.4659670889377594, "learning_rate": 1.8942835650698408e-05, "loss": 0.399, "num_input_tokens_seen": 72605360, "step": 76020 }, { "epoch": 6.201566196263969, "grad_norm": 0.6907389760017395, "learning_rate": 1.893938248040336e-05, "loss": 0.3581, "num_input_tokens_seen": 72610768, "step": 76025 }, { "epoch": 6.201974059874378, "grad_norm": 13.595375061035156, "learning_rate": 1.893592943295446e-05, "loss": 0.4521, "num_input_tokens_seen": 72615760, "step": 76030 }, { "epoch": 6.202381923484786, "grad_norm": 25.063098907470703, "learning_rate": 1.8932476508421687e-05, "loss": 0.4274, "num_input_tokens_seen": 72621312, "step": 76035 }, { "epoch": 6.202789787095195, "grad_norm": 3.661168336868286, "learning_rate": 1.8929023706875026e-05, "loss": 0.4413, "num_input_tokens_seen": 72626096, "step": 76040 }, { "epoch": 6.203197650705604, "grad_norm": 4.168900489807129, "learning_rate": 1.892557102838449e-05, "loss": 0.4855, "num_input_tokens_seen": 72630976, "step": 76045 }, { "epoch": 6.2036055143160125, "grad_norm": 31.894712448120117, "learning_rate": 1.8922118473020046e-05, "loss": 0.3736, "num_input_tokens_seen": 72635920, "step": 76050 }, { "epoch": 6.2040133779264215, "grad_norm": 7.0954484939575195, "learning_rate": 1.891866604085168e-05, "loss": 0.3897, "num_input_tokens_seen": 72641024, "step": 76055 }, { "epoch": 6.2044212415368305, "grad_norm": 1.5836350917816162, "learning_rate": 1.8915213731949356e-05, "loss": 0.2965, "num_input_tokens_seen": 72645584, "step": 76060 }, { "epoch": 6.204829105147239, "grad_norm": 2.4197282791137695, "learning_rate": 1.8911761546383076e-05, "loss": 0.4285, "num_input_tokens_seen": 72649264, "step": 76065 }, { "epoch": 6.205236968757648, "grad_norm": 17.24532699584961, "learning_rate": 1.89083094842228e-05, "loss": 0.2615, "num_input_tokens_seen": 72653920, "step": 76070 }, { "epoch": 6.205644832368056, "grad_norm": 4.209134578704834, "learning_rate": 1.8904857545538507e-05, "loss": 0.2917, "num_input_tokens_seen": 72658480, "step": 76075 }, { "epoch": 6.206052695978465, "grad_norm": 2.2376317977905273, "learning_rate": 1.890140573040015e-05, "loss": 0.2709, "num_input_tokens_seen": 72663760, "step": 76080 }, { "epoch": 6.206460559588874, "grad_norm": 0.8733569383621216, "learning_rate": 1.889795403887772e-05, "loss": 0.3813, "num_input_tokens_seen": 72668240, "step": 76085 }, { "epoch": 6.206868423199282, "grad_norm": 0.9236204028129578, "learning_rate": 1.889450247104116e-05, "loss": 0.4132, "num_input_tokens_seen": 72672304, "step": 76090 }, { "epoch": 6.207276286809691, "grad_norm": 127.73095703125, "learning_rate": 1.8891051026960448e-05, "loss": 0.6511, "num_input_tokens_seen": 72676960, "step": 76095 }, { "epoch": 6.2076841504201, "grad_norm": 0.8501973748207092, "learning_rate": 1.8887599706705533e-05, "loss": 0.4218, "num_input_tokens_seen": 72681696, "step": 76100 }, { "epoch": 6.208092014030508, "grad_norm": 11.575432777404785, "learning_rate": 1.8884148510346365e-05, "loss": 0.3676, "num_input_tokens_seen": 72686496, "step": 76105 }, { "epoch": 6.208499877640917, "grad_norm": 1.1681967973709106, "learning_rate": 1.8880697437952916e-05, "loss": 0.415, "num_input_tokens_seen": 72691728, "step": 76110 }, { "epoch": 6.208907741251325, "grad_norm": 32.6343994140625, "learning_rate": 1.8877246489595132e-05, "loss": 0.352, "num_input_tokens_seen": 72696976, "step": 76115 }, { "epoch": 6.209315604861734, "grad_norm": 1.5488566160202026, "learning_rate": 1.8873795665342962e-05, "loss": 0.372, "num_input_tokens_seen": 72701424, "step": 76120 }, { "epoch": 6.209723468472143, "grad_norm": 17.097129821777344, "learning_rate": 1.8870344965266334e-05, "loss": 0.5033, "num_input_tokens_seen": 72705440, "step": 76125 }, { "epoch": 6.210131332082551, "grad_norm": 9.325850486755371, "learning_rate": 1.8866894389435223e-05, "loss": 0.4457, "num_input_tokens_seen": 72711040, "step": 76130 }, { "epoch": 6.21053919569296, "grad_norm": 1.4063504934310913, "learning_rate": 1.886344393791956e-05, "loss": 0.4166, "num_input_tokens_seen": 72715280, "step": 76135 }, { "epoch": 6.210947059303369, "grad_norm": 3.5082859992980957, "learning_rate": 1.8859993610789275e-05, "loss": 0.3655, "num_input_tokens_seen": 72719696, "step": 76140 }, { "epoch": 6.211354922913777, "grad_norm": 0.3489958345890045, "learning_rate": 1.8856543408114314e-05, "loss": 0.3068, "num_input_tokens_seen": 72724560, "step": 76145 }, { "epoch": 6.2117627865241865, "grad_norm": 3.3639159202575684, "learning_rate": 1.88530933299646e-05, "loss": 0.2959, "num_input_tokens_seen": 72729712, "step": 76150 }, { "epoch": 6.212170650134595, "grad_norm": 2.618180274963379, "learning_rate": 1.8849643376410082e-05, "loss": 0.4148, "num_input_tokens_seen": 72734224, "step": 76155 }, { "epoch": 6.212578513745004, "grad_norm": 2.224393367767334, "learning_rate": 1.8846193547520674e-05, "loss": 0.3017, "num_input_tokens_seen": 72739264, "step": 76160 }, { "epoch": 6.212986377355413, "grad_norm": 4.742082118988037, "learning_rate": 1.884274384336631e-05, "loss": 0.385, "num_input_tokens_seen": 72744640, "step": 76165 }, { "epoch": 6.213394240965821, "grad_norm": 1.3293852806091309, "learning_rate": 1.8839294264016898e-05, "loss": 0.4494, "num_input_tokens_seen": 72750304, "step": 76170 }, { "epoch": 6.21380210457623, "grad_norm": 1.4287478923797607, "learning_rate": 1.8835844809542392e-05, "loss": 0.3805, "num_input_tokens_seen": 72755184, "step": 76175 }, { "epoch": 6.214209968186639, "grad_norm": 2.0566844940185547, "learning_rate": 1.8832395480012686e-05, "loss": 0.3827, "num_input_tokens_seen": 72760112, "step": 76180 }, { "epoch": 6.214617831797047, "grad_norm": 1.7900868654251099, "learning_rate": 1.8828946275497704e-05, "loss": 0.2402, "num_input_tokens_seen": 72764368, "step": 76185 }, { "epoch": 6.215025695407456, "grad_norm": 4.590693950653076, "learning_rate": 1.882549719606736e-05, "loss": 0.5009, "num_input_tokens_seen": 72769312, "step": 76190 }, { "epoch": 6.215433559017864, "grad_norm": 6.839274883270264, "learning_rate": 1.882204824179155e-05, "loss": 0.2323, "num_input_tokens_seen": 72773648, "step": 76195 }, { "epoch": 6.215841422628273, "grad_norm": 1.4450973272323608, "learning_rate": 1.8818599412740212e-05, "loss": 0.3118, "num_input_tokens_seen": 72778752, "step": 76200 }, { "epoch": 6.216249286238682, "grad_norm": 3.570331335067749, "learning_rate": 1.8815150708983235e-05, "loss": 0.4555, "num_input_tokens_seen": 72783456, "step": 76205 }, { "epoch": 6.21665714984909, "grad_norm": 1.716187596321106, "learning_rate": 1.8811702130590526e-05, "loss": 0.324, "num_input_tokens_seen": 72788144, "step": 76210 }, { "epoch": 6.217065013459499, "grad_norm": 0.7668477892875671, "learning_rate": 1.8808253677631978e-05, "loss": 0.2687, "num_input_tokens_seen": 72793296, "step": 76215 }, { "epoch": 6.217472877069908, "grad_norm": 1.2659674882888794, "learning_rate": 1.8804805350177505e-05, "loss": 0.3464, "num_input_tokens_seen": 72797584, "step": 76220 }, { "epoch": 6.217880740680316, "grad_norm": 5.6301589012146, "learning_rate": 1.8801357148296995e-05, "loss": 0.6278, "num_input_tokens_seen": 72802048, "step": 76225 }, { "epoch": 6.218288604290725, "grad_norm": 6.254997253417969, "learning_rate": 1.8797909072060342e-05, "loss": 0.3485, "num_input_tokens_seen": 72806400, "step": 76230 }, { "epoch": 6.218696467901134, "grad_norm": 1.1933751106262207, "learning_rate": 1.8794461121537428e-05, "loss": 0.2918, "num_input_tokens_seen": 72811408, "step": 76235 }, { "epoch": 6.219104331511542, "grad_norm": 1.3287500143051147, "learning_rate": 1.879101329679815e-05, "loss": 0.5948, "num_input_tokens_seen": 72816288, "step": 76240 }, { "epoch": 6.219512195121951, "grad_norm": 14.809852600097656, "learning_rate": 1.8787565597912403e-05, "loss": 0.373, "num_input_tokens_seen": 72821248, "step": 76245 }, { "epoch": 6.2199200587323595, "grad_norm": 2.9882142543792725, "learning_rate": 1.8784118024950058e-05, "loss": 0.2805, "num_input_tokens_seen": 72825968, "step": 76250 }, { "epoch": 6.2203279223427685, "grad_norm": 4.892888069152832, "learning_rate": 1.8780670577981e-05, "loss": 0.3701, "num_input_tokens_seen": 72830640, "step": 76255 }, { "epoch": 6.2207357859531776, "grad_norm": 1.1608439683914185, "learning_rate": 1.87772232570751e-05, "loss": 0.2752, "num_input_tokens_seen": 72835696, "step": 76260 }, { "epoch": 6.221143649563586, "grad_norm": 2.440037965774536, "learning_rate": 1.8773776062302252e-05, "loss": 0.2054, "num_input_tokens_seen": 72840768, "step": 76265 }, { "epoch": 6.221551513173995, "grad_norm": 1.4873747825622559, "learning_rate": 1.877032899373231e-05, "loss": 0.3865, "num_input_tokens_seen": 72844880, "step": 76270 }, { "epoch": 6.221959376784404, "grad_norm": 26.335874557495117, "learning_rate": 1.876688205143516e-05, "loss": 0.3112, "num_input_tokens_seen": 72849712, "step": 76275 }, { "epoch": 6.222367240394812, "grad_norm": 3.4594876766204834, "learning_rate": 1.876343523548065e-05, "loss": 0.2732, "num_input_tokens_seen": 72854592, "step": 76280 }, { "epoch": 6.222775104005221, "grad_norm": 5.223794460296631, "learning_rate": 1.8759988545938668e-05, "loss": 0.2822, "num_input_tokens_seen": 72859456, "step": 76285 }, { "epoch": 6.223182967615629, "grad_norm": 3.264766216278076, "learning_rate": 1.8756541982879063e-05, "loss": 0.4792, "num_input_tokens_seen": 72864336, "step": 76290 }, { "epoch": 6.223590831226038, "grad_norm": 2.665884256362915, "learning_rate": 1.8753095546371696e-05, "loss": 0.4611, "num_input_tokens_seen": 72869344, "step": 76295 }, { "epoch": 6.223998694836447, "grad_norm": 14.057893753051758, "learning_rate": 1.8749649236486434e-05, "loss": 0.4999, "num_input_tokens_seen": 72873792, "step": 76300 }, { "epoch": 6.224406558446855, "grad_norm": 3.400770664215088, "learning_rate": 1.8746203053293116e-05, "loss": 0.2457, "num_input_tokens_seen": 72878800, "step": 76305 }, { "epoch": 6.224814422057264, "grad_norm": 1.5389879941940308, "learning_rate": 1.8742756996861613e-05, "loss": 0.3598, "num_input_tokens_seen": 72883536, "step": 76310 }, { "epoch": 6.225222285667673, "grad_norm": 0.43312346935272217, "learning_rate": 1.8739311067261767e-05, "loss": 0.2883, "num_input_tokens_seen": 72888128, "step": 76315 }, { "epoch": 6.225630149278081, "grad_norm": 4.84112024307251, "learning_rate": 1.8735865264563425e-05, "loss": 0.3834, "num_input_tokens_seen": 72892688, "step": 76320 }, { "epoch": 6.22603801288849, "grad_norm": 28.608869552612305, "learning_rate": 1.8732419588836423e-05, "loss": 0.3144, "num_input_tokens_seen": 72898048, "step": 76325 }, { "epoch": 6.226445876498898, "grad_norm": 1.8844149112701416, "learning_rate": 1.872897404015062e-05, "loss": 0.3069, "num_input_tokens_seen": 72903248, "step": 76330 }, { "epoch": 6.226853740109307, "grad_norm": 6.855342388153076, "learning_rate": 1.872552861857585e-05, "loss": 0.4022, "num_input_tokens_seen": 72908240, "step": 76335 }, { "epoch": 6.227261603719716, "grad_norm": 0.7654627561569214, "learning_rate": 1.872208332418195e-05, "loss": 0.2771, "num_input_tokens_seen": 72913504, "step": 76340 }, { "epoch": 6.2276694673301245, "grad_norm": 8.442315101623535, "learning_rate": 1.8718638157038754e-05, "loss": 0.2846, "num_input_tokens_seen": 72918352, "step": 76345 }, { "epoch": 6.2280773309405335, "grad_norm": 1.5436731576919556, "learning_rate": 1.8715193117216086e-05, "loss": 0.3267, "num_input_tokens_seen": 72923248, "step": 76350 }, { "epoch": 6.2284851945509425, "grad_norm": 1.067367434501648, "learning_rate": 1.8711748204783785e-05, "loss": 0.2789, "num_input_tokens_seen": 72928784, "step": 76355 }, { "epoch": 6.228893058161351, "grad_norm": 1.397977352142334, "learning_rate": 1.870830341981168e-05, "loss": 0.3597, "num_input_tokens_seen": 72933296, "step": 76360 }, { "epoch": 6.22930092177176, "grad_norm": 5.663522720336914, "learning_rate": 1.870485876236959e-05, "loss": 0.3693, "num_input_tokens_seen": 72938016, "step": 76365 }, { "epoch": 6.229708785382169, "grad_norm": 1.3079737424850464, "learning_rate": 1.8701414232527337e-05, "loss": 0.3335, "num_input_tokens_seen": 72942368, "step": 76370 }, { "epoch": 6.230116648992577, "grad_norm": 5.643996715545654, "learning_rate": 1.8697969830354738e-05, "loss": 0.4132, "num_input_tokens_seen": 72947472, "step": 76375 }, { "epoch": 6.230524512602986, "grad_norm": 1.0922425985336304, "learning_rate": 1.8694525555921622e-05, "loss": 0.3244, "num_input_tokens_seen": 72952640, "step": 76380 }, { "epoch": 6.230932376213394, "grad_norm": 1.2844847440719604, "learning_rate": 1.8691081409297788e-05, "loss": 0.3852, "num_input_tokens_seen": 72957776, "step": 76385 }, { "epoch": 6.231340239823803, "grad_norm": 9.576950073242188, "learning_rate": 1.8687637390553053e-05, "loss": 0.3847, "num_input_tokens_seen": 72962864, "step": 76390 }, { "epoch": 6.231748103434212, "grad_norm": 24.1468563079834, "learning_rate": 1.868419349975722e-05, "loss": 0.4713, "num_input_tokens_seen": 72967408, "step": 76395 }, { "epoch": 6.23215596704462, "grad_norm": 0.8142842650413513, "learning_rate": 1.8680749736980107e-05, "loss": 0.3261, "num_input_tokens_seen": 72971344, "step": 76400 }, { "epoch": 6.232563830655029, "grad_norm": 17.37195587158203, "learning_rate": 1.8677306102291515e-05, "loss": 0.2829, "num_input_tokens_seen": 72976448, "step": 76405 }, { "epoch": 6.232971694265438, "grad_norm": 3.409298896789551, "learning_rate": 1.8673862595761237e-05, "loss": 0.379, "num_input_tokens_seen": 72981280, "step": 76410 }, { "epoch": 6.233379557875846, "grad_norm": 0.6668975353240967, "learning_rate": 1.867041921745907e-05, "loss": 0.3053, "num_input_tokens_seen": 72986592, "step": 76415 }, { "epoch": 6.233787421486255, "grad_norm": 5.934099197387695, "learning_rate": 1.8666975967454823e-05, "loss": 0.3326, "num_input_tokens_seen": 72991936, "step": 76420 }, { "epoch": 6.234195285096663, "grad_norm": 17.979419708251953, "learning_rate": 1.866353284581828e-05, "loss": 0.2961, "num_input_tokens_seen": 72996704, "step": 76425 }, { "epoch": 6.234603148707072, "grad_norm": 0.6902462840080261, "learning_rate": 1.866008985261924e-05, "loss": 0.3501, "num_input_tokens_seen": 73001568, "step": 76430 }, { "epoch": 6.235011012317481, "grad_norm": 6.405952453613281, "learning_rate": 1.8656646987927486e-05, "loss": 0.3317, "num_input_tokens_seen": 73006176, "step": 76435 }, { "epoch": 6.2354188759278895, "grad_norm": 0.6598004698753357, "learning_rate": 1.865320425181279e-05, "loss": 0.4553, "num_input_tokens_seen": 73011056, "step": 76440 }, { "epoch": 6.2358267395382985, "grad_norm": 0.8491405248641968, "learning_rate": 1.8649761644344955e-05, "loss": 0.2847, "num_input_tokens_seen": 73016208, "step": 76445 }, { "epoch": 6.2362346031487075, "grad_norm": 0.5180286169052124, "learning_rate": 1.8646319165593752e-05, "loss": 0.3684, "num_input_tokens_seen": 73021200, "step": 76450 }, { "epoch": 6.236642466759116, "grad_norm": 1.1226942539215088, "learning_rate": 1.8642876815628962e-05, "loss": 0.4888, "num_input_tokens_seen": 73026384, "step": 76455 }, { "epoch": 6.237050330369525, "grad_norm": 8.188790321350098, "learning_rate": 1.8639434594520343e-05, "loss": 0.3059, "num_input_tokens_seen": 73031280, "step": 76460 }, { "epoch": 6.237458193979933, "grad_norm": 1.0509624481201172, "learning_rate": 1.8635992502337694e-05, "loss": 0.2712, "num_input_tokens_seen": 73035744, "step": 76465 }, { "epoch": 6.237866057590342, "grad_norm": 2.0403332710266113, "learning_rate": 1.8632550539150777e-05, "loss": 0.3721, "num_input_tokens_seen": 73040544, "step": 76470 }, { "epoch": 6.238273921200751, "grad_norm": 1.2258433103561401, "learning_rate": 1.8629108705029354e-05, "loss": 0.3287, "num_input_tokens_seen": 73045184, "step": 76475 }, { "epoch": 6.238681784811159, "grad_norm": 6.115640640258789, "learning_rate": 1.862566700004317e-05, "loss": 0.2237, "num_input_tokens_seen": 73049808, "step": 76480 }, { "epoch": 6.239089648421568, "grad_norm": 2.6186575889587402, "learning_rate": 1.862222542426203e-05, "loss": 0.2875, "num_input_tokens_seen": 73054784, "step": 76485 }, { "epoch": 6.239497512031977, "grad_norm": 16.875049591064453, "learning_rate": 1.8618783977755666e-05, "loss": 0.3147, "num_input_tokens_seen": 73059824, "step": 76490 }, { "epoch": 6.239905375642385, "grad_norm": 1.1595265865325928, "learning_rate": 1.861534266059384e-05, "loss": 0.3364, "num_input_tokens_seen": 73064352, "step": 76495 }, { "epoch": 6.240313239252794, "grad_norm": 2.0479609966278076, "learning_rate": 1.8611901472846304e-05, "loss": 0.4382, "num_input_tokens_seen": 73069520, "step": 76500 }, { "epoch": 6.240721102863202, "grad_norm": 2.1149418354034424, "learning_rate": 1.8608460414582807e-05, "loss": 0.3381, "num_input_tokens_seen": 73074080, "step": 76505 }, { "epoch": 6.241128966473611, "grad_norm": 3.414883613586426, "learning_rate": 1.8605019485873105e-05, "loss": 0.3504, "num_input_tokens_seen": 73079456, "step": 76510 }, { "epoch": 6.24153683008402, "grad_norm": 6.1857075691223145, "learning_rate": 1.8601578686786943e-05, "loss": 0.351, "num_input_tokens_seen": 73084608, "step": 76515 }, { "epoch": 6.241944693694428, "grad_norm": 2.462517738342285, "learning_rate": 1.859813801739406e-05, "loss": 0.3275, "num_input_tokens_seen": 73089456, "step": 76520 }, { "epoch": 6.242352557304837, "grad_norm": 1.731310486793518, "learning_rate": 1.859469747776419e-05, "loss": 0.3162, "num_input_tokens_seen": 73093488, "step": 76525 }, { "epoch": 6.242760420915246, "grad_norm": 0.6464357376098633, "learning_rate": 1.8591257067967088e-05, "loss": 0.3124, "num_input_tokens_seen": 73097408, "step": 76530 }, { "epoch": 6.243168284525654, "grad_norm": 11.8245849609375, "learning_rate": 1.8587816788072484e-05, "loss": 0.3706, "num_input_tokens_seen": 73102560, "step": 76535 }, { "epoch": 6.243576148136063, "grad_norm": 0.5451933741569519, "learning_rate": 1.8584376638150108e-05, "loss": 0.381, "num_input_tokens_seen": 73107504, "step": 76540 }, { "epoch": 6.2439840117464716, "grad_norm": 28.13566017150879, "learning_rate": 1.8580936618269696e-05, "loss": 0.3369, "num_input_tokens_seen": 73112352, "step": 76545 }, { "epoch": 6.244391875356881, "grad_norm": 15.968891143798828, "learning_rate": 1.8577496728500954e-05, "loss": 0.2749, "num_input_tokens_seen": 73116656, "step": 76550 }, { "epoch": 6.24479973896729, "grad_norm": 6.64534330368042, "learning_rate": 1.8574056968913638e-05, "loss": 0.3021, "num_input_tokens_seen": 73120720, "step": 76555 }, { "epoch": 6.245207602577698, "grad_norm": 8.88025951385498, "learning_rate": 1.8570617339577458e-05, "loss": 0.465, "num_input_tokens_seen": 73124864, "step": 76560 }, { "epoch": 6.245615466188107, "grad_norm": 4.2465596199035645, "learning_rate": 1.856717784056213e-05, "loss": 0.3069, "num_input_tokens_seen": 73129184, "step": 76565 }, { "epoch": 6.246023329798516, "grad_norm": 5.712649345397949, "learning_rate": 1.856373847193737e-05, "loss": 0.3394, "num_input_tokens_seen": 73134160, "step": 76570 }, { "epoch": 6.246431193408924, "grad_norm": 1.340423583984375, "learning_rate": 1.8560299233772897e-05, "loss": 0.4651, "num_input_tokens_seen": 73139536, "step": 76575 }, { "epoch": 6.246839057019333, "grad_norm": 4.943142414093018, "learning_rate": 1.855686012613843e-05, "loss": 0.1837, "num_input_tokens_seen": 73144528, "step": 76580 }, { "epoch": 6.247246920629742, "grad_norm": 0.7481539249420166, "learning_rate": 1.8553421149103665e-05, "loss": 0.3114, "num_input_tokens_seen": 73149792, "step": 76585 }, { "epoch": 6.24765478424015, "grad_norm": 34.50926208496094, "learning_rate": 1.8549982302738318e-05, "loss": 0.4035, "num_input_tokens_seen": 73154944, "step": 76590 }, { "epoch": 6.248062647850559, "grad_norm": 0.6826930642127991, "learning_rate": 1.8546543587112075e-05, "loss": 0.2565, "num_input_tokens_seen": 73160256, "step": 76595 }, { "epoch": 6.248470511460967, "grad_norm": 3.389587640762329, "learning_rate": 1.8543105002294666e-05, "loss": 0.3224, "num_input_tokens_seen": 73164896, "step": 76600 }, { "epoch": 6.248878375071376, "grad_norm": 3.2957651615142822, "learning_rate": 1.8539666548355766e-05, "loss": 0.3401, "num_input_tokens_seen": 73170512, "step": 76605 }, { "epoch": 6.249286238681785, "grad_norm": 1.3004640340805054, "learning_rate": 1.8536228225365086e-05, "loss": 0.4436, "num_input_tokens_seen": 73175408, "step": 76610 }, { "epoch": 6.249694102292193, "grad_norm": 16.01030921936035, "learning_rate": 1.85327900333923e-05, "loss": 0.471, "num_input_tokens_seen": 73180048, "step": 76615 }, { "epoch": 6.250101965902602, "grad_norm": 1.302858591079712, "learning_rate": 1.8529351972507128e-05, "loss": 0.3177, "num_input_tokens_seen": 73184432, "step": 76620 }, { "epoch": 6.250509829513011, "grad_norm": 1.497917890548706, "learning_rate": 1.852591404277924e-05, "loss": 0.2567, "num_input_tokens_seen": 73188864, "step": 76625 }, { "epoch": 6.250917693123419, "grad_norm": 3.857039213180542, "learning_rate": 1.852247624427832e-05, "loss": 0.3788, "num_input_tokens_seen": 73194064, "step": 76630 }, { "epoch": 6.251325556733828, "grad_norm": 4.704765319824219, "learning_rate": 1.851903857707405e-05, "loss": 0.2308, "num_input_tokens_seen": 73199584, "step": 76635 }, { "epoch": 6.2517334203442365, "grad_norm": 9.683005332946777, "learning_rate": 1.851560104123612e-05, "loss": 0.57, "num_input_tokens_seen": 73204320, "step": 76640 }, { "epoch": 6.2521412839546455, "grad_norm": 3.953812599182129, "learning_rate": 1.85121636368342e-05, "loss": 0.317, "num_input_tokens_seen": 73209168, "step": 76645 }, { "epoch": 6.2525491475650545, "grad_norm": 5.606827259063721, "learning_rate": 1.850872636393797e-05, "loss": 0.391, "num_input_tokens_seen": 73214192, "step": 76650 }, { "epoch": 6.252957011175463, "grad_norm": 4.085766315460205, "learning_rate": 1.8505289222617095e-05, "loss": 0.2567, "num_input_tokens_seen": 73218832, "step": 76655 }, { "epoch": 6.253364874785872, "grad_norm": 2.2584872245788574, "learning_rate": 1.850185221294124e-05, "loss": 0.4248, "num_input_tokens_seen": 73222800, "step": 76660 }, { "epoch": 6.253772738396281, "grad_norm": 1.5142451524734497, "learning_rate": 1.8498415334980087e-05, "loss": 0.5673, "num_input_tokens_seen": 73227216, "step": 76665 }, { "epoch": 6.254180602006689, "grad_norm": 1.8842988014221191, "learning_rate": 1.8494978588803293e-05, "loss": 0.4016, "num_input_tokens_seen": 73231840, "step": 76670 }, { "epoch": 6.254588465617098, "grad_norm": 1.1193161010742188, "learning_rate": 1.849154197448052e-05, "loss": 0.2829, "num_input_tokens_seen": 73237760, "step": 76675 }, { "epoch": 6.254996329227506, "grad_norm": 11.500587463378906, "learning_rate": 1.8488105492081416e-05, "loss": 0.3044, "num_input_tokens_seen": 73243168, "step": 76680 }, { "epoch": 6.255404192837915, "grad_norm": 6.813935279846191, "learning_rate": 1.8484669141675654e-05, "loss": 0.4035, "num_input_tokens_seen": 73248560, "step": 76685 }, { "epoch": 6.255812056448324, "grad_norm": 37.21426773071289, "learning_rate": 1.8481232923332885e-05, "loss": 0.2813, "num_input_tokens_seen": 73252944, "step": 76690 }, { "epoch": 6.256219920058732, "grad_norm": 27.150524139404297, "learning_rate": 1.847779683712275e-05, "loss": 0.4758, "num_input_tokens_seen": 73257712, "step": 76695 }, { "epoch": 6.256627783669141, "grad_norm": 2.507761001586914, "learning_rate": 1.8474360883114902e-05, "loss": 0.3069, "num_input_tokens_seen": 73262832, "step": 76700 }, { "epoch": 6.25703564727955, "grad_norm": 2.9471590518951416, "learning_rate": 1.8470925061378984e-05, "loss": 0.4301, "num_input_tokens_seen": 73267648, "step": 76705 }, { "epoch": 6.257443510889958, "grad_norm": 10.347329139709473, "learning_rate": 1.846748937198464e-05, "loss": 0.3151, "num_input_tokens_seen": 73272288, "step": 76710 }, { "epoch": 6.257851374500367, "grad_norm": 35.958213806152344, "learning_rate": 1.8464053815001515e-05, "loss": 0.5422, "num_input_tokens_seen": 73277648, "step": 76715 }, { "epoch": 6.258259238110776, "grad_norm": 0.7829374670982361, "learning_rate": 1.846061839049924e-05, "loss": 0.2928, "num_input_tokens_seen": 73282560, "step": 76720 }, { "epoch": 6.258667101721184, "grad_norm": 2.2584025859832764, "learning_rate": 1.8457183098547442e-05, "loss": 0.3373, "num_input_tokens_seen": 73287952, "step": 76725 }, { "epoch": 6.259074965331593, "grad_norm": 8.577962875366211, "learning_rate": 1.8453747939215778e-05, "loss": 0.3629, "num_input_tokens_seen": 73292016, "step": 76730 }, { "epoch": 6.2594828289420015, "grad_norm": 12.035452842712402, "learning_rate": 1.8450312912573854e-05, "loss": 0.3148, "num_input_tokens_seen": 73297472, "step": 76735 }, { "epoch": 6.2598906925524105, "grad_norm": 0.9686073660850525, "learning_rate": 1.8446878018691304e-05, "loss": 0.3454, "num_input_tokens_seen": 73302800, "step": 76740 }, { "epoch": 6.2602985561628195, "grad_norm": 1.5723317861557007, "learning_rate": 1.8443443257637757e-05, "loss": 0.2557, "num_input_tokens_seen": 73307584, "step": 76745 }, { "epoch": 6.260706419773228, "grad_norm": 2.0826478004455566, "learning_rate": 1.8440008629482816e-05, "loss": 0.3378, "num_input_tokens_seen": 73312304, "step": 76750 }, { "epoch": 6.261114283383637, "grad_norm": 1.6487364768981934, "learning_rate": 1.8436574134296126e-05, "loss": 0.2748, "num_input_tokens_seen": 73316528, "step": 76755 }, { "epoch": 6.261522146994045, "grad_norm": 27.50298500061035, "learning_rate": 1.8433139772147284e-05, "loss": 0.5667, "num_input_tokens_seen": 73321488, "step": 76760 }, { "epoch": 6.261930010604454, "grad_norm": 26.426856994628906, "learning_rate": 1.8429705543105912e-05, "loss": 0.3819, "num_input_tokens_seen": 73327184, "step": 76765 }, { "epoch": 6.262337874214863, "grad_norm": 1.3546862602233887, "learning_rate": 1.842627144724161e-05, "loss": 0.2368, "num_input_tokens_seen": 73332224, "step": 76770 }, { "epoch": 6.262745737825271, "grad_norm": 8.322549819946289, "learning_rate": 1.8422837484624e-05, "loss": 0.2855, "num_input_tokens_seen": 73336656, "step": 76775 }, { "epoch": 6.26315360143568, "grad_norm": 7.387156963348389, "learning_rate": 1.8419403655322676e-05, "loss": 0.33, "num_input_tokens_seen": 73341712, "step": 76780 }, { "epoch": 6.263561465046089, "grad_norm": 1.1904077529907227, "learning_rate": 1.8415969959407246e-05, "loss": 0.3533, "num_input_tokens_seen": 73346576, "step": 76785 }, { "epoch": 6.263969328656497, "grad_norm": 2.509909152984619, "learning_rate": 1.8412536396947305e-05, "loss": 0.3493, "num_input_tokens_seen": 73352320, "step": 76790 }, { "epoch": 6.264377192266906, "grad_norm": 1.9302268028259277, "learning_rate": 1.8409102968012442e-05, "loss": 0.4402, "num_input_tokens_seen": 73357248, "step": 76795 }, { "epoch": 6.264785055877315, "grad_norm": 1.8906917572021484, "learning_rate": 1.8405669672672273e-05, "loss": 0.2379, "num_input_tokens_seen": 73361712, "step": 76800 }, { "epoch": 6.265192919487723, "grad_norm": 6.44372034072876, "learning_rate": 1.840223651099638e-05, "loss": 0.4164, "num_input_tokens_seen": 73365392, "step": 76805 }, { "epoch": 6.265600783098132, "grad_norm": 4.747570991516113, "learning_rate": 1.8398803483054344e-05, "loss": 0.2458, "num_input_tokens_seen": 73369600, "step": 76810 }, { "epoch": 6.26600864670854, "grad_norm": 0.4019930064678192, "learning_rate": 1.839537058891575e-05, "loss": 0.2517, "num_input_tokens_seen": 73374784, "step": 76815 }, { "epoch": 6.266416510318949, "grad_norm": 3.7733919620513916, "learning_rate": 1.8391937828650195e-05, "loss": 0.3766, "num_input_tokens_seen": 73379072, "step": 76820 }, { "epoch": 6.266824373929358, "grad_norm": 4.547204971313477, "learning_rate": 1.8388505202327256e-05, "loss": 0.2616, "num_input_tokens_seen": 73384576, "step": 76825 }, { "epoch": 6.267232237539766, "grad_norm": 44.43955612182617, "learning_rate": 1.8385072710016506e-05, "loss": 0.4673, "num_input_tokens_seen": 73389376, "step": 76830 }, { "epoch": 6.2676401011501754, "grad_norm": 0.5777702331542969, "learning_rate": 1.8381640351787514e-05, "loss": 0.4121, "num_input_tokens_seen": 73394096, "step": 76835 }, { "epoch": 6.2680479647605845, "grad_norm": 2.864438533782959, "learning_rate": 1.8378208127709866e-05, "loss": 0.2715, "num_input_tokens_seen": 73399184, "step": 76840 }, { "epoch": 6.268455828370993, "grad_norm": 3.3891923427581787, "learning_rate": 1.8374776037853127e-05, "loss": 0.3114, "num_input_tokens_seen": 73403808, "step": 76845 }, { "epoch": 6.268863691981402, "grad_norm": 1.2376426458358765, "learning_rate": 1.837134408228686e-05, "loss": 0.3283, "num_input_tokens_seen": 73407680, "step": 76850 }, { "epoch": 6.26927155559181, "grad_norm": 16.37047576904297, "learning_rate": 1.8367912261080634e-05, "loss": 0.3401, "num_input_tokens_seen": 73412624, "step": 76855 }, { "epoch": 6.269679419202219, "grad_norm": 23.957683563232422, "learning_rate": 1.8364480574303993e-05, "loss": 0.4121, "num_input_tokens_seen": 73418032, "step": 76860 }, { "epoch": 6.270087282812628, "grad_norm": 25.566646575927734, "learning_rate": 1.8361049022026526e-05, "loss": 0.34, "num_input_tokens_seen": 73423408, "step": 76865 }, { "epoch": 6.270495146423036, "grad_norm": 3.20041823387146, "learning_rate": 1.8357617604317776e-05, "loss": 0.4558, "num_input_tokens_seen": 73427904, "step": 76870 }, { "epoch": 6.270903010033445, "grad_norm": 1.6150633096694946, "learning_rate": 1.835418632124729e-05, "loss": 0.2851, "num_input_tokens_seen": 73432768, "step": 76875 }, { "epoch": 6.271310873643854, "grad_norm": 1.1197068691253662, "learning_rate": 1.835075517288462e-05, "loss": 0.3347, "num_input_tokens_seen": 73437616, "step": 76880 }, { "epoch": 6.271718737254262, "grad_norm": 2.692675828933716, "learning_rate": 1.834732415929932e-05, "loss": 0.2575, "num_input_tokens_seen": 73442320, "step": 76885 }, { "epoch": 6.272126600864671, "grad_norm": 2.4274816513061523, "learning_rate": 1.8343893280560937e-05, "loss": 0.4066, "num_input_tokens_seen": 73446944, "step": 76890 }, { "epoch": 6.272534464475079, "grad_norm": 1.6840147972106934, "learning_rate": 1.8340462536739006e-05, "loss": 0.313, "num_input_tokens_seen": 73452384, "step": 76895 }, { "epoch": 6.272942328085488, "grad_norm": 2.1602063179016113, "learning_rate": 1.833703192790307e-05, "loss": 0.1955, "num_input_tokens_seen": 73457024, "step": 76900 }, { "epoch": 6.273350191695897, "grad_norm": 0.43021437525749207, "learning_rate": 1.8333601454122664e-05, "loss": 0.3984, "num_input_tokens_seen": 73460992, "step": 76905 }, { "epoch": 6.273758055306305, "grad_norm": 0.6007034778594971, "learning_rate": 1.8330171115467325e-05, "loss": 0.2682, "num_input_tokens_seen": 73465728, "step": 76910 }, { "epoch": 6.274165918916714, "grad_norm": 0.5027748942375183, "learning_rate": 1.8326740912006586e-05, "loss": 0.3859, "num_input_tokens_seen": 73470352, "step": 76915 }, { "epoch": 6.274573782527123, "grad_norm": 8.679080963134766, "learning_rate": 1.832331084380997e-05, "loss": 0.3925, "num_input_tokens_seen": 73474464, "step": 76920 }, { "epoch": 6.274981646137531, "grad_norm": 86.01029968261719, "learning_rate": 1.8319880910946997e-05, "loss": 0.4876, "num_input_tokens_seen": 73478704, "step": 76925 }, { "epoch": 6.27538950974794, "grad_norm": 0.28357866406440735, "learning_rate": 1.8316451113487208e-05, "loss": 0.3229, "num_input_tokens_seen": 73482960, "step": 76930 }, { "epoch": 6.275797373358349, "grad_norm": 16.624073028564453, "learning_rate": 1.8313021451500118e-05, "loss": 0.3108, "num_input_tokens_seen": 73488080, "step": 76935 }, { "epoch": 6.2762052369687575, "grad_norm": 0.9859725832939148, "learning_rate": 1.8309591925055243e-05, "loss": 0.3092, "num_input_tokens_seen": 73493152, "step": 76940 }, { "epoch": 6.2766131005791665, "grad_norm": 8.684216499328613, "learning_rate": 1.830616253422209e-05, "loss": 0.3694, "num_input_tokens_seen": 73497600, "step": 76945 }, { "epoch": 6.277020964189575, "grad_norm": 12.29192066192627, "learning_rate": 1.8302733279070174e-05, "loss": 0.2884, "num_input_tokens_seen": 73501504, "step": 76950 }, { "epoch": 6.277428827799984, "grad_norm": 1.8533639907836914, "learning_rate": 1.8299304159669018e-05, "loss": 0.2615, "num_input_tokens_seen": 73505968, "step": 76955 }, { "epoch": 6.277836691410393, "grad_norm": 3.0243091583251953, "learning_rate": 1.8295875176088118e-05, "loss": 0.3115, "num_input_tokens_seen": 73511328, "step": 76960 }, { "epoch": 6.278244555020801, "grad_norm": 8.854353904724121, "learning_rate": 1.8292446328396978e-05, "loss": 0.4003, "num_input_tokens_seen": 73515648, "step": 76965 }, { "epoch": 6.27865241863121, "grad_norm": 1.2961252927780151, "learning_rate": 1.8289017616665095e-05, "loss": 0.3156, "num_input_tokens_seen": 73520688, "step": 76970 }, { "epoch": 6.279060282241618, "grad_norm": 1.5446839332580566, "learning_rate": 1.828558904096198e-05, "loss": 0.3659, "num_input_tokens_seen": 73525728, "step": 76975 }, { "epoch": 6.279468145852027, "grad_norm": 0.5552583932876587, "learning_rate": 1.8282160601357126e-05, "loss": 0.2639, "num_input_tokens_seen": 73530016, "step": 76980 }, { "epoch": 6.279876009462436, "grad_norm": 20.21912384033203, "learning_rate": 1.8278732297920016e-05, "loss": 0.3222, "num_input_tokens_seen": 73535328, "step": 76985 }, { "epoch": 6.280283873072844, "grad_norm": 16.974308013916016, "learning_rate": 1.8275304130720144e-05, "loss": 0.3559, "num_input_tokens_seen": 73540128, "step": 76990 }, { "epoch": 6.280691736683253, "grad_norm": 10.479327201843262, "learning_rate": 1.8271876099827e-05, "loss": 0.412, "num_input_tokens_seen": 73545760, "step": 76995 }, { "epoch": 6.281099600293662, "grad_norm": 0.6441667675971985, "learning_rate": 1.8268448205310074e-05, "loss": 0.254, "num_input_tokens_seen": 73550032, "step": 77000 }, { "epoch": 6.28150746390407, "grad_norm": 0.6409634351730347, "learning_rate": 1.826502044723884e-05, "loss": 0.2648, "num_input_tokens_seen": 73555232, "step": 77005 }, { "epoch": 6.281915327514479, "grad_norm": 3.409062385559082, "learning_rate": 1.8261592825682783e-05, "loss": 0.3777, "num_input_tokens_seen": 73560048, "step": 77010 }, { "epoch": 6.282323191124888, "grad_norm": 14.89600658416748, "learning_rate": 1.8258165340711365e-05, "loss": 0.2697, "num_input_tokens_seen": 73564320, "step": 77015 }, { "epoch": 6.282731054735296, "grad_norm": 5.082791805267334, "learning_rate": 1.8254737992394082e-05, "loss": 0.3301, "num_input_tokens_seen": 73569392, "step": 77020 }, { "epoch": 6.283138918345705, "grad_norm": 25.656770706176758, "learning_rate": 1.8251310780800392e-05, "loss": 0.358, "num_input_tokens_seen": 73574224, "step": 77025 }, { "epoch": 6.2835467819561135, "grad_norm": 1.257750391960144, "learning_rate": 1.8247883705999765e-05, "loss": 0.3593, "num_input_tokens_seen": 73578832, "step": 77030 }, { "epoch": 6.2839546455665225, "grad_norm": 0.6070613861083984, "learning_rate": 1.8244456768061658e-05, "loss": 0.2642, "num_input_tokens_seen": 73582848, "step": 77035 }, { "epoch": 6.2843625091769315, "grad_norm": 1.3031667470932007, "learning_rate": 1.824102996705555e-05, "loss": 0.3768, "num_input_tokens_seen": 73587840, "step": 77040 }, { "epoch": 6.28477037278734, "grad_norm": 0.5401557683944702, "learning_rate": 1.8237603303050893e-05, "loss": 0.2531, "num_input_tokens_seen": 73591568, "step": 77045 }, { "epoch": 6.285178236397749, "grad_norm": 9.437376022338867, "learning_rate": 1.8234176776117136e-05, "loss": 0.4701, "num_input_tokens_seen": 73596576, "step": 77050 }, { "epoch": 6.285586100008158, "grad_norm": 0.8806806206703186, "learning_rate": 1.8230750386323746e-05, "loss": 0.3538, "num_input_tokens_seen": 73601088, "step": 77055 }, { "epoch": 6.285993963618566, "grad_norm": 2.0220277309417725, "learning_rate": 1.8227324133740166e-05, "loss": 0.4868, "num_input_tokens_seen": 73605696, "step": 77060 }, { "epoch": 6.286401827228975, "grad_norm": 0.6661459803581238, "learning_rate": 1.8223898018435853e-05, "loss": 0.2515, "num_input_tokens_seen": 73609920, "step": 77065 }, { "epoch": 6.286809690839383, "grad_norm": 0.31302541494369507, "learning_rate": 1.8220472040480246e-05, "loss": 0.3297, "num_input_tokens_seen": 73614288, "step": 77070 }, { "epoch": 6.287217554449792, "grad_norm": 25.817859649658203, "learning_rate": 1.8217046199942796e-05, "loss": 0.3331, "num_input_tokens_seen": 73619008, "step": 77075 }, { "epoch": 6.287625418060201, "grad_norm": 2.2121479511260986, "learning_rate": 1.8213620496892918e-05, "loss": 0.2661, "num_input_tokens_seen": 73624112, "step": 77080 }, { "epoch": 6.288033281670609, "grad_norm": 1.3897428512573242, "learning_rate": 1.8210194931400086e-05, "loss": 0.4046, "num_input_tokens_seen": 73629264, "step": 77085 }, { "epoch": 6.288441145281018, "grad_norm": 2.350292205810547, "learning_rate": 1.8206769503533716e-05, "loss": 0.2828, "num_input_tokens_seen": 73634160, "step": 77090 }, { "epoch": 6.288849008891427, "grad_norm": 1.4963679313659668, "learning_rate": 1.8203344213363238e-05, "loss": 0.4201, "num_input_tokens_seen": 73638736, "step": 77095 }, { "epoch": 6.289256872501835, "grad_norm": 38.23546600341797, "learning_rate": 1.819991906095809e-05, "loss": 0.3829, "num_input_tokens_seen": 73643088, "step": 77100 }, { "epoch": 6.289664736112244, "grad_norm": 2.7285163402557373, "learning_rate": 1.8196494046387683e-05, "loss": 0.3542, "num_input_tokens_seen": 73648656, "step": 77105 }, { "epoch": 6.290072599722652, "grad_norm": 0.6109917163848877, "learning_rate": 1.8193069169721454e-05, "loss": 0.2824, "num_input_tokens_seen": 73653616, "step": 77110 }, { "epoch": 6.290480463333061, "grad_norm": 2.204486846923828, "learning_rate": 1.8189644431028822e-05, "loss": 0.2822, "num_input_tokens_seen": 73658144, "step": 77115 }, { "epoch": 6.29088832694347, "grad_norm": 11.99041748046875, "learning_rate": 1.818621983037921e-05, "loss": 0.2827, "num_input_tokens_seen": 73663344, "step": 77120 }, { "epoch": 6.2912961905538785, "grad_norm": 33.82164001464844, "learning_rate": 1.8182795367842016e-05, "loss": 0.4582, "num_input_tokens_seen": 73668560, "step": 77125 }, { "epoch": 6.2917040541642875, "grad_norm": 0.8343924880027771, "learning_rate": 1.817937104348667e-05, "loss": 0.3292, "num_input_tokens_seen": 73673248, "step": 77130 }, { "epoch": 6.2921119177746965, "grad_norm": 21.98107147216797, "learning_rate": 1.8175946857382574e-05, "loss": 0.4356, "num_input_tokens_seen": 73678064, "step": 77135 }, { "epoch": 6.292519781385105, "grad_norm": 0.5090906620025635, "learning_rate": 1.8172522809599135e-05, "loss": 0.3067, "num_input_tokens_seen": 73682416, "step": 77140 }, { "epoch": 6.292927644995514, "grad_norm": 16.404922485351562, "learning_rate": 1.8169098900205762e-05, "loss": 0.5573, "num_input_tokens_seen": 73687296, "step": 77145 }, { "epoch": 6.293335508605923, "grad_norm": 0.4647042751312256, "learning_rate": 1.8165675129271837e-05, "loss": 0.2583, "num_input_tokens_seen": 73691568, "step": 77150 }, { "epoch": 6.293743372216331, "grad_norm": 2.136997699737549, "learning_rate": 1.8162251496866782e-05, "loss": 0.2718, "num_input_tokens_seen": 73696496, "step": 77155 }, { "epoch": 6.29415123582674, "grad_norm": 1.0396255254745483, "learning_rate": 1.8158828003059984e-05, "loss": 0.3298, "num_input_tokens_seen": 73700080, "step": 77160 }, { "epoch": 6.294559099437148, "grad_norm": 4.422112941741943, "learning_rate": 1.8155404647920838e-05, "loss": 0.2847, "num_input_tokens_seen": 73705568, "step": 77165 }, { "epoch": 6.294966963047557, "grad_norm": 0.988427996635437, "learning_rate": 1.8151981431518722e-05, "loss": 0.453, "num_input_tokens_seen": 73710320, "step": 77170 }, { "epoch": 6.295374826657966, "grad_norm": 3.0043132305145264, "learning_rate": 1.814855835392304e-05, "loss": 0.3779, "num_input_tokens_seen": 73715056, "step": 77175 }, { "epoch": 6.295782690268374, "grad_norm": 13.695394515991211, "learning_rate": 1.814513541520317e-05, "loss": 0.4373, "num_input_tokens_seen": 73720896, "step": 77180 }, { "epoch": 6.296190553878783, "grad_norm": 10.757589340209961, "learning_rate": 1.814171261542849e-05, "loss": 0.3555, "num_input_tokens_seen": 73727328, "step": 77185 }, { "epoch": 6.296598417489192, "grad_norm": 4.0637526512146, "learning_rate": 1.813828995466838e-05, "loss": 0.2963, "num_input_tokens_seen": 73732080, "step": 77190 }, { "epoch": 6.2970062810996, "grad_norm": 0.6549537181854248, "learning_rate": 1.813486743299222e-05, "loss": 0.4258, "num_input_tokens_seen": 73736848, "step": 77195 }, { "epoch": 6.297414144710009, "grad_norm": 35.707733154296875, "learning_rate": 1.813144505046938e-05, "loss": 0.2393, "num_input_tokens_seen": 73741088, "step": 77200 }, { "epoch": 6.297822008320417, "grad_norm": 0.6955711841583252, "learning_rate": 1.8128022807169232e-05, "loss": 0.2978, "num_input_tokens_seen": 73745536, "step": 77205 }, { "epoch": 6.298229871930826, "grad_norm": 6.790191650390625, "learning_rate": 1.8124600703161138e-05, "loss": 0.2973, "num_input_tokens_seen": 73750752, "step": 77210 }, { "epoch": 6.298637735541235, "grad_norm": 1.9242098331451416, "learning_rate": 1.812117873851446e-05, "loss": 0.4756, "num_input_tokens_seen": 73755760, "step": 77215 }, { "epoch": 6.299045599151643, "grad_norm": 12.397647857666016, "learning_rate": 1.8117756913298572e-05, "loss": 0.384, "num_input_tokens_seen": 73760496, "step": 77220 }, { "epoch": 6.299453462762052, "grad_norm": 16.208850860595703, "learning_rate": 1.8114335227582828e-05, "loss": 0.2676, "num_input_tokens_seen": 73765840, "step": 77225 }, { "epoch": 6.299861326372461, "grad_norm": 1.7817797660827637, "learning_rate": 1.8110913681436585e-05, "loss": 0.561, "num_input_tokens_seen": 73770960, "step": 77230 }, { "epoch": 6.3002691899828696, "grad_norm": 3.7276744842529297, "learning_rate": 1.8107492274929182e-05, "loss": 0.3032, "num_input_tokens_seen": 73775568, "step": 77235 }, { "epoch": 6.300677053593279, "grad_norm": 8.499481201171875, "learning_rate": 1.8104071008129994e-05, "loss": 0.2978, "num_input_tokens_seen": 73780032, "step": 77240 }, { "epoch": 6.301084917203687, "grad_norm": 4.5965256690979, "learning_rate": 1.8100649881108356e-05, "loss": 0.3225, "num_input_tokens_seen": 73784624, "step": 77245 }, { "epoch": 6.301492780814096, "grad_norm": 1.447798252105713, "learning_rate": 1.8097228893933612e-05, "loss": 0.2413, "num_input_tokens_seen": 73789216, "step": 77250 }, { "epoch": 6.301900644424505, "grad_norm": 9.396846771240234, "learning_rate": 1.8093808046675105e-05, "loss": 0.3272, "num_input_tokens_seen": 73794416, "step": 77255 }, { "epoch": 6.302308508034913, "grad_norm": 2.9996120929718018, "learning_rate": 1.809038733940217e-05, "loss": 0.4312, "num_input_tokens_seen": 73799312, "step": 77260 }, { "epoch": 6.302716371645322, "grad_norm": 47.43699264526367, "learning_rate": 1.8086966772184154e-05, "loss": 0.3732, "num_input_tokens_seen": 73803440, "step": 77265 }, { "epoch": 6.303124235255731, "grad_norm": 0.9395434856414795, "learning_rate": 1.8083546345090384e-05, "loss": 0.2569, "num_input_tokens_seen": 73808224, "step": 77270 }, { "epoch": 6.303532098866139, "grad_norm": 1.9654157161712646, "learning_rate": 1.808012605819019e-05, "loss": 0.2648, "num_input_tokens_seen": 73812816, "step": 77275 }, { "epoch": 6.303939962476548, "grad_norm": 1.4487262964248657, "learning_rate": 1.8076705911552886e-05, "loss": 0.4128, "num_input_tokens_seen": 73818064, "step": 77280 }, { "epoch": 6.304347826086957, "grad_norm": 0.6558910012245178, "learning_rate": 1.807328590524783e-05, "loss": 0.283, "num_input_tokens_seen": 73821376, "step": 77285 }, { "epoch": 6.304755689697365, "grad_norm": 2.0175955295562744, "learning_rate": 1.8069866039344316e-05, "loss": 0.2145, "num_input_tokens_seen": 73826256, "step": 77290 }, { "epoch": 6.305163553307774, "grad_norm": 3.1217989921569824, "learning_rate": 1.8066446313911677e-05, "loss": 0.5771, "num_input_tokens_seen": 73831632, "step": 77295 }, { "epoch": 6.305571416918182, "grad_norm": 4.084808349609375, "learning_rate": 1.8063026729019226e-05, "loss": 0.349, "num_input_tokens_seen": 73835888, "step": 77300 }, { "epoch": 6.305979280528591, "grad_norm": 4.082308769226074, "learning_rate": 1.8059607284736263e-05, "loss": 0.6122, "num_input_tokens_seen": 73840560, "step": 77305 }, { "epoch": 6.306387144139, "grad_norm": 42.138648986816406, "learning_rate": 1.8056187981132123e-05, "loss": 0.2857, "num_input_tokens_seen": 73845088, "step": 77310 }, { "epoch": 6.306795007749408, "grad_norm": 4.53717565536499, "learning_rate": 1.80527688182761e-05, "loss": 0.3525, "num_input_tokens_seen": 73849472, "step": 77315 }, { "epoch": 6.307202871359817, "grad_norm": 0.8959092497825623, "learning_rate": 1.8049349796237504e-05, "loss": 0.3769, "num_input_tokens_seen": 73853696, "step": 77320 }, { "epoch": 6.3076107349702255, "grad_norm": 0.44133296608924866, "learning_rate": 1.804593091508563e-05, "loss": 0.2994, "num_input_tokens_seen": 73859376, "step": 77325 }, { "epoch": 6.3080185985806345, "grad_norm": 0.7432082891464233, "learning_rate": 1.804251217488978e-05, "loss": 0.3405, "num_input_tokens_seen": 73863408, "step": 77330 }, { "epoch": 6.3084264621910435, "grad_norm": 0.30785802006721497, "learning_rate": 1.8039093575719257e-05, "loss": 0.3421, "num_input_tokens_seen": 73867792, "step": 77335 }, { "epoch": 6.308834325801452, "grad_norm": 0.5496312379837036, "learning_rate": 1.8035675117643348e-05, "loss": 0.2515, "num_input_tokens_seen": 73872496, "step": 77340 }, { "epoch": 6.309242189411861, "grad_norm": 23.72325325012207, "learning_rate": 1.8032256800731344e-05, "loss": 0.3563, "num_input_tokens_seen": 73878080, "step": 77345 }, { "epoch": 6.30965005302227, "grad_norm": 0.4650271236896515, "learning_rate": 1.8028838625052527e-05, "loss": 0.2786, "num_input_tokens_seen": 73882832, "step": 77350 }, { "epoch": 6.310057916632678, "grad_norm": 4.164438247680664, "learning_rate": 1.8025420590676196e-05, "loss": 0.443, "num_input_tokens_seen": 73887664, "step": 77355 }, { "epoch": 6.310465780243087, "grad_norm": 4.72676944732666, "learning_rate": 1.8022002697671626e-05, "loss": 0.3667, "num_input_tokens_seen": 73891824, "step": 77360 }, { "epoch": 6.310873643853496, "grad_norm": 3.487877130508423, "learning_rate": 1.80185849461081e-05, "loss": 0.5095, "num_input_tokens_seen": 73895920, "step": 77365 }, { "epoch": 6.311281507463904, "grad_norm": 0.5179075002670288, "learning_rate": 1.8015167336054877e-05, "loss": 0.2921, "num_input_tokens_seen": 73901168, "step": 77370 }, { "epoch": 6.311689371074313, "grad_norm": 1.3265068531036377, "learning_rate": 1.8011749867581255e-05, "loss": 0.3433, "num_input_tokens_seen": 73905600, "step": 77375 }, { "epoch": 6.312097234684721, "grad_norm": 6.03257417678833, "learning_rate": 1.8008332540756492e-05, "loss": 0.2904, "num_input_tokens_seen": 73911120, "step": 77380 }, { "epoch": 6.31250509829513, "grad_norm": 0.45222875475883484, "learning_rate": 1.8004915355649858e-05, "loss": 0.3289, "num_input_tokens_seen": 73915232, "step": 77385 }, { "epoch": 6.312912961905539, "grad_norm": 2.151299238204956, "learning_rate": 1.8001498312330614e-05, "loss": 0.3246, "num_input_tokens_seen": 73920768, "step": 77390 }, { "epoch": 6.313320825515947, "grad_norm": 1.052507758140564, "learning_rate": 1.799808141086803e-05, "loss": 0.3548, "num_input_tokens_seen": 73925424, "step": 77395 }, { "epoch": 6.313728689126356, "grad_norm": 0.43308863043785095, "learning_rate": 1.7994664651331362e-05, "loss": 0.2912, "num_input_tokens_seen": 73930080, "step": 77400 }, { "epoch": 6.314136552736765, "grad_norm": 0.46175920963287354, "learning_rate": 1.799124803378986e-05, "loss": 0.378, "num_input_tokens_seen": 73934400, "step": 77405 }, { "epoch": 6.314544416347173, "grad_norm": 4.185465335845947, "learning_rate": 1.7987831558312788e-05, "loss": 0.3305, "num_input_tokens_seen": 73938256, "step": 77410 }, { "epoch": 6.314952279957582, "grad_norm": 1.2950637340545654, "learning_rate": 1.7984415224969377e-05, "loss": 0.372, "num_input_tokens_seen": 73941248, "step": 77415 }, { "epoch": 6.3153601435679905, "grad_norm": 1.2446420192718506, "learning_rate": 1.79809990338289e-05, "loss": 0.3504, "num_input_tokens_seen": 73946064, "step": 77420 }, { "epoch": 6.3157680071783995, "grad_norm": 1.6900830268859863, "learning_rate": 1.7977582984960587e-05, "loss": 0.3497, "num_input_tokens_seen": 73951056, "step": 77425 }, { "epoch": 6.3161758707888085, "grad_norm": 7.533816814422607, "learning_rate": 1.7974167078433683e-05, "loss": 0.2799, "num_input_tokens_seen": 73955568, "step": 77430 }, { "epoch": 6.316583734399217, "grad_norm": 1.960839867591858, "learning_rate": 1.797075131431742e-05, "loss": 0.2257, "num_input_tokens_seen": 73959648, "step": 77435 }, { "epoch": 6.316991598009626, "grad_norm": 17.206777572631836, "learning_rate": 1.796733569268105e-05, "loss": 0.4115, "num_input_tokens_seen": 73964384, "step": 77440 }, { "epoch": 6.317399461620035, "grad_norm": 3.27494215965271, "learning_rate": 1.79639202135938e-05, "loss": 0.3021, "num_input_tokens_seen": 73969360, "step": 77445 }, { "epoch": 6.317807325230443, "grad_norm": 0.8688982725143433, "learning_rate": 1.796050487712489e-05, "loss": 0.3559, "num_input_tokens_seen": 73974160, "step": 77450 }, { "epoch": 6.318215188840852, "grad_norm": 4.757335186004639, "learning_rate": 1.795708968334356e-05, "loss": 0.2531, "num_input_tokens_seen": 73979088, "step": 77455 }, { "epoch": 6.31862305245126, "grad_norm": 46.32008743286133, "learning_rate": 1.7953674632319022e-05, "loss": 0.3229, "num_input_tokens_seen": 73983824, "step": 77460 }, { "epoch": 6.319030916061669, "grad_norm": 8.50662612915039, "learning_rate": 1.7950259724120512e-05, "loss": 0.4827, "num_input_tokens_seen": 73988592, "step": 77465 }, { "epoch": 6.319438779672078, "grad_norm": 0.5468900799751282, "learning_rate": 1.7946844958817243e-05, "loss": 0.3972, "num_input_tokens_seen": 73993568, "step": 77470 }, { "epoch": 6.319846643282486, "grad_norm": 11.792555809020996, "learning_rate": 1.7943430336478428e-05, "loss": 0.3895, "num_input_tokens_seen": 73998448, "step": 77475 }, { "epoch": 6.320254506892895, "grad_norm": 15.921340942382812, "learning_rate": 1.794001585717327e-05, "loss": 0.2626, "num_input_tokens_seen": 74003216, "step": 77480 }, { "epoch": 6.320662370503304, "grad_norm": 0.4994043707847595, "learning_rate": 1.7936601520971002e-05, "loss": 0.2712, "num_input_tokens_seen": 74007792, "step": 77485 }, { "epoch": 6.321070234113712, "grad_norm": 13.631969451904297, "learning_rate": 1.793318732794082e-05, "loss": 0.4703, "num_input_tokens_seen": 74012528, "step": 77490 }, { "epoch": 6.321478097724121, "grad_norm": 0.7858084440231323, "learning_rate": 1.792977327815193e-05, "loss": 0.2024, "num_input_tokens_seen": 74016400, "step": 77495 }, { "epoch": 6.32188596133453, "grad_norm": 8.560807228088379, "learning_rate": 1.792635937167353e-05, "loss": 0.6435, "num_input_tokens_seen": 74021216, "step": 77500 }, { "epoch": 6.322293824944938, "grad_norm": 24.925945281982422, "learning_rate": 1.792294560857481e-05, "loss": 0.4001, "num_input_tokens_seen": 74025712, "step": 77505 }, { "epoch": 6.322701688555347, "grad_norm": 17.46969985961914, "learning_rate": 1.791953198892498e-05, "loss": 0.3487, "num_input_tokens_seen": 74030752, "step": 77510 }, { "epoch": 6.323109552165755, "grad_norm": 0.9810513257980347, "learning_rate": 1.7916118512793233e-05, "loss": 0.2399, "num_input_tokens_seen": 74035616, "step": 77515 }, { "epoch": 6.323517415776164, "grad_norm": 8.163458824157715, "learning_rate": 1.7912705180248756e-05, "loss": 0.3895, "num_input_tokens_seen": 74039376, "step": 77520 }, { "epoch": 6.3239252793865735, "grad_norm": 9.977571487426758, "learning_rate": 1.7909291991360724e-05, "loss": 0.4645, "num_input_tokens_seen": 74043808, "step": 77525 }, { "epoch": 6.324333142996982, "grad_norm": 1.4599100351333618, "learning_rate": 1.7905878946198337e-05, "loss": 0.318, "num_input_tokens_seen": 74048496, "step": 77530 }, { "epoch": 6.324741006607391, "grad_norm": 29.5716495513916, "learning_rate": 1.7902466044830768e-05, "loss": 0.5346, "num_input_tokens_seen": 74052752, "step": 77535 }, { "epoch": 6.325148870217799, "grad_norm": 0.5128431916236877, "learning_rate": 1.78990532873272e-05, "loss": 0.3363, "num_input_tokens_seen": 74057664, "step": 77540 }, { "epoch": 6.325556733828208, "grad_norm": 3.588440418243408, "learning_rate": 1.78956406737568e-05, "loss": 0.2822, "num_input_tokens_seen": 74062336, "step": 77545 }, { "epoch": 6.325964597438617, "grad_norm": 0.8268582224845886, "learning_rate": 1.7892228204188735e-05, "loss": 0.3738, "num_input_tokens_seen": 74067168, "step": 77550 }, { "epoch": 6.326372461049025, "grad_norm": 20.243183135986328, "learning_rate": 1.7888815878692194e-05, "loss": 0.3507, "num_input_tokens_seen": 74072096, "step": 77555 }, { "epoch": 6.326780324659434, "grad_norm": 0.4703957140445709, "learning_rate": 1.7885403697336338e-05, "loss": 0.3307, "num_input_tokens_seen": 74076976, "step": 77560 }, { "epoch": 6.327188188269843, "grad_norm": 3.5148258209228516, "learning_rate": 1.788199166019032e-05, "loss": 0.3545, "num_input_tokens_seen": 74082112, "step": 77565 }, { "epoch": 6.327596051880251, "grad_norm": 27.32256507873535, "learning_rate": 1.78785797673233e-05, "loss": 0.4326, "num_input_tokens_seen": 74087008, "step": 77570 }, { "epoch": 6.32800391549066, "grad_norm": 1.2855043411254883, "learning_rate": 1.787516801880445e-05, "loss": 0.3087, "num_input_tokens_seen": 74091120, "step": 77575 }, { "epoch": 6.328411779101069, "grad_norm": 7.135586738586426, "learning_rate": 1.787175641470292e-05, "loss": 0.3882, "num_input_tokens_seen": 74095904, "step": 77580 }, { "epoch": 6.328819642711477, "grad_norm": 6.141177654266357, "learning_rate": 1.7868344955087856e-05, "loss": 0.3318, "num_input_tokens_seen": 74100992, "step": 77585 }, { "epoch": 6.329227506321886, "grad_norm": 1.0001986026763916, "learning_rate": 1.7864933640028404e-05, "loss": 0.2197, "num_input_tokens_seen": 74105136, "step": 77590 }, { "epoch": 6.329635369932294, "grad_norm": 1.636494517326355, "learning_rate": 1.7861522469593723e-05, "loss": 0.2591, "num_input_tokens_seen": 74109680, "step": 77595 }, { "epoch": 6.330043233542703, "grad_norm": 0.8750458359718323, "learning_rate": 1.7858111443852946e-05, "loss": 0.4093, "num_input_tokens_seen": 74114608, "step": 77600 }, { "epoch": 6.330451097153112, "grad_norm": 3.3578617572784424, "learning_rate": 1.7854700562875217e-05, "loss": 0.2997, "num_input_tokens_seen": 74119536, "step": 77605 }, { "epoch": 6.33085896076352, "grad_norm": 0.44279947876930237, "learning_rate": 1.7851289826729665e-05, "loss": 0.5944, "num_input_tokens_seen": 74123600, "step": 77610 }, { "epoch": 6.331266824373929, "grad_norm": 20.13684844970703, "learning_rate": 1.784787923548543e-05, "loss": 0.4183, "num_input_tokens_seen": 74128816, "step": 77615 }, { "epoch": 6.331674687984338, "grad_norm": 12.077390670776367, "learning_rate": 1.784446878921165e-05, "loss": 0.3321, "num_input_tokens_seen": 74133920, "step": 77620 }, { "epoch": 6.3320825515947465, "grad_norm": 3.322039842605591, "learning_rate": 1.7841058487977446e-05, "loss": 0.383, "num_input_tokens_seen": 74138400, "step": 77625 }, { "epoch": 6.3324904152051555, "grad_norm": 5.151928901672363, "learning_rate": 1.7837648331851942e-05, "loss": 0.5, "num_input_tokens_seen": 74142784, "step": 77630 }, { "epoch": 6.332898278815564, "grad_norm": 0.4611760079860687, "learning_rate": 1.7834238320904256e-05, "loss": 0.5681, "num_input_tokens_seen": 74147712, "step": 77635 }, { "epoch": 6.333306142425973, "grad_norm": 6.2981672286987305, "learning_rate": 1.7830828455203524e-05, "loss": 0.3632, "num_input_tokens_seen": 74152864, "step": 77640 }, { "epoch": 6.333714006036382, "grad_norm": 1.1864919662475586, "learning_rate": 1.7827418734818853e-05, "loss": 0.2907, "num_input_tokens_seen": 74157984, "step": 77645 }, { "epoch": 6.33412186964679, "grad_norm": 3.2538273334503174, "learning_rate": 1.7824009159819354e-05, "loss": 0.3142, "num_input_tokens_seen": 74162944, "step": 77650 }, { "epoch": 6.334529733257199, "grad_norm": 37.74242401123047, "learning_rate": 1.7820599730274145e-05, "loss": 0.4079, "num_input_tokens_seen": 74168464, "step": 77655 }, { "epoch": 6.334937596867608, "grad_norm": 2.3387579917907715, "learning_rate": 1.7817190446252317e-05, "loss": 0.3458, "num_input_tokens_seen": 74173792, "step": 77660 }, { "epoch": 6.335345460478016, "grad_norm": 8.855677604675293, "learning_rate": 1.7813781307822998e-05, "loss": 0.3373, "num_input_tokens_seen": 74179072, "step": 77665 }, { "epoch": 6.335753324088425, "grad_norm": 0.6943948864936829, "learning_rate": 1.7810372315055275e-05, "loss": 0.4194, "num_input_tokens_seen": 74183520, "step": 77670 }, { "epoch": 6.336161187698833, "grad_norm": 1.633152723312378, "learning_rate": 1.780696346801825e-05, "loss": 0.2807, "num_input_tokens_seen": 74188688, "step": 77675 }, { "epoch": 6.336569051309242, "grad_norm": 15.65218734741211, "learning_rate": 1.780355476678101e-05, "loss": 0.2009, "num_input_tokens_seen": 74194656, "step": 77680 }, { "epoch": 6.336976914919651, "grad_norm": 1.8015793561935425, "learning_rate": 1.780014621141267e-05, "loss": 0.4385, "num_input_tokens_seen": 74199888, "step": 77685 }, { "epoch": 6.337384778530059, "grad_norm": 2.071377992630005, "learning_rate": 1.7796737801982304e-05, "loss": 0.2207, "num_input_tokens_seen": 74205664, "step": 77690 }, { "epoch": 6.337792642140468, "grad_norm": 9.283409118652344, "learning_rate": 1.7793329538559e-05, "loss": 0.3864, "num_input_tokens_seen": 74210256, "step": 77695 }, { "epoch": 6.338200505750877, "grad_norm": 36.677555084228516, "learning_rate": 1.7789921421211847e-05, "loss": 0.3712, "num_input_tokens_seen": 74215088, "step": 77700 }, { "epoch": 6.338608369361285, "grad_norm": 5.197506427764893, "learning_rate": 1.778651345000991e-05, "loss": 0.4075, "num_input_tokens_seen": 74220544, "step": 77705 }, { "epoch": 6.339016232971694, "grad_norm": 1.8618440628051758, "learning_rate": 1.778310562502229e-05, "loss": 0.3299, "num_input_tokens_seen": 74225440, "step": 77710 }, { "epoch": 6.339424096582103, "grad_norm": 2.0953686237335205, "learning_rate": 1.7779697946318057e-05, "loss": 0.3965, "num_input_tokens_seen": 74230720, "step": 77715 }, { "epoch": 6.3398319601925115, "grad_norm": 0.43423327803611755, "learning_rate": 1.7776290413966273e-05, "loss": 0.3675, "num_input_tokens_seen": 74236048, "step": 77720 }, { "epoch": 6.3402398238029205, "grad_norm": 43.847408294677734, "learning_rate": 1.7772883028036012e-05, "loss": 0.2717, "num_input_tokens_seen": 74241280, "step": 77725 }, { "epoch": 6.340647687413329, "grad_norm": 1.4989774227142334, "learning_rate": 1.7769475788596346e-05, "loss": 0.4884, "num_input_tokens_seen": 74246096, "step": 77730 }, { "epoch": 6.341055551023738, "grad_norm": 0.582356870174408, "learning_rate": 1.776606869571633e-05, "loss": 0.3323, "num_input_tokens_seen": 74251040, "step": 77735 }, { "epoch": 6.341463414634147, "grad_norm": 0.6261385083198547, "learning_rate": 1.7762661749465026e-05, "loss": 0.4592, "num_input_tokens_seen": 74254912, "step": 77740 }, { "epoch": 6.341871278244555, "grad_norm": 0.8378211855888367, "learning_rate": 1.7759254949911498e-05, "loss": 0.3577, "num_input_tokens_seen": 74260368, "step": 77745 }, { "epoch": 6.342279141854964, "grad_norm": 4.78707218170166, "learning_rate": 1.7755848297124787e-05, "loss": 0.4148, "num_input_tokens_seen": 74263472, "step": 77750 }, { "epoch": 6.342687005465373, "grad_norm": 36.53596496582031, "learning_rate": 1.775244179117396e-05, "loss": 0.3081, "num_input_tokens_seen": 74268176, "step": 77755 }, { "epoch": 6.343094869075781, "grad_norm": 19.676395416259766, "learning_rate": 1.774903543212806e-05, "loss": 0.3486, "num_input_tokens_seen": 74273056, "step": 77760 }, { "epoch": 6.34350273268619, "grad_norm": 4.092952251434326, "learning_rate": 1.7745629220056125e-05, "loss": 0.2553, "num_input_tokens_seen": 74277792, "step": 77765 }, { "epoch": 6.343910596296598, "grad_norm": 18.394420623779297, "learning_rate": 1.7742223155027196e-05, "loss": 0.4315, "num_input_tokens_seen": 74283328, "step": 77770 }, { "epoch": 6.344318459907007, "grad_norm": 26.16469383239746, "learning_rate": 1.773881723711033e-05, "loss": 0.4675, "num_input_tokens_seen": 74287600, "step": 77775 }, { "epoch": 6.344726323517416, "grad_norm": 1.5905334949493408, "learning_rate": 1.773541146637455e-05, "loss": 0.2299, "num_input_tokens_seen": 74293120, "step": 77780 }, { "epoch": 6.345134187127824, "grad_norm": 3.6122546195983887, "learning_rate": 1.77320058428889e-05, "loss": 0.2876, "num_input_tokens_seen": 74298112, "step": 77785 }, { "epoch": 6.345542050738233, "grad_norm": 6.9714813232421875, "learning_rate": 1.772860036672239e-05, "loss": 0.5491, "num_input_tokens_seen": 74303040, "step": 77790 }, { "epoch": 6.345949914348642, "grad_norm": 20.668928146362305, "learning_rate": 1.7725195037944075e-05, "loss": 0.3582, "num_input_tokens_seen": 74307904, "step": 77795 }, { "epoch": 6.34635777795905, "grad_norm": 5.069266319274902, "learning_rate": 1.7721789856622956e-05, "loss": 0.2238, "num_input_tokens_seen": 74312848, "step": 77800 }, { "epoch": 6.346765641569459, "grad_norm": 2.7032740116119385, "learning_rate": 1.7718384822828065e-05, "loss": 0.3093, "num_input_tokens_seen": 74317488, "step": 77805 }, { "epoch": 6.3471735051798674, "grad_norm": 0.8315030336380005, "learning_rate": 1.7714979936628425e-05, "loss": 0.2192, "num_input_tokens_seen": 74322688, "step": 77810 }, { "epoch": 6.3475813687902765, "grad_norm": 0.4463860094547272, "learning_rate": 1.7711575198093037e-05, "loss": 0.3431, "num_input_tokens_seen": 74327536, "step": 77815 }, { "epoch": 6.3479892324006855, "grad_norm": 0.5031648874282837, "learning_rate": 1.770817060729093e-05, "loss": 0.3405, "num_input_tokens_seen": 74331792, "step": 77820 }, { "epoch": 6.348397096011094, "grad_norm": 1.0760266780853271, "learning_rate": 1.7704766164291107e-05, "loss": 0.2982, "num_input_tokens_seen": 74336160, "step": 77825 }, { "epoch": 6.348804959621503, "grad_norm": 6.215380668640137, "learning_rate": 1.7701361869162574e-05, "loss": 0.3167, "num_input_tokens_seen": 74340672, "step": 77830 }, { "epoch": 6.349212823231912, "grad_norm": 14.417418479919434, "learning_rate": 1.7697957721974324e-05, "loss": 0.2994, "num_input_tokens_seen": 74345632, "step": 77835 }, { "epoch": 6.34962068684232, "grad_norm": 1.2230459451675415, "learning_rate": 1.7694553722795375e-05, "loss": 0.2769, "num_input_tokens_seen": 74349856, "step": 77840 }, { "epoch": 6.350028550452729, "grad_norm": 1.9169355630874634, "learning_rate": 1.769114987169472e-05, "loss": 0.3764, "num_input_tokens_seen": 74354848, "step": 77845 }, { "epoch": 6.350436414063138, "grad_norm": 0.9018290042877197, "learning_rate": 1.7687746168741353e-05, "loss": 0.2525, "num_input_tokens_seen": 74359504, "step": 77850 }, { "epoch": 6.350844277673546, "grad_norm": 18.852169036865234, "learning_rate": 1.768434261400426e-05, "loss": 0.2948, "num_input_tokens_seen": 74364528, "step": 77855 }, { "epoch": 6.351252141283955, "grad_norm": 1.593651294708252, "learning_rate": 1.7680939207552432e-05, "loss": 0.2079, "num_input_tokens_seen": 74370160, "step": 77860 }, { "epoch": 6.351660004894363, "grad_norm": 1.8623623847961426, "learning_rate": 1.7677535949454854e-05, "loss": 0.3761, "num_input_tokens_seen": 74374928, "step": 77865 }, { "epoch": 6.352067868504772, "grad_norm": 11.497443199157715, "learning_rate": 1.7674132839780517e-05, "loss": 0.2424, "num_input_tokens_seen": 74380384, "step": 77870 }, { "epoch": 6.352475732115181, "grad_norm": 2.7088069915771484, "learning_rate": 1.7670729878598395e-05, "loss": 0.2655, "num_input_tokens_seen": 74385264, "step": 77875 }, { "epoch": 6.352883595725589, "grad_norm": 0.6920720934867859, "learning_rate": 1.766732706597746e-05, "loss": 0.4273, "num_input_tokens_seen": 74390096, "step": 77880 }, { "epoch": 6.353291459335998, "grad_norm": 0.3510592579841614, "learning_rate": 1.7663924401986693e-05, "loss": 0.3743, "num_input_tokens_seen": 74395072, "step": 77885 }, { "epoch": 6.353699322946406, "grad_norm": 2.7383766174316406, "learning_rate": 1.7660521886695064e-05, "loss": 0.471, "num_input_tokens_seen": 74399696, "step": 77890 }, { "epoch": 6.354107186556815, "grad_norm": 0.7659575939178467, "learning_rate": 1.7657119520171535e-05, "loss": 0.3555, "num_input_tokens_seen": 74403792, "step": 77895 }, { "epoch": 6.354515050167224, "grad_norm": 32.21754837036133, "learning_rate": 1.765371730248507e-05, "loss": 0.5079, "num_input_tokens_seen": 74408064, "step": 77900 }, { "epoch": 6.354922913777632, "grad_norm": 3.6805129051208496, "learning_rate": 1.7650315233704627e-05, "loss": 0.3391, "num_input_tokens_seen": 74412064, "step": 77905 }, { "epoch": 6.355330777388041, "grad_norm": 1.1670300960540771, "learning_rate": 1.7646913313899183e-05, "loss": 0.4255, "num_input_tokens_seen": 74415888, "step": 77910 }, { "epoch": 6.35573864099845, "grad_norm": 0.44728079438209534, "learning_rate": 1.7643511543137677e-05, "loss": 0.2872, "num_input_tokens_seen": 74421056, "step": 77915 }, { "epoch": 6.3561465046088585, "grad_norm": 7.623142242431641, "learning_rate": 1.7640109921489066e-05, "loss": 0.2985, "num_input_tokens_seen": 74425856, "step": 77920 }, { "epoch": 6.356554368219268, "grad_norm": 1.2084378004074097, "learning_rate": 1.7636708449022294e-05, "loss": 0.2878, "num_input_tokens_seen": 74430784, "step": 77925 }, { "epoch": 6.356962231829677, "grad_norm": 4.040890693664551, "learning_rate": 1.763330712580632e-05, "loss": 0.2774, "num_input_tokens_seen": 74436512, "step": 77930 }, { "epoch": 6.357370095440085, "grad_norm": 0.9594492316246033, "learning_rate": 1.7629905951910082e-05, "loss": 0.3542, "num_input_tokens_seen": 74441280, "step": 77935 }, { "epoch": 6.357777959050494, "grad_norm": 0.5374400019645691, "learning_rate": 1.762650492740252e-05, "loss": 0.3681, "num_input_tokens_seen": 74445552, "step": 77940 }, { "epoch": 6.358185822660902, "grad_norm": 12.300715446472168, "learning_rate": 1.762310405235256e-05, "loss": 0.5283, "num_input_tokens_seen": 74450304, "step": 77945 }, { "epoch": 6.358593686271311, "grad_norm": 21.87124252319336, "learning_rate": 1.7619703326829155e-05, "loss": 0.4806, "num_input_tokens_seen": 74453840, "step": 77950 }, { "epoch": 6.35900154988172, "grad_norm": 0.5625987648963928, "learning_rate": 1.7616302750901225e-05, "loss": 0.3884, "num_input_tokens_seen": 74458016, "step": 77955 }, { "epoch": 6.359409413492128, "grad_norm": 3.18361759185791, "learning_rate": 1.7612902324637704e-05, "loss": 0.3602, "num_input_tokens_seen": 74462912, "step": 77960 }, { "epoch": 6.359817277102537, "grad_norm": 0.7206804156303406, "learning_rate": 1.760950204810751e-05, "loss": 0.3023, "num_input_tokens_seen": 74468224, "step": 77965 }, { "epoch": 6.360225140712946, "grad_norm": 3.791167736053467, "learning_rate": 1.760610192137956e-05, "loss": 0.3797, "num_input_tokens_seen": 74472976, "step": 77970 }, { "epoch": 6.360633004323354, "grad_norm": 2.098179817199707, "learning_rate": 1.7602701944522797e-05, "loss": 0.3811, "num_input_tokens_seen": 74478288, "step": 77975 }, { "epoch": 6.361040867933763, "grad_norm": 6.2900285720825195, "learning_rate": 1.7599302117606115e-05, "loss": 0.36, "num_input_tokens_seen": 74483328, "step": 77980 }, { "epoch": 6.361448731544171, "grad_norm": 1.0136840343475342, "learning_rate": 1.759590244069844e-05, "loss": 0.2983, "num_input_tokens_seen": 74487392, "step": 77985 }, { "epoch": 6.36185659515458, "grad_norm": 7.6281023025512695, "learning_rate": 1.7592502913868665e-05, "loss": 0.2859, "num_input_tokens_seen": 74491152, "step": 77990 }, { "epoch": 6.362264458764989, "grad_norm": 3.1959033012390137, "learning_rate": 1.7589103537185712e-05, "loss": 0.2962, "num_input_tokens_seen": 74495984, "step": 77995 }, { "epoch": 6.362672322375397, "grad_norm": 7.328712463378906, "learning_rate": 1.758570431071849e-05, "loss": 0.3304, "num_input_tokens_seen": 74501216, "step": 78000 }, { "epoch": 6.363080185985806, "grad_norm": 12.843743324279785, "learning_rate": 1.758230523453589e-05, "loss": 0.3556, "num_input_tokens_seen": 74506080, "step": 78005 }, { "epoch": 6.363488049596215, "grad_norm": 4.601917743682861, "learning_rate": 1.7578906308706804e-05, "loss": 0.3174, "num_input_tokens_seen": 74509952, "step": 78010 }, { "epoch": 6.3638959132066235, "grad_norm": 0.7423628568649292, "learning_rate": 1.7575507533300133e-05, "loss": 0.4445, "num_input_tokens_seen": 74514928, "step": 78015 }, { "epoch": 6.3643037768170325, "grad_norm": 22.886140823364258, "learning_rate": 1.7572108908384776e-05, "loss": 0.2025, "num_input_tokens_seen": 74520064, "step": 78020 }, { "epoch": 6.364711640427441, "grad_norm": 0.6126304268836975, "learning_rate": 1.7568710434029617e-05, "loss": 0.3662, "num_input_tokens_seen": 74524864, "step": 78025 }, { "epoch": 6.36511950403785, "grad_norm": 2.43512225151062, "learning_rate": 1.7565312110303536e-05, "loss": 0.2535, "num_input_tokens_seen": 74530240, "step": 78030 }, { "epoch": 6.365527367648259, "grad_norm": 0.9783927798271179, "learning_rate": 1.7561913937275408e-05, "loss": 0.246, "num_input_tokens_seen": 74534448, "step": 78035 }, { "epoch": 6.365935231258667, "grad_norm": 8.971721649169922, "learning_rate": 1.7558515915014137e-05, "loss": 0.3061, "num_input_tokens_seen": 74538912, "step": 78040 }, { "epoch": 6.366343094869076, "grad_norm": 6.901186943054199, "learning_rate": 1.7555118043588586e-05, "loss": 0.265, "num_input_tokens_seen": 74543136, "step": 78045 }, { "epoch": 6.366750958479485, "grad_norm": 52.40521240234375, "learning_rate": 1.7551720323067627e-05, "loss": 0.5369, "num_input_tokens_seen": 74548144, "step": 78050 }, { "epoch": 6.367158822089893, "grad_norm": 20.90369415283203, "learning_rate": 1.7548322753520137e-05, "loss": 0.5327, "num_input_tokens_seen": 74552384, "step": 78055 }, { "epoch": 6.367566685700302, "grad_norm": 9.370342254638672, "learning_rate": 1.754492533501496e-05, "loss": 0.5053, "num_input_tokens_seen": 74557760, "step": 78060 }, { "epoch": 6.367974549310711, "grad_norm": 1.9572409391403198, "learning_rate": 1.7541528067620993e-05, "loss": 0.2693, "num_input_tokens_seen": 74562640, "step": 78065 }, { "epoch": 6.368382412921119, "grad_norm": 1.4914720058441162, "learning_rate": 1.7538130951407084e-05, "loss": 0.3774, "num_input_tokens_seen": 74567168, "step": 78070 }, { "epoch": 6.368790276531528, "grad_norm": 42.45073699951172, "learning_rate": 1.7534733986442085e-05, "loss": 0.3029, "num_input_tokens_seen": 74571152, "step": 78075 }, { "epoch": 6.369198140141936, "grad_norm": 26.27924156188965, "learning_rate": 1.753133717279486e-05, "loss": 0.5588, "num_input_tokens_seen": 74576416, "step": 78080 }, { "epoch": 6.369606003752345, "grad_norm": 43.87086868286133, "learning_rate": 1.7527940510534253e-05, "loss": 0.3963, "num_input_tokens_seen": 74580576, "step": 78085 }, { "epoch": 6.370013867362754, "grad_norm": 0.42995598912239075, "learning_rate": 1.752454399972912e-05, "loss": 0.2674, "num_input_tokens_seen": 74584000, "step": 78090 }, { "epoch": 6.370421730973162, "grad_norm": 4.5106282234191895, "learning_rate": 1.7521147640448306e-05, "loss": 0.4061, "num_input_tokens_seen": 74589104, "step": 78095 }, { "epoch": 6.370829594583571, "grad_norm": 4.0674920082092285, "learning_rate": 1.751775143276065e-05, "loss": 0.6233, "num_input_tokens_seen": 74594080, "step": 78100 }, { "epoch": 6.3712374581939795, "grad_norm": 4.649265766143799, "learning_rate": 1.7514355376734983e-05, "loss": 0.4487, "num_input_tokens_seen": 74598608, "step": 78105 }, { "epoch": 6.3716453218043885, "grad_norm": 19.882843017578125, "learning_rate": 1.7510959472440158e-05, "loss": 0.4064, "num_input_tokens_seen": 74603504, "step": 78110 }, { "epoch": 6.3720531854147975, "grad_norm": 7.833295822143555, "learning_rate": 1.750756371994501e-05, "loss": 0.3094, "num_input_tokens_seen": 74608944, "step": 78115 }, { "epoch": 6.372461049025206, "grad_norm": 7.211825370788574, "learning_rate": 1.7504168119318353e-05, "loss": 0.3227, "num_input_tokens_seen": 74614320, "step": 78120 }, { "epoch": 6.372868912635615, "grad_norm": 12.38700008392334, "learning_rate": 1.750077267062902e-05, "loss": 0.3371, "num_input_tokens_seen": 74618752, "step": 78125 }, { "epoch": 6.373276776246024, "grad_norm": 2.4650094509124756, "learning_rate": 1.7497377373945842e-05, "loss": 0.4146, "num_input_tokens_seen": 74622736, "step": 78130 }, { "epoch": 6.373684639856432, "grad_norm": 23.88141632080078, "learning_rate": 1.749398222933764e-05, "loss": 0.3145, "num_input_tokens_seen": 74628096, "step": 78135 }, { "epoch": 6.374092503466841, "grad_norm": 1.2130829095840454, "learning_rate": 1.749058723687323e-05, "loss": 0.4812, "num_input_tokens_seen": 74632368, "step": 78140 }, { "epoch": 6.37450036707725, "grad_norm": 1.201430320739746, "learning_rate": 1.7487192396621418e-05, "loss": 0.3413, "num_input_tokens_seen": 74637104, "step": 78145 }, { "epoch": 6.374908230687658, "grad_norm": 1.7351353168487549, "learning_rate": 1.748379770865103e-05, "loss": 0.3318, "num_input_tokens_seen": 74642224, "step": 78150 }, { "epoch": 6.375316094298067, "grad_norm": 1.186662197113037, "learning_rate": 1.748040317303087e-05, "loss": 0.2408, "num_input_tokens_seen": 74647024, "step": 78155 }, { "epoch": 6.375723957908475, "grad_norm": 24.359371185302734, "learning_rate": 1.747700878982974e-05, "loss": 0.3141, "num_input_tokens_seen": 74650944, "step": 78160 }, { "epoch": 6.376131821518884, "grad_norm": 1.6875776052474976, "learning_rate": 1.7473614559116447e-05, "loss": 0.2614, "num_input_tokens_seen": 74655840, "step": 78165 }, { "epoch": 6.376539685129293, "grad_norm": 2.017219305038452, "learning_rate": 1.7470220480959777e-05, "loss": 0.416, "num_input_tokens_seen": 74660432, "step": 78170 }, { "epoch": 6.376947548739701, "grad_norm": 20.594900131225586, "learning_rate": 1.746682655542855e-05, "loss": 0.2474, "num_input_tokens_seen": 74665792, "step": 78175 }, { "epoch": 6.37735541235011, "grad_norm": 9.11214542388916, "learning_rate": 1.7463432782591545e-05, "loss": 0.4515, "num_input_tokens_seen": 74669600, "step": 78180 }, { "epoch": 6.377763275960519, "grad_norm": 8.560897827148438, "learning_rate": 1.7460039162517558e-05, "loss": 0.3306, "num_input_tokens_seen": 74674224, "step": 78185 }, { "epoch": 6.378171139570927, "grad_norm": 2.0040905475616455, "learning_rate": 1.745664569527536e-05, "loss": 0.2909, "num_input_tokens_seen": 74679136, "step": 78190 }, { "epoch": 6.378579003181336, "grad_norm": 37.71174240112305, "learning_rate": 1.7453252380933762e-05, "loss": 0.3811, "num_input_tokens_seen": 74682688, "step": 78195 }, { "epoch": 6.378986866791744, "grad_norm": 0.4933719038963318, "learning_rate": 1.7449859219561533e-05, "loss": 0.361, "num_input_tokens_seen": 74687072, "step": 78200 }, { "epoch": 6.379394730402153, "grad_norm": 18.93940544128418, "learning_rate": 1.7446466211227445e-05, "loss": 0.3218, "num_input_tokens_seen": 74692640, "step": 78205 }, { "epoch": 6.3798025940125624, "grad_norm": 0.8740208745002747, "learning_rate": 1.744307335600028e-05, "loss": 0.3656, "num_input_tokens_seen": 74696464, "step": 78210 }, { "epoch": 6.380210457622971, "grad_norm": 5.798488140106201, "learning_rate": 1.74396806539488e-05, "loss": 0.5372, "num_input_tokens_seen": 74701120, "step": 78215 }, { "epoch": 6.38061832123338, "grad_norm": 3.5407588481903076, "learning_rate": 1.7436288105141785e-05, "loss": 0.277, "num_input_tokens_seen": 74705568, "step": 78220 }, { "epoch": 6.381026184843789, "grad_norm": 1.6959240436553955, "learning_rate": 1.7432895709647995e-05, "loss": 0.27, "num_input_tokens_seen": 74710544, "step": 78225 }, { "epoch": 6.381434048454197, "grad_norm": 1.9659759998321533, "learning_rate": 1.7429503467536194e-05, "loss": 0.3813, "num_input_tokens_seen": 74714880, "step": 78230 }, { "epoch": 6.381841912064606, "grad_norm": 18.084949493408203, "learning_rate": 1.7426111378875136e-05, "loss": 0.2778, "num_input_tokens_seen": 74720256, "step": 78235 }, { "epoch": 6.382249775675014, "grad_norm": 0.4558471143245697, "learning_rate": 1.742271944373359e-05, "loss": 0.3117, "num_input_tokens_seen": 74724704, "step": 78240 }, { "epoch": 6.382657639285423, "grad_norm": 1.0662052631378174, "learning_rate": 1.7419327662180297e-05, "loss": 0.3024, "num_input_tokens_seen": 74729824, "step": 78245 }, { "epoch": 6.383065502895832, "grad_norm": 17.17252540588379, "learning_rate": 1.7415936034284013e-05, "loss": 0.3146, "num_input_tokens_seen": 74735360, "step": 78250 }, { "epoch": 6.38347336650624, "grad_norm": 8.684052467346191, "learning_rate": 1.7412544560113486e-05, "loss": 0.2505, "num_input_tokens_seen": 74739872, "step": 78255 }, { "epoch": 6.383881230116649, "grad_norm": 0.45120003819465637, "learning_rate": 1.740915323973744e-05, "loss": 0.2277, "num_input_tokens_seen": 74745216, "step": 78260 }, { "epoch": 6.384289093727058, "grad_norm": 9.447810173034668, "learning_rate": 1.7405762073224648e-05, "loss": 0.3565, "num_input_tokens_seen": 74750432, "step": 78265 }, { "epoch": 6.384696957337466, "grad_norm": 0.5577570796012878, "learning_rate": 1.740237106064383e-05, "loss": 0.2359, "num_input_tokens_seen": 74755808, "step": 78270 }, { "epoch": 6.385104820947875, "grad_norm": 36.508392333984375, "learning_rate": 1.7398980202063724e-05, "loss": 0.4709, "num_input_tokens_seen": 74760864, "step": 78275 }, { "epoch": 6.385512684558284, "grad_norm": 6.650335788726807, "learning_rate": 1.7395589497553052e-05, "loss": 0.3007, "num_input_tokens_seen": 74765984, "step": 78280 }, { "epoch": 6.385920548168692, "grad_norm": 44.64055633544922, "learning_rate": 1.7392198947180553e-05, "loss": 0.3094, "num_input_tokens_seen": 74770752, "step": 78285 }, { "epoch": 6.386328411779101, "grad_norm": 4.824552536010742, "learning_rate": 1.7388808551014953e-05, "loss": 0.2454, "num_input_tokens_seen": 74775152, "step": 78290 }, { "epoch": 6.386736275389509, "grad_norm": 0.5003341436386108, "learning_rate": 1.738541830912497e-05, "loss": 0.3725, "num_input_tokens_seen": 74780032, "step": 78295 }, { "epoch": 6.387144138999918, "grad_norm": 2.0534913539886475, "learning_rate": 1.738202822157932e-05, "loss": 0.448, "num_input_tokens_seen": 74785088, "step": 78300 }, { "epoch": 6.387552002610327, "grad_norm": 11.328142166137695, "learning_rate": 1.7378638288446706e-05, "loss": 0.3358, "num_input_tokens_seen": 74789936, "step": 78305 }, { "epoch": 6.3879598662207355, "grad_norm": 2.5998268127441406, "learning_rate": 1.737524850979587e-05, "loss": 0.2682, "num_input_tokens_seen": 74793984, "step": 78310 }, { "epoch": 6.3883677298311445, "grad_norm": 1.279440999031067, "learning_rate": 1.737185888569551e-05, "loss": 0.4076, "num_input_tokens_seen": 74799040, "step": 78315 }, { "epoch": 6.3887755934415535, "grad_norm": 11.211871147155762, "learning_rate": 1.7368469416214325e-05, "loss": 0.2717, "num_input_tokens_seen": 74804064, "step": 78320 }, { "epoch": 6.389183457051962, "grad_norm": 4.09740686416626, "learning_rate": 1.736508010142101e-05, "loss": 0.364, "num_input_tokens_seen": 74808688, "step": 78325 }, { "epoch": 6.389591320662371, "grad_norm": 0.5578561425209045, "learning_rate": 1.7361690941384292e-05, "loss": 0.3102, "num_input_tokens_seen": 74814096, "step": 78330 }, { "epoch": 6.389999184272779, "grad_norm": 1.5818817615509033, "learning_rate": 1.7358301936172854e-05, "loss": 0.2986, "num_input_tokens_seen": 74818880, "step": 78335 }, { "epoch": 6.390407047883188, "grad_norm": 3.450357437133789, "learning_rate": 1.7354913085855386e-05, "loss": 0.2885, "num_input_tokens_seen": 74823344, "step": 78340 }, { "epoch": 6.390814911493597, "grad_norm": 2.135145425796509, "learning_rate": 1.7351524390500573e-05, "loss": 0.4956, "num_input_tokens_seen": 74828128, "step": 78345 }, { "epoch": 6.391222775104005, "grad_norm": 0.8195651769638062, "learning_rate": 1.7348135850177116e-05, "loss": 0.3545, "num_input_tokens_seen": 74832848, "step": 78350 }, { "epoch": 6.391630638714414, "grad_norm": 29.291362762451172, "learning_rate": 1.7344747464953697e-05, "loss": 0.4288, "num_input_tokens_seen": 74837552, "step": 78355 }, { "epoch": 6.392038502324823, "grad_norm": 8.541547775268555, "learning_rate": 1.7341359234898995e-05, "loss": 0.3551, "num_input_tokens_seen": 74842384, "step": 78360 }, { "epoch": 6.392446365935231, "grad_norm": 1.8994736671447754, "learning_rate": 1.7337971160081685e-05, "loss": 0.2817, "num_input_tokens_seen": 74847664, "step": 78365 }, { "epoch": 6.39285422954564, "grad_norm": 0.26939305663108826, "learning_rate": 1.7334583240570434e-05, "loss": 0.4743, "num_input_tokens_seen": 74852656, "step": 78370 }, { "epoch": 6.393262093156048, "grad_norm": 1.525168776512146, "learning_rate": 1.7331195476433933e-05, "loss": 0.3798, "num_input_tokens_seen": 74856096, "step": 78375 }, { "epoch": 6.393669956766457, "grad_norm": 3.8497889041900635, "learning_rate": 1.7327807867740842e-05, "loss": 0.3057, "num_input_tokens_seen": 74860784, "step": 78380 }, { "epoch": 6.394077820376866, "grad_norm": 2.44948410987854, "learning_rate": 1.7324420414559823e-05, "loss": 0.3221, "num_input_tokens_seen": 74865696, "step": 78385 }, { "epoch": 6.394485683987274, "grad_norm": 2.535395383834839, "learning_rate": 1.7321033116959532e-05, "loss": 0.4488, "num_input_tokens_seen": 74869840, "step": 78390 }, { "epoch": 6.394893547597683, "grad_norm": 2.1117405891418457, "learning_rate": 1.7317645975008646e-05, "loss": 0.4644, "num_input_tokens_seen": 74874784, "step": 78395 }, { "epoch": 6.395301411208092, "grad_norm": 5.674305438995361, "learning_rate": 1.7314258988775813e-05, "loss": 0.2955, "num_input_tokens_seen": 74879152, "step": 78400 }, { "epoch": 6.3957092748185005, "grad_norm": 1.0968986749649048, "learning_rate": 1.7310872158329683e-05, "loss": 0.3578, "num_input_tokens_seen": 74883760, "step": 78405 }, { "epoch": 6.3961171384289095, "grad_norm": 2.7428338527679443, "learning_rate": 1.7307485483738912e-05, "loss": 0.3837, "num_input_tokens_seen": 74888912, "step": 78410 }, { "epoch": 6.3965250020393185, "grad_norm": 1.7341440916061401, "learning_rate": 1.730409896507213e-05, "loss": 0.3159, "num_input_tokens_seen": 74894144, "step": 78415 }, { "epoch": 6.396932865649727, "grad_norm": 0.67878657579422, "learning_rate": 1.7300712602398e-05, "loss": 0.2501, "num_input_tokens_seen": 74899040, "step": 78420 }, { "epoch": 6.397340729260136, "grad_norm": 1.6562525033950806, "learning_rate": 1.7297326395785153e-05, "loss": 0.3089, "num_input_tokens_seen": 74903904, "step": 78425 }, { "epoch": 6.397748592870544, "grad_norm": 0.9533101320266724, "learning_rate": 1.7293940345302227e-05, "loss": 0.3809, "num_input_tokens_seen": 74908544, "step": 78430 }, { "epoch": 6.398156456480953, "grad_norm": 1.2442084550857544, "learning_rate": 1.7290554451017845e-05, "loss": 0.4224, "num_input_tokens_seen": 74913664, "step": 78435 }, { "epoch": 6.398564320091362, "grad_norm": 24.80173683166504, "learning_rate": 1.728716871300066e-05, "loss": 0.3017, "num_input_tokens_seen": 74917200, "step": 78440 }, { "epoch": 6.39897218370177, "grad_norm": 1.668542504310608, "learning_rate": 1.7283783131319288e-05, "loss": 0.4256, "num_input_tokens_seen": 74922240, "step": 78445 }, { "epoch": 6.399380047312179, "grad_norm": 0.3458337187767029, "learning_rate": 1.728039770604235e-05, "loss": 0.3298, "num_input_tokens_seen": 74926384, "step": 78450 }, { "epoch": 6.399787910922587, "grad_norm": 6.249047756195068, "learning_rate": 1.7277012437238475e-05, "loss": 0.321, "num_input_tokens_seen": 74930480, "step": 78455 }, { "epoch": 6.400195774532996, "grad_norm": 1.3503297567367554, "learning_rate": 1.7273627324976263e-05, "loss": 0.3069, "num_input_tokens_seen": 74934544, "step": 78460 }, { "epoch": 6.400603638143405, "grad_norm": 1.6335794925689697, "learning_rate": 1.7270242369324354e-05, "loss": 0.3512, "num_input_tokens_seen": 74938896, "step": 78465 }, { "epoch": 6.401011501753813, "grad_norm": 5.366985321044922, "learning_rate": 1.7266857570351347e-05, "loss": 0.3193, "num_input_tokens_seen": 74943232, "step": 78470 }, { "epoch": 6.401419365364222, "grad_norm": 17.618167877197266, "learning_rate": 1.7263472928125847e-05, "loss": 0.2707, "num_input_tokens_seen": 74948560, "step": 78475 }, { "epoch": 6.401827228974631, "grad_norm": 12.097180366516113, "learning_rate": 1.7260088442716462e-05, "loss": 0.4007, "num_input_tokens_seen": 74953200, "step": 78480 }, { "epoch": 6.402235092585039, "grad_norm": 0.7459297776222229, "learning_rate": 1.7256704114191803e-05, "loss": 0.3033, "num_input_tokens_seen": 74957376, "step": 78485 }, { "epoch": 6.402642956195448, "grad_norm": 0.8777877688407898, "learning_rate": 1.7253319942620454e-05, "loss": 0.2811, "num_input_tokens_seen": 74962208, "step": 78490 }, { "epoch": 6.403050819805857, "grad_norm": 0.632436215877533, "learning_rate": 1.724993592807102e-05, "loss": 0.3357, "num_input_tokens_seen": 74967168, "step": 78495 }, { "epoch": 6.4034586834162655, "grad_norm": 43.00162887573242, "learning_rate": 1.724655207061209e-05, "loss": 0.3223, "num_input_tokens_seen": 74972016, "step": 78500 }, { "epoch": 6.4038665470266745, "grad_norm": 5.257361888885498, "learning_rate": 1.7243168370312258e-05, "loss": 0.1871, "num_input_tokens_seen": 74977328, "step": 78505 }, { "epoch": 6.404274410637083, "grad_norm": 6.260488033294678, "learning_rate": 1.7239784827240108e-05, "loss": 0.2634, "num_input_tokens_seen": 74981664, "step": 78510 }, { "epoch": 6.404682274247492, "grad_norm": 13.5966215133667, "learning_rate": 1.723640144146422e-05, "loss": 0.6867, "num_input_tokens_seen": 74986688, "step": 78515 }, { "epoch": 6.405090137857901, "grad_norm": 0.9523594975471497, "learning_rate": 1.7233018213053176e-05, "loss": 0.3634, "num_input_tokens_seen": 74990912, "step": 78520 }, { "epoch": 6.405498001468309, "grad_norm": 17.788358688354492, "learning_rate": 1.7229635142075544e-05, "loss": 0.2746, "num_input_tokens_seen": 74996000, "step": 78525 }, { "epoch": 6.405905865078718, "grad_norm": 1.052095651626587, "learning_rate": 1.7226252228599915e-05, "loss": 0.3813, "num_input_tokens_seen": 75000416, "step": 78530 }, { "epoch": 6.406313728689127, "grad_norm": 1.9486204385757446, "learning_rate": 1.7222869472694853e-05, "loss": 0.4009, "num_input_tokens_seen": 75005504, "step": 78535 }, { "epoch": 6.406721592299535, "grad_norm": 0.24184875190258026, "learning_rate": 1.721948687442892e-05, "loss": 0.342, "num_input_tokens_seen": 75009216, "step": 78540 }, { "epoch": 6.407129455909944, "grad_norm": 11.044535636901855, "learning_rate": 1.721610443387068e-05, "loss": 0.2539, "num_input_tokens_seen": 75013136, "step": 78545 }, { "epoch": 6.407537319520352, "grad_norm": 6.274246692657471, "learning_rate": 1.7212722151088694e-05, "loss": 0.3004, "num_input_tokens_seen": 75017520, "step": 78550 }, { "epoch": 6.407945183130761, "grad_norm": 8.279170989990234, "learning_rate": 1.7209340026151523e-05, "loss": 0.3016, "num_input_tokens_seen": 75022944, "step": 78555 }, { "epoch": 6.40835304674117, "grad_norm": 4.9082231521606445, "learning_rate": 1.720595805912772e-05, "loss": 0.4062, "num_input_tokens_seen": 75026544, "step": 78560 }, { "epoch": 6.408760910351578, "grad_norm": 7.847657203674316, "learning_rate": 1.7202576250085832e-05, "loss": 0.3233, "num_input_tokens_seen": 75030736, "step": 78565 }, { "epoch": 6.409168773961987, "grad_norm": 2.8608803749084473, "learning_rate": 1.719919459909441e-05, "loss": 0.4028, "num_input_tokens_seen": 75035552, "step": 78570 }, { "epoch": 6.409576637572396, "grad_norm": 1.9911166429519653, "learning_rate": 1.7195813106222002e-05, "loss": 0.3791, "num_input_tokens_seen": 75040384, "step": 78575 }, { "epoch": 6.409984501182804, "grad_norm": 13.378098487854004, "learning_rate": 1.7192431771537148e-05, "loss": 0.4323, "num_input_tokens_seen": 75044656, "step": 78580 }, { "epoch": 6.410392364793213, "grad_norm": 1.4049298763275146, "learning_rate": 1.7189050595108382e-05, "loss": 0.3401, "num_input_tokens_seen": 75049920, "step": 78585 }, { "epoch": 6.410800228403621, "grad_norm": 12.988341331481934, "learning_rate": 1.7185669577004233e-05, "loss": 0.3528, "num_input_tokens_seen": 75054256, "step": 78590 }, { "epoch": 6.41120809201403, "grad_norm": 0.27075833082199097, "learning_rate": 1.7182288717293253e-05, "loss": 0.3566, "num_input_tokens_seen": 75058928, "step": 78595 }, { "epoch": 6.411615955624439, "grad_norm": 3.415518045425415, "learning_rate": 1.7178908016043954e-05, "loss": 0.3803, "num_input_tokens_seen": 75063664, "step": 78600 }, { "epoch": 6.4120238192348475, "grad_norm": 16.98764991760254, "learning_rate": 1.717552747332487e-05, "loss": 0.3826, "num_input_tokens_seen": 75067808, "step": 78605 }, { "epoch": 6.4124316828452566, "grad_norm": 3.8133082389831543, "learning_rate": 1.7172147089204516e-05, "loss": 0.2806, "num_input_tokens_seen": 75072832, "step": 78610 }, { "epoch": 6.412839546455666, "grad_norm": 5.369810581207275, "learning_rate": 1.7168766863751407e-05, "loss": 0.3131, "num_input_tokens_seen": 75078272, "step": 78615 }, { "epoch": 6.413247410066074, "grad_norm": 11.297779083251953, "learning_rate": 1.7165386797034075e-05, "loss": 0.3664, "num_input_tokens_seen": 75082896, "step": 78620 }, { "epoch": 6.413655273676483, "grad_norm": 1.223520040512085, "learning_rate": 1.7162006889121024e-05, "loss": 0.2682, "num_input_tokens_seen": 75088128, "step": 78625 }, { "epoch": 6.414063137286892, "grad_norm": 5.7965264320373535, "learning_rate": 1.7158627140080763e-05, "loss": 0.315, "num_input_tokens_seen": 75093456, "step": 78630 }, { "epoch": 6.4144710008973, "grad_norm": 8.187618255615234, "learning_rate": 1.715524754998179e-05, "loss": 0.4936, "num_input_tokens_seen": 75098576, "step": 78635 }, { "epoch": 6.414878864507709, "grad_norm": 1.8509387969970703, "learning_rate": 1.7151868118892624e-05, "loss": 0.3997, "num_input_tokens_seen": 75103600, "step": 78640 }, { "epoch": 6.415286728118117, "grad_norm": 9.949861526489258, "learning_rate": 1.7148488846881756e-05, "loss": 0.4001, "num_input_tokens_seen": 75108272, "step": 78645 }, { "epoch": 6.415694591728526, "grad_norm": 0.7393771409988403, "learning_rate": 1.7145109734017682e-05, "loss": 0.3569, "num_input_tokens_seen": 75113664, "step": 78650 }, { "epoch": 6.416102455338935, "grad_norm": 1.4636874198913574, "learning_rate": 1.7141730780368897e-05, "loss": 0.4173, "num_input_tokens_seen": 75118176, "step": 78655 }, { "epoch": 6.416510318949343, "grad_norm": 0.8477358222007751, "learning_rate": 1.713835198600388e-05, "loss": 0.3567, "num_input_tokens_seen": 75123040, "step": 78660 }, { "epoch": 6.416918182559752, "grad_norm": 6.574224472045898, "learning_rate": 1.7134973350991135e-05, "loss": 0.2768, "num_input_tokens_seen": 75127936, "step": 78665 }, { "epoch": 6.417326046170161, "grad_norm": 6.894374847412109, "learning_rate": 1.713159487539914e-05, "loss": 0.3009, "num_input_tokens_seen": 75132816, "step": 78670 }, { "epoch": 6.417733909780569, "grad_norm": 43.36325454711914, "learning_rate": 1.7128216559296374e-05, "loss": 0.628, "num_input_tokens_seen": 75138352, "step": 78675 }, { "epoch": 6.418141773390978, "grad_norm": 2.2340359687805176, "learning_rate": 1.7124838402751305e-05, "loss": 0.3047, "num_input_tokens_seen": 75142528, "step": 78680 }, { "epoch": 6.418549637001386, "grad_norm": 17.920635223388672, "learning_rate": 1.7121460405832417e-05, "loss": 0.3906, "num_input_tokens_seen": 75147632, "step": 78685 }, { "epoch": 6.418957500611795, "grad_norm": 1.468410611152649, "learning_rate": 1.7118082568608185e-05, "loss": 0.2796, "num_input_tokens_seen": 75152816, "step": 78690 }, { "epoch": 6.419365364222204, "grad_norm": 10.635007858276367, "learning_rate": 1.7114704891147072e-05, "loss": 0.2274, "num_input_tokens_seen": 75157568, "step": 78695 }, { "epoch": 6.4197732278326125, "grad_norm": 2.0882627964019775, "learning_rate": 1.7111327373517532e-05, "loss": 0.2985, "num_input_tokens_seen": 75162176, "step": 78700 }, { "epoch": 6.4201810914430215, "grad_norm": 1.7932761907577515, "learning_rate": 1.7107950015788036e-05, "loss": 0.2979, "num_input_tokens_seen": 75166704, "step": 78705 }, { "epoch": 6.4205889550534305, "grad_norm": 0.5097122192382812, "learning_rate": 1.7104572818027048e-05, "loss": 0.2369, "num_input_tokens_seen": 75170736, "step": 78710 }, { "epoch": 6.420996818663839, "grad_norm": 2.7578330039978027, "learning_rate": 1.710119578030301e-05, "loss": 0.4392, "num_input_tokens_seen": 75175040, "step": 78715 }, { "epoch": 6.421404682274248, "grad_norm": 0.471734881401062, "learning_rate": 1.7097818902684377e-05, "loss": 0.2402, "num_input_tokens_seen": 75181136, "step": 78720 }, { "epoch": 6.421812545884656, "grad_norm": 0.6838247179985046, "learning_rate": 1.7094442185239585e-05, "loss": 0.2827, "num_input_tokens_seen": 75186176, "step": 78725 }, { "epoch": 6.422220409495065, "grad_norm": 42.48569107055664, "learning_rate": 1.70910656280371e-05, "loss": 0.5834, "num_input_tokens_seen": 75191264, "step": 78730 }, { "epoch": 6.422628273105474, "grad_norm": 25.026519775390625, "learning_rate": 1.708768923114536e-05, "loss": 0.4831, "num_input_tokens_seen": 75196592, "step": 78735 }, { "epoch": 6.423036136715882, "grad_norm": 4.108726978302002, "learning_rate": 1.7084312994632793e-05, "loss": 0.3832, "num_input_tokens_seen": 75201744, "step": 78740 }, { "epoch": 6.423444000326291, "grad_norm": 38.76900100708008, "learning_rate": 1.7080936918567827e-05, "loss": 0.3379, "num_input_tokens_seen": 75206320, "step": 78745 }, { "epoch": 6.4238518639367, "grad_norm": 5.3195719718933105, "learning_rate": 1.7077561003018917e-05, "loss": 0.2525, "num_input_tokens_seen": 75211056, "step": 78750 }, { "epoch": 6.424259727547108, "grad_norm": 2.25829815864563, "learning_rate": 1.707418524805448e-05, "loss": 0.2427, "num_input_tokens_seen": 75216240, "step": 78755 }, { "epoch": 6.424667591157517, "grad_norm": 3.623089075088501, "learning_rate": 1.7070809653742936e-05, "loss": 0.2822, "num_input_tokens_seen": 75221376, "step": 78760 }, { "epoch": 6.425075454767926, "grad_norm": 1.1179509162902832, "learning_rate": 1.706743422015271e-05, "loss": 0.4908, "num_input_tokens_seen": 75226272, "step": 78765 }, { "epoch": 6.425483318378334, "grad_norm": 26.33646583557129, "learning_rate": 1.7064058947352223e-05, "loss": 0.4497, "num_input_tokens_seen": 75230816, "step": 78770 }, { "epoch": 6.425891181988743, "grad_norm": 2.2780001163482666, "learning_rate": 1.706068383540989e-05, "loss": 0.4336, "num_input_tokens_seen": 75234944, "step": 78775 }, { "epoch": 6.426299045599151, "grad_norm": 17.93001937866211, "learning_rate": 1.7057308884394126e-05, "loss": 0.2719, "num_input_tokens_seen": 75240304, "step": 78780 }, { "epoch": 6.42670690920956, "grad_norm": 0.5626107454299927, "learning_rate": 1.705393409437333e-05, "loss": 0.3179, "num_input_tokens_seen": 75244864, "step": 78785 }, { "epoch": 6.427114772819969, "grad_norm": 1.7976024150848389, "learning_rate": 1.7050559465415904e-05, "loss": 0.4923, "num_input_tokens_seen": 75249840, "step": 78790 }, { "epoch": 6.4275226364303775, "grad_norm": 0.5716491341590881, "learning_rate": 1.7047184997590277e-05, "loss": 0.3335, "num_input_tokens_seen": 75254592, "step": 78795 }, { "epoch": 6.4279305000407865, "grad_norm": 0.6459952592849731, "learning_rate": 1.7043810690964825e-05, "loss": 0.3039, "num_input_tokens_seen": 75259424, "step": 78800 }, { "epoch": 6.428338363651195, "grad_norm": 0.9763088226318359, "learning_rate": 1.704043654560795e-05, "loss": 0.2482, "num_input_tokens_seen": 75264336, "step": 78805 }, { "epoch": 6.428746227261604, "grad_norm": 31.827713012695312, "learning_rate": 1.703706256158804e-05, "loss": 0.3871, "num_input_tokens_seen": 75269248, "step": 78810 }, { "epoch": 6.429154090872013, "grad_norm": 3.3378138542175293, "learning_rate": 1.7033688738973484e-05, "loss": 0.3239, "num_input_tokens_seen": 75273936, "step": 78815 }, { "epoch": 6.429561954482421, "grad_norm": 0.6592775583267212, "learning_rate": 1.703031507783268e-05, "loss": 0.3, "num_input_tokens_seen": 75279104, "step": 78820 }, { "epoch": 6.42996981809283, "grad_norm": 14.490179061889648, "learning_rate": 1.7026941578234004e-05, "loss": 0.3087, "num_input_tokens_seen": 75283632, "step": 78825 }, { "epoch": 6.430377681703239, "grad_norm": 0.5654232501983643, "learning_rate": 1.7023568240245835e-05, "loss": 0.2672, "num_input_tokens_seen": 75287840, "step": 78830 }, { "epoch": 6.430785545313647, "grad_norm": 1.8380037546157837, "learning_rate": 1.702019506393654e-05, "loss": 0.3643, "num_input_tokens_seen": 75292240, "step": 78835 }, { "epoch": 6.431193408924056, "grad_norm": 23.259111404418945, "learning_rate": 1.701682204937451e-05, "loss": 0.3674, "num_input_tokens_seen": 75297680, "step": 78840 }, { "epoch": 6.431601272534465, "grad_norm": 0.3236004412174225, "learning_rate": 1.70134491966281e-05, "loss": 0.4073, "num_input_tokens_seen": 75302432, "step": 78845 }, { "epoch": 6.432009136144873, "grad_norm": 2.5816538333892822, "learning_rate": 1.7010076505765683e-05, "loss": 0.2041, "num_input_tokens_seen": 75306736, "step": 78850 }, { "epoch": 6.432416999755282, "grad_norm": 19.22930335998535, "learning_rate": 1.700670397685562e-05, "loss": 0.4338, "num_input_tokens_seen": 75311136, "step": 78855 }, { "epoch": 6.43282486336569, "grad_norm": 0.5792607665061951, "learning_rate": 1.7003331609966262e-05, "loss": 0.3214, "num_input_tokens_seen": 75314624, "step": 78860 }, { "epoch": 6.433232726976099, "grad_norm": 7.834054470062256, "learning_rate": 1.6999959405165984e-05, "loss": 0.253, "num_input_tokens_seen": 75318912, "step": 78865 }, { "epoch": 6.433640590586508, "grad_norm": 1.1058731079101562, "learning_rate": 1.699658736252313e-05, "loss": 0.3647, "num_input_tokens_seen": 75323456, "step": 78870 }, { "epoch": 6.434048454196916, "grad_norm": 2.675652265548706, "learning_rate": 1.6993215482106044e-05, "loss": 0.4212, "num_input_tokens_seen": 75327280, "step": 78875 }, { "epoch": 6.434456317807325, "grad_norm": 8.64486026763916, "learning_rate": 1.6989843763983073e-05, "loss": 0.482, "num_input_tokens_seen": 75331776, "step": 78880 }, { "epoch": 6.434864181417734, "grad_norm": 3.8441905975341797, "learning_rate": 1.6986472208222576e-05, "loss": 0.2906, "num_input_tokens_seen": 75336288, "step": 78885 }, { "epoch": 6.435272045028142, "grad_norm": 12.16580581665039, "learning_rate": 1.698310081489288e-05, "loss": 0.3945, "num_input_tokens_seen": 75340144, "step": 78890 }, { "epoch": 6.435679908638551, "grad_norm": 1.6636855602264404, "learning_rate": 1.6979729584062325e-05, "loss": 0.3576, "num_input_tokens_seen": 75344096, "step": 78895 }, { "epoch": 6.43608777224896, "grad_norm": 3.0449979305267334, "learning_rate": 1.6976358515799233e-05, "loss": 0.3084, "num_input_tokens_seen": 75349072, "step": 78900 }, { "epoch": 6.436495635859369, "grad_norm": 0.6024202704429626, "learning_rate": 1.6972987610171957e-05, "loss": 0.2094, "num_input_tokens_seen": 75354848, "step": 78905 }, { "epoch": 6.436903499469778, "grad_norm": 4.247289180755615, "learning_rate": 1.696961686724881e-05, "loss": 0.4475, "num_input_tokens_seen": 75360368, "step": 78910 }, { "epoch": 6.437311363080186, "grad_norm": 0.710122287273407, "learning_rate": 1.6966246287098114e-05, "loss": 0.3399, "num_input_tokens_seen": 75366224, "step": 78915 }, { "epoch": 6.437719226690595, "grad_norm": 1.1736539602279663, "learning_rate": 1.696287586978819e-05, "loss": 0.2842, "num_input_tokens_seen": 75370560, "step": 78920 }, { "epoch": 6.438127090301004, "grad_norm": 1.6179966926574707, "learning_rate": 1.695950561538735e-05, "loss": 0.3961, "num_input_tokens_seen": 75374960, "step": 78925 }, { "epoch": 6.438534953911412, "grad_norm": 1.0227081775665283, "learning_rate": 1.6956135523963926e-05, "loss": 0.352, "num_input_tokens_seen": 75379808, "step": 78930 }, { "epoch": 6.438942817521821, "grad_norm": 1.8407220840454102, "learning_rate": 1.6952765595586215e-05, "loss": 0.2885, "num_input_tokens_seen": 75384016, "step": 78935 }, { "epoch": 6.439350681132229, "grad_norm": 0.47420191764831543, "learning_rate": 1.694939583032253e-05, "loss": 0.3966, "num_input_tokens_seen": 75387936, "step": 78940 }, { "epoch": 6.439758544742638, "grad_norm": 0.34028834104537964, "learning_rate": 1.6946026228241154e-05, "loss": 0.2495, "num_input_tokens_seen": 75392688, "step": 78945 }, { "epoch": 6.440166408353047, "grad_norm": 4.198913097381592, "learning_rate": 1.6942656789410417e-05, "loss": 0.3318, "num_input_tokens_seen": 75397696, "step": 78950 }, { "epoch": 6.440574271963455, "grad_norm": 9.381169319152832, "learning_rate": 1.6939287513898603e-05, "loss": 0.281, "num_input_tokens_seen": 75401936, "step": 78955 }, { "epoch": 6.440982135573864, "grad_norm": 2.5012195110321045, "learning_rate": 1.6935918401774003e-05, "loss": 0.3425, "num_input_tokens_seen": 75406016, "step": 78960 }, { "epoch": 6.441389999184273, "grad_norm": 1.7958214282989502, "learning_rate": 1.6932549453104915e-05, "loss": 0.3529, "num_input_tokens_seen": 75410048, "step": 78965 }, { "epoch": 6.441797862794681, "grad_norm": 15.850189208984375, "learning_rate": 1.6929180667959617e-05, "loss": 0.4786, "num_input_tokens_seen": 75414560, "step": 78970 }, { "epoch": 6.44220572640509, "grad_norm": 0.6757884621620178, "learning_rate": 1.6925812046406397e-05, "loss": 0.3705, "num_input_tokens_seen": 75419200, "step": 78975 }, { "epoch": 6.442613590015499, "grad_norm": 2.4105076789855957, "learning_rate": 1.6922443588513542e-05, "loss": 0.2938, "num_input_tokens_seen": 75423872, "step": 78980 }, { "epoch": 6.443021453625907, "grad_norm": 0.17899705469608307, "learning_rate": 1.691907529434932e-05, "loss": 0.3386, "num_input_tokens_seen": 75428240, "step": 78985 }, { "epoch": 6.443429317236316, "grad_norm": 11.691046714782715, "learning_rate": 1.6915707163981998e-05, "loss": 0.379, "num_input_tokens_seen": 75433520, "step": 78990 }, { "epoch": 6.4438371808467245, "grad_norm": 3.3764901161193848, "learning_rate": 1.6912339197479864e-05, "loss": 0.2649, "num_input_tokens_seen": 75438848, "step": 78995 }, { "epoch": 6.4442450444571335, "grad_norm": 6.086275100708008, "learning_rate": 1.6908971394911183e-05, "loss": 0.3314, "num_input_tokens_seen": 75444384, "step": 79000 }, { "epoch": 6.4446529080675425, "grad_norm": 63.540992736816406, "learning_rate": 1.690560375634421e-05, "loss": 0.3355, "num_input_tokens_seen": 75449664, "step": 79005 }, { "epoch": 6.445060771677951, "grad_norm": 0.70046067237854, "learning_rate": 1.6902236281847212e-05, "loss": 0.302, "num_input_tokens_seen": 75453072, "step": 79010 }, { "epoch": 6.44546863528836, "grad_norm": 62.98414611816406, "learning_rate": 1.689886897148843e-05, "loss": 0.4227, "num_input_tokens_seen": 75457904, "step": 79015 }, { "epoch": 6.445876498898768, "grad_norm": 0.3574032485485077, "learning_rate": 1.6895501825336148e-05, "loss": 0.3129, "num_input_tokens_seen": 75463168, "step": 79020 }, { "epoch": 6.446284362509177, "grad_norm": 0.6329953670501709, "learning_rate": 1.6892134843458594e-05, "loss": 0.2798, "num_input_tokens_seen": 75468144, "step": 79025 }, { "epoch": 6.446692226119586, "grad_norm": 3.4910407066345215, "learning_rate": 1.6888768025924022e-05, "loss": 0.3351, "num_input_tokens_seen": 75472352, "step": 79030 }, { "epoch": 6.447100089729994, "grad_norm": 4.826144695281982, "learning_rate": 1.6885401372800673e-05, "loss": 0.3555, "num_input_tokens_seen": 75477408, "step": 79035 }, { "epoch": 6.447507953340403, "grad_norm": 5.3111724853515625, "learning_rate": 1.6882034884156794e-05, "loss": 0.3633, "num_input_tokens_seen": 75482304, "step": 79040 }, { "epoch": 6.447915816950812, "grad_norm": 0.4404143989086151, "learning_rate": 1.6878668560060618e-05, "loss": 0.2841, "num_input_tokens_seen": 75487632, "step": 79045 }, { "epoch": 6.44832368056122, "grad_norm": 1.253220558166504, "learning_rate": 1.687530240058038e-05, "loss": 0.3771, "num_input_tokens_seen": 75491984, "step": 79050 }, { "epoch": 6.448731544171629, "grad_norm": 2.0595743656158447, "learning_rate": 1.68719364057843e-05, "loss": 0.5276, "num_input_tokens_seen": 75496848, "step": 79055 }, { "epoch": 6.449139407782038, "grad_norm": 0.8061413764953613, "learning_rate": 1.686857057574062e-05, "loss": 0.3751, "num_input_tokens_seen": 75501648, "step": 79060 }, { "epoch": 6.449547271392446, "grad_norm": 0.5773754715919495, "learning_rate": 1.686520491051757e-05, "loss": 0.4469, "num_input_tokens_seen": 75506192, "step": 79065 }, { "epoch": 6.449955135002855, "grad_norm": 19.573226928710938, "learning_rate": 1.686183941018335e-05, "loss": 0.2598, "num_input_tokens_seen": 75509872, "step": 79070 }, { "epoch": 6.450362998613263, "grad_norm": 1.5997449159622192, "learning_rate": 1.685847407480619e-05, "loss": 0.3176, "num_input_tokens_seen": 75513856, "step": 79075 }, { "epoch": 6.450770862223672, "grad_norm": 2.0723612308502197, "learning_rate": 1.6855108904454292e-05, "loss": 0.3639, "num_input_tokens_seen": 75518304, "step": 79080 }, { "epoch": 6.451178725834081, "grad_norm": 4.824612140655518, "learning_rate": 1.6851743899195887e-05, "loss": 0.3405, "num_input_tokens_seen": 75523520, "step": 79085 }, { "epoch": 6.4515865894444895, "grad_norm": 0.4760049879550934, "learning_rate": 1.6848379059099168e-05, "loss": 0.3777, "num_input_tokens_seen": 75528288, "step": 79090 }, { "epoch": 6.4519944530548985, "grad_norm": 0.9086481928825378, "learning_rate": 1.6845014384232343e-05, "loss": 0.3306, "num_input_tokens_seen": 75532880, "step": 79095 }, { "epoch": 6.4524023166653075, "grad_norm": 0.8322434425354004, "learning_rate": 1.6841649874663608e-05, "loss": 0.3404, "num_input_tokens_seen": 75536528, "step": 79100 }, { "epoch": 6.452810180275716, "grad_norm": 5.134636878967285, "learning_rate": 1.6838285530461166e-05, "loss": 0.3385, "num_input_tokens_seen": 75540992, "step": 79105 }, { "epoch": 6.453218043886125, "grad_norm": 7.566709995269775, "learning_rate": 1.6834921351693207e-05, "loss": 0.3236, "num_input_tokens_seen": 75546224, "step": 79110 }, { "epoch": 6.453625907496533, "grad_norm": 1.0610945224761963, "learning_rate": 1.6831557338427924e-05, "loss": 0.4875, "num_input_tokens_seen": 75550080, "step": 79115 }, { "epoch": 6.454033771106942, "grad_norm": 2.7621572017669678, "learning_rate": 1.6828193490733502e-05, "loss": 0.3577, "num_input_tokens_seen": 75555248, "step": 79120 }, { "epoch": 6.454441634717351, "grad_norm": 3.444575309753418, "learning_rate": 1.6824829808678118e-05, "loss": 0.3153, "num_input_tokens_seen": 75560336, "step": 79125 }, { "epoch": 6.454849498327759, "grad_norm": 5.6432929039001465, "learning_rate": 1.6821466292329968e-05, "loss": 0.1526, "num_input_tokens_seen": 75565424, "step": 79130 }, { "epoch": 6.455257361938168, "grad_norm": 18.260433197021484, "learning_rate": 1.6818102941757217e-05, "loss": 0.5859, "num_input_tokens_seen": 75570416, "step": 79135 }, { "epoch": 6.455665225548577, "grad_norm": 1.8232401609420776, "learning_rate": 1.681473975702805e-05, "loss": 0.3287, "num_input_tokens_seen": 75574912, "step": 79140 }, { "epoch": 6.456073089158985, "grad_norm": 2.329808235168457, "learning_rate": 1.6811376738210617e-05, "loss": 0.2861, "num_input_tokens_seen": 75580368, "step": 79145 }, { "epoch": 6.456480952769394, "grad_norm": 2.0935635566711426, "learning_rate": 1.680801388537311e-05, "loss": 0.3023, "num_input_tokens_seen": 75585376, "step": 79150 }, { "epoch": 6.456888816379802, "grad_norm": 2.4420738220214844, "learning_rate": 1.6804651198583677e-05, "loss": 0.3112, "num_input_tokens_seen": 75590032, "step": 79155 }, { "epoch": 6.457296679990211, "grad_norm": 0.461026132106781, "learning_rate": 1.680128867791048e-05, "loss": 0.347, "num_input_tokens_seen": 75594736, "step": 79160 }, { "epoch": 6.45770454360062, "grad_norm": 3.2493667602539062, "learning_rate": 1.6797926323421684e-05, "loss": 0.3518, "num_input_tokens_seen": 75599312, "step": 79165 }, { "epoch": 6.458112407211028, "grad_norm": 5.013213157653809, "learning_rate": 1.6794564135185427e-05, "loss": 0.3548, "num_input_tokens_seen": 75604608, "step": 79170 }, { "epoch": 6.458520270821437, "grad_norm": 37.44981384277344, "learning_rate": 1.6791202113269875e-05, "loss": 0.3957, "num_input_tokens_seen": 75608640, "step": 79175 }, { "epoch": 6.458928134431846, "grad_norm": 28.872499465942383, "learning_rate": 1.678784025774317e-05, "loss": 0.3209, "num_input_tokens_seen": 75612864, "step": 79180 }, { "epoch": 6.4593359980422544, "grad_norm": 0.6791254878044128, "learning_rate": 1.6784478568673446e-05, "loss": 0.2883, "num_input_tokens_seen": 75617872, "step": 79185 }, { "epoch": 6.4597438616526635, "grad_norm": 0.4771309196949005, "learning_rate": 1.6781117046128846e-05, "loss": 0.257, "num_input_tokens_seen": 75623232, "step": 79190 }, { "epoch": 6.4601517252630725, "grad_norm": 0.4802190065383911, "learning_rate": 1.677775569017752e-05, "loss": 0.2807, "num_input_tokens_seen": 75628048, "step": 79195 }, { "epoch": 6.460559588873481, "grad_norm": 0.6388002038002014, "learning_rate": 1.6774394500887594e-05, "loss": 0.2346, "num_input_tokens_seen": 75632928, "step": 79200 }, { "epoch": 6.46096745248389, "grad_norm": 0.3405604958534241, "learning_rate": 1.6771033478327196e-05, "loss": 0.2484, "num_input_tokens_seen": 75638608, "step": 79205 }, { "epoch": 6.461375316094298, "grad_norm": 0.4723309278488159, "learning_rate": 1.676767262256445e-05, "loss": 0.5011, "num_input_tokens_seen": 75642896, "step": 79210 }, { "epoch": 6.461783179704707, "grad_norm": 2.294320583343506, "learning_rate": 1.6764311933667472e-05, "loss": 0.4455, "num_input_tokens_seen": 75647728, "step": 79215 }, { "epoch": 6.462191043315116, "grad_norm": 0.6415231227874756, "learning_rate": 1.6760951411704406e-05, "loss": 0.3568, "num_input_tokens_seen": 75652624, "step": 79220 }, { "epoch": 6.462598906925524, "grad_norm": 27.275062561035156, "learning_rate": 1.675759105674335e-05, "loss": 0.4857, "num_input_tokens_seen": 75658448, "step": 79225 }, { "epoch": 6.463006770535933, "grad_norm": 2.629664659500122, "learning_rate": 1.6754230868852423e-05, "loss": 0.2948, "num_input_tokens_seen": 75663520, "step": 79230 }, { "epoch": 6.463414634146342, "grad_norm": 9.119033813476562, "learning_rate": 1.6750870848099725e-05, "loss": 0.2547, "num_input_tokens_seen": 75668192, "step": 79235 }, { "epoch": 6.46382249775675, "grad_norm": 28.374540328979492, "learning_rate": 1.6747510994553377e-05, "loss": 0.3437, "num_input_tokens_seen": 75673696, "step": 79240 }, { "epoch": 6.464230361367159, "grad_norm": 0.5241997241973877, "learning_rate": 1.674415130828147e-05, "loss": 0.2594, "num_input_tokens_seen": 75678656, "step": 79245 }, { "epoch": 6.464638224977567, "grad_norm": 24.286306381225586, "learning_rate": 1.6740791789352107e-05, "loss": 0.3371, "num_input_tokens_seen": 75682592, "step": 79250 }, { "epoch": 6.465046088587976, "grad_norm": 29.231557846069336, "learning_rate": 1.6737432437833385e-05, "loss": 0.3897, "num_input_tokens_seen": 75686912, "step": 79255 }, { "epoch": 6.465453952198385, "grad_norm": 14.155519485473633, "learning_rate": 1.6734073253793398e-05, "loss": 0.3756, "num_input_tokens_seen": 75691648, "step": 79260 }, { "epoch": 6.465861815808793, "grad_norm": 0.3777655363082886, "learning_rate": 1.6730714237300238e-05, "loss": 0.3958, "num_input_tokens_seen": 75696048, "step": 79265 }, { "epoch": 6.466269679419202, "grad_norm": 2.0644187927246094, "learning_rate": 1.6727355388421983e-05, "loss": 0.5276, "num_input_tokens_seen": 75700912, "step": 79270 }, { "epoch": 6.466677543029611, "grad_norm": 0.9138599634170532, "learning_rate": 1.672399670722672e-05, "loss": 0.3159, "num_input_tokens_seen": 75704720, "step": 79275 }, { "epoch": 6.467085406640019, "grad_norm": 45.830223083496094, "learning_rate": 1.6720638193782518e-05, "loss": 0.3817, "num_input_tokens_seen": 75709424, "step": 79280 }, { "epoch": 6.467493270250428, "grad_norm": 22.329483032226562, "learning_rate": 1.6717279848157474e-05, "loss": 0.41, "num_input_tokens_seen": 75714496, "step": 79285 }, { "epoch": 6.4679011338608365, "grad_norm": 1.010379433631897, "learning_rate": 1.6713921670419642e-05, "loss": 0.3663, "num_input_tokens_seen": 75719472, "step": 79290 }, { "epoch": 6.4683089974712455, "grad_norm": 1.8987500667572021, "learning_rate": 1.67105636606371e-05, "loss": 0.4031, "num_input_tokens_seen": 75724048, "step": 79295 }, { "epoch": 6.468716861081655, "grad_norm": 29.10236358642578, "learning_rate": 1.670720581887791e-05, "loss": 0.3787, "num_input_tokens_seen": 75729040, "step": 79300 }, { "epoch": 6.469124724692063, "grad_norm": 5.959085941314697, "learning_rate": 1.6703848145210133e-05, "loss": 0.3822, "num_input_tokens_seen": 75733088, "step": 79305 }, { "epoch": 6.469532588302472, "grad_norm": 2.1463921070098877, "learning_rate": 1.6700490639701823e-05, "loss": 0.2615, "num_input_tokens_seen": 75737136, "step": 79310 }, { "epoch": 6.469940451912881, "grad_norm": 10.509121894836426, "learning_rate": 1.669713330242105e-05, "loss": 0.3091, "num_input_tokens_seen": 75742096, "step": 79315 }, { "epoch": 6.470348315523289, "grad_norm": 0.5071737766265869, "learning_rate": 1.6693776133435855e-05, "loss": 0.3012, "num_input_tokens_seen": 75746592, "step": 79320 }, { "epoch": 6.470756179133698, "grad_norm": 48.90304183959961, "learning_rate": 1.6690419132814282e-05, "loss": 0.3483, "num_input_tokens_seen": 75750864, "step": 79325 }, { "epoch": 6.471164042744107, "grad_norm": 0.7746685743331909, "learning_rate": 1.668706230062439e-05, "loss": 0.2789, "num_input_tokens_seen": 75755024, "step": 79330 }, { "epoch": 6.471571906354515, "grad_norm": 0.4843612611293793, "learning_rate": 1.6683705636934212e-05, "loss": 0.3111, "num_input_tokens_seen": 75759936, "step": 79335 }, { "epoch": 6.471979769964924, "grad_norm": 4.7711100578308105, "learning_rate": 1.668034914181179e-05, "loss": 0.36, "num_input_tokens_seen": 75764752, "step": 79340 }, { "epoch": 6.472387633575332, "grad_norm": 61.53120040893555, "learning_rate": 1.6676992815325137e-05, "loss": 0.4069, "num_input_tokens_seen": 75770112, "step": 79345 }, { "epoch": 6.472795497185741, "grad_norm": 0.8991381525993347, "learning_rate": 1.667363665754232e-05, "loss": 0.4326, "num_input_tokens_seen": 75775040, "step": 79350 }, { "epoch": 6.47320336079615, "grad_norm": 15.20377254486084, "learning_rate": 1.667028066853135e-05, "loss": 0.3362, "num_input_tokens_seen": 75779920, "step": 79355 }, { "epoch": 6.473611224406558, "grad_norm": 2.464207410812378, "learning_rate": 1.6666924848360248e-05, "loss": 0.4627, "num_input_tokens_seen": 75783760, "step": 79360 }, { "epoch": 6.474019088016967, "grad_norm": 0.39896026253700256, "learning_rate": 1.666356919709704e-05, "loss": 0.2538, "num_input_tokens_seen": 75788896, "step": 79365 }, { "epoch": 6.474426951627375, "grad_norm": 0.7698916792869568, "learning_rate": 1.666021371480973e-05, "loss": 0.2628, "num_input_tokens_seen": 75793568, "step": 79370 }, { "epoch": 6.474834815237784, "grad_norm": 3.736717939376831, "learning_rate": 1.665685840156636e-05, "loss": 0.307, "num_input_tokens_seen": 75798432, "step": 79375 }, { "epoch": 6.475242678848193, "grad_norm": 4.343702793121338, "learning_rate": 1.665350325743492e-05, "loss": 0.4298, "num_input_tokens_seen": 75803328, "step": 79380 }, { "epoch": 6.4756505424586015, "grad_norm": 0.4169931709766388, "learning_rate": 1.6650148282483424e-05, "loss": 0.3558, "num_input_tokens_seen": 75807408, "step": 79385 }, { "epoch": 6.4760584060690105, "grad_norm": 4.804437160491943, "learning_rate": 1.6646793476779866e-05, "loss": 0.2644, "num_input_tokens_seen": 75812000, "step": 79390 }, { "epoch": 6.4764662696794195, "grad_norm": 4.350985527038574, "learning_rate": 1.664343884039226e-05, "loss": 0.4113, "num_input_tokens_seen": 75817088, "step": 79395 }, { "epoch": 6.476874133289828, "grad_norm": 0.4150984287261963, "learning_rate": 1.6640084373388603e-05, "loss": 0.2758, "num_input_tokens_seen": 75822272, "step": 79400 }, { "epoch": 6.477281996900237, "grad_norm": 1.8215198516845703, "learning_rate": 1.663673007583688e-05, "loss": 0.3025, "num_input_tokens_seen": 75826624, "step": 79405 }, { "epoch": 6.477689860510646, "grad_norm": 6.1892409324646, "learning_rate": 1.6633375947805084e-05, "loss": 0.3308, "num_input_tokens_seen": 75831248, "step": 79410 }, { "epoch": 6.478097724121054, "grad_norm": 11.073156356811523, "learning_rate": 1.6630021989361194e-05, "loss": 0.3448, "num_input_tokens_seen": 75836064, "step": 79415 }, { "epoch": 6.478505587731463, "grad_norm": 0.913193941116333, "learning_rate": 1.662666820057321e-05, "loss": 0.2869, "num_input_tokens_seen": 75840208, "step": 79420 }, { "epoch": 6.478913451341871, "grad_norm": 0.9966797232627869, "learning_rate": 1.662331458150911e-05, "loss": 0.4028, "num_input_tokens_seen": 75844512, "step": 79425 }, { "epoch": 6.47932131495228, "grad_norm": 11.846738815307617, "learning_rate": 1.6619961132236856e-05, "loss": 0.3333, "num_input_tokens_seen": 75849840, "step": 79430 }, { "epoch": 6.479729178562689, "grad_norm": 0.8999060392379761, "learning_rate": 1.6616607852824422e-05, "loss": 0.4057, "num_input_tokens_seen": 75854992, "step": 79435 }, { "epoch": 6.480137042173097, "grad_norm": 0.7521892786026001, "learning_rate": 1.6613254743339794e-05, "loss": 0.2769, "num_input_tokens_seen": 75860032, "step": 79440 }, { "epoch": 6.480544905783506, "grad_norm": 9.566250801086426, "learning_rate": 1.6609901803850927e-05, "loss": 0.3245, "num_input_tokens_seen": 75865440, "step": 79445 }, { "epoch": 6.480952769393915, "grad_norm": 0.7091485261917114, "learning_rate": 1.660654903442579e-05, "loss": 0.1914, "num_input_tokens_seen": 75870304, "step": 79450 }, { "epoch": 6.481360633004323, "grad_norm": 8.517657279968262, "learning_rate": 1.6603196435132327e-05, "loss": 0.2577, "num_input_tokens_seen": 75874800, "step": 79455 }, { "epoch": 6.481768496614732, "grad_norm": 1.5765626430511475, "learning_rate": 1.659984400603851e-05, "loss": 0.2476, "num_input_tokens_seen": 75880272, "step": 79460 }, { "epoch": 6.48217636022514, "grad_norm": 1.072472333908081, "learning_rate": 1.6596491747212288e-05, "loss": 0.4608, "num_input_tokens_seen": 75885248, "step": 79465 }, { "epoch": 6.482584223835549, "grad_norm": 0.4094809293746948, "learning_rate": 1.6593139658721608e-05, "loss": 0.1934, "num_input_tokens_seen": 75889616, "step": 79470 }, { "epoch": 6.482992087445958, "grad_norm": 10.202848434448242, "learning_rate": 1.6589787740634414e-05, "loss": 0.2569, "num_input_tokens_seen": 75894768, "step": 79475 }, { "epoch": 6.4833999510563665, "grad_norm": 64.82949829101562, "learning_rate": 1.6586435993018635e-05, "loss": 0.4548, "num_input_tokens_seen": 75900432, "step": 79480 }, { "epoch": 6.4838078146667755, "grad_norm": 20.742063522338867, "learning_rate": 1.658308441594224e-05, "loss": 0.3554, "num_input_tokens_seen": 75905472, "step": 79485 }, { "epoch": 6.4842156782771845, "grad_norm": 0.7455267906188965, "learning_rate": 1.6579733009473142e-05, "loss": 0.3253, "num_input_tokens_seen": 75909520, "step": 79490 }, { "epoch": 6.484623541887593, "grad_norm": 1.8373136520385742, "learning_rate": 1.657638177367928e-05, "loss": 0.4815, "num_input_tokens_seen": 75913856, "step": 79495 }, { "epoch": 6.485031405498002, "grad_norm": 5.461819171905518, "learning_rate": 1.657303070862857e-05, "loss": 0.3287, "num_input_tokens_seen": 75918256, "step": 79500 }, { "epoch": 6.48543926910841, "grad_norm": 51.465049743652344, "learning_rate": 1.6569679814388956e-05, "loss": 0.4078, "num_input_tokens_seen": 75923568, "step": 79505 }, { "epoch": 6.485847132718819, "grad_norm": 1.5589975118637085, "learning_rate": 1.656632909102835e-05, "loss": 0.2921, "num_input_tokens_seen": 75927440, "step": 79510 }, { "epoch": 6.486254996329228, "grad_norm": 7.393187999725342, "learning_rate": 1.6562978538614672e-05, "loss": 0.2436, "num_input_tokens_seen": 75932384, "step": 79515 }, { "epoch": 6.486662859939636, "grad_norm": 1.0678632259368896, "learning_rate": 1.655962815721583e-05, "loss": 0.3727, "num_input_tokens_seen": 75937344, "step": 79520 }, { "epoch": 6.487070723550045, "grad_norm": 4.365388870239258, "learning_rate": 1.6556277946899737e-05, "loss": 0.3968, "num_input_tokens_seen": 75941104, "step": 79525 }, { "epoch": 6.487478587160454, "grad_norm": 0.5284837484359741, "learning_rate": 1.6552927907734306e-05, "loss": 0.6333, "num_input_tokens_seen": 75945920, "step": 79530 }, { "epoch": 6.487886450770862, "grad_norm": 30.572452545166016, "learning_rate": 1.6549578039787436e-05, "loss": 0.7844, "num_input_tokens_seen": 75950336, "step": 79535 }, { "epoch": 6.488294314381271, "grad_norm": 7.223043441772461, "learning_rate": 1.654622834312703e-05, "loss": 0.2243, "num_input_tokens_seen": 75955216, "step": 79540 }, { "epoch": 6.48870217799168, "grad_norm": 1.5806509256362915, "learning_rate": 1.6542878817820973e-05, "loss": 0.7342, "num_input_tokens_seen": 75960512, "step": 79545 }, { "epoch": 6.489110041602088, "grad_norm": 1.9087358713150024, "learning_rate": 1.6539529463937176e-05, "loss": 0.4363, "num_input_tokens_seen": 75965744, "step": 79550 }, { "epoch": 6.489517905212497, "grad_norm": 0.5551214218139648, "learning_rate": 1.6536180281543524e-05, "loss": 0.5506, "num_input_tokens_seen": 75970816, "step": 79555 }, { "epoch": 6.489925768822905, "grad_norm": 0.8991904258728027, "learning_rate": 1.6532831270707898e-05, "loss": 0.2987, "num_input_tokens_seen": 75975536, "step": 79560 }, { "epoch": 6.490333632433314, "grad_norm": 6.632327556610107, "learning_rate": 1.652948243149819e-05, "loss": 0.4001, "num_input_tokens_seen": 75978752, "step": 79565 }, { "epoch": 6.490741496043723, "grad_norm": 2.5342893600463867, "learning_rate": 1.652613376398226e-05, "loss": 0.5723, "num_input_tokens_seen": 75983120, "step": 79570 }, { "epoch": 6.491149359654131, "grad_norm": 3.6243233680725098, "learning_rate": 1.652278526822801e-05, "loss": 0.2258, "num_input_tokens_seen": 75987712, "step": 79575 }, { "epoch": 6.49155722326454, "grad_norm": 2.702765464782715, "learning_rate": 1.65194369443033e-05, "loss": 0.3495, "num_input_tokens_seen": 75992160, "step": 79580 }, { "epoch": 6.4919650868749486, "grad_norm": 4.029116630554199, "learning_rate": 1.6516088792275997e-05, "loss": 0.2499, "num_input_tokens_seen": 75996992, "step": 79585 }, { "epoch": 6.492372950485358, "grad_norm": 0.7387926578521729, "learning_rate": 1.651274081221397e-05, "loss": 0.4373, "num_input_tokens_seen": 76001760, "step": 79590 }, { "epoch": 6.492780814095767, "grad_norm": 1.338066816329956, "learning_rate": 1.6509393004185074e-05, "loss": 0.5096, "num_input_tokens_seen": 76007632, "step": 79595 }, { "epoch": 6.493188677706175, "grad_norm": 4.695964813232422, "learning_rate": 1.6506045368257183e-05, "loss": 0.337, "num_input_tokens_seen": 76012032, "step": 79600 }, { "epoch": 6.493596541316584, "grad_norm": 3.187337636947632, "learning_rate": 1.6502697904498145e-05, "loss": 0.4353, "num_input_tokens_seen": 76017472, "step": 79605 }, { "epoch": 6.494004404926993, "grad_norm": 7.4428534507751465, "learning_rate": 1.6499350612975795e-05, "loss": 0.3951, "num_input_tokens_seen": 76022048, "step": 79610 }, { "epoch": 6.494412268537401, "grad_norm": 1.5099668502807617, "learning_rate": 1.6496003493758006e-05, "loss": 0.2834, "num_input_tokens_seen": 76027184, "step": 79615 }, { "epoch": 6.49482013214781, "grad_norm": 6.7671074867248535, "learning_rate": 1.649265654691261e-05, "loss": 0.4372, "num_input_tokens_seen": 76032512, "step": 79620 }, { "epoch": 6.495227995758219, "grad_norm": 1.4074550867080688, "learning_rate": 1.6489309772507454e-05, "loss": 0.3042, "num_input_tokens_seen": 76037536, "step": 79625 }, { "epoch": 6.495635859368627, "grad_norm": 16.478458404541016, "learning_rate": 1.6485963170610376e-05, "loss": 0.4441, "num_input_tokens_seen": 76042112, "step": 79630 }, { "epoch": 6.496043722979036, "grad_norm": 7.36150598526001, "learning_rate": 1.6482616741289192e-05, "loss": 0.4499, "num_input_tokens_seen": 76046816, "step": 79635 }, { "epoch": 6.496451586589444, "grad_norm": 0.6765109896659851, "learning_rate": 1.647927048461176e-05, "loss": 0.3391, "num_input_tokens_seen": 76051568, "step": 79640 }, { "epoch": 6.496859450199853, "grad_norm": 6.234273910522461, "learning_rate": 1.647592440064589e-05, "loss": 0.2177, "num_input_tokens_seen": 76056320, "step": 79645 }, { "epoch": 6.497267313810262, "grad_norm": 39.02452850341797, "learning_rate": 1.6472578489459417e-05, "loss": 0.5451, "num_input_tokens_seen": 76060848, "step": 79650 }, { "epoch": 6.49767517742067, "grad_norm": 7.017441749572754, "learning_rate": 1.6469232751120146e-05, "loss": 0.3654, "num_input_tokens_seen": 76064384, "step": 79655 }, { "epoch": 6.498083041031079, "grad_norm": 1.9855635166168213, "learning_rate": 1.6465887185695906e-05, "loss": 0.4086, "num_input_tokens_seen": 76068176, "step": 79660 }, { "epoch": 6.498490904641488, "grad_norm": 21.161867141723633, "learning_rate": 1.646254179325451e-05, "loss": 0.6013, "num_input_tokens_seen": 76072176, "step": 79665 }, { "epoch": 6.498898768251896, "grad_norm": 2.0848803520202637, "learning_rate": 1.645919657386376e-05, "loss": 0.6582, "num_input_tokens_seen": 76077216, "step": 79670 }, { "epoch": 6.499306631862305, "grad_norm": 0.7325525879859924, "learning_rate": 1.645585152759147e-05, "loss": 0.3886, "num_input_tokens_seen": 76082912, "step": 79675 }, { "epoch": 6.4997144954727135, "grad_norm": 37.2886848449707, "learning_rate": 1.6452506654505426e-05, "loss": 0.6754, "num_input_tokens_seen": 76088192, "step": 79680 }, { "epoch": 6.5001223590831225, "grad_norm": 14.465478897094727, "learning_rate": 1.6449161954673452e-05, "loss": 0.3885, "num_input_tokens_seen": 76093600, "step": 79685 }, { "epoch": 6.5005302226935315, "grad_norm": 1.4464350938796997, "learning_rate": 1.6445817428163328e-05, "loss": 0.4941, "num_input_tokens_seen": 76098208, "step": 79690 }, { "epoch": 6.5005302226935315, "eval_loss": 0.35939252376556396, "eval_runtime": 570.9866, "eval_samples_per_second": 4.772, "eval_steps_per_second": 2.387, "num_input_tokens_seen": 76098208, "step": 79690 }, { "epoch": 6.50093808630394, "grad_norm": 1.64451265335083, "learning_rate": 1.6442473075042853e-05, "loss": 0.3151, "num_input_tokens_seen": 76103056, "step": 79695 }, { "epoch": 6.501345949914349, "grad_norm": 0.6139971017837524, "learning_rate": 1.6439128895379802e-05, "loss": 0.3044, "num_input_tokens_seen": 76107920, "step": 79700 }, { "epoch": 6.501753813524758, "grad_norm": 0.5129449367523193, "learning_rate": 1.643578488924198e-05, "loss": 0.5243, "num_input_tokens_seen": 76112864, "step": 79705 }, { "epoch": 6.502161677135166, "grad_norm": 17.819866180419922, "learning_rate": 1.643244105669716e-05, "loss": 0.4517, "num_input_tokens_seen": 76117168, "step": 79710 }, { "epoch": 6.502569540745575, "grad_norm": 17.99888801574707, "learning_rate": 1.642909739781312e-05, "loss": 0.3719, "num_input_tokens_seen": 76122320, "step": 79715 }, { "epoch": 6.502977404355983, "grad_norm": 0.9152053594589233, "learning_rate": 1.642575391265763e-05, "loss": 0.3175, "num_input_tokens_seen": 76127360, "step": 79720 }, { "epoch": 6.503385267966392, "grad_norm": 1.031288504600525, "learning_rate": 1.6422410601298465e-05, "loss": 0.3493, "num_input_tokens_seen": 76132016, "step": 79725 }, { "epoch": 6.503793131576801, "grad_norm": 1.022382140159607, "learning_rate": 1.6419067463803393e-05, "loss": 0.3402, "num_input_tokens_seen": 76136544, "step": 79730 }, { "epoch": 6.504200995187209, "grad_norm": 12.043181419372559, "learning_rate": 1.641572450024018e-05, "loss": 0.4413, "num_input_tokens_seen": 76141664, "step": 79735 }, { "epoch": 6.504608858797618, "grad_norm": 2.331315517425537, "learning_rate": 1.6412381710676584e-05, "loss": 0.3693, "num_input_tokens_seen": 76145920, "step": 79740 }, { "epoch": 6.505016722408027, "grad_norm": 14.298135757446289, "learning_rate": 1.6409039095180352e-05, "loss": 0.306, "num_input_tokens_seen": 76150960, "step": 79745 }, { "epoch": 6.505424586018435, "grad_norm": 1.3388099670410156, "learning_rate": 1.6405696653819254e-05, "loss": 0.4532, "num_input_tokens_seen": 76156016, "step": 79750 }, { "epoch": 6.505832449628844, "grad_norm": 1.3873295783996582, "learning_rate": 1.6402354386661035e-05, "loss": 0.3359, "num_input_tokens_seen": 76161584, "step": 79755 }, { "epoch": 6.506240313239253, "grad_norm": 28.581317901611328, "learning_rate": 1.6399012293773435e-05, "loss": 0.2829, "num_input_tokens_seen": 76166752, "step": 79760 }, { "epoch": 6.506648176849661, "grad_norm": 6.764204502105713, "learning_rate": 1.6395670375224206e-05, "loss": 0.458, "num_input_tokens_seen": 76171456, "step": 79765 }, { "epoch": 6.50705604046007, "grad_norm": 3.5436794757843018, "learning_rate": 1.6392328631081073e-05, "loss": 0.3621, "num_input_tokens_seen": 76175648, "step": 79770 }, { "epoch": 6.5074639040704785, "grad_norm": 14.49715518951416, "learning_rate": 1.6388987061411786e-05, "loss": 0.3349, "num_input_tokens_seen": 76179792, "step": 79775 }, { "epoch": 6.5078717676808875, "grad_norm": 0.7915396690368652, "learning_rate": 1.6385645666284076e-05, "loss": 0.3392, "num_input_tokens_seen": 76184576, "step": 79780 }, { "epoch": 6.5082796312912965, "grad_norm": 68.19509887695312, "learning_rate": 1.6382304445765666e-05, "loss": 0.4451, "num_input_tokens_seen": 76189072, "step": 79785 }, { "epoch": 6.508687494901705, "grad_norm": 2.3418335914611816, "learning_rate": 1.637896339992428e-05, "loss": 0.2722, "num_input_tokens_seen": 76194656, "step": 79790 }, { "epoch": 6.509095358512114, "grad_norm": 25.2262020111084, "learning_rate": 1.6375622528827646e-05, "loss": 0.3123, "num_input_tokens_seen": 76199504, "step": 79795 }, { "epoch": 6.509503222122522, "grad_norm": 8.579683303833008, "learning_rate": 1.637228183254348e-05, "loss": 0.3453, "num_input_tokens_seen": 76203872, "step": 79800 }, { "epoch": 6.509911085732931, "grad_norm": 3.7233469486236572, "learning_rate": 1.6368941311139495e-05, "loss": 0.2658, "num_input_tokens_seen": 76208384, "step": 79805 }, { "epoch": 6.51031894934334, "grad_norm": 0.42190107703208923, "learning_rate": 1.636560096468339e-05, "loss": 0.3561, "num_input_tokens_seen": 76212832, "step": 79810 }, { "epoch": 6.510726812953748, "grad_norm": 0.3790804445743561, "learning_rate": 1.6362260793242894e-05, "loss": 0.272, "num_input_tokens_seen": 76217888, "step": 79815 }, { "epoch": 6.511134676564157, "grad_norm": 0.4457681477069855, "learning_rate": 1.6358920796885705e-05, "loss": 0.2827, "num_input_tokens_seen": 76221824, "step": 79820 }, { "epoch": 6.511542540174566, "grad_norm": 0.6100639700889587, "learning_rate": 1.635558097567952e-05, "loss": 0.2882, "num_input_tokens_seen": 76226464, "step": 79825 }, { "epoch": 6.511950403784974, "grad_norm": 0.38623788952827454, "learning_rate": 1.635224132969203e-05, "loss": 0.2968, "num_input_tokens_seen": 76231264, "step": 79830 }, { "epoch": 6.512358267395383, "grad_norm": 1.2891826629638672, "learning_rate": 1.634890185899093e-05, "loss": 0.3353, "num_input_tokens_seen": 76235680, "step": 79835 }, { "epoch": 6.512766131005792, "grad_norm": 11.208295822143555, "learning_rate": 1.6345562563643917e-05, "loss": 0.3144, "num_input_tokens_seen": 76239936, "step": 79840 }, { "epoch": 6.5131739946162, "grad_norm": 0.3742932081222534, "learning_rate": 1.6342223443718677e-05, "loss": 0.2523, "num_input_tokens_seen": 76244416, "step": 79845 }, { "epoch": 6.513581858226609, "grad_norm": 43.35251235961914, "learning_rate": 1.6338884499282886e-05, "loss": 0.4234, "num_input_tokens_seen": 76249408, "step": 79850 }, { "epoch": 6.513989721837017, "grad_norm": 1.5338099002838135, "learning_rate": 1.633554573040422e-05, "loss": 0.4601, "num_input_tokens_seen": 76253936, "step": 79855 }, { "epoch": 6.514397585447426, "grad_norm": 11.171422004699707, "learning_rate": 1.6332207137150374e-05, "loss": 0.3474, "num_input_tokens_seen": 76258672, "step": 79860 }, { "epoch": 6.514805449057835, "grad_norm": 8.697175979614258, "learning_rate": 1.6328868719588998e-05, "loss": 0.3952, "num_input_tokens_seen": 76263168, "step": 79865 }, { "epoch": 6.515213312668243, "grad_norm": 17.31875991821289, "learning_rate": 1.6325530477787772e-05, "loss": 0.4326, "num_input_tokens_seen": 76267632, "step": 79870 }, { "epoch": 6.5156211762786524, "grad_norm": 29.846477508544922, "learning_rate": 1.6322192411814357e-05, "loss": 0.3892, "num_input_tokens_seen": 76272352, "step": 79875 }, { "epoch": 6.5160290398890615, "grad_norm": 0.8898743391036987, "learning_rate": 1.6318854521736403e-05, "loss": 0.2727, "num_input_tokens_seen": 76277472, "step": 79880 }, { "epoch": 6.51643690349947, "grad_norm": 0.7240332365036011, "learning_rate": 1.631551680762159e-05, "loss": 0.2726, "num_input_tokens_seen": 76282528, "step": 79885 }, { "epoch": 6.516844767109879, "grad_norm": 0.27538296580314636, "learning_rate": 1.6312179269537558e-05, "loss": 0.3939, "num_input_tokens_seen": 76287136, "step": 79890 }, { "epoch": 6.517252630720288, "grad_norm": 2.7415218353271484, "learning_rate": 1.6308841907551965e-05, "loss": 0.2892, "num_input_tokens_seen": 76291984, "step": 79895 }, { "epoch": 6.517660494330696, "grad_norm": 0.9007331132888794, "learning_rate": 1.630550472173244e-05, "loss": 0.3732, "num_input_tokens_seen": 76296448, "step": 79900 }, { "epoch": 6.518068357941105, "grad_norm": 0.5003803968429565, "learning_rate": 1.6302167712146655e-05, "loss": 0.1328, "num_input_tokens_seen": 76301488, "step": 79905 }, { "epoch": 6.518476221551513, "grad_norm": 2.915361166000366, "learning_rate": 1.629883087886223e-05, "loss": 0.4163, "num_input_tokens_seen": 76306432, "step": 79910 }, { "epoch": 6.518884085161922, "grad_norm": 0.4229426980018616, "learning_rate": 1.6295494221946807e-05, "loss": 0.3352, "num_input_tokens_seen": 76311328, "step": 79915 }, { "epoch": 6.519291948772331, "grad_norm": 1.4250597953796387, "learning_rate": 1.6292157741468016e-05, "loss": 0.3965, "num_input_tokens_seen": 76316160, "step": 79920 }, { "epoch": 6.519699812382739, "grad_norm": 1.3128674030303955, "learning_rate": 1.6288821437493486e-05, "loss": 0.2984, "num_input_tokens_seen": 76320704, "step": 79925 }, { "epoch": 6.520107675993148, "grad_norm": 0.5124990940093994, "learning_rate": 1.628548531009084e-05, "loss": 0.4429, "num_input_tokens_seen": 76326000, "step": 79930 }, { "epoch": 6.520515539603556, "grad_norm": 25.478534698486328, "learning_rate": 1.6282149359327716e-05, "loss": 0.2324, "num_input_tokens_seen": 76330288, "step": 79935 }, { "epoch": 6.520923403213965, "grad_norm": 25.862308502197266, "learning_rate": 1.6278813585271706e-05, "loss": 0.3176, "num_input_tokens_seen": 76333824, "step": 79940 }, { "epoch": 6.521331266824374, "grad_norm": 1.721506953239441, "learning_rate": 1.627547798799044e-05, "loss": 0.2686, "num_input_tokens_seen": 76337728, "step": 79945 }, { "epoch": 6.521739130434782, "grad_norm": 2.6966612339019775, "learning_rate": 1.6272142567551534e-05, "loss": 0.3371, "num_input_tokens_seen": 76342464, "step": 79950 }, { "epoch": 6.522146994045191, "grad_norm": 29.591819763183594, "learning_rate": 1.6268807324022593e-05, "loss": 0.2966, "num_input_tokens_seen": 76346816, "step": 79955 }, { "epoch": 6.5225548576556, "grad_norm": 0.39938950538635254, "learning_rate": 1.6265472257471215e-05, "loss": 0.391, "num_input_tokens_seen": 76352256, "step": 79960 }, { "epoch": 6.522962721266008, "grad_norm": 0.43605780601501465, "learning_rate": 1.6262137367965003e-05, "loss": 0.3223, "num_input_tokens_seen": 76357280, "step": 79965 }, { "epoch": 6.523370584876417, "grad_norm": 9.055547714233398, "learning_rate": 1.625880265557154e-05, "loss": 0.451, "num_input_tokens_seen": 76362352, "step": 79970 }, { "epoch": 6.523778448486826, "grad_norm": 1.0084292888641357, "learning_rate": 1.6255468120358448e-05, "loss": 0.3003, "num_input_tokens_seen": 76367088, "step": 79975 }, { "epoch": 6.5241863120972345, "grad_norm": 0.2900093197822571, "learning_rate": 1.62521337623933e-05, "loss": 0.346, "num_input_tokens_seen": 76371888, "step": 79980 }, { "epoch": 6.5245941757076436, "grad_norm": 2.3350577354431152, "learning_rate": 1.624879958174368e-05, "loss": 0.4695, "num_input_tokens_seen": 76377232, "step": 79985 }, { "epoch": 6.525002039318052, "grad_norm": 2.5141825675964355, "learning_rate": 1.6245465578477173e-05, "loss": 0.3208, "num_input_tokens_seen": 76381616, "step": 79990 }, { "epoch": 6.525409902928461, "grad_norm": 34.519168853759766, "learning_rate": 1.624213175266136e-05, "loss": 0.3428, "num_input_tokens_seen": 76386768, "step": 79995 }, { "epoch": 6.52581776653887, "grad_norm": 32.34785842895508, "learning_rate": 1.6238798104363813e-05, "loss": 0.3514, "num_input_tokens_seen": 76392656, "step": 80000 }, { "epoch": 6.526225630149278, "grad_norm": 9.72482967376709, "learning_rate": 1.623546463365211e-05, "loss": 0.323, "num_input_tokens_seen": 76397696, "step": 80005 }, { "epoch": 6.526633493759687, "grad_norm": 3.858943223953247, "learning_rate": 1.623213134059381e-05, "loss": 0.2836, "num_input_tokens_seen": 76403104, "step": 80010 }, { "epoch": 6.527041357370095, "grad_norm": 2.8028807640075684, "learning_rate": 1.6228798225256487e-05, "loss": 0.4118, "num_input_tokens_seen": 76406992, "step": 80015 }, { "epoch": 6.527449220980504, "grad_norm": 10.867321968078613, "learning_rate": 1.6225465287707697e-05, "loss": 0.3586, "num_input_tokens_seen": 76412336, "step": 80020 }, { "epoch": 6.527857084590913, "grad_norm": 15.535052299499512, "learning_rate": 1.6222132528014995e-05, "loss": 0.3654, "num_input_tokens_seen": 76417536, "step": 80025 }, { "epoch": 6.528264948201322, "grad_norm": 0.8809248208999634, "learning_rate": 1.621879994624594e-05, "loss": 0.3046, "num_input_tokens_seen": 76422000, "step": 80030 }, { "epoch": 6.52867281181173, "grad_norm": 0.773426353931427, "learning_rate": 1.6215467542468066e-05, "loss": 0.3069, "num_input_tokens_seen": 76427472, "step": 80035 }, { "epoch": 6.529080675422139, "grad_norm": 1.1509298086166382, "learning_rate": 1.6212135316748942e-05, "loss": 0.2554, "num_input_tokens_seen": 76432240, "step": 80040 }, { "epoch": 6.529488539032547, "grad_norm": 13.771764755249023, "learning_rate": 1.6208803269156104e-05, "loss": 0.3617, "num_input_tokens_seen": 76436032, "step": 80045 }, { "epoch": 6.529896402642956, "grad_norm": 0.49187231063842773, "learning_rate": 1.6205471399757084e-05, "loss": 0.2515, "num_input_tokens_seen": 76439888, "step": 80050 }, { "epoch": 6.530304266253365, "grad_norm": 42.064842224121094, "learning_rate": 1.6202139708619417e-05, "loss": 0.3242, "num_input_tokens_seen": 76444576, "step": 80055 }, { "epoch": 6.530712129863773, "grad_norm": 1.889499545097351, "learning_rate": 1.619880819581065e-05, "loss": 0.3707, "num_input_tokens_seen": 76449808, "step": 80060 }, { "epoch": 6.531119993474182, "grad_norm": 0.3614882826805115, "learning_rate": 1.619547686139829e-05, "loss": 0.3174, "num_input_tokens_seen": 76454400, "step": 80065 }, { "epoch": 6.5315278570845905, "grad_norm": 0.8612968325614929, "learning_rate": 1.619214570544988e-05, "loss": 0.3515, "num_input_tokens_seen": 76459312, "step": 80070 }, { "epoch": 6.5319357206949995, "grad_norm": 1.608249545097351, "learning_rate": 1.6188814728032938e-05, "loss": 0.2891, "num_input_tokens_seen": 76464272, "step": 80075 }, { "epoch": 6.5323435843054085, "grad_norm": 0.6849706172943115, "learning_rate": 1.6185483929214963e-05, "loss": 0.2714, "num_input_tokens_seen": 76469008, "step": 80080 }, { "epoch": 6.532751447915817, "grad_norm": 0.3902210295200348, "learning_rate": 1.61821533090635e-05, "loss": 0.341, "num_input_tokens_seen": 76474176, "step": 80085 }, { "epoch": 6.533159311526226, "grad_norm": 1.2269819974899292, "learning_rate": 1.617882286764603e-05, "loss": 0.3262, "num_input_tokens_seen": 76479248, "step": 80090 }, { "epoch": 6.533567175136635, "grad_norm": 0.4613976776599884, "learning_rate": 1.617549260503008e-05, "loss": 0.3889, "num_input_tokens_seen": 76484240, "step": 80095 }, { "epoch": 6.533975038747043, "grad_norm": 0.5832318663597107, "learning_rate": 1.617216252128313e-05, "loss": 0.2364, "num_input_tokens_seen": 76489488, "step": 80100 }, { "epoch": 6.534382902357452, "grad_norm": 0.1942254900932312, "learning_rate": 1.616883261647271e-05, "loss": 0.3262, "num_input_tokens_seen": 76494672, "step": 80105 }, { "epoch": 6.534790765967861, "grad_norm": 0.48670315742492676, "learning_rate": 1.61655028906663e-05, "loss": 0.3304, "num_input_tokens_seen": 76499264, "step": 80110 }, { "epoch": 6.535198629578269, "grad_norm": 2.247020959854126, "learning_rate": 1.616217334393139e-05, "loss": 0.3446, "num_input_tokens_seen": 76504048, "step": 80115 }, { "epoch": 6.535606493188678, "grad_norm": 1.3475563526153564, "learning_rate": 1.615884397633547e-05, "loss": 0.2754, "num_input_tokens_seen": 76508368, "step": 80120 }, { "epoch": 6.536014356799086, "grad_norm": 0.638361394405365, "learning_rate": 1.615551478794602e-05, "loss": 0.364, "num_input_tokens_seen": 76513296, "step": 80125 }, { "epoch": 6.536422220409495, "grad_norm": 0.41988813877105713, "learning_rate": 1.6152185778830532e-05, "loss": 0.3211, "num_input_tokens_seen": 76518704, "step": 80130 }, { "epoch": 6.536830084019904, "grad_norm": 4.645815372467041, "learning_rate": 1.614885694905648e-05, "loss": 0.2996, "num_input_tokens_seen": 76523584, "step": 80135 }, { "epoch": 6.537237947630312, "grad_norm": 0.5096820592880249, "learning_rate": 1.6145528298691335e-05, "loss": 0.3193, "num_input_tokens_seen": 76528256, "step": 80140 }, { "epoch": 6.537645811240721, "grad_norm": 1.5690131187438965, "learning_rate": 1.6142199827802572e-05, "loss": 0.3382, "num_input_tokens_seen": 76532912, "step": 80145 }, { "epoch": 6.538053674851129, "grad_norm": 8.320243835449219, "learning_rate": 1.613887153645765e-05, "loss": 0.3476, "num_input_tokens_seen": 76537792, "step": 80150 }, { "epoch": 6.538461538461538, "grad_norm": 0.4147116541862488, "learning_rate": 1.6135543424724044e-05, "loss": 0.3154, "num_input_tokens_seen": 76542688, "step": 80155 }, { "epoch": 6.538869402071947, "grad_norm": 31.907894134521484, "learning_rate": 1.61322154926692e-05, "loss": 0.4756, "num_input_tokens_seen": 76547104, "step": 80160 }, { "epoch": 6.5392772656823555, "grad_norm": 2.034062623977661, "learning_rate": 1.612888774036058e-05, "loss": 0.3178, "num_input_tokens_seen": 76552144, "step": 80165 }, { "epoch": 6.5396851292927645, "grad_norm": 0.4775031507015228, "learning_rate": 1.6125560167865634e-05, "loss": 0.3468, "num_input_tokens_seen": 76557424, "step": 80170 }, { "epoch": 6.5400929929031735, "grad_norm": 1.563733696937561, "learning_rate": 1.6122232775251815e-05, "loss": 0.3446, "num_input_tokens_seen": 76562048, "step": 80175 }, { "epoch": 6.540500856513582, "grad_norm": 44.929141998291016, "learning_rate": 1.6118905562586567e-05, "loss": 0.4731, "num_input_tokens_seen": 76567056, "step": 80180 }, { "epoch": 6.540908720123991, "grad_norm": 0.7039458751678467, "learning_rate": 1.611557852993733e-05, "loss": 0.3873, "num_input_tokens_seen": 76570912, "step": 80185 }, { "epoch": 6.5413165837344, "grad_norm": 0.7238860726356506, "learning_rate": 1.6112251677371527e-05, "loss": 0.2886, "num_input_tokens_seen": 76575936, "step": 80190 }, { "epoch": 6.541724447344808, "grad_norm": 30.764930725097656, "learning_rate": 1.6108925004956623e-05, "loss": 0.2824, "num_input_tokens_seen": 76581248, "step": 80195 }, { "epoch": 6.542132310955217, "grad_norm": 13.972363471984863, "learning_rate": 1.6105598512760026e-05, "loss": 0.3032, "num_input_tokens_seen": 76585280, "step": 80200 }, { "epoch": 6.542540174565625, "grad_norm": 1.2009223699569702, "learning_rate": 1.6102272200849167e-05, "loss": 0.2792, "num_input_tokens_seen": 76589968, "step": 80205 }, { "epoch": 6.542948038176034, "grad_norm": 2.1939570903778076, "learning_rate": 1.6098946069291468e-05, "loss": 0.2442, "num_input_tokens_seen": 76594544, "step": 80210 }, { "epoch": 6.543355901786443, "grad_norm": 8.346623420715332, "learning_rate": 1.6095620118154354e-05, "loss": 0.3136, "num_input_tokens_seen": 76600352, "step": 80215 }, { "epoch": 6.543763765396851, "grad_norm": 0.915327787399292, "learning_rate": 1.6092294347505237e-05, "loss": 0.2893, "num_input_tokens_seen": 76605200, "step": 80220 }, { "epoch": 6.54417162900726, "grad_norm": 50.58669662475586, "learning_rate": 1.6088968757411526e-05, "loss": 0.4133, "num_input_tokens_seen": 76610256, "step": 80225 }, { "epoch": 6.544579492617669, "grad_norm": 24.866962432861328, "learning_rate": 1.608564334794063e-05, "loss": 0.3237, "num_input_tokens_seen": 76614672, "step": 80230 }, { "epoch": 6.544987356228077, "grad_norm": 1.7256594896316528, "learning_rate": 1.608231811915995e-05, "loss": 0.3906, "num_input_tokens_seen": 76620320, "step": 80235 }, { "epoch": 6.545395219838486, "grad_norm": 95.47642517089844, "learning_rate": 1.6078993071136904e-05, "loss": 0.3811, "num_input_tokens_seen": 76624736, "step": 80240 }, { "epoch": 6.545803083448895, "grad_norm": 0.45921748876571655, "learning_rate": 1.6075668203938874e-05, "loss": 0.3007, "num_input_tokens_seen": 76629136, "step": 80245 }, { "epoch": 6.546210947059303, "grad_norm": 1.9931490421295166, "learning_rate": 1.607234351763326e-05, "loss": 0.4439, "num_input_tokens_seen": 76633136, "step": 80250 }, { "epoch": 6.546618810669712, "grad_norm": 2.049318790435791, "learning_rate": 1.6069019012287433e-05, "loss": 0.2693, "num_input_tokens_seen": 76637040, "step": 80255 }, { "epoch": 6.54702667428012, "grad_norm": 0.36707553267478943, "learning_rate": 1.6065694687968814e-05, "loss": 0.2736, "num_input_tokens_seen": 76641792, "step": 80260 }, { "epoch": 6.547434537890529, "grad_norm": 0.32694584131240845, "learning_rate": 1.6062370544744763e-05, "loss": 0.3085, "num_input_tokens_seen": 76647312, "step": 80265 }, { "epoch": 6.547842401500938, "grad_norm": 1.2909694910049438, "learning_rate": 1.6059046582682664e-05, "loss": 0.2816, "num_input_tokens_seen": 76651936, "step": 80270 }, { "epoch": 6.548250265111347, "grad_norm": 0.5414707064628601, "learning_rate": 1.605572280184989e-05, "loss": 0.3489, "num_input_tokens_seen": 76657632, "step": 80275 }, { "epoch": 6.548658128721756, "grad_norm": 0.25571101903915405, "learning_rate": 1.605239920231381e-05, "loss": 0.3666, "num_input_tokens_seen": 76662496, "step": 80280 }, { "epoch": 6.549065992332164, "grad_norm": 8.116660118103027, "learning_rate": 1.6049075784141798e-05, "loss": 0.4807, "num_input_tokens_seen": 76667552, "step": 80285 }, { "epoch": 6.549473855942573, "grad_norm": 0.40430134534835815, "learning_rate": 1.6045752547401222e-05, "loss": 0.2894, "num_input_tokens_seen": 76671424, "step": 80290 }, { "epoch": 6.549881719552982, "grad_norm": 0.27243027091026306, "learning_rate": 1.6042429492159434e-05, "loss": 0.319, "num_input_tokens_seen": 76675584, "step": 80295 }, { "epoch": 6.55028958316339, "grad_norm": 0.9645888209342957, "learning_rate": 1.6039106618483783e-05, "loss": 0.3762, "num_input_tokens_seen": 76679872, "step": 80300 }, { "epoch": 6.550697446773799, "grad_norm": 0.6928995251655579, "learning_rate": 1.6035783926441646e-05, "loss": 0.3004, "num_input_tokens_seen": 76684032, "step": 80305 }, { "epoch": 6.551105310384208, "grad_norm": 57.1018180847168, "learning_rate": 1.6032461416100357e-05, "loss": 0.383, "num_input_tokens_seen": 76689152, "step": 80310 }, { "epoch": 6.551513173994616, "grad_norm": 0.7087828516960144, "learning_rate": 1.6029139087527267e-05, "loss": 0.4021, "num_input_tokens_seen": 76693968, "step": 80315 }, { "epoch": 6.551921037605025, "grad_norm": 0.4846532642841339, "learning_rate": 1.602581694078972e-05, "loss": 0.3212, "num_input_tokens_seen": 76698608, "step": 80320 }, { "epoch": 6.552328901215434, "grad_norm": 0.770698070526123, "learning_rate": 1.602249497595503e-05, "loss": 0.3763, "num_input_tokens_seen": 76704080, "step": 80325 }, { "epoch": 6.552736764825842, "grad_norm": 0.9031305313110352, "learning_rate": 1.6019173193090566e-05, "loss": 0.3365, "num_input_tokens_seen": 76708400, "step": 80330 }, { "epoch": 6.553144628436251, "grad_norm": 0.19427943229675293, "learning_rate": 1.6015851592263647e-05, "loss": 0.3313, "num_input_tokens_seen": 76713760, "step": 80335 }, { "epoch": 6.553552492046659, "grad_norm": 0.6158091425895691, "learning_rate": 1.60125301735416e-05, "loss": 0.2885, "num_input_tokens_seen": 76718368, "step": 80340 }, { "epoch": 6.553960355657068, "grad_norm": 0.4567420482635498, "learning_rate": 1.6009208936991737e-05, "loss": 0.3411, "num_input_tokens_seen": 76723904, "step": 80345 }, { "epoch": 6.554368219267477, "grad_norm": 0.7927287220954895, "learning_rate": 1.6005887882681396e-05, "loss": 0.3341, "num_input_tokens_seen": 76728512, "step": 80350 }, { "epoch": 6.554776082877885, "grad_norm": 0.8264813423156738, "learning_rate": 1.600256701067789e-05, "loss": 0.3485, "num_input_tokens_seen": 76733328, "step": 80355 }, { "epoch": 6.555183946488294, "grad_norm": 3.081509828567505, "learning_rate": 1.5999246321048523e-05, "loss": 0.334, "num_input_tokens_seen": 76738112, "step": 80360 }, { "epoch": 6.5555918100987025, "grad_norm": 0.9396960139274597, "learning_rate": 1.5995925813860595e-05, "loss": 0.359, "num_input_tokens_seen": 76742896, "step": 80365 }, { "epoch": 6.5559996737091115, "grad_norm": 3.09183669090271, "learning_rate": 1.5992605489181443e-05, "loss": 0.3286, "num_input_tokens_seen": 76746928, "step": 80370 }, { "epoch": 6.5564075373195205, "grad_norm": 0.5999053120613098, "learning_rate": 1.5989285347078345e-05, "loss": 0.3144, "num_input_tokens_seen": 76751792, "step": 80375 }, { "epoch": 6.556815400929929, "grad_norm": 1.4663095474243164, "learning_rate": 1.5985965387618607e-05, "loss": 0.2759, "num_input_tokens_seen": 76757456, "step": 80380 }, { "epoch": 6.557223264540338, "grad_norm": 0.32832950353622437, "learning_rate": 1.5982645610869518e-05, "loss": 0.2581, "num_input_tokens_seen": 76762512, "step": 80385 }, { "epoch": 6.557631128150747, "grad_norm": 0.6562579870223999, "learning_rate": 1.597932601689836e-05, "loss": 0.3381, "num_input_tokens_seen": 76766336, "step": 80390 }, { "epoch": 6.558038991761155, "grad_norm": 4.080273151397705, "learning_rate": 1.597600660577244e-05, "loss": 0.4216, "num_input_tokens_seen": 76771296, "step": 80395 }, { "epoch": 6.558446855371564, "grad_norm": 0.37470296025276184, "learning_rate": 1.5972687377559036e-05, "loss": 0.2818, "num_input_tokens_seen": 76777200, "step": 80400 }, { "epoch": 6.558854718981973, "grad_norm": 1.248002529144287, "learning_rate": 1.5969368332325417e-05, "loss": 0.2992, "num_input_tokens_seen": 76781952, "step": 80405 }, { "epoch": 6.559262582592381, "grad_norm": 1.2029486894607544, "learning_rate": 1.5966049470138868e-05, "loss": 0.2771, "num_input_tokens_seen": 76786688, "step": 80410 }, { "epoch": 6.55967044620279, "grad_norm": 3.459216356277466, "learning_rate": 1.5962730791066656e-05, "loss": 0.2698, "num_input_tokens_seen": 76791664, "step": 80415 }, { "epoch": 6.560078309813198, "grad_norm": 0.611774206161499, "learning_rate": 1.5959412295176052e-05, "loss": 0.3827, "num_input_tokens_seen": 76796384, "step": 80420 }, { "epoch": 6.560486173423607, "grad_norm": 0.540084183216095, "learning_rate": 1.595609398253432e-05, "loss": 0.3145, "num_input_tokens_seen": 76801104, "step": 80425 }, { "epoch": 6.560894037034016, "grad_norm": 1.109574556350708, "learning_rate": 1.5952775853208718e-05, "loss": 0.3396, "num_input_tokens_seen": 76805968, "step": 80430 }, { "epoch": 6.561301900644424, "grad_norm": 5.01609468460083, "learning_rate": 1.5949457907266496e-05, "loss": 0.4348, "num_input_tokens_seen": 76809584, "step": 80435 }, { "epoch": 6.561709764254833, "grad_norm": 4.537898540496826, "learning_rate": 1.5946140144774928e-05, "loss": 0.3103, "num_input_tokens_seen": 76814080, "step": 80440 }, { "epoch": 6.562117627865242, "grad_norm": 1.0112285614013672, "learning_rate": 1.594282256580125e-05, "loss": 0.3426, "num_input_tokens_seen": 76818704, "step": 80445 }, { "epoch": 6.56252549147565, "grad_norm": 61.92629623413086, "learning_rate": 1.593950517041271e-05, "loss": 0.3486, "num_input_tokens_seen": 76823968, "step": 80450 }, { "epoch": 6.562933355086059, "grad_norm": 0.51070237159729, "learning_rate": 1.593618795867654e-05, "loss": 0.292, "num_input_tokens_seen": 76829008, "step": 80455 }, { "epoch": 6.563341218696468, "grad_norm": 9.346694946289062, "learning_rate": 1.5932870930659998e-05, "loss": 0.2926, "num_input_tokens_seen": 76833584, "step": 80460 }, { "epoch": 6.5637490823068765, "grad_norm": 1.38307785987854, "learning_rate": 1.5929554086430315e-05, "loss": 0.3244, "num_input_tokens_seen": 76837776, "step": 80465 }, { "epoch": 6.5641569459172855, "grad_norm": 1.519832730293274, "learning_rate": 1.5926237426054714e-05, "loss": 0.4023, "num_input_tokens_seen": 76842368, "step": 80470 }, { "epoch": 6.564564809527694, "grad_norm": 20.936311721801758, "learning_rate": 1.592292094960042e-05, "loss": 0.3794, "num_input_tokens_seen": 76846912, "step": 80475 }, { "epoch": 6.564972673138103, "grad_norm": 0.5861212611198425, "learning_rate": 1.591960465713466e-05, "loss": 0.2832, "num_input_tokens_seen": 76851056, "step": 80480 }, { "epoch": 6.565380536748512, "grad_norm": 0.5831378698348999, "learning_rate": 1.591628854872466e-05, "loss": 0.3206, "num_input_tokens_seen": 76856112, "step": 80485 }, { "epoch": 6.56578840035892, "grad_norm": 3.7452728748321533, "learning_rate": 1.5912972624437627e-05, "loss": 0.3691, "num_input_tokens_seen": 76861712, "step": 80490 }, { "epoch": 6.566196263969329, "grad_norm": 0.624955415725708, "learning_rate": 1.5909656884340785e-05, "loss": 0.4644, "num_input_tokens_seen": 76866240, "step": 80495 }, { "epoch": 6.566604127579737, "grad_norm": 1.8316694498062134, "learning_rate": 1.590634132850132e-05, "loss": 0.366, "num_input_tokens_seen": 76871184, "step": 80500 }, { "epoch": 6.567011991190146, "grad_norm": 0.41484779119491577, "learning_rate": 1.5903025956986465e-05, "loss": 0.3272, "num_input_tokens_seen": 76876288, "step": 80505 }, { "epoch": 6.567419854800555, "grad_norm": 2.911898612976074, "learning_rate": 1.5899710769863408e-05, "loss": 0.3409, "num_input_tokens_seen": 76881904, "step": 80510 }, { "epoch": 6.567827718410963, "grad_norm": 1.1779240369796753, "learning_rate": 1.5896395767199342e-05, "loss": 0.3074, "num_input_tokens_seen": 76886768, "step": 80515 }, { "epoch": 6.568235582021372, "grad_norm": 1.8518720865249634, "learning_rate": 1.589308094906147e-05, "loss": 0.2251, "num_input_tokens_seen": 76890768, "step": 80520 }, { "epoch": 6.568643445631781, "grad_norm": 3.2392797470092773, "learning_rate": 1.588976631551697e-05, "loss": 0.2554, "num_input_tokens_seen": 76895616, "step": 80525 }, { "epoch": 6.569051309242189, "grad_norm": 3.2610676288604736, "learning_rate": 1.588645186663304e-05, "loss": 0.4199, "num_input_tokens_seen": 76900448, "step": 80530 }, { "epoch": 6.569459172852598, "grad_norm": 25.280426025390625, "learning_rate": 1.588313760247686e-05, "loss": 0.2898, "num_input_tokens_seen": 76904928, "step": 80535 }, { "epoch": 6.569867036463007, "grad_norm": 1.795593500137329, "learning_rate": 1.5879823523115605e-05, "loss": 0.3355, "num_input_tokens_seen": 76910352, "step": 80540 }, { "epoch": 6.570274900073415, "grad_norm": 14.786322593688965, "learning_rate": 1.587650962861645e-05, "loss": 0.2655, "num_input_tokens_seen": 76915536, "step": 80545 }, { "epoch": 6.570682763683824, "grad_norm": 12.849618911743164, "learning_rate": 1.5873195919046572e-05, "loss": 0.3869, "num_input_tokens_seen": 76920448, "step": 80550 }, { "epoch": 6.571090627294232, "grad_norm": 0.42531007528305054, "learning_rate": 1.5869882394473133e-05, "loss": 0.3101, "num_input_tokens_seen": 76925216, "step": 80555 }, { "epoch": 6.571498490904641, "grad_norm": 5.899432182312012, "learning_rate": 1.58665690549633e-05, "loss": 0.3237, "num_input_tokens_seen": 76929728, "step": 80560 }, { "epoch": 6.5719063545150505, "grad_norm": 0.7267736792564392, "learning_rate": 1.586325590058422e-05, "loss": 0.3014, "num_input_tokens_seen": 76935104, "step": 80565 }, { "epoch": 6.572314218125459, "grad_norm": 8.741801261901855, "learning_rate": 1.5859942931403072e-05, "loss": 0.3767, "num_input_tokens_seen": 76939872, "step": 80570 }, { "epoch": 6.572722081735868, "grad_norm": 1.1068257093429565, "learning_rate": 1.5856630147486995e-05, "loss": 0.2766, "num_input_tokens_seen": 76944608, "step": 80575 }, { "epoch": 6.573129945346277, "grad_norm": 0.6142252087593079, "learning_rate": 1.5853317548903143e-05, "loss": 0.3502, "num_input_tokens_seen": 76949472, "step": 80580 }, { "epoch": 6.573537808956685, "grad_norm": 1.3372923135757446, "learning_rate": 1.5850005135718655e-05, "loss": 0.3319, "num_input_tokens_seen": 76954432, "step": 80585 }, { "epoch": 6.573945672567094, "grad_norm": 0.8380147814750671, "learning_rate": 1.5846692908000664e-05, "loss": 0.3924, "num_input_tokens_seen": 76959360, "step": 80590 }, { "epoch": 6.574353536177503, "grad_norm": 1.2030445337295532, "learning_rate": 1.5843380865816333e-05, "loss": 0.2878, "num_input_tokens_seen": 76964992, "step": 80595 }, { "epoch": 6.574761399787911, "grad_norm": 0.2684621512889862, "learning_rate": 1.5840069009232774e-05, "loss": 0.3464, "num_input_tokens_seen": 76970032, "step": 80600 }, { "epoch": 6.57516926339832, "grad_norm": 0.23484158515930176, "learning_rate": 1.583675733831713e-05, "loss": 0.3445, "num_input_tokens_seen": 76974640, "step": 80605 }, { "epoch": 6.575577127008728, "grad_norm": 0.3105012774467468, "learning_rate": 1.5833445853136514e-05, "loss": 0.3444, "num_input_tokens_seen": 76979520, "step": 80610 }, { "epoch": 6.575984990619137, "grad_norm": 18.25493812561035, "learning_rate": 1.5830134553758058e-05, "loss": 0.3407, "num_input_tokens_seen": 76983808, "step": 80615 }, { "epoch": 6.576392854229546, "grad_norm": 1.341051459312439, "learning_rate": 1.582682344024888e-05, "loss": 0.2902, "num_input_tokens_seen": 76988912, "step": 80620 }, { "epoch": 6.576800717839954, "grad_norm": 0.39579880237579346, "learning_rate": 1.582351251267609e-05, "loss": 0.3003, "num_input_tokens_seen": 76993520, "step": 80625 }, { "epoch": 6.577208581450363, "grad_norm": 1.878667950630188, "learning_rate": 1.582020177110681e-05, "loss": 0.3486, "num_input_tokens_seen": 76998064, "step": 80630 }, { "epoch": 6.577616445060771, "grad_norm": 4.5812177658081055, "learning_rate": 1.581689121560812e-05, "loss": 0.3432, "num_input_tokens_seen": 77002176, "step": 80635 }, { "epoch": 6.57802430867118, "grad_norm": 12.526558876037598, "learning_rate": 1.581358084624716e-05, "loss": 0.3474, "num_input_tokens_seen": 77007568, "step": 80640 }, { "epoch": 6.578432172281589, "grad_norm": 2.407893419265747, "learning_rate": 1.581027066309101e-05, "loss": 0.3474, "num_input_tokens_seen": 77012544, "step": 80645 }, { "epoch": 6.578840035891997, "grad_norm": 0.5293928980827332, "learning_rate": 1.5806960666206768e-05, "loss": 0.3577, "num_input_tokens_seen": 77017136, "step": 80650 }, { "epoch": 6.579247899502406, "grad_norm": 0.4904971420764923, "learning_rate": 1.580365085566152e-05, "loss": 0.3276, "num_input_tokens_seen": 77022160, "step": 80655 }, { "epoch": 6.579655763112815, "grad_norm": 8.562478065490723, "learning_rate": 1.5800341231522366e-05, "loss": 0.2735, "num_input_tokens_seen": 77026704, "step": 80660 }, { "epoch": 6.5800636267232235, "grad_norm": 26.645692825317383, "learning_rate": 1.579703179385639e-05, "loss": 0.3576, "num_input_tokens_seen": 77031632, "step": 80665 }, { "epoch": 6.5804714903336325, "grad_norm": 0.345598042011261, "learning_rate": 1.579372254273067e-05, "loss": 0.2623, "num_input_tokens_seen": 77035760, "step": 80670 }, { "epoch": 6.5808793539440416, "grad_norm": 0.6786817312240601, "learning_rate": 1.579041347821228e-05, "loss": 0.3296, "num_input_tokens_seen": 77040624, "step": 80675 }, { "epoch": 6.58128721755445, "grad_norm": 0.9366660714149475, "learning_rate": 1.5787104600368287e-05, "loss": 0.2463, "num_input_tokens_seen": 77045616, "step": 80680 }, { "epoch": 6.581695081164859, "grad_norm": 0.7410618662834167, "learning_rate": 1.5783795909265782e-05, "loss": 0.2975, "num_input_tokens_seen": 77049408, "step": 80685 }, { "epoch": 6.582102944775267, "grad_norm": 0.2858095169067383, "learning_rate": 1.5780487404971812e-05, "loss": 0.4355, "num_input_tokens_seen": 77054016, "step": 80690 }, { "epoch": 6.582510808385676, "grad_norm": 0.41157427430152893, "learning_rate": 1.577717908755344e-05, "loss": 0.2632, "num_input_tokens_seen": 77058160, "step": 80695 }, { "epoch": 6.582918671996085, "grad_norm": 0.4222677946090698, "learning_rate": 1.577387095707773e-05, "loss": 0.317, "num_input_tokens_seen": 77062816, "step": 80700 }, { "epoch": 6.583326535606493, "grad_norm": 3.1998684406280518, "learning_rate": 1.5770563013611738e-05, "loss": 0.2937, "num_input_tokens_seen": 77068000, "step": 80705 }, { "epoch": 6.583734399216902, "grad_norm": 1.151865839958191, "learning_rate": 1.5767255257222514e-05, "loss": 0.3521, "num_input_tokens_seen": 77072672, "step": 80710 }, { "epoch": 6.58414226282731, "grad_norm": 16.330888748168945, "learning_rate": 1.5763947687977106e-05, "loss": 0.578, "num_input_tokens_seen": 77077664, "step": 80715 }, { "epoch": 6.584550126437719, "grad_norm": 5.884249687194824, "learning_rate": 1.5760640305942546e-05, "loss": 0.2985, "num_input_tokens_seen": 77082272, "step": 80720 }, { "epoch": 6.584957990048128, "grad_norm": 20.738283157348633, "learning_rate": 1.5757333111185877e-05, "loss": 0.3512, "num_input_tokens_seen": 77086544, "step": 80725 }, { "epoch": 6.585365853658536, "grad_norm": 0.36542120575904846, "learning_rate": 1.5754026103774146e-05, "loss": 0.23, "num_input_tokens_seen": 77091648, "step": 80730 }, { "epoch": 6.585773717268945, "grad_norm": 6.209537029266357, "learning_rate": 1.5750719283774377e-05, "loss": 0.3778, "num_input_tokens_seen": 77096224, "step": 80735 }, { "epoch": 6.586181580879354, "grad_norm": 0.4147365987300873, "learning_rate": 1.57474126512536e-05, "loss": 0.2956, "num_input_tokens_seen": 77100800, "step": 80740 }, { "epoch": 6.586589444489762, "grad_norm": 0.34619057178497314, "learning_rate": 1.5744106206278832e-05, "loss": 0.2969, "num_input_tokens_seen": 77105760, "step": 80745 }, { "epoch": 6.586997308100171, "grad_norm": 1.4779698848724365, "learning_rate": 1.57407999489171e-05, "loss": 0.3439, "num_input_tokens_seen": 77110192, "step": 80750 }, { "epoch": 6.58740517171058, "grad_norm": 12.995516777038574, "learning_rate": 1.5737493879235416e-05, "loss": 0.3687, "num_input_tokens_seen": 77114864, "step": 80755 }, { "epoch": 6.5878130353209885, "grad_norm": 0.3405574858188629, "learning_rate": 1.5734187997300797e-05, "loss": 0.3457, "num_input_tokens_seen": 77118704, "step": 80760 }, { "epoch": 6.5882208989313975, "grad_norm": 0.5422849655151367, "learning_rate": 1.573088230318025e-05, "loss": 0.3195, "num_input_tokens_seen": 77122544, "step": 80765 }, { "epoch": 6.588628762541806, "grad_norm": 5.769426345825195, "learning_rate": 1.5727576796940783e-05, "loss": 0.3164, "num_input_tokens_seen": 77127056, "step": 80770 }, { "epoch": 6.589036626152215, "grad_norm": 26.154293060302734, "learning_rate": 1.5724271478649393e-05, "loss": 0.4772, "num_input_tokens_seen": 77131792, "step": 80775 }, { "epoch": 6.589444489762624, "grad_norm": 1.2287887334823608, "learning_rate": 1.572096634837308e-05, "loss": 0.3597, "num_input_tokens_seen": 77137392, "step": 80780 }, { "epoch": 6.589852353373032, "grad_norm": 76.2239761352539, "learning_rate": 1.5717661406178834e-05, "loss": 0.3575, "num_input_tokens_seen": 77141920, "step": 80785 }, { "epoch": 6.590260216983441, "grad_norm": 15.774341583251953, "learning_rate": 1.5714356652133637e-05, "loss": 0.2358, "num_input_tokens_seen": 77146976, "step": 80790 }, { "epoch": 6.59066808059385, "grad_norm": 1.0573936700820923, "learning_rate": 1.5711052086304497e-05, "loss": 0.2802, "num_input_tokens_seen": 77152256, "step": 80795 }, { "epoch": 6.591075944204258, "grad_norm": 2.5523641109466553, "learning_rate": 1.5707747708758387e-05, "loss": 0.2822, "num_input_tokens_seen": 77156720, "step": 80800 }, { "epoch": 6.591483807814667, "grad_norm": 4.837218284606934, "learning_rate": 1.5704443519562277e-05, "loss": 0.3155, "num_input_tokens_seen": 77161968, "step": 80805 }, { "epoch": 6.591891671425076, "grad_norm": 9.683487892150879, "learning_rate": 1.570113951878314e-05, "loss": 0.3394, "num_input_tokens_seen": 77167056, "step": 80810 }, { "epoch": 6.592299535035484, "grad_norm": 3.61567759513855, "learning_rate": 1.5697835706487962e-05, "loss": 0.339, "num_input_tokens_seen": 77171904, "step": 80815 }, { "epoch": 6.592707398645893, "grad_norm": 4.016345024108887, "learning_rate": 1.5694532082743702e-05, "loss": 0.2608, "num_input_tokens_seen": 77176784, "step": 80820 }, { "epoch": 6.593115262256301, "grad_norm": 8.559175491333008, "learning_rate": 1.5691228647617322e-05, "loss": 0.3586, "num_input_tokens_seen": 77181648, "step": 80825 }, { "epoch": 6.59352312586671, "grad_norm": 4.365661144256592, "learning_rate": 1.568792540117578e-05, "loss": 0.2247, "num_input_tokens_seen": 77186496, "step": 80830 }, { "epoch": 6.593930989477119, "grad_norm": 1.6395900249481201, "learning_rate": 1.5684622343486032e-05, "loss": 0.3973, "num_input_tokens_seen": 77190336, "step": 80835 }, { "epoch": 6.594338853087527, "grad_norm": 4.919414520263672, "learning_rate": 1.5681319474615038e-05, "loss": 0.3372, "num_input_tokens_seen": 77195072, "step": 80840 }, { "epoch": 6.594746716697936, "grad_norm": 1.9090133905410767, "learning_rate": 1.567801679462973e-05, "loss": 0.3194, "num_input_tokens_seen": 77200304, "step": 80845 }, { "epoch": 6.5951545803083444, "grad_norm": 0.4017036557197571, "learning_rate": 1.5674714303597065e-05, "loss": 0.2655, "num_input_tokens_seen": 77205200, "step": 80850 }, { "epoch": 6.5955624439187535, "grad_norm": 0.4134777784347534, "learning_rate": 1.5671412001583972e-05, "loss": 0.2873, "num_input_tokens_seen": 77209872, "step": 80855 }, { "epoch": 6.5959703075291625, "grad_norm": 0.5917320251464844, "learning_rate": 1.56681098886574e-05, "loss": 0.339, "num_input_tokens_seen": 77214272, "step": 80860 }, { "epoch": 6.596378171139571, "grad_norm": 2.2691049575805664, "learning_rate": 1.5664807964884275e-05, "loss": 0.3549, "num_input_tokens_seen": 77219424, "step": 80865 }, { "epoch": 6.59678603474998, "grad_norm": 0.9688062071800232, "learning_rate": 1.5661506230331527e-05, "loss": 0.3169, "num_input_tokens_seen": 77223312, "step": 80870 }, { "epoch": 6.597193898360389, "grad_norm": 0.43251925706863403, "learning_rate": 1.565820468506608e-05, "loss": 0.3175, "num_input_tokens_seen": 77228928, "step": 80875 }, { "epoch": 6.597601761970797, "grad_norm": 1.386824131011963, "learning_rate": 1.565490332915484e-05, "loss": 0.3529, "num_input_tokens_seen": 77233728, "step": 80880 }, { "epoch": 6.598009625581206, "grad_norm": 8.042566299438477, "learning_rate": 1.5651602162664753e-05, "loss": 0.2749, "num_input_tokens_seen": 77237904, "step": 80885 }, { "epoch": 6.598417489191615, "grad_norm": 4.517767906188965, "learning_rate": 1.5648301185662717e-05, "loss": 0.3715, "num_input_tokens_seen": 77242544, "step": 80890 }, { "epoch": 6.598825352802023, "grad_norm": 2.2734055519104004, "learning_rate": 1.5645000398215644e-05, "loss": 0.3083, "num_input_tokens_seen": 77247072, "step": 80895 }, { "epoch": 6.599233216412432, "grad_norm": 2.162623167037964, "learning_rate": 1.564169980039043e-05, "loss": 0.3807, "num_input_tokens_seen": 77251712, "step": 80900 }, { "epoch": 6.59964108002284, "grad_norm": 0.5072959661483765, "learning_rate": 1.563839939225399e-05, "loss": 0.3487, "num_input_tokens_seen": 77256960, "step": 80905 }, { "epoch": 6.600048943633249, "grad_norm": 3.591525077819824, "learning_rate": 1.563509917387322e-05, "loss": 0.2209, "num_input_tokens_seen": 77261696, "step": 80910 }, { "epoch": 6.600456807243658, "grad_norm": 30.772977828979492, "learning_rate": 1.5631799145315003e-05, "loss": 0.4038, "num_input_tokens_seen": 77267120, "step": 80915 }, { "epoch": 6.600864670854066, "grad_norm": 2.6829018592834473, "learning_rate": 1.5628499306646234e-05, "loss": 0.1913, "num_input_tokens_seen": 77271136, "step": 80920 }, { "epoch": 6.601272534464475, "grad_norm": 4.4439568519592285, "learning_rate": 1.5625199657933805e-05, "loss": 0.5024, "num_input_tokens_seen": 77275104, "step": 80925 }, { "epoch": 6.601680398074883, "grad_norm": 1.2391703128814697, "learning_rate": 1.56219001992446e-05, "loss": 0.2872, "num_input_tokens_seen": 77278624, "step": 80930 }, { "epoch": 6.602088261685292, "grad_norm": 6.027096748352051, "learning_rate": 1.561860093064549e-05, "loss": 0.2923, "num_input_tokens_seen": 77282816, "step": 80935 }, { "epoch": 6.602496125295701, "grad_norm": 0.35451576113700867, "learning_rate": 1.5615301852203357e-05, "loss": 0.2139, "num_input_tokens_seen": 77287920, "step": 80940 }, { "epoch": 6.602903988906109, "grad_norm": 3.398693084716797, "learning_rate": 1.5612002963985054e-05, "loss": 0.3572, "num_input_tokens_seen": 77293552, "step": 80945 }, { "epoch": 6.603311852516518, "grad_norm": 11.652917861938477, "learning_rate": 1.5608704266057473e-05, "loss": 0.5719, "num_input_tokens_seen": 77298560, "step": 80950 }, { "epoch": 6.603719716126927, "grad_norm": 0.5544677376747131, "learning_rate": 1.5605405758487465e-05, "loss": 0.3458, "num_input_tokens_seen": 77303824, "step": 80955 }, { "epoch": 6.6041275797373356, "grad_norm": 3.3035669326782227, "learning_rate": 1.560210744134189e-05, "loss": 0.3545, "num_input_tokens_seen": 77308080, "step": 80960 }, { "epoch": 6.604535443347745, "grad_norm": 1.9976252317428589, "learning_rate": 1.5598809314687602e-05, "loss": 0.2849, "num_input_tokens_seen": 77312592, "step": 80965 }, { "epoch": 6.604943306958154, "grad_norm": 1.2488703727722168, "learning_rate": 1.5595511378591456e-05, "loss": 0.363, "num_input_tokens_seen": 77318144, "step": 80970 }, { "epoch": 6.605351170568562, "grad_norm": 2.5152790546417236, "learning_rate": 1.55922136331203e-05, "loss": 0.3837, "num_input_tokens_seen": 77321920, "step": 80975 }, { "epoch": 6.605759034178971, "grad_norm": 5.219788074493408, "learning_rate": 1.5588916078340976e-05, "loss": 0.357, "num_input_tokens_seen": 77327072, "step": 80980 }, { "epoch": 6.606166897789379, "grad_norm": 3.360405445098877, "learning_rate": 1.558561871432032e-05, "loss": 0.3298, "num_input_tokens_seen": 77331520, "step": 80985 }, { "epoch": 6.606574761399788, "grad_norm": 0.8116345405578613, "learning_rate": 1.5582321541125168e-05, "loss": 0.3417, "num_input_tokens_seen": 77335872, "step": 80990 }, { "epoch": 6.606982625010197, "grad_norm": 2.8482956886291504, "learning_rate": 1.5579024558822364e-05, "loss": 0.3735, "num_input_tokens_seen": 77340464, "step": 80995 }, { "epoch": 6.607390488620605, "grad_norm": 4.770092010498047, "learning_rate": 1.557572776747873e-05, "loss": 0.2774, "num_input_tokens_seen": 77344880, "step": 81000 }, { "epoch": 6.607798352231014, "grad_norm": 4.1872663497924805, "learning_rate": 1.557243116716109e-05, "loss": 0.2734, "num_input_tokens_seen": 77349600, "step": 81005 }, { "epoch": 6.608206215841423, "grad_norm": 11.190587997436523, "learning_rate": 1.5569134757936248e-05, "loss": 0.397, "num_input_tokens_seen": 77354352, "step": 81010 }, { "epoch": 6.608614079451831, "grad_norm": 1.893981695175171, "learning_rate": 1.556583853987105e-05, "loss": 0.4668, "num_input_tokens_seen": 77358464, "step": 81015 }, { "epoch": 6.60902194306224, "grad_norm": 1.475806474685669, "learning_rate": 1.55625425130323e-05, "loss": 0.3956, "num_input_tokens_seen": 77363312, "step": 81020 }, { "epoch": 6.609429806672649, "grad_norm": 0.9593504071235657, "learning_rate": 1.55592466774868e-05, "loss": 0.3102, "num_input_tokens_seen": 77367696, "step": 81025 }, { "epoch": 6.609837670283057, "grad_norm": 0.7697240710258484, "learning_rate": 1.5555951033301362e-05, "loss": 0.1761, "num_input_tokens_seen": 77372048, "step": 81030 }, { "epoch": 6.610245533893466, "grad_norm": 0.24496906995773315, "learning_rate": 1.5552655580542773e-05, "loss": 0.4129, "num_input_tokens_seen": 77377360, "step": 81035 }, { "epoch": 6.610653397503874, "grad_norm": 61.786376953125, "learning_rate": 1.554936031927785e-05, "loss": 0.3401, "num_input_tokens_seen": 77382448, "step": 81040 }, { "epoch": 6.611061261114283, "grad_norm": 5.456145286560059, "learning_rate": 1.5546065249573377e-05, "loss": 0.2551, "num_input_tokens_seen": 77387296, "step": 81045 }, { "epoch": 6.611469124724692, "grad_norm": 42.56444549560547, "learning_rate": 1.5542770371496143e-05, "loss": 0.2927, "num_input_tokens_seen": 77392144, "step": 81050 }, { "epoch": 6.6118769883351005, "grad_norm": 0.4159247875213623, "learning_rate": 1.5539475685112924e-05, "loss": 0.4096, "num_input_tokens_seen": 77396352, "step": 81055 }, { "epoch": 6.6122848519455095, "grad_norm": 1.378330111503601, "learning_rate": 1.5536181190490527e-05, "loss": 0.2375, "num_input_tokens_seen": 77401056, "step": 81060 }, { "epoch": 6.612692715555918, "grad_norm": 18.597869873046875, "learning_rate": 1.5532886887695713e-05, "loss": 0.3507, "num_input_tokens_seen": 77406528, "step": 81065 }, { "epoch": 6.613100579166327, "grad_norm": 15.78926944732666, "learning_rate": 1.552959277679526e-05, "loss": 0.3624, "num_input_tokens_seen": 77411776, "step": 81070 }, { "epoch": 6.613508442776736, "grad_norm": 0.7477591633796692, "learning_rate": 1.5526298857855933e-05, "loss": 0.4368, "num_input_tokens_seen": 77417360, "step": 81075 }, { "epoch": 6.613916306387144, "grad_norm": 2.511986255645752, "learning_rate": 1.5523005130944494e-05, "loss": 0.4413, "num_input_tokens_seen": 77422800, "step": 81080 }, { "epoch": 6.614324169997553, "grad_norm": 4.689981460571289, "learning_rate": 1.5519711596127727e-05, "loss": 0.4016, "num_input_tokens_seen": 77426832, "step": 81085 }, { "epoch": 6.614732033607962, "grad_norm": 0.707593560218811, "learning_rate": 1.551641825347238e-05, "loss": 0.5415, "num_input_tokens_seen": 77430672, "step": 81090 }, { "epoch": 6.61513989721837, "grad_norm": 0.776092529296875, "learning_rate": 1.55131251030452e-05, "loss": 0.4461, "num_input_tokens_seen": 77435888, "step": 81095 }, { "epoch": 6.615547760828779, "grad_norm": 9.795284271240234, "learning_rate": 1.550983214491294e-05, "loss": 0.3299, "num_input_tokens_seen": 77439520, "step": 81100 }, { "epoch": 6.615955624439188, "grad_norm": 0.12596963346004486, "learning_rate": 1.5506539379142354e-05, "loss": 0.2682, "num_input_tokens_seen": 77443984, "step": 81105 }, { "epoch": 6.616363488049596, "grad_norm": 0.48876839876174927, "learning_rate": 1.550324680580018e-05, "loss": 0.2825, "num_input_tokens_seen": 77449024, "step": 81110 }, { "epoch": 6.616771351660005, "grad_norm": 1.928813099861145, "learning_rate": 1.5499954424953158e-05, "loss": 0.6908, "num_input_tokens_seen": 77454144, "step": 81115 }, { "epoch": 6.617179215270413, "grad_norm": 2.895585536956787, "learning_rate": 1.5496662236668015e-05, "loss": 0.4089, "num_input_tokens_seen": 77459312, "step": 81120 }, { "epoch": 6.617587078880822, "grad_norm": 54.474037170410156, "learning_rate": 1.54933702410115e-05, "loss": 0.2876, "num_input_tokens_seen": 77464032, "step": 81125 }, { "epoch": 6.617994942491231, "grad_norm": 2.055675506591797, "learning_rate": 1.5490078438050325e-05, "loss": 0.2723, "num_input_tokens_seen": 77469056, "step": 81130 }, { "epoch": 6.618402806101639, "grad_norm": 0.7747960686683655, "learning_rate": 1.5486786827851226e-05, "loss": 0.2868, "num_input_tokens_seen": 77473200, "step": 81135 }, { "epoch": 6.618810669712048, "grad_norm": 0.8982481956481934, "learning_rate": 1.5483495410480912e-05, "loss": 0.7232, "num_input_tokens_seen": 77478448, "step": 81140 }, { "epoch": 6.619218533322457, "grad_norm": 1.774079442024231, "learning_rate": 1.5480204186006096e-05, "loss": 0.326, "num_input_tokens_seen": 77483920, "step": 81145 }, { "epoch": 6.6196263969328655, "grad_norm": 7.680391311645508, "learning_rate": 1.5476913154493503e-05, "loss": 0.343, "num_input_tokens_seen": 77488400, "step": 81150 }, { "epoch": 6.6200342605432745, "grad_norm": 0.471023827791214, "learning_rate": 1.5473622316009833e-05, "loss": 0.242, "num_input_tokens_seen": 77492960, "step": 81155 }, { "epoch": 6.6204421241536835, "grad_norm": 0.9993252754211426, "learning_rate": 1.547033167062179e-05, "loss": 0.3305, "num_input_tokens_seen": 77498000, "step": 81160 }, { "epoch": 6.620849987764092, "grad_norm": 2.8673439025878906, "learning_rate": 1.5467041218396073e-05, "loss": 0.5854, "num_input_tokens_seen": 77503168, "step": 81165 }, { "epoch": 6.621257851374501, "grad_norm": 1.3500902652740479, "learning_rate": 1.5463750959399385e-05, "loss": 0.2926, "num_input_tokens_seen": 77507712, "step": 81170 }, { "epoch": 6.621665714984909, "grad_norm": 1.0560754537582397, "learning_rate": 1.5460460893698407e-05, "loss": 0.2292, "num_input_tokens_seen": 77512752, "step": 81175 }, { "epoch": 6.622073578595318, "grad_norm": 14.398918151855469, "learning_rate": 1.545717102135984e-05, "loss": 0.3838, "num_input_tokens_seen": 77516784, "step": 81180 }, { "epoch": 6.622481442205727, "grad_norm": 1.0565619468688965, "learning_rate": 1.545388134245036e-05, "loss": 0.3468, "num_input_tokens_seen": 77521184, "step": 81185 }, { "epoch": 6.622889305816135, "grad_norm": 0.5446974635124207, "learning_rate": 1.5450591857036633e-05, "loss": 0.2608, "num_input_tokens_seen": 77526096, "step": 81190 }, { "epoch": 6.623297169426544, "grad_norm": 0.4649943709373474, "learning_rate": 1.544730256518537e-05, "loss": 0.3004, "num_input_tokens_seen": 77531136, "step": 81195 }, { "epoch": 6.623705033036952, "grad_norm": 0.8694003224372864, "learning_rate": 1.5444013466963218e-05, "loss": 0.3031, "num_input_tokens_seen": 77535600, "step": 81200 }, { "epoch": 6.624112896647361, "grad_norm": 7.581872463226318, "learning_rate": 1.5440724562436855e-05, "loss": 0.3639, "num_input_tokens_seen": 77539824, "step": 81205 }, { "epoch": 6.62452076025777, "grad_norm": 0.8577446937561035, "learning_rate": 1.543743585167293e-05, "loss": 0.5135, "num_input_tokens_seen": 77545056, "step": 81210 }, { "epoch": 6.624928623868178, "grad_norm": 2.734957218170166, "learning_rate": 1.5434147334738134e-05, "loss": 0.2799, "num_input_tokens_seen": 77549744, "step": 81215 }, { "epoch": 6.625336487478587, "grad_norm": 2.1753201484680176, "learning_rate": 1.54308590116991e-05, "loss": 0.3784, "num_input_tokens_seen": 77555456, "step": 81220 }, { "epoch": 6.625744351088996, "grad_norm": 0.6216212511062622, "learning_rate": 1.5427570882622492e-05, "loss": 0.3871, "num_input_tokens_seen": 77559712, "step": 81225 }, { "epoch": 6.626152214699404, "grad_norm": 1.1318310499191284, "learning_rate": 1.5424282947574953e-05, "loss": 0.3354, "num_input_tokens_seen": 77564896, "step": 81230 }, { "epoch": 6.626560078309813, "grad_norm": 0.6911316514015198, "learning_rate": 1.5420995206623123e-05, "loss": 0.3133, "num_input_tokens_seen": 77570672, "step": 81235 }, { "epoch": 6.626967941920222, "grad_norm": 0.9591699838638306, "learning_rate": 1.5417707659833657e-05, "loss": 0.3846, "num_input_tokens_seen": 77575728, "step": 81240 }, { "epoch": 6.62737580553063, "grad_norm": 29.054601669311523, "learning_rate": 1.5414420307273186e-05, "loss": 0.463, "num_input_tokens_seen": 77580256, "step": 81245 }, { "epoch": 6.6277836691410394, "grad_norm": 0.4941639304161072, "learning_rate": 1.5411133149008338e-05, "loss": 0.3156, "num_input_tokens_seen": 77584656, "step": 81250 }, { "epoch": 6.628191532751448, "grad_norm": 2.8257100582122803, "learning_rate": 1.540784618510574e-05, "loss": 0.2464, "num_input_tokens_seen": 77589328, "step": 81255 }, { "epoch": 6.628599396361857, "grad_norm": 0.5600850582122803, "learning_rate": 1.5404559415632033e-05, "loss": 0.2748, "num_input_tokens_seen": 77593936, "step": 81260 }, { "epoch": 6.629007259972266, "grad_norm": 1.1330622434616089, "learning_rate": 1.540127284065383e-05, "loss": 0.3006, "num_input_tokens_seen": 77599456, "step": 81265 }, { "epoch": 6.629415123582674, "grad_norm": 1.3351796865463257, "learning_rate": 1.5397986460237746e-05, "loss": 0.3311, "num_input_tokens_seen": 77604384, "step": 81270 }, { "epoch": 6.629822987193083, "grad_norm": 9.009532928466797, "learning_rate": 1.5394700274450398e-05, "loss": 0.5398, "num_input_tokens_seen": 77608512, "step": 81275 }, { "epoch": 6.630230850803491, "grad_norm": 9.464460372924805, "learning_rate": 1.5391414283358384e-05, "loss": 0.3531, "num_input_tokens_seen": 77613136, "step": 81280 }, { "epoch": 6.6306387144139, "grad_norm": 2.3076717853546143, "learning_rate": 1.5388128487028328e-05, "loss": 0.3074, "num_input_tokens_seen": 77617072, "step": 81285 }, { "epoch": 6.631046578024309, "grad_norm": 0.5858039855957031, "learning_rate": 1.5384842885526824e-05, "loss": 0.3929, "num_input_tokens_seen": 77621088, "step": 81290 }, { "epoch": 6.631454441634717, "grad_norm": 2.1387341022491455, "learning_rate": 1.538155747892047e-05, "loss": 0.2524, "num_input_tokens_seen": 77626256, "step": 81295 }, { "epoch": 6.631862305245126, "grad_norm": 19.382400512695312, "learning_rate": 1.537827226727585e-05, "loss": 0.3467, "num_input_tokens_seen": 77630880, "step": 81300 }, { "epoch": 6.632270168855535, "grad_norm": 1.915914535522461, "learning_rate": 1.5374987250659572e-05, "loss": 0.2537, "num_input_tokens_seen": 77635824, "step": 81305 }, { "epoch": 6.632678032465943, "grad_norm": 1.0684975385665894, "learning_rate": 1.5371702429138214e-05, "loss": 0.2483, "num_input_tokens_seen": 77640368, "step": 81310 }, { "epoch": 6.633085896076352, "grad_norm": 3.4297053813934326, "learning_rate": 1.5368417802778352e-05, "loss": 0.3014, "num_input_tokens_seen": 77646144, "step": 81315 }, { "epoch": 6.633493759686761, "grad_norm": 3.841245412826538, "learning_rate": 1.5365133371646558e-05, "loss": 0.4875, "num_input_tokens_seen": 77651376, "step": 81320 }, { "epoch": 6.633901623297169, "grad_norm": 0.5370787382125854, "learning_rate": 1.5361849135809427e-05, "loss": 0.3976, "num_input_tokens_seen": 77656752, "step": 81325 }, { "epoch": 6.634309486907578, "grad_norm": 1.3809162378311157, "learning_rate": 1.535856509533352e-05, "loss": 0.4095, "num_input_tokens_seen": 77661232, "step": 81330 }, { "epoch": 6.634717350517986, "grad_norm": 12.578381538391113, "learning_rate": 1.53552812502854e-05, "loss": 0.4183, "num_input_tokens_seen": 77666224, "step": 81335 }, { "epoch": 6.635125214128395, "grad_norm": 8.587124824523926, "learning_rate": 1.5351997600731636e-05, "loss": 0.295, "num_input_tokens_seen": 77670976, "step": 81340 }, { "epoch": 6.635533077738804, "grad_norm": 2.0381603240966797, "learning_rate": 1.5348714146738767e-05, "loss": 0.2638, "num_input_tokens_seen": 77676160, "step": 81345 }, { "epoch": 6.6359409413492125, "grad_norm": 4.4642510414123535, "learning_rate": 1.534543088837337e-05, "loss": 0.2872, "num_input_tokens_seen": 77682032, "step": 81350 }, { "epoch": 6.6363488049596215, "grad_norm": 3.5637435913085938, "learning_rate": 1.5342147825701986e-05, "loss": 0.3871, "num_input_tokens_seen": 77686560, "step": 81355 }, { "epoch": 6.6367566685700305, "grad_norm": 0.4974430501461029, "learning_rate": 1.5338864958791167e-05, "loss": 0.4155, "num_input_tokens_seen": 77691760, "step": 81360 }, { "epoch": 6.637164532180439, "grad_norm": 3.1809587478637695, "learning_rate": 1.533558228770744e-05, "loss": 0.3275, "num_input_tokens_seen": 77696000, "step": 81365 }, { "epoch": 6.637572395790848, "grad_norm": 4.599040985107422, "learning_rate": 1.533229981251736e-05, "loss": 0.3098, "num_input_tokens_seen": 77701072, "step": 81370 }, { "epoch": 6.637980259401257, "grad_norm": 0.919308602809906, "learning_rate": 1.5329017533287453e-05, "loss": 0.4482, "num_input_tokens_seen": 77706272, "step": 81375 }, { "epoch": 6.638388123011665, "grad_norm": 0.8932023048400879, "learning_rate": 1.5325735450084255e-05, "loss": 0.2322, "num_input_tokens_seen": 77711696, "step": 81380 }, { "epoch": 6.638795986622074, "grad_norm": 2.4227373600006104, "learning_rate": 1.5322453562974285e-05, "loss": 0.2879, "num_input_tokens_seen": 77716608, "step": 81385 }, { "epoch": 6.639203850232482, "grad_norm": 10.62867259979248, "learning_rate": 1.531917187202406e-05, "loss": 0.3355, "num_input_tokens_seen": 77721136, "step": 81390 }, { "epoch": 6.639611713842891, "grad_norm": 1.623202919960022, "learning_rate": 1.531589037730012e-05, "loss": 0.2595, "num_input_tokens_seen": 77725552, "step": 81395 }, { "epoch": 6.6400195774533, "grad_norm": 1.6305969953536987, "learning_rate": 1.531260907886896e-05, "loss": 0.2933, "num_input_tokens_seen": 77730640, "step": 81400 }, { "epoch": 6.640427441063708, "grad_norm": 1.1042039394378662, "learning_rate": 1.5309327976797107e-05, "loss": 0.3466, "num_input_tokens_seen": 77736032, "step": 81405 }, { "epoch": 6.640835304674117, "grad_norm": 1.888425350189209, "learning_rate": 1.5306047071151043e-05, "loss": 0.3328, "num_input_tokens_seen": 77741200, "step": 81410 }, { "epoch": 6.641243168284525, "grad_norm": 0.7978554368019104, "learning_rate": 1.5302766361997296e-05, "loss": 0.3155, "num_input_tokens_seen": 77745072, "step": 81415 }, { "epoch": 6.641651031894934, "grad_norm": 0.6796456575393677, "learning_rate": 1.529948584940235e-05, "loss": 0.3359, "num_input_tokens_seen": 77749520, "step": 81420 }, { "epoch": 6.642058895505343, "grad_norm": 48.95607376098633, "learning_rate": 1.529620553343271e-05, "loss": 0.4266, "num_input_tokens_seen": 77754256, "step": 81425 }, { "epoch": 6.642466759115751, "grad_norm": 4.1716437339782715, "learning_rate": 1.529292541415486e-05, "loss": 0.4401, "num_input_tokens_seen": 77760016, "step": 81430 }, { "epoch": 6.64287462272616, "grad_norm": 1.712448239326477, "learning_rate": 1.528964549163528e-05, "loss": 0.2885, "num_input_tokens_seen": 77764576, "step": 81435 }, { "epoch": 6.643282486336569, "grad_norm": 1.148852825164795, "learning_rate": 1.5286365765940467e-05, "loss": 0.4407, "num_input_tokens_seen": 77770720, "step": 81440 }, { "epoch": 6.6436903499469775, "grad_norm": 3.790942907333374, "learning_rate": 1.528308623713689e-05, "loss": 0.3362, "num_input_tokens_seen": 77774944, "step": 81445 }, { "epoch": 6.6440982135573865, "grad_norm": 0.40090593695640564, "learning_rate": 1.527980690529102e-05, "loss": 0.3759, "num_input_tokens_seen": 77780016, "step": 81450 }, { "epoch": 6.6445060771677955, "grad_norm": 1.867434024810791, "learning_rate": 1.5276527770469337e-05, "loss": 0.372, "num_input_tokens_seen": 77784912, "step": 81455 }, { "epoch": 6.644913940778204, "grad_norm": 0.7607150673866272, "learning_rate": 1.5273248832738307e-05, "loss": 0.3868, "num_input_tokens_seen": 77789424, "step": 81460 }, { "epoch": 6.645321804388613, "grad_norm": 0.44853684306144714, "learning_rate": 1.5269970092164394e-05, "loss": 0.331, "num_input_tokens_seen": 77794656, "step": 81465 }, { "epoch": 6.645729667999021, "grad_norm": 0.7213157415390015, "learning_rate": 1.5266691548814045e-05, "loss": 0.3071, "num_input_tokens_seen": 77799248, "step": 81470 }, { "epoch": 6.64613753160943, "grad_norm": 1.5592933893203735, "learning_rate": 1.5263413202753717e-05, "loss": 0.2689, "num_input_tokens_seen": 77804304, "step": 81475 }, { "epoch": 6.646545395219839, "grad_norm": 2.1821177005767822, "learning_rate": 1.526013505404988e-05, "loss": 0.3133, "num_input_tokens_seen": 77808576, "step": 81480 }, { "epoch": 6.646953258830247, "grad_norm": 1.806868553161621, "learning_rate": 1.5256857102768963e-05, "loss": 0.2561, "num_input_tokens_seen": 77813056, "step": 81485 }, { "epoch": 6.647361122440656, "grad_norm": 0.3870714008808136, "learning_rate": 1.525357934897741e-05, "loss": 0.3026, "num_input_tokens_seen": 77817216, "step": 81490 }, { "epoch": 6.647768986051064, "grad_norm": 0.32753658294677734, "learning_rate": 1.5250301792741664e-05, "loss": 0.3832, "num_input_tokens_seen": 77821472, "step": 81495 }, { "epoch": 6.648176849661473, "grad_norm": 2.838961124420166, "learning_rate": 1.5247024434128154e-05, "loss": 0.3695, "num_input_tokens_seen": 77825888, "step": 81500 }, { "epoch": 6.648584713271882, "grad_norm": 0.43661776185035706, "learning_rate": 1.5243747273203318e-05, "loss": 0.2838, "num_input_tokens_seen": 77830256, "step": 81505 }, { "epoch": 6.648992576882291, "grad_norm": 0.4827563762664795, "learning_rate": 1.5240470310033571e-05, "loss": 0.3085, "num_input_tokens_seen": 77834144, "step": 81510 }, { "epoch": 6.649400440492699, "grad_norm": 0.5575986504554749, "learning_rate": 1.5237193544685355e-05, "loss": 0.3351, "num_input_tokens_seen": 77839248, "step": 81515 }, { "epoch": 6.649808304103108, "grad_norm": 1.022252082824707, "learning_rate": 1.5233916977225068e-05, "loss": 0.3096, "num_input_tokens_seen": 77843328, "step": 81520 }, { "epoch": 6.650216167713516, "grad_norm": 4.823270320892334, "learning_rate": 1.5230640607719144e-05, "loss": 0.3474, "num_input_tokens_seen": 77848160, "step": 81525 }, { "epoch": 6.650624031323925, "grad_norm": 2.10498309135437, "learning_rate": 1.5227364436233976e-05, "loss": 0.3425, "num_input_tokens_seen": 77852224, "step": 81530 }, { "epoch": 6.651031894934334, "grad_norm": 2.289576292037964, "learning_rate": 1.5224088462835983e-05, "loss": 0.3036, "num_input_tokens_seen": 77857296, "step": 81535 }, { "epoch": 6.6514397585447425, "grad_norm": 0.42161163687705994, "learning_rate": 1.5220812687591563e-05, "loss": 0.4158, "num_input_tokens_seen": 77861744, "step": 81540 }, { "epoch": 6.6518476221551515, "grad_norm": 3.1363863945007324, "learning_rate": 1.5217537110567104e-05, "loss": 0.3093, "num_input_tokens_seen": 77865600, "step": 81545 }, { "epoch": 6.65225548576556, "grad_norm": 25.24979019165039, "learning_rate": 1.5214261731829022e-05, "loss": 0.339, "num_input_tokens_seen": 77870288, "step": 81550 }, { "epoch": 6.652663349375969, "grad_norm": 2.8738839626312256, "learning_rate": 1.5210986551443696e-05, "loss": 0.2957, "num_input_tokens_seen": 77875056, "step": 81555 }, { "epoch": 6.653071212986378, "grad_norm": 0.34836676716804504, "learning_rate": 1.5207711569477511e-05, "loss": 0.2892, "num_input_tokens_seen": 77880064, "step": 81560 }, { "epoch": 6.653479076596786, "grad_norm": 0.46552804112434387, "learning_rate": 1.5204436785996846e-05, "loss": 0.2946, "num_input_tokens_seen": 77885344, "step": 81565 }, { "epoch": 6.653886940207195, "grad_norm": 1.9952383041381836, "learning_rate": 1.5201162201068087e-05, "loss": 0.345, "num_input_tokens_seen": 77889408, "step": 81570 }, { "epoch": 6.654294803817604, "grad_norm": 65.46461486816406, "learning_rate": 1.519788781475761e-05, "loss": 0.5031, "num_input_tokens_seen": 77895072, "step": 81575 }, { "epoch": 6.654702667428012, "grad_norm": 2.7851736545562744, "learning_rate": 1.5194613627131782e-05, "loss": 0.3768, "num_input_tokens_seen": 77899952, "step": 81580 }, { "epoch": 6.655110531038421, "grad_norm": 17.954221725463867, "learning_rate": 1.5191339638256971e-05, "loss": 0.3513, "num_input_tokens_seen": 77904672, "step": 81585 }, { "epoch": 6.65551839464883, "grad_norm": 10.190902709960938, "learning_rate": 1.5188065848199534e-05, "loss": 0.5945, "num_input_tokens_seen": 77909952, "step": 81590 }, { "epoch": 6.655926258259238, "grad_norm": 0.2311316579580307, "learning_rate": 1.5184792257025831e-05, "loss": 0.4541, "num_input_tokens_seen": 77915040, "step": 81595 }, { "epoch": 6.656334121869647, "grad_norm": 0.8215169906616211, "learning_rate": 1.5181518864802224e-05, "loss": 0.2976, "num_input_tokens_seen": 77919744, "step": 81600 }, { "epoch": 6.656741985480055, "grad_norm": 5.734445095062256, "learning_rate": 1.517824567159506e-05, "loss": 0.362, "num_input_tokens_seen": 77924480, "step": 81605 }, { "epoch": 6.657149849090464, "grad_norm": 0.5150310397148132, "learning_rate": 1.5174972677470668e-05, "loss": 0.2786, "num_input_tokens_seen": 77929296, "step": 81610 }, { "epoch": 6.657557712700873, "grad_norm": 8.841146469116211, "learning_rate": 1.5171699882495416e-05, "loss": 0.3256, "num_input_tokens_seen": 77934288, "step": 81615 }, { "epoch": 6.657965576311281, "grad_norm": 0.3493029773235321, "learning_rate": 1.5168427286735629e-05, "loss": 0.3583, "num_input_tokens_seen": 77939312, "step": 81620 }, { "epoch": 6.65837343992169, "grad_norm": 1.9750155210494995, "learning_rate": 1.5165154890257644e-05, "loss": 0.2078, "num_input_tokens_seen": 77944128, "step": 81625 }, { "epoch": 6.658781303532098, "grad_norm": 1.9616903066635132, "learning_rate": 1.5161882693127793e-05, "loss": 0.3043, "num_input_tokens_seen": 77947152, "step": 81630 }, { "epoch": 6.659189167142507, "grad_norm": 0.7023640871047974, "learning_rate": 1.5158610695412388e-05, "loss": 0.2752, "num_input_tokens_seen": 77951264, "step": 81635 }, { "epoch": 6.659597030752916, "grad_norm": 1.5103026628494263, "learning_rate": 1.5155338897177773e-05, "loss": 0.3827, "num_input_tokens_seen": 77955440, "step": 81640 }, { "epoch": 6.6600048943633245, "grad_norm": 1.9906455278396606, "learning_rate": 1.5152067298490252e-05, "loss": 0.4751, "num_input_tokens_seen": 77960032, "step": 81645 }, { "epoch": 6.6604127579737336, "grad_norm": 1.7037264108657837, "learning_rate": 1.5148795899416146e-05, "loss": 0.2981, "num_input_tokens_seen": 77964496, "step": 81650 }, { "epoch": 6.660820621584143, "grad_norm": 0.7687446475028992, "learning_rate": 1.5145524700021755e-05, "loss": 0.2999, "num_input_tokens_seen": 77969872, "step": 81655 }, { "epoch": 6.661228485194551, "grad_norm": 10.748919486999512, "learning_rate": 1.5142253700373392e-05, "loss": 0.3284, "num_input_tokens_seen": 77974960, "step": 81660 }, { "epoch": 6.66163634880496, "grad_norm": 1.6038211584091187, "learning_rate": 1.5138982900537363e-05, "loss": 0.297, "num_input_tokens_seen": 77980080, "step": 81665 }, { "epoch": 6.662044212415369, "grad_norm": 0.8372916579246521, "learning_rate": 1.513571230057996e-05, "loss": 0.3829, "num_input_tokens_seen": 77984704, "step": 81670 }, { "epoch": 6.662452076025777, "grad_norm": 0.564260721206665, "learning_rate": 1.5132441900567466e-05, "loss": 0.4005, "num_input_tokens_seen": 77989872, "step": 81675 }, { "epoch": 6.662859939636186, "grad_norm": 13.18408203125, "learning_rate": 1.5129171700566192e-05, "loss": 0.5645, "num_input_tokens_seen": 77994416, "step": 81680 }, { "epoch": 6.663267803246594, "grad_norm": 0.8976349234580994, "learning_rate": 1.5125901700642411e-05, "loss": 0.2658, "num_input_tokens_seen": 77999616, "step": 81685 }, { "epoch": 6.663675666857003, "grad_norm": 9.801918029785156, "learning_rate": 1.512263190086241e-05, "loss": 0.3134, "num_input_tokens_seen": 78004240, "step": 81690 }, { "epoch": 6.664083530467412, "grad_norm": 1.1080641746520996, "learning_rate": 1.5119362301292461e-05, "loss": 0.3535, "num_input_tokens_seen": 78009696, "step": 81695 }, { "epoch": 6.66449139407782, "grad_norm": 2.0509402751922607, "learning_rate": 1.511609290199883e-05, "loss": 0.3748, "num_input_tokens_seen": 78015024, "step": 81700 }, { "epoch": 6.664899257688229, "grad_norm": 0.6898058652877808, "learning_rate": 1.5112823703047807e-05, "loss": 0.3989, "num_input_tokens_seen": 78019712, "step": 81705 }, { "epoch": 6.665307121298638, "grad_norm": 0.4004705250263214, "learning_rate": 1.5109554704505647e-05, "loss": 0.3466, "num_input_tokens_seen": 78024656, "step": 81710 }, { "epoch": 6.665714984909046, "grad_norm": 0.927245557308197, "learning_rate": 1.5106285906438606e-05, "loss": 0.3703, "num_input_tokens_seen": 78029360, "step": 81715 }, { "epoch": 6.666122848519455, "grad_norm": 4.624459743499756, "learning_rate": 1.5103017308912945e-05, "loss": 0.2566, "num_input_tokens_seen": 78033840, "step": 81720 }, { "epoch": 6.666530712129864, "grad_norm": 0.4131532609462738, "learning_rate": 1.5099748911994921e-05, "loss": 0.372, "num_input_tokens_seen": 78038064, "step": 81725 }, { "epoch": 6.666938575740272, "grad_norm": 0.5334758162498474, "learning_rate": 1.509648071575078e-05, "loss": 0.2325, "num_input_tokens_seen": 78043680, "step": 81730 }, { "epoch": 6.667346439350681, "grad_norm": 0.6421825885772705, "learning_rate": 1.5093212720246766e-05, "loss": 0.4152, "num_input_tokens_seen": 78048672, "step": 81735 }, { "epoch": 6.6677543029610895, "grad_norm": 0.32685884833335876, "learning_rate": 1.5089944925549122e-05, "loss": 0.2951, "num_input_tokens_seen": 78052944, "step": 81740 }, { "epoch": 6.6681621665714985, "grad_norm": 0.1738857626914978, "learning_rate": 1.5086677331724072e-05, "loss": 0.3124, "num_input_tokens_seen": 78057504, "step": 81745 }, { "epoch": 6.6685700301819075, "grad_norm": 0.22330142557621002, "learning_rate": 1.508340993883787e-05, "loss": 0.4301, "num_input_tokens_seen": 78063408, "step": 81750 }, { "epoch": 6.668977893792316, "grad_norm": 28.099634170532227, "learning_rate": 1.5080142746956734e-05, "loss": 0.3758, "num_input_tokens_seen": 78069040, "step": 81755 }, { "epoch": 6.669385757402725, "grad_norm": 0.4454864263534546, "learning_rate": 1.5076875756146891e-05, "loss": 0.2901, "num_input_tokens_seen": 78074544, "step": 81760 }, { "epoch": 6.669793621013133, "grad_norm": 0.9898146390914917, "learning_rate": 1.507360896647455e-05, "loss": 0.3631, "num_input_tokens_seen": 78079728, "step": 81765 }, { "epoch": 6.670201484623542, "grad_norm": 0.6648833155632019, "learning_rate": 1.5070342378005948e-05, "loss": 0.3117, "num_input_tokens_seen": 78084560, "step": 81770 }, { "epoch": 6.670609348233951, "grad_norm": 2.4016385078430176, "learning_rate": 1.5067075990807283e-05, "loss": 0.3413, "num_input_tokens_seen": 78089152, "step": 81775 }, { "epoch": 6.671017211844359, "grad_norm": 0.40733468532562256, "learning_rate": 1.506380980494477e-05, "loss": 0.2818, "num_input_tokens_seen": 78092896, "step": 81780 }, { "epoch": 6.671425075454768, "grad_norm": 5.205758571624756, "learning_rate": 1.5060543820484607e-05, "loss": 0.2539, "num_input_tokens_seen": 78098512, "step": 81785 }, { "epoch": 6.671832939065177, "grad_norm": 1.0784187316894531, "learning_rate": 1.5057278037492994e-05, "loss": 0.2755, "num_input_tokens_seen": 78102944, "step": 81790 }, { "epoch": 6.672240802675585, "grad_norm": 0.24538792669773102, "learning_rate": 1.5054012456036137e-05, "loss": 0.362, "num_input_tokens_seen": 78106992, "step": 81795 }, { "epoch": 6.672648666285994, "grad_norm": 0.3755115270614624, "learning_rate": 1.5050747076180222e-05, "loss": 0.3497, "num_input_tokens_seen": 78112176, "step": 81800 }, { "epoch": 6.673056529896403, "grad_norm": 0.6487245559692383, "learning_rate": 1.5047481897991433e-05, "loss": 0.343, "num_input_tokens_seen": 78116768, "step": 81805 }, { "epoch": 6.673464393506811, "grad_norm": 0.42130246758461, "learning_rate": 1.5044216921535947e-05, "loss": 0.3587, "num_input_tokens_seen": 78121808, "step": 81810 }, { "epoch": 6.67387225711722, "grad_norm": 0.5095774531364441, "learning_rate": 1.5040952146879966e-05, "loss": 0.3602, "num_input_tokens_seen": 78126528, "step": 81815 }, { "epoch": 6.674280120727628, "grad_norm": 1.1638473272323608, "learning_rate": 1.5037687574089651e-05, "loss": 0.3584, "num_input_tokens_seen": 78131920, "step": 81820 }, { "epoch": 6.674687984338037, "grad_norm": 1.5566790103912354, "learning_rate": 1.5034423203231177e-05, "loss": 0.2576, "num_input_tokens_seen": 78137360, "step": 81825 }, { "epoch": 6.675095847948446, "grad_norm": 11.007943153381348, "learning_rate": 1.5031159034370707e-05, "loss": 0.293, "num_input_tokens_seen": 78142480, "step": 81830 }, { "epoch": 6.6755037115588545, "grad_norm": 0.2611081600189209, "learning_rate": 1.50278950675744e-05, "loss": 0.4174, "num_input_tokens_seen": 78147424, "step": 81835 }, { "epoch": 6.6759115751692635, "grad_norm": 1.5419549942016602, "learning_rate": 1.5024631302908431e-05, "loss": 0.2636, "num_input_tokens_seen": 78152144, "step": 81840 }, { "epoch": 6.676319438779672, "grad_norm": 15.296568870544434, "learning_rate": 1.502136774043895e-05, "loss": 0.3895, "num_input_tokens_seen": 78157120, "step": 81845 }, { "epoch": 6.676727302390081, "grad_norm": 0.32989752292633057, "learning_rate": 1.5018104380232101e-05, "loss": 0.369, "num_input_tokens_seen": 78162464, "step": 81850 }, { "epoch": 6.67713516600049, "grad_norm": 0.8265833854675293, "learning_rate": 1.501484122235403e-05, "loss": 0.3151, "num_input_tokens_seen": 78167248, "step": 81855 }, { "epoch": 6.677543029610898, "grad_norm": 0.22988125681877136, "learning_rate": 1.5011578266870893e-05, "loss": 0.3498, "num_input_tokens_seen": 78172144, "step": 81860 }, { "epoch": 6.677950893221307, "grad_norm": 0.34289947152137756, "learning_rate": 1.500831551384882e-05, "loss": 0.3093, "num_input_tokens_seen": 78176464, "step": 81865 }, { "epoch": 6.678358756831716, "grad_norm": 2.027867078781128, "learning_rate": 1.500505296335394e-05, "loss": 0.3102, "num_input_tokens_seen": 78181408, "step": 81870 }, { "epoch": 6.678766620442124, "grad_norm": 0.303659588098526, "learning_rate": 1.5001790615452383e-05, "loss": 0.3597, "num_input_tokens_seen": 78187136, "step": 81875 }, { "epoch": 6.679174484052533, "grad_norm": 0.6208794116973877, "learning_rate": 1.4998528470210293e-05, "loss": 0.3819, "num_input_tokens_seen": 78191552, "step": 81880 }, { "epoch": 6.679582347662942, "grad_norm": 0.5428104996681213, "learning_rate": 1.499526652769378e-05, "loss": 0.334, "num_input_tokens_seen": 78196032, "step": 81885 }, { "epoch": 6.67999021127335, "grad_norm": 1.4167200326919556, "learning_rate": 1.4992004787968961e-05, "loss": 0.3292, "num_input_tokens_seen": 78200848, "step": 81890 }, { "epoch": 6.680398074883759, "grad_norm": 0.5752173066139221, "learning_rate": 1.4988743251101955e-05, "loss": 0.4361, "num_input_tokens_seen": 78204544, "step": 81895 }, { "epoch": 6.680805938494167, "grad_norm": 1.7194452285766602, "learning_rate": 1.4985481917158861e-05, "loss": 0.2631, "num_input_tokens_seen": 78208992, "step": 81900 }, { "epoch": 6.681213802104576, "grad_norm": 0.3927082121372223, "learning_rate": 1.4982220786205797e-05, "loss": 0.3367, "num_input_tokens_seen": 78213856, "step": 81905 }, { "epoch": 6.681621665714985, "grad_norm": 0.8749545216560364, "learning_rate": 1.4978959858308866e-05, "loss": 0.3866, "num_input_tokens_seen": 78218624, "step": 81910 }, { "epoch": 6.682029529325393, "grad_norm": 7.6907958984375, "learning_rate": 1.497569913353416e-05, "loss": 0.2983, "num_input_tokens_seen": 78223728, "step": 81915 }, { "epoch": 6.682437392935802, "grad_norm": 5.767385482788086, "learning_rate": 1.4972438611947765e-05, "loss": 0.3125, "num_input_tokens_seen": 78228544, "step": 81920 }, { "epoch": 6.682845256546211, "grad_norm": 22.07881736755371, "learning_rate": 1.4969178293615788e-05, "loss": 0.37, "num_input_tokens_seen": 78232976, "step": 81925 }, { "epoch": 6.683253120156619, "grad_norm": 0.7092031240463257, "learning_rate": 1.49659181786043e-05, "loss": 0.3164, "num_input_tokens_seen": 78238080, "step": 81930 }, { "epoch": 6.683660983767028, "grad_norm": 6.378195285797119, "learning_rate": 1.496265826697939e-05, "loss": 0.2774, "num_input_tokens_seen": 78243168, "step": 81935 }, { "epoch": 6.6840688473774375, "grad_norm": 0.5825335383415222, "learning_rate": 1.4959398558807131e-05, "loss": 0.2856, "num_input_tokens_seen": 78247408, "step": 81940 }, { "epoch": 6.684476710987846, "grad_norm": 0.9820077419281006, "learning_rate": 1.4956139054153583e-05, "loss": 0.3287, "num_input_tokens_seen": 78251472, "step": 81945 }, { "epoch": 6.684884574598255, "grad_norm": 58.364837646484375, "learning_rate": 1.4952879753084842e-05, "loss": 0.3226, "num_input_tokens_seen": 78256464, "step": 81950 }, { "epoch": 6.685292438208663, "grad_norm": 2.280059337615967, "learning_rate": 1.494962065566696e-05, "loss": 0.3621, "num_input_tokens_seen": 78261312, "step": 81955 }, { "epoch": 6.685700301819072, "grad_norm": 0.4449354410171509, "learning_rate": 1.4946361761965996e-05, "loss": 0.2892, "num_input_tokens_seen": 78266480, "step": 81960 }, { "epoch": 6.686108165429481, "grad_norm": 10.320014953613281, "learning_rate": 1.4943103072047995e-05, "loss": 0.3033, "num_input_tokens_seen": 78270880, "step": 81965 }, { "epoch": 6.686516029039889, "grad_norm": 9.45452880859375, "learning_rate": 1.4939844585979035e-05, "loss": 0.3917, "num_input_tokens_seen": 78276544, "step": 81970 }, { "epoch": 6.686923892650298, "grad_norm": 35.702049255371094, "learning_rate": 1.4936586303825145e-05, "loss": 0.3526, "num_input_tokens_seen": 78281264, "step": 81975 }, { "epoch": 6.687331756260706, "grad_norm": 0.3640226721763611, "learning_rate": 1.4933328225652379e-05, "loss": 0.4309, "num_input_tokens_seen": 78285984, "step": 81980 }, { "epoch": 6.687739619871115, "grad_norm": 0.767989993095398, "learning_rate": 1.493007035152677e-05, "loss": 0.3221, "num_input_tokens_seen": 78290032, "step": 81985 }, { "epoch": 6.688147483481524, "grad_norm": 34.44675827026367, "learning_rate": 1.4926812681514351e-05, "loss": 0.3869, "num_input_tokens_seen": 78295472, "step": 81990 }, { "epoch": 6.688555347091932, "grad_norm": 0.40533557534217834, "learning_rate": 1.4923555215681167e-05, "loss": 0.3877, "num_input_tokens_seen": 78299760, "step": 81995 }, { "epoch": 6.688963210702341, "grad_norm": 0.9687994718551636, "learning_rate": 1.4920297954093235e-05, "loss": 0.3115, "num_input_tokens_seen": 78304496, "step": 82000 }, { "epoch": 6.68937107431275, "grad_norm": 0.41746383905410767, "learning_rate": 1.4917040896816581e-05, "loss": 0.3664, "num_input_tokens_seen": 78309248, "step": 82005 }, { "epoch": 6.689778937923158, "grad_norm": 0.7876628041267395, "learning_rate": 1.4913784043917212e-05, "loss": 0.3938, "num_input_tokens_seen": 78313056, "step": 82010 }, { "epoch": 6.690186801533567, "grad_norm": 0.47415608167648315, "learning_rate": 1.4910527395461163e-05, "loss": 0.2013, "num_input_tokens_seen": 78316912, "step": 82015 }, { "epoch": 6.690594665143976, "grad_norm": 5.531936168670654, "learning_rate": 1.4907270951514441e-05, "loss": 0.3583, "num_input_tokens_seen": 78321952, "step": 82020 }, { "epoch": 6.691002528754384, "grad_norm": 0.776351273059845, "learning_rate": 1.4904014712143043e-05, "loss": 0.5216, "num_input_tokens_seen": 78327152, "step": 82025 }, { "epoch": 6.691410392364793, "grad_norm": 0.40038055181503296, "learning_rate": 1.490075867741298e-05, "loss": 0.3747, "num_input_tokens_seen": 78330608, "step": 82030 }, { "epoch": 6.6918182559752015, "grad_norm": 0.46947401762008667, "learning_rate": 1.4897502847390237e-05, "loss": 0.2903, "num_input_tokens_seen": 78335712, "step": 82035 }, { "epoch": 6.6922261195856105, "grad_norm": 0.34982332587242126, "learning_rate": 1.4894247222140829e-05, "loss": 0.3062, "num_input_tokens_seen": 78339744, "step": 82040 }, { "epoch": 6.6926339831960195, "grad_norm": 0.2798812687397003, "learning_rate": 1.489099180173073e-05, "loss": 0.3928, "num_input_tokens_seen": 78344960, "step": 82045 }, { "epoch": 6.693041846806428, "grad_norm": 0.4902162253856659, "learning_rate": 1.4887736586225936e-05, "loss": 0.4173, "num_input_tokens_seen": 78349600, "step": 82050 }, { "epoch": 6.693449710416837, "grad_norm": 0.24816228449344635, "learning_rate": 1.4884481575692422e-05, "loss": 0.3405, "num_input_tokens_seen": 78353904, "step": 82055 }, { "epoch": 6.693857574027245, "grad_norm": 0.718176007270813, "learning_rate": 1.4881226770196171e-05, "loss": 0.3855, "num_input_tokens_seen": 78358592, "step": 82060 }, { "epoch": 6.694265437637654, "grad_norm": 0.8817863464355469, "learning_rate": 1.4877972169803154e-05, "loss": 0.3507, "num_input_tokens_seen": 78364240, "step": 82065 }, { "epoch": 6.694673301248063, "grad_norm": 0.3751767873764038, "learning_rate": 1.4874717774579338e-05, "loss": 0.3688, "num_input_tokens_seen": 78369232, "step": 82070 }, { "epoch": 6.695081164858472, "grad_norm": 1.5444210767745972, "learning_rate": 1.4871463584590683e-05, "loss": 0.457, "num_input_tokens_seen": 78374384, "step": 82075 }, { "epoch": 6.69548902846888, "grad_norm": 0.6847425103187561, "learning_rate": 1.4868209599903166e-05, "loss": 0.5235, "num_input_tokens_seen": 78379648, "step": 82080 }, { "epoch": 6.695896892079289, "grad_norm": 0.5901159644126892, "learning_rate": 1.4864955820582732e-05, "loss": 0.3409, "num_input_tokens_seen": 78384096, "step": 82085 }, { "epoch": 6.696304755689697, "grad_norm": 0.9791353940963745, "learning_rate": 1.4861702246695341e-05, "loss": 0.4339, "num_input_tokens_seen": 78387888, "step": 82090 }, { "epoch": 6.696712619300106, "grad_norm": 2.3381283283233643, "learning_rate": 1.4858448878306935e-05, "loss": 0.3126, "num_input_tokens_seen": 78392384, "step": 82095 }, { "epoch": 6.697120482910515, "grad_norm": 9.373542785644531, "learning_rate": 1.4855195715483455e-05, "loss": 0.431, "num_input_tokens_seen": 78396896, "step": 82100 }, { "epoch": 6.697528346520923, "grad_norm": 11.261598587036133, "learning_rate": 1.4851942758290855e-05, "loss": 0.3384, "num_input_tokens_seen": 78402160, "step": 82105 }, { "epoch": 6.697936210131332, "grad_norm": 0.7387994527816772, "learning_rate": 1.4848690006795061e-05, "loss": 0.3134, "num_input_tokens_seen": 78406720, "step": 82110 }, { "epoch": 6.69834407374174, "grad_norm": 0.9649766683578491, "learning_rate": 1.4845437461062007e-05, "loss": 0.3296, "num_input_tokens_seen": 78411808, "step": 82115 }, { "epoch": 6.698751937352149, "grad_norm": 5.217876434326172, "learning_rate": 1.4842185121157618e-05, "loss": 0.3231, "num_input_tokens_seen": 78416224, "step": 82120 }, { "epoch": 6.699159800962558, "grad_norm": 0.6498801708221436, "learning_rate": 1.4838932987147825e-05, "loss": 0.3321, "num_input_tokens_seen": 78421360, "step": 82125 }, { "epoch": 6.6995676645729665, "grad_norm": 1.0933862924575806, "learning_rate": 1.4835681059098538e-05, "loss": 0.321, "num_input_tokens_seen": 78426448, "step": 82130 }, { "epoch": 6.6999755281833755, "grad_norm": 11.027649879455566, "learning_rate": 1.4832429337075682e-05, "loss": 0.282, "num_input_tokens_seen": 78432448, "step": 82135 }, { "epoch": 6.7003833917937845, "grad_norm": 0.4935518801212311, "learning_rate": 1.4829177821145158e-05, "loss": 0.2377, "num_input_tokens_seen": 78436992, "step": 82140 }, { "epoch": 6.700791255404193, "grad_norm": 1.650138258934021, "learning_rate": 1.4825926511372877e-05, "loss": 0.3071, "num_input_tokens_seen": 78441616, "step": 82145 }, { "epoch": 6.701199119014602, "grad_norm": 3.23762845993042, "learning_rate": 1.4822675407824746e-05, "loss": 0.3701, "num_input_tokens_seen": 78445872, "step": 82150 }, { "epoch": 6.701606982625011, "grad_norm": 2.0055007934570312, "learning_rate": 1.481942451056666e-05, "loss": 0.2982, "num_input_tokens_seen": 78450592, "step": 82155 }, { "epoch": 6.702014846235419, "grad_norm": 11.685145378112793, "learning_rate": 1.4816173819664513e-05, "loss": 0.4016, "num_input_tokens_seen": 78455296, "step": 82160 }, { "epoch": 6.702422709845828, "grad_norm": 0.5390825867652893, "learning_rate": 1.4812923335184186e-05, "loss": 0.3854, "num_input_tokens_seen": 78460448, "step": 82165 }, { "epoch": 6.702830573456236, "grad_norm": 1.1737686395645142, "learning_rate": 1.4809673057191586e-05, "loss": 0.3259, "num_input_tokens_seen": 78465344, "step": 82170 }, { "epoch": 6.703238437066645, "grad_norm": 1.259644865989685, "learning_rate": 1.4806422985752583e-05, "loss": 0.3458, "num_input_tokens_seen": 78469360, "step": 82175 }, { "epoch": 6.703646300677054, "grad_norm": 1.7044243812561035, "learning_rate": 1.480317312093305e-05, "loss": 0.3157, "num_input_tokens_seen": 78474048, "step": 82180 }, { "epoch": 6.704054164287462, "grad_norm": 0.5312287211418152, "learning_rate": 1.479992346279887e-05, "loss": 0.2942, "num_input_tokens_seen": 78479264, "step": 82185 }, { "epoch": 6.704462027897871, "grad_norm": 2.7753994464874268, "learning_rate": 1.47966740114159e-05, "loss": 0.3817, "num_input_tokens_seen": 78484080, "step": 82190 }, { "epoch": 6.704869891508279, "grad_norm": 0.7011152505874634, "learning_rate": 1.4793424766850017e-05, "loss": 0.3141, "num_input_tokens_seen": 78488736, "step": 82195 }, { "epoch": 6.705277755118688, "grad_norm": 0.5099398493766785, "learning_rate": 1.4790175729167077e-05, "loss": 0.31, "num_input_tokens_seen": 78493568, "step": 82200 }, { "epoch": 6.705685618729097, "grad_norm": 9.09512710571289, "learning_rate": 1.478692689843293e-05, "loss": 0.2407, "num_input_tokens_seen": 78499008, "step": 82205 }, { "epoch": 6.706093482339505, "grad_norm": 0.7268301844596863, "learning_rate": 1.4783678274713436e-05, "loss": 0.2993, "num_input_tokens_seen": 78504352, "step": 82210 }, { "epoch": 6.706501345949914, "grad_norm": 0.5027362704277039, "learning_rate": 1.478042985807445e-05, "loss": 0.3561, "num_input_tokens_seen": 78508384, "step": 82215 }, { "epoch": 6.706909209560323, "grad_norm": 0.804724395275116, "learning_rate": 1.4777181648581806e-05, "loss": 0.3465, "num_input_tokens_seen": 78513136, "step": 82220 }, { "epoch": 6.7073170731707314, "grad_norm": 12.491410255432129, "learning_rate": 1.4773933646301346e-05, "loss": 0.337, "num_input_tokens_seen": 78518160, "step": 82225 }, { "epoch": 6.7077249367811405, "grad_norm": 0.5253143310546875, "learning_rate": 1.4770685851298893e-05, "loss": 0.2991, "num_input_tokens_seen": 78523088, "step": 82230 }, { "epoch": 6.7081328003915495, "grad_norm": 0.27023783326148987, "learning_rate": 1.4767438263640303e-05, "loss": 0.4892, "num_input_tokens_seen": 78528160, "step": 82235 }, { "epoch": 6.708540664001958, "grad_norm": 1.0804576873779297, "learning_rate": 1.4764190883391394e-05, "loss": 0.3336, "num_input_tokens_seen": 78533008, "step": 82240 }, { "epoch": 6.708948527612367, "grad_norm": 2.612639904022217, "learning_rate": 1.4760943710617987e-05, "loss": 0.3115, "num_input_tokens_seen": 78538800, "step": 82245 }, { "epoch": 6.709356391222775, "grad_norm": 0.603137195110321, "learning_rate": 1.4757696745385897e-05, "loss": 0.3411, "num_input_tokens_seen": 78543296, "step": 82250 }, { "epoch": 6.709764254833184, "grad_norm": 3.101806879043579, "learning_rate": 1.4754449987760937e-05, "loss": 0.4925, "num_input_tokens_seen": 78548752, "step": 82255 }, { "epoch": 6.710172118443593, "grad_norm": 4.671218395233154, "learning_rate": 1.4751203437808925e-05, "loss": 0.4066, "num_input_tokens_seen": 78553648, "step": 82260 }, { "epoch": 6.710579982054001, "grad_norm": 2.0511748790740967, "learning_rate": 1.4747957095595663e-05, "loss": 0.3503, "num_input_tokens_seen": 78558320, "step": 82265 }, { "epoch": 6.71098784566441, "grad_norm": 0.35945022106170654, "learning_rate": 1.4744710961186958e-05, "loss": 0.2005, "num_input_tokens_seen": 78563328, "step": 82270 }, { "epoch": 6.711395709274819, "grad_norm": 0.5581045150756836, "learning_rate": 1.47414650346486e-05, "loss": 0.2989, "num_input_tokens_seen": 78567696, "step": 82275 }, { "epoch": 6.711803572885227, "grad_norm": 1.1106809377670288, "learning_rate": 1.4738219316046393e-05, "loss": 0.3508, "num_input_tokens_seen": 78572400, "step": 82280 }, { "epoch": 6.712211436495636, "grad_norm": 0.46395453810691833, "learning_rate": 1.4734973805446118e-05, "loss": 0.3028, "num_input_tokens_seen": 78576416, "step": 82285 }, { "epoch": 6.712619300106045, "grad_norm": 12.719544410705566, "learning_rate": 1.473172850291356e-05, "loss": 0.5529, "num_input_tokens_seen": 78581152, "step": 82290 }, { "epoch": 6.713027163716453, "grad_norm": 1.3281532526016235, "learning_rate": 1.4728483408514504e-05, "loss": 0.4717, "num_input_tokens_seen": 78586208, "step": 82295 }, { "epoch": 6.713435027326862, "grad_norm": 2.0847973823547363, "learning_rate": 1.4725238522314717e-05, "loss": 0.3221, "num_input_tokens_seen": 78591248, "step": 82300 }, { "epoch": 6.71384289093727, "grad_norm": 0.7081184983253479, "learning_rate": 1.4721993844379984e-05, "loss": 0.3273, "num_input_tokens_seen": 78595936, "step": 82305 }, { "epoch": 6.714250754547679, "grad_norm": 0.4708791673183441, "learning_rate": 1.471874937477607e-05, "loss": 0.2692, "num_input_tokens_seen": 78600720, "step": 82310 }, { "epoch": 6.714658618158088, "grad_norm": 101.26553344726562, "learning_rate": 1.471550511356874e-05, "loss": 0.4709, "num_input_tokens_seen": 78605632, "step": 82315 }, { "epoch": 6.715066481768496, "grad_norm": 1.2846399545669556, "learning_rate": 1.4712261060823745e-05, "loss": 0.3394, "num_input_tokens_seen": 78610128, "step": 82320 }, { "epoch": 6.715474345378905, "grad_norm": 1.1096214056015015, "learning_rate": 1.4709017216606846e-05, "loss": 0.3556, "num_input_tokens_seen": 78614432, "step": 82325 }, { "epoch": 6.7158822089893135, "grad_norm": 1.7333955764770508, "learning_rate": 1.47057735809838e-05, "loss": 0.336, "num_input_tokens_seen": 78617760, "step": 82330 }, { "epoch": 6.7162900725997225, "grad_norm": 1.4615367650985718, "learning_rate": 1.470253015402035e-05, "loss": 0.34, "num_input_tokens_seen": 78622832, "step": 82335 }, { "epoch": 6.716697936210132, "grad_norm": 1.6993606090545654, "learning_rate": 1.4699286935782239e-05, "loss": 0.2665, "num_input_tokens_seen": 78627456, "step": 82340 }, { "epoch": 6.71710579982054, "grad_norm": 2.4340946674346924, "learning_rate": 1.4696043926335198e-05, "loss": 0.2224, "num_input_tokens_seen": 78632496, "step": 82345 }, { "epoch": 6.717513663430949, "grad_norm": 55.312374114990234, "learning_rate": 1.4692801125744976e-05, "loss": 0.257, "num_input_tokens_seen": 78637264, "step": 82350 }, { "epoch": 6.717921527041358, "grad_norm": 2.3610479831695557, "learning_rate": 1.4689558534077291e-05, "loss": 0.4163, "num_input_tokens_seen": 78641776, "step": 82355 }, { "epoch": 6.718329390651766, "grad_norm": 8.043280601501465, "learning_rate": 1.4686316151397877e-05, "loss": 0.2309, "num_input_tokens_seen": 78646416, "step": 82360 }, { "epoch": 6.718737254262175, "grad_norm": 2.787440061569214, "learning_rate": 1.4683073977772438e-05, "loss": 0.2504, "num_input_tokens_seen": 78650896, "step": 82365 }, { "epoch": 6.719145117872584, "grad_norm": 177.4140625, "learning_rate": 1.467983201326672e-05, "loss": 0.5236, "num_input_tokens_seen": 78656112, "step": 82370 }, { "epoch": 6.719552981482992, "grad_norm": 38.808746337890625, "learning_rate": 1.467659025794642e-05, "loss": 0.4687, "num_input_tokens_seen": 78661456, "step": 82375 }, { "epoch": 6.719960845093401, "grad_norm": 66.18097686767578, "learning_rate": 1.467334871187725e-05, "loss": 0.4952, "num_input_tokens_seen": 78666976, "step": 82380 }, { "epoch": 6.720368708703809, "grad_norm": 24.55293083190918, "learning_rate": 1.4670107375124912e-05, "loss": 0.4252, "num_input_tokens_seen": 78671728, "step": 82385 }, { "epoch": 6.720776572314218, "grad_norm": 1.84586501121521, "learning_rate": 1.4666866247755098e-05, "loss": 0.3183, "num_input_tokens_seen": 78676144, "step": 82390 }, { "epoch": 6.721184435924627, "grad_norm": 5.141228675842285, "learning_rate": 1.4663625329833525e-05, "loss": 0.325, "num_input_tokens_seen": 78681440, "step": 82395 }, { "epoch": 6.721592299535035, "grad_norm": 0.4874224364757538, "learning_rate": 1.4660384621425873e-05, "loss": 0.2618, "num_input_tokens_seen": 78686128, "step": 82400 }, { "epoch": 6.722000163145444, "grad_norm": 5.38601541519165, "learning_rate": 1.4657144122597832e-05, "loss": 0.2469, "num_input_tokens_seen": 78690496, "step": 82405 }, { "epoch": 6.722408026755852, "grad_norm": 32.66349792480469, "learning_rate": 1.4653903833415084e-05, "loss": 0.3416, "num_input_tokens_seen": 78695664, "step": 82410 }, { "epoch": 6.722815890366261, "grad_norm": 7.381059169769287, "learning_rate": 1.4650663753943309e-05, "loss": 0.3433, "num_input_tokens_seen": 78700368, "step": 82415 }, { "epoch": 6.72322375397667, "grad_norm": 9.065835952758789, "learning_rate": 1.4647423884248183e-05, "loss": 0.6179, "num_input_tokens_seen": 78705040, "step": 82420 }, { "epoch": 6.7236316175870785, "grad_norm": 4.696211814880371, "learning_rate": 1.4644184224395375e-05, "loss": 0.3728, "num_input_tokens_seen": 78709616, "step": 82425 }, { "epoch": 6.7240394811974875, "grad_norm": 2.3884220123291016, "learning_rate": 1.4640944774450543e-05, "loss": 0.3079, "num_input_tokens_seen": 78714256, "step": 82430 }, { "epoch": 6.7244473448078965, "grad_norm": 2.612112045288086, "learning_rate": 1.4637705534479368e-05, "loss": 0.4498, "num_input_tokens_seen": 78718864, "step": 82435 }, { "epoch": 6.724855208418305, "grad_norm": 6.35935115814209, "learning_rate": 1.46344665045475e-05, "loss": 0.2079, "num_input_tokens_seen": 78723632, "step": 82440 }, { "epoch": 6.725263072028714, "grad_norm": 0.5474647879600525, "learning_rate": 1.463122768472059e-05, "loss": 0.3757, "num_input_tokens_seen": 78728032, "step": 82445 }, { "epoch": 6.725670935639123, "grad_norm": 110.36003112792969, "learning_rate": 1.4627989075064291e-05, "loss": 0.3042, "num_input_tokens_seen": 78733520, "step": 82450 }, { "epoch": 6.726078799249531, "grad_norm": 27.759355545043945, "learning_rate": 1.4624750675644234e-05, "loss": 0.2673, "num_input_tokens_seen": 78738544, "step": 82455 }, { "epoch": 6.72648666285994, "grad_norm": 0.9962427020072937, "learning_rate": 1.4621512486526084e-05, "loss": 0.3248, "num_input_tokens_seen": 78743296, "step": 82460 }, { "epoch": 6.726894526470348, "grad_norm": 0.5259616374969482, "learning_rate": 1.4618274507775465e-05, "loss": 0.2961, "num_input_tokens_seen": 78748288, "step": 82465 }, { "epoch": 6.727302390080757, "grad_norm": 18.901865005493164, "learning_rate": 1.461503673945801e-05, "loss": 0.3229, "num_input_tokens_seen": 78753632, "step": 82470 }, { "epoch": 6.727710253691166, "grad_norm": 23.640884399414062, "learning_rate": 1.4611799181639341e-05, "loss": 0.3788, "num_input_tokens_seen": 78757392, "step": 82475 }, { "epoch": 6.728118117301574, "grad_norm": 16.693387985229492, "learning_rate": 1.4608561834385093e-05, "loss": 0.4988, "num_input_tokens_seen": 78761984, "step": 82480 }, { "epoch": 6.728525980911983, "grad_norm": 1.9357006549835205, "learning_rate": 1.4605324697760883e-05, "loss": 0.3579, "num_input_tokens_seen": 78766864, "step": 82485 }, { "epoch": 6.728933844522392, "grad_norm": 6.339968204498291, "learning_rate": 1.4602087771832324e-05, "loss": 0.3178, "num_input_tokens_seen": 78771984, "step": 82490 }, { "epoch": 6.7293417081328, "grad_norm": 33.73308563232422, "learning_rate": 1.459885105666503e-05, "loss": 0.3784, "num_input_tokens_seen": 78777648, "step": 82495 }, { "epoch": 6.729749571743209, "grad_norm": 0.518538236618042, "learning_rate": 1.4595614552324596e-05, "loss": 0.2907, "num_input_tokens_seen": 78782544, "step": 82500 }, { "epoch": 6.730157435353618, "grad_norm": 1.5753253698349, "learning_rate": 1.4592378258876643e-05, "loss": 0.3187, "num_input_tokens_seen": 78786704, "step": 82505 }, { "epoch": 6.730565298964026, "grad_norm": 3.3897619247436523, "learning_rate": 1.4589142176386755e-05, "loss": 0.3202, "num_input_tokens_seen": 78792720, "step": 82510 }, { "epoch": 6.730973162574435, "grad_norm": 21.881162643432617, "learning_rate": 1.4585906304920538e-05, "loss": 0.281, "num_input_tokens_seen": 78797472, "step": 82515 }, { "epoch": 6.7313810261848435, "grad_norm": 27.164859771728516, "learning_rate": 1.4582670644543561e-05, "loss": 0.2603, "num_input_tokens_seen": 78801968, "step": 82520 }, { "epoch": 6.7317888897952525, "grad_norm": 1.7937817573547363, "learning_rate": 1.4579435195321434e-05, "loss": 0.3354, "num_input_tokens_seen": 78806288, "step": 82525 }, { "epoch": 6.7321967534056615, "grad_norm": 1.7337912321090698, "learning_rate": 1.4576199957319725e-05, "loss": 0.204, "num_input_tokens_seen": 78811072, "step": 82530 }, { "epoch": 6.73260461701607, "grad_norm": 3.131135940551758, "learning_rate": 1.4572964930604015e-05, "loss": 0.4504, "num_input_tokens_seen": 78816016, "step": 82535 }, { "epoch": 6.733012480626479, "grad_norm": 46.832088470458984, "learning_rate": 1.4569730115239877e-05, "loss": 0.5702, "num_input_tokens_seen": 78820592, "step": 82540 }, { "epoch": 6.733420344236887, "grad_norm": 9.246827125549316, "learning_rate": 1.4566495511292866e-05, "loss": 0.3085, "num_input_tokens_seen": 78824288, "step": 82545 }, { "epoch": 6.733828207847296, "grad_norm": 4.403851509094238, "learning_rate": 1.456326111882857e-05, "loss": 0.2984, "num_input_tokens_seen": 78830192, "step": 82550 }, { "epoch": 6.734236071457705, "grad_norm": 13.98593807220459, "learning_rate": 1.4560026937912536e-05, "loss": 0.6005, "num_input_tokens_seen": 78834960, "step": 82555 }, { "epoch": 6.734643935068113, "grad_norm": 0.9812548756599426, "learning_rate": 1.4556792968610316e-05, "loss": 0.2774, "num_input_tokens_seen": 78839056, "step": 82560 }, { "epoch": 6.735051798678522, "grad_norm": 0.7131803035736084, "learning_rate": 1.4553559210987457e-05, "loss": 0.2234, "num_input_tokens_seen": 78844112, "step": 82565 }, { "epoch": 6.735459662288931, "grad_norm": 63.4212646484375, "learning_rate": 1.4550325665109527e-05, "loss": 0.5864, "num_input_tokens_seen": 78850016, "step": 82570 }, { "epoch": 6.735867525899339, "grad_norm": 1.3509539365768433, "learning_rate": 1.4547092331042051e-05, "loss": 0.5209, "num_input_tokens_seen": 78855104, "step": 82575 }, { "epoch": 6.736275389509748, "grad_norm": 75.21831512451172, "learning_rate": 1.4543859208850572e-05, "loss": 0.3486, "num_input_tokens_seen": 78860752, "step": 82580 }, { "epoch": 6.736683253120157, "grad_norm": 2.1253767013549805, "learning_rate": 1.4540626298600624e-05, "loss": 0.3111, "num_input_tokens_seen": 78866064, "step": 82585 }, { "epoch": 6.737091116730565, "grad_norm": 0.4804568588733673, "learning_rate": 1.4537393600357729e-05, "loss": 0.2906, "num_input_tokens_seen": 78870368, "step": 82590 }, { "epoch": 6.737498980340974, "grad_norm": 16.51392936706543, "learning_rate": 1.4534161114187428e-05, "loss": 0.2906, "num_input_tokens_seen": 78873872, "step": 82595 }, { "epoch": 6.737906843951382, "grad_norm": 0.3035329282283783, "learning_rate": 1.4530928840155233e-05, "loss": 0.1997, "num_input_tokens_seen": 78878368, "step": 82600 }, { "epoch": 6.738314707561791, "grad_norm": 0.6868083477020264, "learning_rate": 1.4527696778326667e-05, "loss": 0.1041, "num_input_tokens_seen": 78883264, "step": 82605 }, { "epoch": 6.7387225711722, "grad_norm": 22.498645782470703, "learning_rate": 1.452446492876722e-05, "loss": 0.3332, "num_input_tokens_seen": 78887872, "step": 82610 }, { "epoch": 6.739130434782608, "grad_norm": 11.093698501586914, "learning_rate": 1.452123329154243e-05, "loss": 0.3845, "num_input_tokens_seen": 78892272, "step": 82615 }, { "epoch": 6.739538298393017, "grad_norm": 28.398481369018555, "learning_rate": 1.451800186671779e-05, "loss": 0.5009, "num_input_tokens_seen": 78896624, "step": 82620 }, { "epoch": 6.739946162003426, "grad_norm": 2.9297749996185303, "learning_rate": 1.45147706543588e-05, "loss": 0.8276, "num_input_tokens_seen": 78901248, "step": 82625 }, { "epoch": 6.740354025613835, "grad_norm": 2.753896474838257, "learning_rate": 1.4511539654530947e-05, "loss": 0.4871, "num_input_tokens_seen": 78905872, "step": 82630 }, { "epoch": 6.740761889224244, "grad_norm": 0.47688931226730347, "learning_rate": 1.4508308867299735e-05, "loss": 0.312, "num_input_tokens_seen": 78910080, "step": 82635 }, { "epoch": 6.741169752834653, "grad_norm": 5.665141582489014, "learning_rate": 1.4505078292730632e-05, "loss": 0.4706, "num_input_tokens_seen": 78914592, "step": 82640 }, { "epoch": 6.741577616445061, "grad_norm": 2.9823663234710693, "learning_rate": 1.450184793088914e-05, "loss": 0.5573, "num_input_tokens_seen": 78918848, "step": 82645 }, { "epoch": 6.74198548005547, "grad_norm": 1.1627026796340942, "learning_rate": 1.4498617781840735e-05, "loss": 0.346, "num_input_tokens_seen": 78923136, "step": 82650 }, { "epoch": 6.742393343665878, "grad_norm": 0.6222413778305054, "learning_rate": 1.4495387845650879e-05, "loss": 0.3421, "num_input_tokens_seen": 78927440, "step": 82655 }, { "epoch": 6.742801207276287, "grad_norm": 0.4586789906024933, "learning_rate": 1.4492158122385053e-05, "loss": 0.4379, "num_input_tokens_seen": 78932912, "step": 82660 }, { "epoch": 6.743209070886696, "grad_norm": 1.1567795276641846, "learning_rate": 1.448892861210871e-05, "loss": 0.3304, "num_input_tokens_seen": 78938752, "step": 82665 }, { "epoch": 6.743616934497104, "grad_norm": 0.5788429379463196, "learning_rate": 1.4485699314887324e-05, "loss": 0.2803, "num_input_tokens_seen": 78942960, "step": 82670 }, { "epoch": 6.744024798107513, "grad_norm": 5.9311699867248535, "learning_rate": 1.4482470230786332e-05, "loss": 0.3523, "num_input_tokens_seen": 78948112, "step": 82675 }, { "epoch": 6.744432661717921, "grad_norm": 3.5640902519226074, "learning_rate": 1.447924135987121e-05, "loss": 0.2554, "num_input_tokens_seen": 78952448, "step": 82680 }, { "epoch": 6.74484052532833, "grad_norm": 0.43264040350914, "learning_rate": 1.4476012702207396e-05, "loss": 0.359, "num_input_tokens_seen": 78957648, "step": 82685 }, { "epoch": 6.745248388938739, "grad_norm": 2.782266139984131, "learning_rate": 1.447278425786033e-05, "loss": 0.4102, "num_input_tokens_seen": 78961584, "step": 82690 }, { "epoch": 6.745656252549147, "grad_norm": 7.601108551025391, "learning_rate": 1.4469556026895457e-05, "loss": 0.4106, "num_input_tokens_seen": 78966624, "step": 82695 }, { "epoch": 6.746064116159556, "grad_norm": 0.5751504302024841, "learning_rate": 1.4466328009378197e-05, "loss": 0.2374, "num_input_tokens_seen": 78971792, "step": 82700 }, { "epoch": 6.746471979769965, "grad_norm": 2.390266180038452, "learning_rate": 1.4463100205374003e-05, "loss": 0.3125, "num_input_tokens_seen": 78976848, "step": 82705 }, { "epoch": 6.746879843380373, "grad_norm": 30.15576171875, "learning_rate": 1.4459872614948288e-05, "loss": 0.2773, "num_input_tokens_seen": 78981600, "step": 82710 }, { "epoch": 6.747287706990782, "grad_norm": 15.465001106262207, "learning_rate": 1.445664523816648e-05, "loss": 0.3656, "num_input_tokens_seen": 78986720, "step": 82715 }, { "epoch": 6.747695570601191, "grad_norm": 4.478330612182617, "learning_rate": 1.445341807509398e-05, "loss": 0.3161, "num_input_tokens_seen": 78992128, "step": 82720 }, { "epoch": 6.7481034342115995, "grad_norm": 0.4984814524650574, "learning_rate": 1.4450191125796225e-05, "loss": 0.3379, "num_input_tokens_seen": 78997040, "step": 82725 }, { "epoch": 6.7485112978220085, "grad_norm": 12.92012882232666, "learning_rate": 1.4446964390338614e-05, "loss": 0.2233, "num_input_tokens_seen": 79002016, "step": 82730 }, { "epoch": 6.748919161432417, "grad_norm": 8.011218070983887, "learning_rate": 1.444373786878655e-05, "loss": 0.3922, "num_input_tokens_seen": 79006912, "step": 82735 }, { "epoch": 6.749327025042826, "grad_norm": 0.9115833044052124, "learning_rate": 1.4440511561205433e-05, "loss": 0.2835, "num_input_tokens_seen": 79011648, "step": 82740 }, { "epoch": 6.749734888653235, "grad_norm": 3.3417887687683105, "learning_rate": 1.443728546766065e-05, "loss": 0.3506, "num_input_tokens_seen": 79017344, "step": 82745 }, { "epoch": 6.750142752263643, "grad_norm": 8.31637191772461, "learning_rate": 1.4434059588217613e-05, "loss": 0.5224, "num_input_tokens_seen": 79021728, "step": 82750 }, { "epoch": 6.750550615874052, "grad_norm": 7.49647855758667, "learning_rate": 1.44308339229417e-05, "loss": 0.2031, "num_input_tokens_seen": 79025888, "step": 82755 }, { "epoch": 6.75095847948446, "grad_norm": 5.36216402053833, "learning_rate": 1.4427608471898288e-05, "loss": 0.4326, "num_input_tokens_seen": 79030720, "step": 82760 }, { "epoch": 6.751366343094869, "grad_norm": 2.166208505630493, "learning_rate": 1.4424383235152752e-05, "loss": 0.3438, "num_input_tokens_seen": 79035616, "step": 82765 }, { "epoch": 6.751774206705278, "grad_norm": 20.945110321044922, "learning_rate": 1.4421158212770485e-05, "loss": 0.5971, "num_input_tokens_seen": 79040448, "step": 82770 }, { "epoch": 6.752182070315686, "grad_norm": 0.4117853343486786, "learning_rate": 1.4417933404816844e-05, "loss": 0.2965, "num_input_tokens_seen": 79046112, "step": 82775 }, { "epoch": 6.752589933926095, "grad_norm": 3.5461738109588623, "learning_rate": 1.4414708811357196e-05, "loss": 0.4861, "num_input_tokens_seen": 79051168, "step": 82780 }, { "epoch": 6.752997797536504, "grad_norm": 3.5921003818511963, "learning_rate": 1.4411484432456903e-05, "loss": 0.2414, "num_input_tokens_seen": 79055776, "step": 82785 }, { "epoch": 6.753405661146912, "grad_norm": 4.695737838745117, "learning_rate": 1.4408260268181324e-05, "loss": 0.3062, "num_input_tokens_seen": 79060240, "step": 82790 }, { "epoch": 6.753813524757321, "grad_norm": 41.41193389892578, "learning_rate": 1.4405036318595805e-05, "loss": 0.4289, "num_input_tokens_seen": 79065120, "step": 82795 }, { "epoch": 6.75422138836773, "grad_norm": 5.550136566162109, "learning_rate": 1.44018125837657e-05, "loss": 0.5633, "num_input_tokens_seen": 79070672, "step": 82800 }, { "epoch": 6.754629251978138, "grad_norm": 3.714700698852539, "learning_rate": 1.4398589063756349e-05, "loss": 0.369, "num_input_tokens_seen": 79075776, "step": 82805 }, { "epoch": 6.755037115588547, "grad_norm": 3.848653554916382, "learning_rate": 1.4395365758633084e-05, "loss": 0.2847, "num_input_tokens_seen": 79080336, "step": 82810 }, { "epoch": 6.7554449791989555, "grad_norm": 0.36550644040107727, "learning_rate": 1.4392142668461255e-05, "loss": 0.5045, "num_input_tokens_seen": 79085184, "step": 82815 }, { "epoch": 6.7558528428093645, "grad_norm": 70.95185852050781, "learning_rate": 1.4388919793306194e-05, "loss": 0.3264, "num_input_tokens_seen": 79089968, "step": 82820 }, { "epoch": 6.7562607064197735, "grad_norm": 3.2323434352874756, "learning_rate": 1.4385697133233214e-05, "loss": 0.4069, "num_input_tokens_seen": 79094992, "step": 82825 }, { "epoch": 6.756668570030182, "grad_norm": 5.758941173553467, "learning_rate": 1.4382474688307635e-05, "loss": 0.1587, "num_input_tokens_seen": 79099184, "step": 82830 }, { "epoch": 6.757076433640591, "grad_norm": 0.48082977533340454, "learning_rate": 1.4379252458594791e-05, "loss": 0.2049, "num_input_tokens_seen": 79104208, "step": 82835 }, { "epoch": 6.757484297251, "grad_norm": 1.8209235668182373, "learning_rate": 1.4376030444159988e-05, "loss": 0.3028, "num_input_tokens_seen": 79109120, "step": 82840 }, { "epoch": 6.757892160861408, "grad_norm": 2.5473031997680664, "learning_rate": 1.437280864506853e-05, "loss": 0.517, "num_input_tokens_seen": 79114848, "step": 82845 }, { "epoch": 6.758300024471817, "grad_norm": 27.017732620239258, "learning_rate": 1.4369587061385731e-05, "loss": 0.3562, "num_input_tokens_seen": 79119168, "step": 82850 }, { "epoch": 6.758707888082226, "grad_norm": 1.8703759908676147, "learning_rate": 1.4366365693176869e-05, "loss": 0.2458, "num_input_tokens_seen": 79124496, "step": 82855 }, { "epoch": 6.759115751692634, "grad_norm": 0.7164052724838257, "learning_rate": 1.4363144540507274e-05, "loss": 0.2992, "num_input_tokens_seen": 79129744, "step": 82860 }, { "epoch": 6.759523615303043, "grad_norm": 0.312825083732605, "learning_rate": 1.435992360344221e-05, "loss": 0.3068, "num_input_tokens_seen": 79134944, "step": 82865 }, { "epoch": 6.759931478913451, "grad_norm": 1.2165343761444092, "learning_rate": 1.435670288204698e-05, "loss": 0.3648, "num_input_tokens_seen": 79139680, "step": 82870 }, { "epoch": 6.76033934252386, "grad_norm": 6.278604984283447, "learning_rate": 1.4353482376386845e-05, "loss": 0.3871, "num_input_tokens_seen": 79144736, "step": 82875 }, { "epoch": 6.760747206134269, "grad_norm": 1.670117735862732, "learning_rate": 1.4350262086527114e-05, "loss": 0.4057, "num_input_tokens_seen": 79148944, "step": 82880 }, { "epoch": 6.761155069744677, "grad_norm": 0.7766255736351013, "learning_rate": 1.434704201253304e-05, "loss": 0.4506, "num_input_tokens_seen": 79154224, "step": 82885 }, { "epoch": 6.761562933355086, "grad_norm": 0.976678729057312, "learning_rate": 1.4343822154469899e-05, "loss": 0.3162, "num_input_tokens_seen": 79158832, "step": 82890 }, { "epoch": 6.761970796965494, "grad_norm": 2.9572994709014893, "learning_rate": 1.4340602512402956e-05, "loss": 0.4526, "num_input_tokens_seen": 79162896, "step": 82895 }, { "epoch": 6.762378660575903, "grad_norm": 4.887850761413574, "learning_rate": 1.433738308639746e-05, "loss": 0.3296, "num_input_tokens_seen": 79166880, "step": 82900 }, { "epoch": 6.762786524186312, "grad_norm": 1.5795539617538452, "learning_rate": 1.4334163876518692e-05, "loss": 0.3224, "num_input_tokens_seen": 79171696, "step": 82905 }, { "epoch": 6.76319438779672, "grad_norm": 31.077919006347656, "learning_rate": 1.4330944882831884e-05, "loss": 0.2698, "num_input_tokens_seen": 79176992, "step": 82910 }, { "epoch": 6.7636022514071295, "grad_norm": 0.9951412677764893, "learning_rate": 1.4327726105402293e-05, "loss": 0.3173, "num_input_tokens_seen": 79181360, "step": 82915 }, { "epoch": 6.7640101150175385, "grad_norm": 28.30438995361328, "learning_rate": 1.4324507544295156e-05, "loss": 0.3094, "num_input_tokens_seen": 79186400, "step": 82920 }, { "epoch": 6.764417978627947, "grad_norm": 0.8941940665245056, "learning_rate": 1.4321289199575715e-05, "loss": 0.2611, "num_input_tokens_seen": 79190096, "step": 82925 }, { "epoch": 6.764825842238356, "grad_norm": 2.1246697902679443, "learning_rate": 1.4318071071309208e-05, "loss": 0.2855, "num_input_tokens_seen": 79194224, "step": 82930 }, { "epoch": 6.765233705848765, "grad_norm": 61.480613708496094, "learning_rate": 1.4314853159560859e-05, "loss": 0.3232, "num_input_tokens_seen": 79199232, "step": 82935 }, { "epoch": 6.765641569459173, "grad_norm": 1.5729327201843262, "learning_rate": 1.4311635464395897e-05, "loss": 0.3372, "num_input_tokens_seen": 79204112, "step": 82940 }, { "epoch": 6.766049433069582, "grad_norm": 0.9087342619895935, "learning_rate": 1.4308417985879533e-05, "loss": 0.3273, "num_input_tokens_seen": 79209360, "step": 82945 }, { "epoch": 6.76645729667999, "grad_norm": 1.302594780921936, "learning_rate": 1.4305200724077e-05, "loss": 0.5948, "num_input_tokens_seen": 79214544, "step": 82950 }, { "epoch": 6.766865160290399, "grad_norm": 0.9119983315467834, "learning_rate": 1.4301983679053506e-05, "loss": 0.35, "num_input_tokens_seen": 79219248, "step": 82955 }, { "epoch": 6.767273023900808, "grad_norm": 0.7418842911720276, "learning_rate": 1.4298766850874257e-05, "loss": 0.4843, "num_input_tokens_seen": 79223552, "step": 82960 }, { "epoch": 6.767680887511216, "grad_norm": 1.8675612211227417, "learning_rate": 1.4295550239604447e-05, "loss": 0.2638, "num_input_tokens_seen": 79227632, "step": 82965 }, { "epoch": 6.768088751121625, "grad_norm": 0.42683306336402893, "learning_rate": 1.4292333845309293e-05, "loss": 0.3902, "num_input_tokens_seen": 79232160, "step": 82970 }, { "epoch": 6.768496614732033, "grad_norm": 0.7427059412002563, "learning_rate": 1.4289117668053986e-05, "loss": 0.3447, "num_input_tokens_seen": 79236960, "step": 82975 }, { "epoch": 6.768904478342442, "grad_norm": 4.373655319213867, "learning_rate": 1.4285901707903709e-05, "loss": 0.2794, "num_input_tokens_seen": 79242336, "step": 82980 }, { "epoch": 6.769312341952851, "grad_norm": 1.367672085762024, "learning_rate": 1.4282685964923642e-05, "loss": 0.1992, "num_input_tokens_seen": 79246704, "step": 82985 }, { "epoch": 6.769720205563259, "grad_norm": 12.992233276367188, "learning_rate": 1.4279470439178988e-05, "loss": 0.4829, "num_input_tokens_seen": 79251072, "step": 82990 }, { "epoch": 6.770128069173668, "grad_norm": 0.36244532465934753, "learning_rate": 1.4276255130734912e-05, "loss": 0.5297, "num_input_tokens_seen": 79256656, "step": 82995 }, { "epoch": 6.770535932784077, "grad_norm": 0.2557428479194641, "learning_rate": 1.4273040039656588e-05, "loss": 0.2311, "num_input_tokens_seen": 79260848, "step": 83000 }, { "epoch": 6.770943796394485, "grad_norm": 40.26640701293945, "learning_rate": 1.4269825166009181e-05, "loss": 0.284, "num_input_tokens_seen": 79264976, "step": 83005 }, { "epoch": 6.771351660004894, "grad_norm": 3.109814167022705, "learning_rate": 1.4266610509857853e-05, "loss": 0.5158, "num_input_tokens_seen": 79269040, "step": 83010 }, { "epoch": 6.771759523615303, "grad_norm": 3.0671370029449463, "learning_rate": 1.4263396071267779e-05, "loss": 0.4282, "num_input_tokens_seen": 79274304, "step": 83015 }, { "epoch": 6.7721673872257115, "grad_norm": 0.23400801420211792, "learning_rate": 1.4260181850304103e-05, "loss": 0.3459, "num_input_tokens_seen": 79279440, "step": 83020 }, { "epoch": 6.7725752508361206, "grad_norm": 1.3734909296035767, "learning_rate": 1.4256967847031977e-05, "loss": 0.3179, "num_input_tokens_seen": 79284336, "step": 83025 }, { "epoch": 6.772983114446529, "grad_norm": 0.30939826369285583, "learning_rate": 1.4253754061516537e-05, "loss": 0.4519, "num_input_tokens_seen": 79289568, "step": 83030 }, { "epoch": 6.773390978056938, "grad_norm": 1.1170850992202759, "learning_rate": 1.4250540493822945e-05, "loss": 0.4603, "num_input_tokens_seen": 79293488, "step": 83035 }, { "epoch": 6.773798841667347, "grad_norm": 0.4328797161579132, "learning_rate": 1.4247327144016332e-05, "loss": 0.3642, "num_input_tokens_seen": 79297472, "step": 83040 }, { "epoch": 6.774206705277755, "grad_norm": 0.8725878000259399, "learning_rate": 1.424411401216183e-05, "loss": 0.361, "num_input_tokens_seen": 79301760, "step": 83045 }, { "epoch": 6.774614568888164, "grad_norm": 1.4313838481903076, "learning_rate": 1.4240901098324561e-05, "loss": 0.2344, "num_input_tokens_seen": 79305952, "step": 83050 }, { "epoch": 6.775022432498573, "grad_norm": 1.855324149131775, "learning_rate": 1.423768840256966e-05, "loss": 0.3556, "num_input_tokens_seen": 79311264, "step": 83055 }, { "epoch": 6.775430296108981, "grad_norm": 0.8697689175605774, "learning_rate": 1.4234475924962237e-05, "loss": 0.2818, "num_input_tokens_seen": 79316464, "step": 83060 }, { "epoch": 6.77583815971939, "grad_norm": 1.1436272859573364, "learning_rate": 1.4231263665567418e-05, "loss": 0.3154, "num_input_tokens_seen": 79321728, "step": 83065 }, { "epoch": 6.776246023329799, "grad_norm": 0.4054446518421173, "learning_rate": 1.4228051624450306e-05, "loss": 0.3221, "num_input_tokens_seen": 79325888, "step": 83070 }, { "epoch": 6.776653886940207, "grad_norm": 70.34465789794922, "learning_rate": 1.4224839801675998e-05, "loss": 0.2898, "num_input_tokens_seen": 79331184, "step": 83075 }, { "epoch": 6.777061750550616, "grad_norm": 0.5021864771842957, "learning_rate": 1.4221628197309623e-05, "loss": 0.3547, "num_input_tokens_seen": 79335888, "step": 83080 }, { "epoch": 6.777469614161024, "grad_norm": 13.643446922302246, "learning_rate": 1.4218416811416264e-05, "loss": 0.4572, "num_input_tokens_seen": 79340592, "step": 83085 }, { "epoch": 6.777877477771433, "grad_norm": 0.3573550283908844, "learning_rate": 1.4215205644061013e-05, "loss": 0.2218, "num_input_tokens_seen": 79345120, "step": 83090 }, { "epoch": 6.778285341381842, "grad_norm": 3.6259689331054688, "learning_rate": 1.421199469530896e-05, "loss": 0.2881, "num_input_tokens_seen": 79349680, "step": 83095 }, { "epoch": 6.77869320499225, "grad_norm": 0.9625717401504517, "learning_rate": 1.420878396522518e-05, "loss": 0.625, "num_input_tokens_seen": 79355152, "step": 83100 }, { "epoch": 6.779101068602659, "grad_norm": 0.3375711739063263, "learning_rate": 1.4205573453874776e-05, "loss": 0.2529, "num_input_tokens_seen": 79360288, "step": 83105 }, { "epoch": 6.7795089322130675, "grad_norm": 1.1837353706359863, "learning_rate": 1.4202363161322812e-05, "loss": 0.2629, "num_input_tokens_seen": 79365552, "step": 83110 }, { "epoch": 6.7799167958234765, "grad_norm": 0.7991902232170105, "learning_rate": 1.4199153087634353e-05, "loss": 0.271, "num_input_tokens_seen": 79370000, "step": 83115 }, { "epoch": 6.7803246594338855, "grad_norm": 0.6375716328620911, "learning_rate": 1.4195943232874467e-05, "loss": 0.2124, "num_input_tokens_seen": 79375312, "step": 83120 }, { "epoch": 6.780732523044294, "grad_norm": 0.7294853925704956, "learning_rate": 1.4192733597108232e-05, "loss": 0.5046, "num_input_tokens_seen": 79380240, "step": 83125 }, { "epoch": 6.781140386654703, "grad_norm": 0.4887073040008545, "learning_rate": 1.418952418040069e-05, "loss": 0.3307, "num_input_tokens_seen": 79385136, "step": 83130 }, { "epoch": 6.781548250265112, "grad_norm": 0.3731231093406677, "learning_rate": 1.4186314982816901e-05, "loss": 0.3403, "num_input_tokens_seen": 79389920, "step": 83135 }, { "epoch": 6.78195611387552, "grad_norm": 2.041273593902588, "learning_rate": 1.4183106004421911e-05, "loss": 0.2886, "num_input_tokens_seen": 79394864, "step": 83140 }, { "epoch": 6.782363977485929, "grad_norm": 3.4232189655303955, "learning_rate": 1.4179897245280756e-05, "loss": 0.2938, "num_input_tokens_seen": 79399520, "step": 83145 }, { "epoch": 6.782771841096338, "grad_norm": 3.3222928047180176, "learning_rate": 1.4176688705458498e-05, "loss": 0.4339, "num_input_tokens_seen": 79404096, "step": 83150 }, { "epoch": 6.783179704706746, "grad_norm": 0.4476291239261627, "learning_rate": 1.417348038502016e-05, "loss": 0.267, "num_input_tokens_seen": 79408592, "step": 83155 }, { "epoch": 6.783587568317155, "grad_norm": 1.9929240942001343, "learning_rate": 1.4170272284030775e-05, "loss": 0.3651, "num_input_tokens_seen": 79413632, "step": 83160 }, { "epoch": 6.783995431927563, "grad_norm": 2.5211212635040283, "learning_rate": 1.4167064402555358e-05, "loss": 0.4104, "num_input_tokens_seen": 79418128, "step": 83165 }, { "epoch": 6.784403295537972, "grad_norm": 54.599735260009766, "learning_rate": 1.4163856740658955e-05, "loss": 0.4649, "num_input_tokens_seen": 79423664, "step": 83170 }, { "epoch": 6.784811159148381, "grad_norm": 2.0596423149108887, "learning_rate": 1.4160649298406567e-05, "loss": 0.3165, "num_input_tokens_seen": 79428400, "step": 83175 }, { "epoch": 6.785219022758789, "grad_norm": 2.0595216751098633, "learning_rate": 1.4157442075863214e-05, "loss": 0.3803, "num_input_tokens_seen": 79433088, "step": 83180 }, { "epoch": 6.785626886369198, "grad_norm": 2.426600456237793, "learning_rate": 1.4154235073093902e-05, "loss": 0.2783, "num_input_tokens_seen": 79438656, "step": 83185 }, { "epoch": 6.786034749979607, "grad_norm": 40.31120300292969, "learning_rate": 1.4151028290163637e-05, "loss": 0.2153, "num_input_tokens_seen": 79442944, "step": 83190 }, { "epoch": 6.786442613590015, "grad_norm": 4.297548770904541, "learning_rate": 1.4147821727137417e-05, "loss": 0.3211, "num_input_tokens_seen": 79447792, "step": 83195 }, { "epoch": 6.786850477200424, "grad_norm": 19.850040435791016, "learning_rate": 1.414461538408024e-05, "loss": 0.3456, "num_input_tokens_seen": 79452128, "step": 83200 }, { "epoch": 6.787258340810833, "grad_norm": 1.7954753637313843, "learning_rate": 1.4141409261057099e-05, "loss": 0.262, "num_input_tokens_seen": 79456736, "step": 83205 }, { "epoch": 6.7876662044212415, "grad_norm": 54.08292770385742, "learning_rate": 1.4138203358132968e-05, "loss": 0.2268, "num_input_tokens_seen": 79461840, "step": 83210 }, { "epoch": 6.7880740680316505, "grad_norm": 3.885538101196289, "learning_rate": 1.4134997675372846e-05, "loss": 0.3821, "num_input_tokens_seen": 79466768, "step": 83215 }, { "epoch": 6.788481931642059, "grad_norm": 14.107894897460938, "learning_rate": 1.4131792212841704e-05, "loss": 0.4187, "num_input_tokens_seen": 79471248, "step": 83220 }, { "epoch": 6.788889795252468, "grad_norm": 3.7036943435668945, "learning_rate": 1.4128586970604518e-05, "loss": 0.3768, "num_input_tokens_seen": 79476304, "step": 83225 }, { "epoch": 6.789297658862877, "grad_norm": 0.8909051418304443, "learning_rate": 1.4125381948726246e-05, "loss": 0.3773, "num_input_tokens_seen": 79481504, "step": 83230 }, { "epoch": 6.789705522473285, "grad_norm": 2.193359613418579, "learning_rate": 1.412217714727187e-05, "loss": 0.2927, "num_input_tokens_seen": 79486800, "step": 83235 }, { "epoch": 6.790113386083694, "grad_norm": 2.821716547012329, "learning_rate": 1.4118972566306338e-05, "loss": 0.3634, "num_input_tokens_seen": 79491936, "step": 83240 }, { "epoch": 6.790521249694102, "grad_norm": 63.787471771240234, "learning_rate": 1.4115768205894614e-05, "loss": 0.3472, "num_input_tokens_seen": 79496560, "step": 83245 }, { "epoch": 6.790929113304511, "grad_norm": 1.4410468339920044, "learning_rate": 1.4112564066101639e-05, "loss": 0.2094, "num_input_tokens_seen": 79502176, "step": 83250 }, { "epoch": 6.79133697691492, "grad_norm": 0.35154685378074646, "learning_rate": 1.4109360146992354e-05, "loss": 0.2138, "num_input_tokens_seen": 79506976, "step": 83255 }, { "epoch": 6.791744840525328, "grad_norm": 23.337186813354492, "learning_rate": 1.4106156448631724e-05, "loss": 0.2587, "num_input_tokens_seen": 79511760, "step": 83260 }, { "epoch": 6.792152704135737, "grad_norm": 15.135281562805176, "learning_rate": 1.410295297108467e-05, "loss": 0.5275, "num_input_tokens_seen": 79516848, "step": 83265 }, { "epoch": 6.792560567746146, "grad_norm": 10.202065467834473, "learning_rate": 1.4099749714416133e-05, "loss": 0.4179, "num_input_tokens_seen": 79521472, "step": 83270 }, { "epoch": 6.792968431356554, "grad_norm": 15.919931411743164, "learning_rate": 1.4096546678691024e-05, "loss": 0.5959, "num_input_tokens_seen": 79526672, "step": 83275 }, { "epoch": 6.793376294966963, "grad_norm": 115.93012237548828, "learning_rate": 1.409334386397429e-05, "loss": 0.5939, "num_input_tokens_seen": 79530976, "step": 83280 }, { "epoch": 6.793784158577372, "grad_norm": 2.3548777103424072, "learning_rate": 1.4090141270330844e-05, "loss": 0.2594, "num_input_tokens_seen": 79535456, "step": 83285 }, { "epoch": 6.79419202218778, "grad_norm": 6.8629679679870605, "learning_rate": 1.40869388978256e-05, "loss": 0.2948, "num_input_tokens_seen": 79540288, "step": 83290 }, { "epoch": 6.794599885798189, "grad_norm": 19.637470245361328, "learning_rate": 1.4083736746523463e-05, "loss": 0.2971, "num_input_tokens_seen": 79545008, "step": 83295 }, { "epoch": 6.795007749408597, "grad_norm": 0.5126477479934692, "learning_rate": 1.4080534816489339e-05, "loss": 0.3355, "num_input_tokens_seen": 79549184, "step": 83300 }, { "epoch": 6.795415613019006, "grad_norm": 2.43943452835083, "learning_rate": 1.4077333107788138e-05, "loss": 0.2822, "num_input_tokens_seen": 79554064, "step": 83305 }, { "epoch": 6.795823476629415, "grad_norm": 4.4925408363342285, "learning_rate": 1.4074131620484759e-05, "loss": 0.4657, "num_input_tokens_seen": 79559200, "step": 83310 }, { "epoch": 6.796231340239824, "grad_norm": 0.2628510296344757, "learning_rate": 1.4070930354644086e-05, "loss": 0.2949, "num_input_tokens_seen": 79562944, "step": 83315 }, { "epoch": 6.796639203850233, "grad_norm": 10.35218620300293, "learning_rate": 1.4067729310331012e-05, "loss": 0.4483, "num_input_tokens_seen": 79566896, "step": 83320 }, { "epoch": 6.797047067460641, "grad_norm": 0.5882713794708252, "learning_rate": 1.4064528487610417e-05, "loss": 0.4012, "num_input_tokens_seen": 79571424, "step": 83325 }, { "epoch": 6.79745493107105, "grad_norm": 0.6670070290565491, "learning_rate": 1.4061327886547188e-05, "loss": 0.2782, "num_input_tokens_seen": 79576848, "step": 83330 }, { "epoch": 6.797862794681459, "grad_norm": 4.313033580780029, "learning_rate": 1.405812750720618e-05, "loss": 0.3568, "num_input_tokens_seen": 79581184, "step": 83335 }, { "epoch": 6.798270658291867, "grad_norm": 4.495680332183838, "learning_rate": 1.4054927349652292e-05, "loss": 0.3642, "num_input_tokens_seen": 79585728, "step": 83340 }, { "epoch": 6.798678521902276, "grad_norm": 47.612342834472656, "learning_rate": 1.4051727413950372e-05, "loss": 0.4085, "num_input_tokens_seen": 79590560, "step": 83345 }, { "epoch": 6.799086385512685, "grad_norm": 0.36692726612091064, "learning_rate": 1.4048527700165288e-05, "loss": 0.2612, "num_input_tokens_seen": 79595600, "step": 83350 }, { "epoch": 6.799494249123093, "grad_norm": 46.1118278503418, "learning_rate": 1.4045328208361891e-05, "loss": 0.5015, "num_input_tokens_seen": 79601232, "step": 83355 }, { "epoch": 6.799902112733502, "grad_norm": 3.8757786750793457, "learning_rate": 1.4042128938605038e-05, "loss": 0.4445, "num_input_tokens_seen": 79605680, "step": 83360 }, { "epoch": 6.800309976343911, "grad_norm": 0.701432466506958, "learning_rate": 1.4038929890959562e-05, "loss": 0.3453, "num_input_tokens_seen": 79610672, "step": 83365 }, { "epoch": 6.800717839954319, "grad_norm": 0.31012865900993347, "learning_rate": 1.4035731065490333e-05, "loss": 0.3506, "num_input_tokens_seen": 79615456, "step": 83370 }, { "epoch": 6.801125703564728, "grad_norm": 3.150310754776001, "learning_rate": 1.4032532462262177e-05, "loss": 0.2967, "num_input_tokens_seen": 79620720, "step": 83375 }, { "epoch": 6.801533567175136, "grad_norm": 5.078259468078613, "learning_rate": 1.4029334081339923e-05, "loss": 0.3766, "num_input_tokens_seen": 79625536, "step": 83380 }, { "epoch": 6.801941430785545, "grad_norm": 6.91834831237793, "learning_rate": 1.4026135922788403e-05, "loss": 0.2508, "num_input_tokens_seen": 79630448, "step": 83385 }, { "epoch": 6.802349294395954, "grad_norm": 1.3414595127105713, "learning_rate": 1.4022937986672446e-05, "loss": 0.2902, "num_input_tokens_seen": 79634880, "step": 83390 }, { "epoch": 6.802757158006362, "grad_norm": 56.36640548706055, "learning_rate": 1.4019740273056878e-05, "loss": 0.265, "num_input_tokens_seen": 79639472, "step": 83395 }, { "epoch": 6.803165021616771, "grad_norm": 12.116372108459473, "learning_rate": 1.401654278200651e-05, "loss": 0.4005, "num_input_tokens_seen": 79644336, "step": 83400 }, { "epoch": 6.80357288522718, "grad_norm": 3.9375643730163574, "learning_rate": 1.4013345513586146e-05, "loss": 0.3912, "num_input_tokens_seen": 79649216, "step": 83405 }, { "epoch": 6.8039807488375885, "grad_norm": 0.8914690017700195, "learning_rate": 1.4010148467860596e-05, "loss": 0.2404, "num_input_tokens_seen": 79653792, "step": 83410 }, { "epoch": 6.8043886124479975, "grad_norm": 3.869032621383667, "learning_rate": 1.4006951644894673e-05, "loss": 0.3434, "num_input_tokens_seen": 79659040, "step": 83415 }, { "epoch": 6.8047964760584065, "grad_norm": 6.674570560455322, "learning_rate": 1.400375504475317e-05, "loss": 0.3171, "num_input_tokens_seen": 79663552, "step": 83420 }, { "epoch": 6.805204339668815, "grad_norm": 12.676031112670898, "learning_rate": 1.400055866750088e-05, "loss": 0.3149, "num_input_tokens_seen": 79668320, "step": 83425 }, { "epoch": 6.805612203279224, "grad_norm": 0.8107587695121765, "learning_rate": 1.399736251320258e-05, "loss": 0.2617, "num_input_tokens_seen": 79672784, "step": 83430 }, { "epoch": 6.806020066889632, "grad_norm": 76.7428207397461, "learning_rate": 1.3994166581923079e-05, "loss": 0.2792, "num_input_tokens_seen": 79677744, "step": 83435 }, { "epoch": 6.806427930500041, "grad_norm": 70.32929992675781, "learning_rate": 1.3990970873727143e-05, "loss": 0.4405, "num_input_tokens_seen": 79683312, "step": 83440 }, { "epoch": 6.80683579411045, "grad_norm": 1.8470994234085083, "learning_rate": 1.3987775388679552e-05, "loss": 0.3429, "num_input_tokens_seen": 79688432, "step": 83445 }, { "epoch": 6.807243657720858, "grad_norm": 1.1700259447097778, "learning_rate": 1.3984580126845071e-05, "loss": 0.2779, "num_input_tokens_seen": 79692224, "step": 83450 }, { "epoch": 6.807651521331267, "grad_norm": 1.0732747316360474, "learning_rate": 1.3981385088288462e-05, "loss": 0.5081, "num_input_tokens_seen": 79696848, "step": 83455 }, { "epoch": 6.808059384941675, "grad_norm": 20.869640350341797, "learning_rate": 1.3978190273074503e-05, "loss": 0.2827, "num_input_tokens_seen": 79701424, "step": 83460 }, { "epoch": 6.808467248552084, "grad_norm": 22.368104934692383, "learning_rate": 1.397499568126795e-05, "loss": 0.3539, "num_input_tokens_seen": 79706176, "step": 83465 }, { "epoch": 6.808875112162493, "grad_norm": 23.19632911682129, "learning_rate": 1.3971801312933544e-05, "loss": 0.2551, "num_input_tokens_seen": 79711456, "step": 83470 }, { "epoch": 6.809282975772901, "grad_norm": 61.45298767089844, "learning_rate": 1.3968607168136038e-05, "loss": 0.2366, "num_input_tokens_seen": 79716704, "step": 83475 }, { "epoch": 6.80969083938331, "grad_norm": 1.856315016746521, "learning_rate": 1.3965413246940181e-05, "loss": 0.3111, "num_input_tokens_seen": 79721152, "step": 83480 }, { "epoch": 6.810098702993719, "grad_norm": 0.43481019139289856, "learning_rate": 1.396221954941071e-05, "loss": 0.513, "num_input_tokens_seen": 79725856, "step": 83485 }, { "epoch": 6.810506566604127, "grad_norm": 1.191444754600525, "learning_rate": 1.3959026075612352e-05, "loss": 0.362, "num_input_tokens_seen": 79730048, "step": 83490 }, { "epoch": 6.810914430214536, "grad_norm": 2.394641876220703, "learning_rate": 1.395583282560985e-05, "loss": 0.2928, "num_input_tokens_seen": 79735040, "step": 83495 }, { "epoch": 6.811322293824945, "grad_norm": 1.8380430936813354, "learning_rate": 1.3952639799467911e-05, "loss": 0.2327, "num_input_tokens_seen": 79739408, "step": 83500 }, { "epoch": 6.8117301574353535, "grad_norm": 3.0719001293182373, "learning_rate": 1.394944699725128e-05, "loss": 0.2589, "num_input_tokens_seen": 79744336, "step": 83505 }, { "epoch": 6.8121380210457625, "grad_norm": 0.6706134676933289, "learning_rate": 1.3946254419024663e-05, "loss": 0.2803, "num_input_tokens_seen": 79749616, "step": 83510 }, { "epoch": 6.812545884656171, "grad_norm": 7.845348358154297, "learning_rate": 1.3943062064852769e-05, "loss": 0.3024, "num_input_tokens_seen": 79754560, "step": 83515 }, { "epoch": 6.81295374826658, "grad_norm": 2.5606110095977783, "learning_rate": 1.3939869934800304e-05, "loss": 0.5636, "num_input_tokens_seen": 79759104, "step": 83520 }, { "epoch": 6.813361611876989, "grad_norm": 2.6016833782196045, "learning_rate": 1.3936678028931982e-05, "loss": 0.3794, "num_input_tokens_seen": 79763488, "step": 83525 }, { "epoch": 6.813769475487397, "grad_norm": 30.384695053100586, "learning_rate": 1.3933486347312494e-05, "loss": 0.5815, "num_input_tokens_seen": 79768688, "step": 83530 }, { "epoch": 6.814177339097806, "grad_norm": 1.8224763870239258, "learning_rate": 1.3930294890006537e-05, "loss": 0.2861, "num_input_tokens_seen": 79773488, "step": 83535 }, { "epoch": 6.814585202708214, "grad_norm": 1.2512673139572144, "learning_rate": 1.3927103657078788e-05, "loss": 0.5598, "num_input_tokens_seen": 79778656, "step": 83540 }, { "epoch": 6.814993066318623, "grad_norm": 1.3182541131973267, "learning_rate": 1.3923912648593953e-05, "loss": 0.2436, "num_input_tokens_seen": 79784112, "step": 83545 }, { "epoch": 6.815400929929032, "grad_norm": 2.134552240371704, "learning_rate": 1.3920721864616704e-05, "loss": 0.2874, "num_input_tokens_seen": 79788528, "step": 83550 }, { "epoch": 6.81580879353944, "grad_norm": 21.371549606323242, "learning_rate": 1.3917531305211712e-05, "loss": 0.4204, "num_input_tokens_seen": 79793632, "step": 83555 }, { "epoch": 6.816216657149849, "grad_norm": 2.58506178855896, "learning_rate": 1.3914340970443651e-05, "loss": 0.4918, "num_input_tokens_seen": 79799584, "step": 83560 }, { "epoch": 6.816624520760258, "grad_norm": 1.7384896278381348, "learning_rate": 1.3911150860377182e-05, "loss": 0.4134, "num_input_tokens_seen": 79804800, "step": 83565 }, { "epoch": 6.817032384370666, "grad_norm": 36.559471130371094, "learning_rate": 1.390796097507698e-05, "loss": 0.5156, "num_input_tokens_seen": 79810288, "step": 83570 }, { "epoch": 6.817440247981075, "grad_norm": 9.217495918273926, "learning_rate": 1.3904771314607695e-05, "loss": 0.3854, "num_input_tokens_seen": 79814848, "step": 83575 }, { "epoch": 6.817848111591484, "grad_norm": 1.0431584119796753, "learning_rate": 1.3901581879033985e-05, "loss": 0.2727, "num_input_tokens_seen": 79819840, "step": 83580 }, { "epoch": 6.818255975201892, "grad_norm": 1.140371322631836, "learning_rate": 1.3898392668420479e-05, "loss": 0.2252, "num_input_tokens_seen": 79824416, "step": 83585 }, { "epoch": 6.818663838812301, "grad_norm": 2.5851640701293945, "learning_rate": 1.389520368283185e-05, "loss": 0.3053, "num_input_tokens_seen": 79829504, "step": 83590 }, { "epoch": 6.819071702422709, "grad_norm": 33.943878173828125, "learning_rate": 1.3892014922332721e-05, "loss": 0.3388, "num_input_tokens_seen": 79834320, "step": 83595 }, { "epoch": 6.8194795660331184, "grad_norm": 1.3382562398910522, "learning_rate": 1.3888826386987733e-05, "loss": 0.2868, "num_input_tokens_seen": 79839120, "step": 83600 }, { "epoch": 6.8198874296435275, "grad_norm": 0.34815704822540283, "learning_rate": 1.388563807686151e-05, "loss": 0.2748, "num_input_tokens_seen": 79843776, "step": 83605 }, { "epoch": 6.820295293253936, "grad_norm": 44.96052551269531, "learning_rate": 1.388244999201868e-05, "loss": 0.2713, "num_input_tokens_seen": 79848416, "step": 83610 }, { "epoch": 6.820703156864345, "grad_norm": 16.84685516357422, "learning_rate": 1.3879262132523868e-05, "loss": 0.4998, "num_input_tokens_seen": 79853408, "step": 83615 }, { "epoch": 6.821111020474754, "grad_norm": 2.3813674449920654, "learning_rate": 1.3876074498441682e-05, "loss": 0.2299, "num_input_tokens_seen": 79857952, "step": 83620 }, { "epoch": 6.821518884085162, "grad_norm": 0.8037506937980652, "learning_rate": 1.3872887089836741e-05, "loss": 0.2899, "num_input_tokens_seen": 79862544, "step": 83625 }, { "epoch": 6.821926747695571, "grad_norm": 7.127895832061768, "learning_rate": 1.3869699906773642e-05, "loss": 0.2339, "num_input_tokens_seen": 79866752, "step": 83630 }, { "epoch": 6.82233461130598, "grad_norm": 3.4716274738311768, "learning_rate": 1.3866512949317007e-05, "loss": 0.2613, "num_input_tokens_seen": 79870528, "step": 83635 }, { "epoch": 6.822742474916388, "grad_norm": 1.4450184106826782, "learning_rate": 1.386332621753142e-05, "loss": 0.2051, "num_input_tokens_seen": 79875600, "step": 83640 }, { "epoch": 6.823150338526797, "grad_norm": 16.2052001953125, "learning_rate": 1.3860139711481482e-05, "loss": 0.254, "num_input_tokens_seen": 79880672, "step": 83645 }, { "epoch": 6.823558202137205, "grad_norm": 5.8297624588012695, "learning_rate": 1.3856953431231778e-05, "loss": 0.3323, "num_input_tokens_seen": 79884576, "step": 83650 }, { "epoch": 6.823966065747614, "grad_norm": 2.8297972679138184, "learning_rate": 1.3853767376846882e-05, "loss": 0.2507, "num_input_tokens_seen": 79888960, "step": 83655 }, { "epoch": 6.824373929358023, "grad_norm": 4.646392822265625, "learning_rate": 1.3850581548391394e-05, "loss": 0.3973, "num_input_tokens_seen": 79893936, "step": 83660 }, { "epoch": 6.824781792968431, "grad_norm": 4.0294318199157715, "learning_rate": 1.384739594592988e-05, "loss": 0.3102, "num_input_tokens_seen": 79898640, "step": 83665 }, { "epoch": 6.82518965657884, "grad_norm": 28.335506439208984, "learning_rate": 1.3844210569526911e-05, "loss": 0.5911, "num_input_tokens_seen": 79903312, "step": 83670 }, { "epoch": 6.825597520189248, "grad_norm": 29.210590362548828, "learning_rate": 1.3841025419247045e-05, "loss": 0.5459, "num_input_tokens_seen": 79907920, "step": 83675 }, { "epoch": 6.826005383799657, "grad_norm": 0.6394246220588684, "learning_rate": 1.3837840495154864e-05, "loss": 0.2766, "num_input_tokens_seen": 79912432, "step": 83680 }, { "epoch": 6.826413247410066, "grad_norm": 2.0347273349761963, "learning_rate": 1.383465579731491e-05, "loss": 0.1657, "num_input_tokens_seen": 79917056, "step": 83685 }, { "epoch": 6.826821111020474, "grad_norm": 0.8950735330581665, "learning_rate": 1.383147132579174e-05, "loss": 0.2505, "num_input_tokens_seen": 79921248, "step": 83690 }, { "epoch": 6.827228974630883, "grad_norm": 2.168267250061035, "learning_rate": 1.3828287080649899e-05, "loss": 0.381, "num_input_tokens_seen": 79925840, "step": 83695 }, { "epoch": 6.827636838241292, "grad_norm": 9.283866882324219, "learning_rate": 1.382510306195392e-05, "loss": 0.2639, "num_input_tokens_seen": 79931152, "step": 83700 }, { "epoch": 6.8280447018517005, "grad_norm": 1.033583641052246, "learning_rate": 1.3821919269768368e-05, "loss": 0.4878, "num_input_tokens_seen": 79935584, "step": 83705 }, { "epoch": 6.8284525654621095, "grad_norm": 1.1740168333053589, "learning_rate": 1.3818735704157758e-05, "loss": 0.257, "num_input_tokens_seen": 79939856, "step": 83710 }, { "epoch": 6.828860429072519, "grad_norm": 2.048957347869873, "learning_rate": 1.381555236518663e-05, "loss": 0.4094, "num_input_tokens_seen": 79944528, "step": 83715 }, { "epoch": 6.829268292682927, "grad_norm": 28.0889835357666, "learning_rate": 1.3812369252919488e-05, "loss": 0.4151, "num_input_tokens_seen": 79949232, "step": 83720 }, { "epoch": 6.829676156293336, "grad_norm": 1.3043456077575684, "learning_rate": 1.3809186367420878e-05, "loss": 0.3007, "num_input_tokens_seen": 79954480, "step": 83725 }, { "epoch": 6.830084019903744, "grad_norm": 13.039791107177734, "learning_rate": 1.3806003708755304e-05, "loss": 0.221, "num_input_tokens_seen": 79958512, "step": 83730 }, { "epoch": 6.830491883514153, "grad_norm": 0.39261090755462646, "learning_rate": 1.3802821276987282e-05, "loss": 0.5522, "num_input_tokens_seen": 79963200, "step": 83735 }, { "epoch": 6.830899747124562, "grad_norm": 0.3559359014034271, "learning_rate": 1.3799639072181314e-05, "loss": 0.2254, "num_input_tokens_seen": 79967168, "step": 83740 }, { "epoch": 6.83130761073497, "grad_norm": 0.36876508593559265, "learning_rate": 1.3796457094401901e-05, "loss": 0.5052, "num_input_tokens_seen": 79971648, "step": 83745 }, { "epoch": 6.831715474345379, "grad_norm": 6.045456409454346, "learning_rate": 1.3793275343713541e-05, "loss": 0.3826, "num_input_tokens_seen": 79976160, "step": 83750 }, { "epoch": 6.832123337955788, "grad_norm": 1.3491591215133667, "learning_rate": 1.3790093820180733e-05, "loss": 0.5351, "num_input_tokens_seen": 79981696, "step": 83755 }, { "epoch": 6.832531201566196, "grad_norm": 32.23068618774414, "learning_rate": 1.3786912523867953e-05, "loss": 0.4255, "num_input_tokens_seen": 79986992, "step": 83760 }, { "epoch": 6.832939065176605, "grad_norm": 28.77753257751465, "learning_rate": 1.3783731454839687e-05, "loss": 0.3813, "num_input_tokens_seen": 79991824, "step": 83765 }, { "epoch": 6.833346928787014, "grad_norm": 0.3672415018081665, "learning_rate": 1.3780550613160425e-05, "loss": 0.3101, "num_input_tokens_seen": 79995536, "step": 83770 }, { "epoch": 6.833754792397422, "grad_norm": 0.820439338684082, "learning_rate": 1.3777369998894635e-05, "loss": 0.4349, "num_input_tokens_seen": 80000272, "step": 83775 }, { "epoch": 6.834162656007831, "grad_norm": 0.4155302941799164, "learning_rate": 1.3774189612106786e-05, "loss": 0.3098, "num_input_tokens_seen": 80005200, "step": 83780 }, { "epoch": 6.834570519618239, "grad_norm": 0.8749970197677612, "learning_rate": 1.3771009452861336e-05, "loss": 0.3196, "num_input_tokens_seen": 80010432, "step": 83785 }, { "epoch": 6.834978383228648, "grad_norm": 0.40986549854278564, "learning_rate": 1.376782952122276e-05, "loss": 0.2457, "num_input_tokens_seen": 80014736, "step": 83790 }, { "epoch": 6.835386246839057, "grad_norm": 0.21940332651138306, "learning_rate": 1.376464981725551e-05, "loss": 0.3371, "num_input_tokens_seen": 80020512, "step": 83795 }, { "epoch": 6.8357941104494655, "grad_norm": 1.1324421167373657, "learning_rate": 1.376147034102403e-05, "loss": 0.3426, "num_input_tokens_seen": 80024688, "step": 83800 }, { "epoch": 6.8362019740598745, "grad_norm": 47.915164947509766, "learning_rate": 1.3758291092592773e-05, "loss": 0.4828, "num_input_tokens_seen": 80028928, "step": 83805 }, { "epoch": 6.836609837670283, "grad_norm": 39.85466766357422, "learning_rate": 1.3755112072026169e-05, "loss": 0.2117, "num_input_tokens_seen": 80034160, "step": 83810 }, { "epoch": 6.837017701280692, "grad_norm": 2.00372052192688, "learning_rate": 1.3751933279388674e-05, "loss": 0.3851, "num_input_tokens_seen": 80037856, "step": 83815 }, { "epoch": 6.837425564891101, "grad_norm": 41.54708480834961, "learning_rate": 1.374875471474471e-05, "loss": 0.3559, "num_input_tokens_seen": 80042480, "step": 83820 }, { "epoch": 6.837833428501509, "grad_norm": 18.27403450012207, "learning_rate": 1.3745576378158712e-05, "loss": 0.3181, "num_input_tokens_seen": 80047536, "step": 83825 }, { "epoch": 6.838241292111918, "grad_norm": 1.7701317071914673, "learning_rate": 1.3742398269695084e-05, "loss": 0.3733, "num_input_tokens_seen": 80052944, "step": 83830 }, { "epoch": 6.838649155722327, "grad_norm": 65.7992172241211, "learning_rate": 1.3739220389418273e-05, "loss": 0.3651, "num_input_tokens_seen": 80057520, "step": 83835 }, { "epoch": 6.839057019332735, "grad_norm": 6.097583770751953, "learning_rate": 1.373604273739268e-05, "loss": 0.2414, "num_input_tokens_seen": 80062432, "step": 83840 }, { "epoch": 6.839464882943144, "grad_norm": 0.5183296799659729, "learning_rate": 1.3732865313682709e-05, "loss": 0.3296, "num_input_tokens_seen": 80067392, "step": 83845 }, { "epoch": 6.839872746553553, "grad_norm": 0.42877820134162903, "learning_rate": 1.3729688118352773e-05, "loss": 0.2756, "num_input_tokens_seen": 80072256, "step": 83850 }, { "epoch": 6.840280610163961, "grad_norm": 5.26806640625, "learning_rate": 1.3726511151467262e-05, "loss": 0.4463, "num_input_tokens_seen": 80077072, "step": 83855 }, { "epoch": 6.84068847377437, "grad_norm": 9.186518669128418, "learning_rate": 1.3723334413090588e-05, "loss": 0.3815, "num_input_tokens_seen": 80081136, "step": 83860 }, { "epoch": 6.841096337384778, "grad_norm": 0.9378737211227417, "learning_rate": 1.3720157903287135e-05, "loss": 0.3671, "num_input_tokens_seen": 80084720, "step": 83865 }, { "epoch": 6.841504200995187, "grad_norm": 2.2242956161499023, "learning_rate": 1.3716981622121286e-05, "loss": 0.3658, "num_input_tokens_seen": 80089632, "step": 83870 }, { "epoch": 6.841912064605596, "grad_norm": 0.6155677437782288, "learning_rate": 1.3713805569657426e-05, "loss": 0.2458, "num_input_tokens_seen": 80093984, "step": 83875 }, { "epoch": 6.842319928216004, "grad_norm": 3.836413860321045, "learning_rate": 1.371062974595993e-05, "loss": 0.4537, "num_input_tokens_seen": 80098256, "step": 83880 }, { "epoch": 6.842727791826413, "grad_norm": 3.125373125076294, "learning_rate": 1.3707454151093174e-05, "loss": 0.319, "num_input_tokens_seen": 80102496, "step": 83885 }, { "epoch": 6.8431356554368215, "grad_norm": 0.8307927250862122, "learning_rate": 1.3704278785121522e-05, "loss": 0.3749, "num_input_tokens_seen": 80106544, "step": 83890 }, { "epoch": 6.8435435190472305, "grad_norm": 0.3980134427547455, "learning_rate": 1.3701103648109343e-05, "loss": 0.5482, "num_input_tokens_seen": 80111776, "step": 83895 }, { "epoch": 6.8439513826576395, "grad_norm": 1.3206983804702759, "learning_rate": 1.3697928740120977e-05, "loss": 0.4196, "num_input_tokens_seen": 80116560, "step": 83900 }, { "epoch": 6.844359246268048, "grad_norm": 1.6862187385559082, "learning_rate": 1.3694754061220804e-05, "loss": 0.4273, "num_input_tokens_seen": 80121616, "step": 83905 }, { "epoch": 6.844767109878457, "grad_norm": 0.3090757131576538, "learning_rate": 1.3691579611473165e-05, "loss": 0.282, "num_input_tokens_seen": 80127184, "step": 83910 }, { "epoch": 6.845174973488866, "grad_norm": 1.0159952640533447, "learning_rate": 1.36884053909424e-05, "loss": 0.5147, "num_input_tokens_seen": 80132736, "step": 83915 }, { "epoch": 6.845582837099274, "grad_norm": 7.010452747344971, "learning_rate": 1.368523139969284e-05, "loss": 0.3785, "num_input_tokens_seen": 80138240, "step": 83920 }, { "epoch": 6.845990700709683, "grad_norm": 6.31387186050415, "learning_rate": 1.3682057637788845e-05, "loss": 0.3127, "num_input_tokens_seen": 80142640, "step": 83925 }, { "epoch": 6.846398564320092, "grad_norm": 1.1981110572814941, "learning_rate": 1.3678884105294732e-05, "loss": 0.4097, "num_input_tokens_seen": 80147648, "step": 83930 }, { "epoch": 6.8468064279305, "grad_norm": 3.6390252113342285, "learning_rate": 1.3675710802274824e-05, "loss": 0.341, "num_input_tokens_seen": 80153152, "step": 83935 }, { "epoch": 6.847214291540909, "grad_norm": 2.511636734008789, "learning_rate": 1.3672537728793439e-05, "loss": 0.2936, "num_input_tokens_seen": 80158160, "step": 83940 }, { "epoch": 6.847622155151317, "grad_norm": 0.6317878365516663, "learning_rate": 1.366936488491491e-05, "loss": 0.3216, "num_input_tokens_seen": 80163264, "step": 83945 }, { "epoch": 6.848030018761726, "grad_norm": 1.1309568881988525, "learning_rate": 1.3666192270703537e-05, "loss": 0.2971, "num_input_tokens_seen": 80167664, "step": 83950 }, { "epoch": 6.848437882372135, "grad_norm": 25.60512351989746, "learning_rate": 1.3663019886223635e-05, "loss": 0.2417, "num_input_tokens_seen": 80172768, "step": 83955 }, { "epoch": 6.848845745982543, "grad_norm": 8.745416641235352, "learning_rate": 1.3659847731539501e-05, "loss": 0.349, "num_input_tokens_seen": 80176480, "step": 83960 }, { "epoch": 6.849253609592952, "grad_norm": 1.2599279880523682, "learning_rate": 1.3656675806715425e-05, "loss": 0.3518, "num_input_tokens_seen": 80181216, "step": 83965 }, { "epoch": 6.849661473203361, "grad_norm": 0.428244948387146, "learning_rate": 1.3653504111815722e-05, "loss": 0.2865, "num_input_tokens_seen": 80185712, "step": 83970 }, { "epoch": 6.850069336813769, "grad_norm": 27.026796340942383, "learning_rate": 1.3650332646904667e-05, "loss": 0.3572, "num_input_tokens_seen": 80190368, "step": 83975 }, { "epoch": 6.850477200424178, "grad_norm": 0.9318118095397949, "learning_rate": 1.3647161412046547e-05, "loss": 0.2413, "num_input_tokens_seen": 80195168, "step": 83980 }, { "epoch": 6.850885064034587, "grad_norm": 0.7957158088684082, "learning_rate": 1.364399040730563e-05, "loss": 0.31, "num_input_tokens_seen": 80199184, "step": 83985 }, { "epoch": 6.851292927644995, "grad_norm": 7.108544826507568, "learning_rate": 1.3640819632746216e-05, "loss": 0.3452, "num_input_tokens_seen": 80204192, "step": 83990 }, { "epoch": 6.851700791255404, "grad_norm": 34.272010803222656, "learning_rate": 1.3637649088432558e-05, "loss": 0.5038, "num_input_tokens_seen": 80209472, "step": 83995 }, { "epoch": 6.8521086548658126, "grad_norm": 1.7977526187896729, "learning_rate": 1.3634478774428921e-05, "loss": 0.273, "num_input_tokens_seen": 80214272, "step": 84000 }, { "epoch": 6.852516518476222, "grad_norm": 1.8752785921096802, "learning_rate": 1.3631308690799576e-05, "loss": 0.3535, "num_input_tokens_seen": 80219200, "step": 84005 }, { "epoch": 6.852924382086631, "grad_norm": 0.5052844882011414, "learning_rate": 1.3628138837608772e-05, "loss": 0.3093, "num_input_tokens_seen": 80223760, "step": 84010 }, { "epoch": 6.853332245697039, "grad_norm": 7.371828079223633, "learning_rate": 1.3624969214920757e-05, "loss": 0.3238, "num_input_tokens_seen": 80229104, "step": 84015 }, { "epoch": 6.853740109307448, "grad_norm": 0.4829258620738983, "learning_rate": 1.3621799822799788e-05, "loss": 0.1871, "num_input_tokens_seen": 80234144, "step": 84020 }, { "epoch": 6.854147972917856, "grad_norm": 1.8385200500488281, "learning_rate": 1.3618630661310088e-05, "loss": 0.3006, "num_input_tokens_seen": 80237984, "step": 84025 }, { "epoch": 6.854555836528265, "grad_norm": 7.551334381103516, "learning_rate": 1.3615461730515922e-05, "loss": 0.3092, "num_input_tokens_seen": 80243056, "step": 84030 }, { "epoch": 6.854963700138674, "grad_norm": 30.108463287353516, "learning_rate": 1.3612293030481506e-05, "loss": 0.575, "num_input_tokens_seen": 80248080, "step": 84035 }, { "epoch": 6.855371563749082, "grad_norm": 0.5642094016075134, "learning_rate": 1.3609124561271075e-05, "loss": 0.3054, "num_input_tokens_seen": 80253136, "step": 84040 }, { "epoch": 6.855779427359491, "grad_norm": 19.45245361328125, "learning_rate": 1.3605956322948842e-05, "loss": 0.3286, "num_input_tokens_seen": 80257680, "step": 84045 }, { "epoch": 6.8561872909699, "grad_norm": 0.36318913102149963, "learning_rate": 1.360278831557904e-05, "loss": 0.336, "num_input_tokens_seen": 80262192, "step": 84050 }, { "epoch": 6.856595154580308, "grad_norm": 0.2905828654766083, "learning_rate": 1.359962053922586e-05, "loss": 0.3838, "num_input_tokens_seen": 80267392, "step": 84055 }, { "epoch": 6.857003018190717, "grad_norm": 0.6440751552581787, "learning_rate": 1.3596452993953542e-05, "loss": 0.3106, "num_input_tokens_seen": 80272000, "step": 84060 }, { "epoch": 6.857410881801126, "grad_norm": 0.37070924043655396, "learning_rate": 1.3593285679826273e-05, "loss": 0.2171, "num_input_tokens_seen": 80276848, "step": 84065 }, { "epoch": 6.857818745411534, "grad_norm": 0.857780396938324, "learning_rate": 1.3590118596908258e-05, "loss": 0.3458, "num_input_tokens_seen": 80281600, "step": 84070 }, { "epoch": 6.858226609021943, "grad_norm": 2.105889081954956, "learning_rate": 1.3586951745263682e-05, "loss": 0.3785, "num_input_tokens_seen": 80286320, "step": 84075 }, { "epoch": 6.858634472632351, "grad_norm": 1.2490241527557373, "learning_rate": 1.3583785124956753e-05, "loss": 0.3586, "num_input_tokens_seen": 80290608, "step": 84080 }, { "epoch": 6.85904233624276, "grad_norm": 0.5573241710662842, "learning_rate": 1.3580618736051647e-05, "loss": 0.3326, "num_input_tokens_seen": 80294624, "step": 84085 }, { "epoch": 6.859450199853169, "grad_norm": 11.039332389831543, "learning_rate": 1.3577452578612549e-05, "loss": 0.5871, "num_input_tokens_seen": 80298016, "step": 84090 }, { "epoch": 6.8598580634635775, "grad_norm": 7.702645301818848, "learning_rate": 1.3574286652703621e-05, "loss": 0.3352, "num_input_tokens_seen": 80301696, "step": 84095 }, { "epoch": 6.8602659270739865, "grad_norm": 1.0632030963897705, "learning_rate": 1.3571120958389061e-05, "loss": 0.2305, "num_input_tokens_seen": 80306656, "step": 84100 }, { "epoch": 6.860673790684395, "grad_norm": 0.9167049527168274, "learning_rate": 1.356795549573302e-05, "loss": 0.2377, "num_input_tokens_seen": 80311680, "step": 84105 }, { "epoch": 6.861081654294804, "grad_norm": 1.252935528755188, "learning_rate": 1.3564790264799666e-05, "loss": 0.255, "num_input_tokens_seen": 80316576, "step": 84110 }, { "epoch": 6.861489517905213, "grad_norm": 0.8590273857116699, "learning_rate": 1.356162526565315e-05, "loss": 0.3019, "num_input_tokens_seen": 80321856, "step": 84115 }, { "epoch": 6.861897381515622, "grad_norm": 10.16227912902832, "learning_rate": 1.3558460498357623e-05, "loss": 0.455, "num_input_tokens_seen": 80326112, "step": 84120 }, { "epoch": 6.86230524512603, "grad_norm": 12.609025001525879, "learning_rate": 1.355529596297725e-05, "loss": 0.3754, "num_input_tokens_seen": 80331104, "step": 84125 }, { "epoch": 6.862713108736439, "grad_norm": 0.92063969373703, "learning_rate": 1.3552131659576162e-05, "loss": 0.3653, "num_input_tokens_seen": 80335344, "step": 84130 }, { "epoch": 6.863120972346847, "grad_norm": 0.2520107328891754, "learning_rate": 1.35489675882185e-05, "loss": 0.3059, "num_input_tokens_seen": 80339952, "step": 84135 }, { "epoch": 6.863528835957256, "grad_norm": 0.6547582745552063, "learning_rate": 1.35458037489684e-05, "loss": 0.312, "num_input_tokens_seen": 80344448, "step": 84140 }, { "epoch": 6.863936699567665, "grad_norm": 36.54917526245117, "learning_rate": 1.3542640141889985e-05, "loss": 0.2965, "num_input_tokens_seen": 80350336, "step": 84145 }, { "epoch": 6.864344563178073, "grad_norm": 6.551475524902344, "learning_rate": 1.3539476767047393e-05, "loss": 0.3851, "num_input_tokens_seen": 80355488, "step": 84150 }, { "epoch": 6.864752426788482, "grad_norm": 0.21474763751029968, "learning_rate": 1.3536313624504734e-05, "loss": 0.3145, "num_input_tokens_seen": 80360128, "step": 84155 }, { "epoch": 6.86516029039889, "grad_norm": 1.7880345582962036, "learning_rate": 1.3533150714326131e-05, "loss": 0.3748, "num_input_tokens_seen": 80364912, "step": 84160 }, { "epoch": 6.865568154009299, "grad_norm": 0.8896516561508179, "learning_rate": 1.3529988036575689e-05, "loss": 0.3698, "num_input_tokens_seen": 80368864, "step": 84165 }, { "epoch": 6.865976017619708, "grad_norm": 20.109006881713867, "learning_rate": 1.3526825591317516e-05, "loss": 0.413, "num_input_tokens_seen": 80373888, "step": 84170 }, { "epoch": 6.866383881230116, "grad_norm": 0.3957599401473999, "learning_rate": 1.352366337861571e-05, "loss": 0.4281, "num_input_tokens_seen": 80378992, "step": 84175 }, { "epoch": 6.866791744840525, "grad_norm": 0.8980516791343689, "learning_rate": 1.3520501398534375e-05, "loss": 0.2828, "num_input_tokens_seen": 80384032, "step": 84180 }, { "epoch": 6.867199608450934, "grad_norm": 0.43299219012260437, "learning_rate": 1.3517339651137586e-05, "loss": 0.3859, "num_input_tokens_seen": 80388352, "step": 84185 }, { "epoch": 6.8676074720613425, "grad_norm": 0.6352789998054504, "learning_rate": 1.3514178136489456e-05, "loss": 0.2704, "num_input_tokens_seen": 80393312, "step": 84190 }, { "epoch": 6.8680153356717515, "grad_norm": 47.39670944213867, "learning_rate": 1.3511016854654052e-05, "loss": 0.5018, "num_input_tokens_seen": 80398832, "step": 84195 }, { "epoch": 6.8684231992821605, "grad_norm": 1.9487262964248657, "learning_rate": 1.3507855805695454e-05, "loss": 0.2932, "num_input_tokens_seen": 80403520, "step": 84200 }, { "epoch": 6.868831062892569, "grad_norm": 0.7311748266220093, "learning_rate": 1.3504694989677736e-05, "loss": 0.3175, "num_input_tokens_seen": 80408064, "step": 84205 }, { "epoch": 6.869238926502978, "grad_norm": 0.4959729015827179, "learning_rate": 1.3501534406664955e-05, "loss": 0.3664, "num_input_tokens_seen": 80412512, "step": 84210 }, { "epoch": 6.869646790113386, "grad_norm": 0.4867069125175476, "learning_rate": 1.3498374056721197e-05, "loss": 0.286, "num_input_tokens_seen": 80416784, "step": 84215 }, { "epoch": 6.870054653723795, "grad_norm": 0.45955944061279297, "learning_rate": 1.3495213939910511e-05, "loss": 0.2514, "num_input_tokens_seen": 80421248, "step": 84220 }, { "epoch": 6.870462517334204, "grad_norm": 0.39780983328819275, "learning_rate": 1.3492054056296949e-05, "loss": 0.2366, "num_input_tokens_seen": 80425376, "step": 84225 }, { "epoch": 6.870870380944612, "grad_norm": 20.38936996459961, "learning_rate": 1.3488894405944549e-05, "loss": 0.4133, "num_input_tokens_seen": 80430272, "step": 84230 }, { "epoch": 6.871278244555021, "grad_norm": 0.8554453253746033, "learning_rate": 1.3485734988917381e-05, "loss": 0.3241, "num_input_tokens_seen": 80435136, "step": 84235 }, { "epoch": 6.871686108165429, "grad_norm": 3.519183397293091, "learning_rate": 1.3482575805279468e-05, "loss": 0.5164, "num_input_tokens_seen": 80440256, "step": 84240 }, { "epoch": 6.872093971775838, "grad_norm": 1.4070161581039429, "learning_rate": 1.3479416855094853e-05, "loss": 0.3591, "num_input_tokens_seen": 80444656, "step": 84245 }, { "epoch": 6.872501835386247, "grad_norm": 11.238201141357422, "learning_rate": 1.347625813842756e-05, "loss": 0.3319, "num_input_tokens_seen": 80449456, "step": 84250 }, { "epoch": 6.872909698996655, "grad_norm": 0.6126213669776917, "learning_rate": 1.3473099655341614e-05, "loss": 0.2477, "num_input_tokens_seen": 80453648, "step": 84255 }, { "epoch": 6.873317562607064, "grad_norm": 0.5032260417938232, "learning_rate": 1.3469941405901043e-05, "loss": 0.3329, "num_input_tokens_seen": 80458352, "step": 84260 }, { "epoch": 6.873725426217473, "grad_norm": 0.7419441342353821, "learning_rate": 1.3466783390169862e-05, "loss": 0.2529, "num_input_tokens_seen": 80462960, "step": 84265 }, { "epoch": 6.874133289827881, "grad_norm": 0.4710434079170227, "learning_rate": 1.3463625608212083e-05, "loss": 0.2908, "num_input_tokens_seen": 80467888, "step": 84270 }, { "epoch": 6.87454115343829, "grad_norm": 0.3769145905971527, "learning_rate": 1.3460468060091702e-05, "loss": 0.3949, "num_input_tokens_seen": 80472576, "step": 84275 }, { "epoch": 6.874949017048699, "grad_norm": 0.6192615032196045, "learning_rate": 1.3457310745872737e-05, "loss": 0.4246, "num_input_tokens_seen": 80477856, "step": 84280 }, { "epoch": 6.875356880659107, "grad_norm": 6.729227542877197, "learning_rate": 1.3454153665619178e-05, "loss": 0.3071, "num_input_tokens_seen": 80482160, "step": 84285 }, { "epoch": 6.8757647442695164, "grad_norm": 0.7003908753395081, "learning_rate": 1.3450996819395017e-05, "loss": 0.3624, "num_input_tokens_seen": 80486656, "step": 84290 }, { "epoch": 6.876172607879925, "grad_norm": 0.3290654122829437, "learning_rate": 1.3447840207264246e-05, "loss": 0.4117, "num_input_tokens_seen": 80491376, "step": 84295 }, { "epoch": 6.876580471490334, "grad_norm": 0.40014636516571045, "learning_rate": 1.3444683829290845e-05, "loss": 0.248, "num_input_tokens_seen": 80495728, "step": 84300 }, { "epoch": 6.876988335100743, "grad_norm": 25.9444637298584, "learning_rate": 1.3441527685538791e-05, "loss": 0.3718, "num_input_tokens_seen": 80500192, "step": 84305 }, { "epoch": 6.877396198711151, "grad_norm": 5.69548225402832, "learning_rate": 1.3438371776072061e-05, "loss": 0.3487, "num_input_tokens_seen": 80505392, "step": 84310 }, { "epoch": 6.87780406232156, "grad_norm": 8.927960395812988, "learning_rate": 1.3435216100954623e-05, "loss": 0.2919, "num_input_tokens_seen": 80510448, "step": 84315 }, { "epoch": 6.878211925931969, "grad_norm": 36.84080123901367, "learning_rate": 1.343206066025043e-05, "loss": 0.5378, "num_input_tokens_seen": 80515072, "step": 84320 }, { "epoch": 6.878619789542377, "grad_norm": 24.81707000732422, "learning_rate": 1.3428905454023461e-05, "loss": 0.3171, "num_input_tokens_seen": 80519984, "step": 84325 }, { "epoch": 6.879027653152786, "grad_norm": 0.3586394488811493, "learning_rate": 1.342575048233766e-05, "loss": 0.4101, "num_input_tokens_seen": 80525424, "step": 84330 }, { "epoch": 6.879435516763195, "grad_norm": 25.5072078704834, "learning_rate": 1.3422595745256977e-05, "loss": 0.4307, "num_input_tokens_seen": 80530272, "step": 84335 }, { "epoch": 6.879843380373603, "grad_norm": 0.3861081600189209, "learning_rate": 1.3419441242845353e-05, "loss": 0.2548, "num_input_tokens_seen": 80535296, "step": 84340 }, { "epoch": 6.880251243984012, "grad_norm": 0.46261388063430786, "learning_rate": 1.3416286975166742e-05, "loss": 0.3158, "num_input_tokens_seen": 80539648, "step": 84345 }, { "epoch": 6.88065910759442, "grad_norm": 16.22427749633789, "learning_rate": 1.3413132942285073e-05, "loss": 0.4544, "num_input_tokens_seen": 80544832, "step": 84350 }, { "epoch": 6.881066971204829, "grad_norm": 0.3380235731601715, "learning_rate": 1.3409979144264276e-05, "loss": 0.3905, "num_input_tokens_seen": 80548944, "step": 84355 }, { "epoch": 6.881474834815238, "grad_norm": 52.61842727661133, "learning_rate": 1.3406825581168275e-05, "loss": 0.4975, "num_input_tokens_seen": 80553472, "step": 84360 }, { "epoch": 6.881882698425646, "grad_norm": 1.2473562955856323, "learning_rate": 1.3403672253060987e-05, "loss": 0.2558, "num_input_tokens_seen": 80557568, "step": 84365 }, { "epoch": 6.882290562036055, "grad_norm": 19.324907302856445, "learning_rate": 1.3400519160006342e-05, "loss": 0.2858, "num_input_tokens_seen": 80562592, "step": 84370 }, { "epoch": 6.882698425646463, "grad_norm": 0.9598253965377808, "learning_rate": 1.3397366302068243e-05, "loss": 0.3659, "num_input_tokens_seen": 80567152, "step": 84375 }, { "epoch": 6.883106289256872, "grad_norm": 5.66910457611084, "learning_rate": 1.3394213679310602e-05, "loss": 0.4372, "num_input_tokens_seen": 80572000, "step": 84380 }, { "epoch": 6.883514152867281, "grad_norm": 1.0106972455978394, "learning_rate": 1.3391061291797307e-05, "loss": 0.3802, "num_input_tokens_seen": 80576912, "step": 84385 }, { "epoch": 6.8839220164776895, "grad_norm": 18.845829010009766, "learning_rate": 1.3387909139592275e-05, "loss": 0.3335, "num_input_tokens_seen": 80580816, "step": 84390 }, { "epoch": 6.8843298800880985, "grad_norm": 1.302809476852417, "learning_rate": 1.3384757222759392e-05, "loss": 0.283, "num_input_tokens_seen": 80585248, "step": 84395 }, { "epoch": 6.8847377436985076, "grad_norm": 13.136661529541016, "learning_rate": 1.3381605541362546e-05, "loss": 0.4131, "num_input_tokens_seen": 80590272, "step": 84400 }, { "epoch": 6.885145607308916, "grad_norm": 0.7407105565071106, "learning_rate": 1.3378454095465617e-05, "loss": 0.2508, "num_input_tokens_seen": 80595344, "step": 84405 }, { "epoch": 6.885553470919325, "grad_norm": 2.7178988456726074, "learning_rate": 1.3375302885132473e-05, "loss": 0.4551, "num_input_tokens_seen": 80599744, "step": 84410 }, { "epoch": 6.885961334529734, "grad_norm": 0.6912670135498047, "learning_rate": 1.3372151910427017e-05, "loss": 0.2769, "num_input_tokens_seen": 80604496, "step": 84415 }, { "epoch": 6.886369198140142, "grad_norm": 2.1130073070526123, "learning_rate": 1.3369001171413093e-05, "loss": 0.38, "num_input_tokens_seen": 80609184, "step": 84420 }, { "epoch": 6.886777061750551, "grad_norm": 37.591365814208984, "learning_rate": 1.3365850668154578e-05, "loss": 0.2836, "num_input_tokens_seen": 80614496, "step": 84425 }, { "epoch": 6.887184925360959, "grad_norm": 0.3024940490722656, "learning_rate": 1.3362700400715322e-05, "loss": 0.5151, "num_input_tokens_seen": 80618944, "step": 84430 }, { "epoch": 6.887592788971368, "grad_norm": 1.6022812128067017, "learning_rate": 1.3359550369159184e-05, "loss": 0.3882, "num_input_tokens_seen": 80623392, "step": 84435 }, { "epoch": 6.888000652581777, "grad_norm": 0.47041258215904236, "learning_rate": 1.335640057355002e-05, "loss": 0.3319, "num_input_tokens_seen": 80628032, "step": 84440 }, { "epoch": 6.888408516192185, "grad_norm": 0.7367129921913147, "learning_rate": 1.3353251013951662e-05, "loss": 0.4096, "num_input_tokens_seen": 80632688, "step": 84445 }, { "epoch": 6.888816379802594, "grad_norm": 3.411046028137207, "learning_rate": 1.3350101690427964e-05, "loss": 0.2281, "num_input_tokens_seen": 80636944, "step": 84450 }, { "epoch": 6.889224243413002, "grad_norm": 23.854942321777344, "learning_rate": 1.3346952603042737e-05, "loss": 0.3803, "num_input_tokens_seen": 80641600, "step": 84455 }, { "epoch": 6.889632107023411, "grad_norm": 23.216169357299805, "learning_rate": 1.334380375185984e-05, "loss": 0.3313, "num_input_tokens_seen": 80646304, "step": 84460 }, { "epoch": 6.89003997063382, "grad_norm": 2.5034759044647217, "learning_rate": 1.3340655136943092e-05, "loss": 0.2674, "num_input_tokens_seen": 80651360, "step": 84465 }, { "epoch": 6.890447834244228, "grad_norm": 3.8037843704223633, "learning_rate": 1.3337506758356308e-05, "loss": 0.2695, "num_input_tokens_seen": 80656352, "step": 84470 }, { "epoch": 6.890855697854637, "grad_norm": 1.196772575378418, "learning_rate": 1.3334358616163295e-05, "loss": 0.3918, "num_input_tokens_seen": 80661760, "step": 84475 }, { "epoch": 6.891263561465046, "grad_norm": 23.877361297607422, "learning_rate": 1.3331210710427888e-05, "loss": 0.3143, "num_input_tokens_seen": 80666288, "step": 84480 }, { "epoch": 6.8916714250754545, "grad_norm": 39.57975387573242, "learning_rate": 1.3328063041213881e-05, "loss": 0.314, "num_input_tokens_seen": 80671088, "step": 84485 }, { "epoch": 6.8920792886858635, "grad_norm": 1.1885018348693848, "learning_rate": 1.3324915608585076e-05, "loss": 0.2958, "num_input_tokens_seen": 80675840, "step": 84490 }, { "epoch": 6.8924871522962725, "grad_norm": 0.6419830322265625, "learning_rate": 1.3321768412605262e-05, "loss": 0.2894, "num_input_tokens_seen": 80680000, "step": 84495 }, { "epoch": 6.892895015906681, "grad_norm": 1.277109980583191, "learning_rate": 1.3318621453338248e-05, "loss": 0.3918, "num_input_tokens_seen": 80685232, "step": 84500 }, { "epoch": 6.89330287951709, "grad_norm": 40.95533752441406, "learning_rate": 1.3315474730847813e-05, "loss": 0.3521, "num_input_tokens_seen": 80690416, "step": 84505 }, { "epoch": 6.893710743127498, "grad_norm": 8.487948417663574, "learning_rate": 1.3312328245197742e-05, "loss": 0.2164, "num_input_tokens_seen": 80695488, "step": 84510 }, { "epoch": 6.894118606737907, "grad_norm": 22.344820022583008, "learning_rate": 1.3309181996451808e-05, "loss": 0.3584, "num_input_tokens_seen": 80700688, "step": 84515 }, { "epoch": 6.894526470348316, "grad_norm": 0.6717753410339355, "learning_rate": 1.3306035984673782e-05, "loss": 0.2796, "num_input_tokens_seen": 80705904, "step": 84520 }, { "epoch": 6.894934333958724, "grad_norm": 124.95014190673828, "learning_rate": 1.3302890209927443e-05, "loss": 0.4982, "num_input_tokens_seen": 80710672, "step": 84525 }, { "epoch": 6.895342197569133, "grad_norm": 25.455108642578125, "learning_rate": 1.3299744672276551e-05, "loss": 0.4555, "num_input_tokens_seen": 80715776, "step": 84530 }, { "epoch": 6.895750061179542, "grad_norm": 0.4955295920372009, "learning_rate": 1.3296599371784863e-05, "loss": 0.277, "num_input_tokens_seen": 80721296, "step": 84535 }, { "epoch": 6.89615792478995, "grad_norm": 18.770130157470703, "learning_rate": 1.3293454308516123e-05, "loss": 0.3084, "num_input_tokens_seen": 80726336, "step": 84540 }, { "epoch": 6.896565788400359, "grad_norm": 0.5029418468475342, "learning_rate": 1.3290309482534099e-05, "loss": 0.4481, "num_input_tokens_seen": 80731904, "step": 84545 }, { "epoch": 6.896973652010768, "grad_norm": 17.058351516723633, "learning_rate": 1.3287164893902526e-05, "loss": 0.3417, "num_input_tokens_seen": 80736560, "step": 84550 }, { "epoch": 6.897381515621176, "grad_norm": 0.47736889123916626, "learning_rate": 1.3284020542685144e-05, "loss": 0.3326, "num_input_tokens_seen": 80740944, "step": 84555 }, { "epoch": 6.897789379231585, "grad_norm": 0.3739621937274933, "learning_rate": 1.3280876428945688e-05, "loss": 0.3288, "num_input_tokens_seen": 80745872, "step": 84560 }, { "epoch": 6.898197242841993, "grad_norm": 17.515987396240234, "learning_rate": 1.3277732552747885e-05, "loss": 0.4537, "num_input_tokens_seen": 80750560, "step": 84565 }, { "epoch": 6.898605106452402, "grad_norm": 0.42310860753059387, "learning_rate": 1.3274588914155462e-05, "loss": 0.5377, "num_input_tokens_seen": 80755712, "step": 84570 }, { "epoch": 6.899012970062811, "grad_norm": 0.30741479992866516, "learning_rate": 1.327144551323214e-05, "loss": 0.277, "num_input_tokens_seen": 80759776, "step": 84575 }, { "epoch": 6.8994208336732195, "grad_norm": 0.8162696361541748, "learning_rate": 1.3268302350041634e-05, "loss": 0.2835, "num_input_tokens_seen": 80765360, "step": 84580 }, { "epoch": 6.8998286972836285, "grad_norm": 0.6405401229858398, "learning_rate": 1.3265159424647645e-05, "loss": 0.3129, "num_input_tokens_seen": 80769744, "step": 84585 }, { "epoch": 6.900236560894037, "grad_norm": 3.7368791103363037, "learning_rate": 1.3262016737113897e-05, "loss": 0.5132, "num_input_tokens_seen": 80775088, "step": 84590 }, { "epoch": 6.900644424504446, "grad_norm": 1.2845361232757568, "learning_rate": 1.325887428750408e-05, "loss": 0.2893, "num_input_tokens_seen": 80779808, "step": 84595 }, { "epoch": 6.901052288114855, "grad_norm": 0.6008656620979309, "learning_rate": 1.3255732075881897e-05, "loss": 0.3666, "num_input_tokens_seen": 80784064, "step": 84600 }, { "epoch": 6.901460151725263, "grad_norm": 0.3671223521232605, "learning_rate": 1.325259010231103e-05, "loss": 0.4946, "num_input_tokens_seen": 80789120, "step": 84605 }, { "epoch": 6.901868015335672, "grad_norm": 0.5240130424499512, "learning_rate": 1.324944836685516e-05, "loss": 0.3272, "num_input_tokens_seen": 80793776, "step": 84610 }, { "epoch": 6.902275878946081, "grad_norm": 0.7522008419036865, "learning_rate": 1.3246306869577993e-05, "loss": 0.4094, "num_input_tokens_seen": 80799040, "step": 84615 }, { "epoch": 6.902683742556489, "grad_norm": 31.350080490112305, "learning_rate": 1.3243165610543188e-05, "loss": 0.4767, "num_input_tokens_seen": 80804192, "step": 84620 }, { "epoch": 6.903091606166898, "grad_norm": 0.5274046659469604, "learning_rate": 1.3240024589814421e-05, "loss": 0.3552, "num_input_tokens_seen": 80809376, "step": 84625 }, { "epoch": 6.903499469777307, "grad_norm": 7.654291152954102, "learning_rate": 1.3236883807455347e-05, "loss": 0.5678, "num_input_tokens_seen": 80814256, "step": 84630 }, { "epoch": 6.903907333387715, "grad_norm": 0.6850792765617371, "learning_rate": 1.3233743263529652e-05, "loss": 0.3045, "num_input_tokens_seen": 80819040, "step": 84635 }, { "epoch": 6.904315196998124, "grad_norm": 11.55855941772461, "learning_rate": 1.323060295810098e-05, "loss": 0.4455, "num_input_tokens_seen": 80824464, "step": 84640 }, { "epoch": 6.904723060608532, "grad_norm": 0.626774787902832, "learning_rate": 1.3227462891232984e-05, "loss": 0.3095, "num_input_tokens_seen": 80829712, "step": 84645 }, { "epoch": 6.905130924218941, "grad_norm": 0.46487098932266235, "learning_rate": 1.3224323062989303e-05, "loss": 0.3714, "num_input_tokens_seen": 80834096, "step": 84650 }, { "epoch": 6.90553878782935, "grad_norm": 20.722929000854492, "learning_rate": 1.3221183473433602e-05, "loss": 0.4057, "num_input_tokens_seen": 80838768, "step": 84655 }, { "epoch": 6.905946651439758, "grad_norm": 11.732979774475098, "learning_rate": 1.3218044122629503e-05, "loss": 0.3998, "num_input_tokens_seen": 80843552, "step": 84660 }, { "epoch": 6.906354515050167, "grad_norm": 0.5066539645195007, "learning_rate": 1.321490501064065e-05, "loss": 0.4246, "num_input_tokens_seen": 80848048, "step": 84665 }, { "epoch": 6.906762378660575, "grad_norm": 1.5392022132873535, "learning_rate": 1.3211766137530657e-05, "loss": 0.3101, "num_input_tokens_seen": 80852976, "step": 84670 }, { "epoch": 6.907170242270984, "grad_norm": 1.0396969318389893, "learning_rate": 1.3208627503363147e-05, "loss": 0.2897, "num_input_tokens_seen": 80858432, "step": 84675 }, { "epoch": 6.907578105881393, "grad_norm": 1.0596097707748413, "learning_rate": 1.320548910820176e-05, "loss": 0.5909, "num_input_tokens_seen": 80863968, "step": 84680 }, { "epoch": 6.907985969491802, "grad_norm": 6.923141002655029, "learning_rate": 1.3202350952110096e-05, "loss": 0.333, "num_input_tokens_seen": 80868768, "step": 84685 }, { "epoch": 6.908393833102211, "grad_norm": 0.7231674790382385, "learning_rate": 1.3199213035151763e-05, "loss": 0.3018, "num_input_tokens_seen": 80873728, "step": 84690 }, { "epoch": 6.90880169671262, "grad_norm": 1.4863890409469604, "learning_rate": 1.3196075357390369e-05, "loss": 0.3163, "num_input_tokens_seen": 80879344, "step": 84695 }, { "epoch": 6.909209560323028, "grad_norm": 2.3652946949005127, "learning_rate": 1.3192937918889513e-05, "loss": 0.3357, "num_input_tokens_seen": 80883904, "step": 84700 }, { "epoch": 6.909617423933437, "grad_norm": 0.5249304175376892, "learning_rate": 1.3189800719712786e-05, "loss": 0.3364, "num_input_tokens_seen": 80888352, "step": 84705 }, { "epoch": 6.910025287543846, "grad_norm": 0.4181942939758301, "learning_rate": 1.3186663759923782e-05, "loss": 0.3166, "num_input_tokens_seen": 80893136, "step": 84710 }, { "epoch": 6.910433151154254, "grad_norm": 0.5732058882713318, "learning_rate": 1.318352703958608e-05, "loss": 0.3828, "num_input_tokens_seen": 80897728, "step": 84715 }, { "epoch": 6.910841014764663, "grad_norm": 0.6099350452423096, "learning_rate": 1.3180390558763255e-05, "loss": 0.2669, "num_input_tokens_seen": 80901488, "step": 84720 }, { "epoch": 6.911248878375071, "grad_norm": 25.023595809936523, "learning_rate": 1.3177254317518905e-05, "loss": 0.2761, "num_input_tokens_seen": 80905680, "step": 84725 }, { "epoch": 6.91165674198548, "grad_norm": 1.4086726903915405, "learning_rate": 1.3174118315916583e-05, "loss": 0.3105, "num_input_tokens_seen": 80909632, "step": 84730 }, { "epoch": 6.912064605595889, "grad_norm": 9.632359504699707, "learning_rate": 1.3170982554019858e-05, "loss": 0.2811, "num_input_tokens_seen": 80914880, "step": 84735 }, { "epoch": 6.912472469206297, "grad_norm": 0.8229913711547852, "learning_rate": 1.3167847031892281e-05, "loss": 0.3062, "num_input_tokens_seen": 80919392, "step": 84740 }, { "epoch": 6.912880332816706, "grad_norm": 1.3947237730026245, "learning_rate": 1.3164711749597426e-05, "loss": 0.398, "num_input_tokens_seen": 80924128, "step": 84745 }, { "epoch": 6.913288196427115, "grad_norm": 1.9549123048782349, "learning_rate": 1.3161576707198837e-05, "loss": 0.375, "num_input_tokens_seen": 80929104, "step": 84750 }, { "epoch": 6.913696060037523, "grad_norm": 5.0244140625, "learning_rate": 1.3158441904760055e-05, "loss": 0.5007, "num_input_tokens_seen": 80933584, "step": 84755 }, { "epoch": 6.914103923647932, "grad_norm": 26.391813278198242, "learning_rate": 1.3155307342344622e-05, "loss": 0.3536, "num_input_tokens_seen": 80938512, "step": 84760 }, { "epoch": 6.914511787258341, "grad_norm": 16.844812393188477, "learning_rate": 1.3152173020016068e-05, "loss": 0.3536, "num_input_tokens_seen": 80943696, "step": 84765 }, { "epoch": 6.914919650868749, "grad_norm": 1.0173208713531494, "learning_rate": 1.3149038937837943e-05, "loss": 0.2404, "num_input_tokens_seen": 80948688, "step": 84770 }, { "epoch": 6.915327514479158, "grad_norm": 1.418335199356079, "learning_rate": 1.3145905095873761e-05, "loss": 0.3415, "num_input_tokens_seen": 80953152, "step": 84775 }, { "epoch": 6.9157353780895665, "grad_norm": 1.7911840677261353, "learning_rate": 1.3142771494187051e-05, "loss": 0.2512, "num_input_tokens_seen": 80958288, "step": 84780 }, { "epoch": 6.9161432416999755, "grad_norm": 1.907366394996643, "learning_rate": 1.3139638132841308e-05, "loss": 0.3209, "num_input_tokens_seen": 80963808, "step": 84785 }, { "epoch": 6.9165511053103845, "grad_norm": 0.45171135663986206, "learning_rate": 1.3136505011900075e-05, "loss": 0.2848, "num_input_tokens_seen": 80969200, "step": 84790 }, { "epoch": 6.916958968920793, "grad_norm": 3.2725374698638916, "learning_rate": 1.313337213142684e-05, "loss": 0.3111, "num_input_tokens_seen": 80973920, "step": 84795 }, { "epoch": 6.917366832531202, "grad_norm": 4.1476054191589355, "learning_rate": 1.3130239491485113e-05, "loss": 0.4464, "num_input_tokens_seen": 80978592, "step": 84800 }, { "epoch": 6.91777469614161, "grad_norm": 0.6281992197036743, "learning_rate": 1.3127107092138388e-05, "loss": 0.3726, "num_input_tokens_seen": 80983120, "step": 84805 }, { "epoch": 6.918182559752019, "grad_norm": 0.8836140036582947, "learning_rate": 1.3123974933450144e-05, "loss": 0.3324, "num_input_tokens_seen": 80987216, "step": 84810 }, { "epoch": 6.918590423362428, "grad_norm": 4.948895454406738, "learning_rate": 1.3120843015483893e-05, "loss": 0.3129, "num_input_tokens_seen": 80992224, "step": 84815 }, { "epoch": 6.918998286972836, "grad_norm": 3.485271692276001, "learning_rate": 1.3117711338303107e-05, "loss": 0.2892, "num_input_tokens_seen": 80996496, "step": 84820 }, { "epoch": 6.919406150583245, "grad_norm": 1.8047242164611816, "learning_rate": 1.3114579901971263e-05, "loss": 0.3592, "num_input_tokens_seen": 81001776, "step": 84825 }, { "epoch": 6.919814014193654, "grad_norm": 1.2105296850204468, "learning_rate": 1.3111448706551834e-05, "loss": 0.326, "num_input_tokens_seen": 81006672, "step": 84830 }, { "epoch": 6.920221877804062, "grad_norm": 14.315291404724121, "learning_rate": 1.310831775210829e-05, "loss": 0.2528, "num_input_tokens_seen": 81011328, "step": 84835 }, { "epoch": 6.920629741414471, "grad_norm": 13.209877967834473, "learning_rate": 1.3105187038704078e-05, "loss": 0.3264, "num_input_tokens_seen": 81015056, "step": 84840 }, { "epoch": 6.92103760502488, "grad_norm": 2.1534311771392822, "learning_rate": 1.3102056566402683e-05, "loss": 0.3623, "num_input_tokens_seen": 81019536, "step": 84845 }, { "epoch": 6.921445468635288, "grad_norm": 0.6222370862960815, "learning_rate": 1.3098926335267545e-05, "loss": 0.4152, "num_input_tokens_seen": 81024480, "step": 84850 }, { "epoch": 6.921853332245697, "grad_norm": 1.7120546102523804, "learning_rate": 1.309579634536211e-05, "loss": 0.2773, "num_input_tokens_seen": 81029376, "step": 84855 }, { "epoch": 6.922261195856105, "grad_norm": 0.734364926815033, "learning_rate": 1.309266659674983e-05, "loss": 0.2553, "num_input_tokens_seen": 81033584, "step": 84860 }, { "epoch": 6.922669059466514, "grad_norm": 0.7514001727104187, "learning_rate": 1.3089537089494135e-05, "loss": 0.3152, "num_input_tokens_seen": 81038928, "step": 84865 }, { "epoch": 6.923076923076923, "grad_norm": 4.875267028808594, "learning_rate": 1.3086407823658465e-05, "loss": 0.3283, "num_input_tokens_seen": 81043888, "step": 84870 }, { "epoch": 6.9234847866873315, "grad_norm": 0.6175082325935364, "learning_rate": 1.308327879930623e-05, "loss": 0.4162, "num_input_tokens_seen": 81048096, "step": 84875 }, { "epoch": 6.9238926502977405, "grad_norm": 16.256595611572266, "learning_rate": 1.3080150016500887e-05, "loss": 0.3168, "num_input_tokens_seen": 81052432, "step": 84880 }, { "epoch": 6.9243005139081495, "grad_norm": 8.424556732177734, "learning_rate": 1.3077021475305834e-05, "loss": 0.4368, "num_input_tokens_seen": 81056816, "step": 84885 }, { "epoch": 6.924708377518558, "grad_norm": 1.0828242301940918, "learning_rate": 1.3073893175784491e-05, "loss": 0.3987, "num_input_tokens_seen": 81060352, "step": 84890 }, { "epoch": 6.925116241128967, "grad_norm": 11.60687255859375, "learning_rate": 1.3070765118000256e-05, "loss": 0.3141, "num_input_tokens_seen": 81064640, "step": 84895 }, { "epoch": 6.925524104739376, "grad_norm": 24.781972885131836, "learning_rate": 1.306763730201655e-05, "loss": 0.4413, "num_input_tokens_seen": 81070256, "step": 84900 }, { "epoch": 6.925931968349784, "grad_norm": 1.5501312017440796, "learning_rate": 1.3064509727896768e-05, "loss": 0.2224, "num_input_tokens_seen": 81075424, "step": 84905 }, { "epoch": 6.926339831960193, "grad_norm": 0.562660276889801, "learning_rate": 1.3061382395704302e-05, "loss": 0.6516, "num_input_tokens_seen": 81079648, "step": 84910 }, { "epoch": 6.926747695570601, "grad_norm": 24.231592178344727, "learning_rate": 1.3058255305502543e-05, "loss": 0.3575, "num_input_tokens_seen": 81084608, "step": 84915 }, { "epoch": 6.92715555918101, "grad_norm": 0.8692992925643921, "learning_rate": 1.3055128457354862e-05, "loss": 0.2905, "num_input_tokens_seen": 81089936, "step": 84920 }, { "epoch": 6.927563422791419, "grad_norm": 0.8392590284347534, "learning_rate": 1.3052001851324664e-05, "loss": 0.3665, "num_input_tokens_seen": 81094672, "step": 84925 }, { "epoch": 6.927971286401827, "grad_norm": 0.6155602931976318, "learning_rate": 1.304887548747531e-05, "loss": 0.1937, "num_input_tokens_seen": 81099744, "step": 84930 }, { "epoch": 6.928379150012236, "grad_norm": 2.832628011703491, "learning_rate": 1.304574936587017e-05, "loss": 0.3149, "num_input_tokens_seen": 81104896, "step": 84935 }, { "epoch": 6.928787013622644, "grad_norm": 55.71095657348633, "learning_rate": 1.3042623486572602e-05, "loss": 0.3564, "num_input_tokens_seen": 81109648, "step": 84940 }, { "epoch": 6.929194877233053, "grad_norm": 1.385093331336975, "learning_rate": 1.3039497849645988e-05, "loss": 0.4118, "num_input_tokens_seen": 81113632, "step": 84945 }, { "epoch": 6.929602740843462, "grad_norm": 32.85543441772461, "learning_rate": 1.3036372455153667e-05, "loss": 0.2764, "num_input_tokens_seen": 81118944, "step": 84950 }, { "epoch": 6.93001060445387, "grad_norm": 30.411354064941406, "learning_rate": 1.303324730315899e-05, "loss": 0.2905, "num_input_tokens_seen": 81124032, "step": 84955 }, { "epoch": 6.930418468064279, "grad_norm": 0.9299318194389343, "learning_rate": 1.3030122393725311e-05, "loss": 0.3159, "num_input_tokens_seen": 81128432, "step": 84960 }, { "epoch": 6.930826331674688, "grad_norm": 35.102561950683594, "learning_rate": 1.3026997726915951e-05, "loss": 0.4373, "num_input_tokens_seen": 81132224, "step": 84965 }, { "epoch": 6.931234195285096, "grad_norm": 22.35948944091797, "learning_rate": 1.302387330279427e-05, "loss": 0.3517, "num_input_tokens_seen": 81136768, "step": 84970 }, { "epoch": 6.931642058895505, "grad_norm": 1.4282755851745605, "learning_rate": 1.3020749121423592e-05, "loss": 0.2974, "num_input_tokens_seen": 81141808, "step": 84975 }, { "epoch": 6.9320499225059145, "grad_norm": 1.33880615234375, "learning_rate": 1.3017625182867233e-05, "loss": 0.3144, "num_input_tokens_seen": 81146672, "step": 84980 }, { "epoch": 6.932457786116323, "grad_norm": 0.8149107694625854, "learning_rate": 1.3014501487188525e-05, "loss": 0.1997, "num_input_tokens_seen": 81151712, "step": 84985 }, { "epoch": 6.932865649726732, "grad_norm": 14.681440353393555, "learning_rate": 1.3011378034450775e-05, "loss": 0.3008, "num_input_tokens_seen": 81156800, "step": 84990 }, { "epoch": 6.93327351333714, "grad_norm": 0.9890817403793335, "learning_rate": 1.30082548247173e-05, "loss": 0.4207, "num_input_tokens_seen": 81161296, "step": 84995 }, { "epoch": 6.933681376947549, "grad_norm": 0.6878694295883179, "learning_rate": 1.3005131858051403e-05, "loss": 0.5109, "num_input_tokens_seen": 81166128, "step": 85000 }, { "epoch": 6.934089240557958, "grad_norm": 33.72959518432617, "learning_rate": 1.3002009134516386e-05, "loss": 0.3106, "num_input_tokens_seen": 81170624, "step": 85005 }, { "epoch": 6.934497104168366, "grad_norm": 1.0038820505142212, "learning_rate": 1.2998886654175538e-05, "loss": 0.4404, "num_input_tokens_seen": 81175872, "step": 85010 }, { "epoch": 6.934904967778775, "grad_norm": 8.920344352722168, "learning_rate": 1.2995764417092166e-05, "loss": 0.2462, "num_input_tokens_seen": 81180384, "step": 85015 }, { "epoch": 6.935312831389183, "grad_norm": 23.963722229003906, "learning_rate": 1.2992642423329549e-05, "loss": 0.4718, "num_input_tokens_seen": 81185296, "step": 85020 }, { "epoch": 6.935720694999592, "grad_norm": 32.54267501831055, "learning_rate": 1.2989520672950966e-05, "loss": 0.4195, "num_input_tokens_seen": 81190320, "step": 85025 }, { "epoch": 6.936128558610001, "grad_norm": 42.04136276245117, "learning_rate": 1.2986399166019691e-05, "loss": 0.3861, "num_input_tokens_seen": 81195248, "step": 85030 }, { "epoch": 6.936536422220409, "grad_norm": 6.002299785614014, "learning_rate": 1.2983277902599005e-05, "loss": 0.4692, "num_input_tokens_seen": 81200304, "step": 85035 }, { "epoch": 6.936944285830818, "grad_norm": 16.947975158691406, "learning_rate": 1.298015688275217e-05, "loss": 0.3043, "num_input_tokens_seen": 81205488, "step": 85040 }, { "epoch": 6.937352149441227, "grad_norm": 1.063073754310608, "learning_rate": 1.297703610654245e-05, "loss": 0.3812, "num_input_tokens_seen": 81209968, "step": 85045 }, { "epoch": 6.937760013051635, "grad_norm": 0.8138270378112793, "learning_rate": 1.2973915574033085e-05, "loss": 0.3352, "num_input_tokens_seen": 81214512, "step": 85050 }, { "epoch": 6.938167876662044, "grad_norm": 0.5784633755683899, "learning_rate": 1.2970795285287357e-05, "loss": 0.3048, "num_input_tokens_seen": 81219376, "step": 85055 }, { "epoch": 6.938575740272453, "grad_norm": 1.3405364751815796, "learning_rate": 1.2967675240368494e-05, "loss": 0.207, "num_input_tokens_seen": 81224528, "step": 85060 }, { "epoch": 6.938983603882861, "grad_norm": 0.4282747507095337, "learning_rate": 1.2964555439339737e-05, "loss": 0.2207, "num_input_tokens_seen": 81229232, "step": 85065 }, { "epoch": 6.93939146749327, "grad_norm": 0.9668698906898499, "learning_rate": 1.2961435882264334e-05, "loss": 0.3704, "num_input_tokens_seen": 81234288, "step": 85070 }, { "epoch": 6.9397993311036785, "grad_norm": 4.214616775512695, "learning_rate": 1.2958316569205495e-05, "loss": 0.2697, "num_input_tokens_seen": 81238512, "step": 85075 }, { "epoch": 6.9402071947140875, "grad_norm": 0.9179590344429016, "learning_rate": 1.2955197500226474e-05, "loss": 0.3228, "num_input_tokens_seen": 81242784, "step": 85080 }, { "epoch": 6.9406150583244965, "grad_norm": 1.8700288534164429, "learning_rate": 1.2952078675390483e-05, "loss": 0.2909, "num_input_tokens_seen": 81247408, "step": 85085 }, { "epoch": 6.941022921934905, "grad_norm": 0.9642025232315063, "learning_rate": 1.2948960094760737e-05, "loss": 0.2333, "num_input_tokens_seen": 81251744, "step": 85090 }, { "epoch": 6.941430785545314, "grad_norm": 3.6983554363250732, "learning_rate": 1.294584175840044e-05, "loss": 0.3022, "num_input_tokens_seen": 81256128, "step": 85095 }, { "epoch": 6.941838649155723, "grad_norm": 1.6365522146224976, "learning_rate": 1.2942723666372814e-05, "loss": 0.2201, "num_input_tokens_seen": 81261040, "step": 85100 }, { "epoch": 6.942246512766131, "grad_norm": 22.30355453491211, "learning_rate": 1.293960581874106e-05, "loss": 0.4089, "num_input_tokens_seen": 81266144, "step": 85105 }, { "epoch": 6.94265437637654, "grad_norm": 0.5831505656242371, "learning_rate": 1.2936488215568372e-05, "loss": 0.3927, "num_input_tokens_seen": 81271184, "step": 85110 }, { "epoch": 6.943062239986949, "grad_norm": 2.444014549255371, "learning_rate": 1.2933370856917937e-05, "loss": 0.434, "num_input_tokens_seen": 81275824, "step": 85115 }, { "epoch": 6.943470103597357, "grad_norm": 1.8720601797103882, "learning_rate": 1.2930253742852952e-05, "loss": 0.2722, "num_input_tokens_seen": 81280544, "step": 85120 }, { "epoch": 6.943877967207766, "grad_norm": 1.4625385999679565, "learning_rate": 1.2927136873436593e-05, "loss": 0.4533, "num_input_tokens_seen": 81285216, "step": 85125 }, { "epoch": 6.944285830818174, "grad_norm": 34.40700912475586, "learning_rate": 1.2924020248732038e-05, "loss": 0.3006, "num_input_tokens_seen": 81289904, "step": 85130 }, { "epoch": 6.944693694428583, "grad_norm": 1.4841868877410889, "learning_rate": 1.2920903868802465e-05, "loss": 0.502, "num_input_tokens_seen": 81294960, "step": 85135 }, { "epoch": 6.945101558038992, "grad_norm": 20.527172088623047, "learning_rate": 1.2917787733711023e-05, "loss": 0.4179, "num_input_tokens_seen": 81299680, "step": 85140 }, { "epoch": 6.9455094216494, "grad_norm": 28.020933151245117, "learning_rate": 1.2914671843520904e-05, "loss": 0.4204, "num_input_tokens_seen": 81304224, "step": 85145 }, { "epoch": 6.945917285259809, "grad_norm": 1.2848373651504517, "learning_rate": 1.291155619829525e-05, "loss": 0.2986, "num_input_tokens_seen": 81308848, "step": 85150 }, { "epoch": 6.946325148870217, "grad_norm": 45.777854919433594, "learning_rate": 1.2908440798097216e-05, "loss": 0.3905, "num_input_tokens_seen": 81314176, "step": 85155 }, { "epoch": 6.946733012480626, "grad_norm": 4.812857151031494, "learning_rate": 1.2905325642989946e-05, "loss": 0.2812, "num_input_tokens_seen": 81318624, "step": 85160 }, { "epoch": 6.947140876091035, "grad_norm": 25.60270881652832, "learning_rate": 1.2902210733036579e-05, "loss": 0.3264, "num_input_tokens_seen": 81322848, "step": 85165 }, { "epoch": 6.9475487397014435, "grad_norm": 1.4139297008514404, "learning_rate": 1.289909606830027e-05, "loss": 0.428, "num_input_tokens_seen": 81326464, "step": 85170 }, { "epoch": 6.9479566033118525, "grad_norm": 0.8595473766326904, "learning_rate": 1.2895981648844139e-05, "loss": 0.324, "num_input_tokens_seen": 81329872, "step": 85175 }, { "epoch": 6.9483644669222615, "grad_norm": 0.714431881904602, "learning_rate": 1.2892867474731317e-05, "loss": 0.5049, "num_input_tokens_seen": 81334032, "step": 85180 }, { "epoch": 6.94877233053267, "grad_norm": 0.5687029957771301, "learning_rate": 1.2889753546024919e-05, "loss": 0.4421, "num_input_tokens_seen": 81338640, "step": 85185 }, { "epoch": 6.949180194143079, "grad_norm": 2.623552083969116, "learning_rate": 1.288663986278808e-05, "loss": 0.2877, "num_input_tokens_seen": 81343616, "step": 85190 }, { "epoch": 6.949588057753488, "grad_norm": 29.27977752685547, "learning_rate": 1.2883526425083902e-05, "loss": 0.1641, "num_input_tokens_seen": 81347744, "step": 85195 }, { "epoch": 6.949995921363896, "grad_norm": 29.611915588378906, "learning_rate": 1.2880413232975497e-05, "loss": 0.1946, "num_input_tokens_seen": 81353264, "step": 85200 }, { "epoch": 6.950403784974305, "grad_norm": 0.6748983263969421, "learning_rate": 1.2877300286525953e-05, "loss": 0.2906, "num_input_tokens_seen": 81357136, "step": 85205 }, { "epoch": 6.950811648584713, "grad_norm": 131.70965576171875, "learning_rate": 1.2874187585798392e-05, "loss": 0.307, "num_input_tokens_seen": 81362288, "step": 85210 }, { "epoch": 6.951219512195122, "grad_norm": 0.6161702871322632, "learning_rate": 1.2871075130855897e-05, "loss": 0.3507, "num_input_tokens_seen": 81367040, "step": 85215 }, { "epoch": 6.951627375805531, "grad_norm": 1.7589811086654663, "learning_rate": 1.2867962921761555e-05, "loss": 0.3921, "num_input_tokens_seen": 81372096, "step": 85220 }, { "epoch": 6.952035239415939, "grad_norm": 0.4910737872123718, "learning_rate": 1.2864850958578451e-05, "loss": 0.2685, "num_input_tokens_seen": 81377584, "step": 85225 }, { "epoch": 6.952443103026348, "grad_norm": 1.3712750673294067, "learning_rate": 1.2861739241369647e-05, "loss": 0.2044, "num_input_tokens_seen": 81382448, "step": 85230 }, { "epoch": 6.952850966636756, "grad_norm": 26.474050521850586, "learning_rate": 1.2858627770198244e-05, "loss": 0.4069, "num_input_tokens_seen": 81387104, "step": 85235 }, { "epoch": 6.953258830247165, "grad_norm": 3.6935675144195557, "learning_rate": 1.2855516545127294e-05, "loss": 0.4078, "num_input_tokens_seen": 81391712, "step": 85240 }, { "epoch": 6.953666693857574, "grad_norm": 0.5489800572395325, "learning_rate": 1.2852405566219866e-05, "loss": 0.2789, "num_input_tokens_seen": 81395280, "step": 85245 }, { "epoch": 6.954074557467983, "grad_norm": 29.937368392944336, "learning_rate": 1.2849294833539016e-05, "loss": 0.4584, "num_input_tokens_seen": 81400384, "step": 85250 }, { "epoch": 6.954482421078391, "grad_norm": 10.067400932312012, "learning_rate": 1.2846184347147794e-05, "loss": 0.3353, "num_input_tokens_seen": 81405456, "step": 85255 }, { "epoch": 6.9548902846888, "grad_norm": 28.651613235473633, "learning_rate": 1.2843074107109254e-05, "loss": 0.4778, "num_input_tokens_seen": 81409968, "step": 85260 }, { "epoch": 6.9552981482992084, "grad_norm": 2.668058156967163, "learning_rate": 1.2839964113486435e-05, "loss": 0.151, "num_input_tokens_seen": 81414336, "step": 85265 }, { "epoch": 6.9557060119096175, "grad_norm": 27.255386352539062, "learning_rate": 1.2836854366342375e-05, "loss": 0.2865, "num_input_tokens_seen": 81418480, "step": 85270 }, { "epoch": 6.9561138755200265, "grad_norm": 0.6394271850585938, "learning_rate": 1.28337448657401e-05, "loss": 0.3946, "num_input_tokens_seen": 81423344, "step": 85275 }, { "epoch": 6.956521739130435, "grad_norm": 0.8210806250572205, "learning_rate": 1.2830635611742652e-05, "loss": 0.4039, "num_input_tokens_seen": 81429248, "step": 85280 }, { "epoch": 6.956929602740844, "grad_norm": 16.605804443359375, "learning_rate": 1.2827526604413053e-05, "loss": 0.3775, "num_input_tokens_seen": 81434560, "step": 85285 }, { "epoch": 6.957337466351252, "grad_norm": 0.41876229643821716, "learning_rate": 1.2824417843814319e-05, "loss": 0.1688, "num_input_tokens_seen": 81439616, "step": 85290 }, { "epoch": 6.957745329961661, "grad_norm": 0.8111001253128052, "learning_rate": 1.282130933000945e-05, "loss": 0.4055, "num_input_tokens_seen": 81443920, "step": 85295 }, { "epoch": 6.95815319357207, "grad_norm": 0.892085611820221, "learning_rate": 1.2818201063061474e-05, "loss": 0.4545, "num_input_tokens_seen": 81448496, "step": 85300 }, { "epoch": 6.958561057182478, "grad_norm": 3.891732931137085, "learning_rate": 1.2815093043033392e-05, "loss": 0.4148, "num_input_tokens_seen": 81452752, "step": 85305 }, { "epoch": 6.958968920792887, "grad_norm": 0.46877750754356384, "learning_rate": 1.2811985269988192e-05, "loss": 0.2922, "num_input_tokens_seen": 81456912, "step": 85310 }, { "epoch": 6.959376784403296, "grad_norm": 6.77639102935791, "learning_rate": 1.2808877743988871e-05, "loss": 0.4289, "num_input_tokens_seen": 81462080, "step": 85315 }, { "epoch": 6.959784648013704, "grad_norm": 20.043811798095703, "learning_rate": 1.280577046509841e-05, "loss": 0.2954, "num_input_tokens_seen": 81466256, "step": 85320 }, { "epoch": 6.960192511624113, "grad_norm": 134.4187469482422, "learning_rate": 1.2802663433379807e-05, "loss": 0.3416, "num_input_tokens_seen": 81471824, "step": 85325 }, { "epoch": 6.960600375234522, "grad_norm": 4.807365894317627, "learning_rate": 1.2799556648896033e-05, "loss": 0.3103, "num_input_tokens_seen": 81476016, "step": 85330 }, { "epoch": 6.96100823884493, "grad_norm": 26.08946990966797, "learning_rate": 1.2796450111710065e-05, "loss": 0.2964, "num_input_tokens_seen": 81481344, "step": 85335 }, { "epoch": 6.961416102455339, "grad_norm": 36.085758209228516, "learning_rate": 1.2793343821884857e-05, "loss": 0.3827, "num_input_tokens_seen": 81486880, "step": 85340 }, { "epoch": 6.961823966065747, "grad_norm": 1.886202096939087, "learning_rate": 1.2790237779483389e-05, "loss": 0.2562, "num_input_tokens_seen": 81491376, "step": 85345 }, { "epoch": 6.962231829676156, "grad_norm": 1.3397244215011597, "learning_rate": 1.2787131984568617e-05, "loss": 0.3191, "num_input_tokens_seen": 81496240, "step": 85350 }, { "epoch": 6.962639693286565, "grad_norm": 2.314343214035034, "learning_rate": 1.2784026437203489e-05, "loss": 0.3372, "num_input_tokens_seen": 81500816, "step": 85355 }, { "epoch": 6.963047556896973, "grad_norm": 0.6786313652992249, "learning_rate": 1.2780921137450953e-05, "loss": 0.2798, "num_input_tokens_seen": 81505296, "step": 85360 }, { "epoch": 6.963455420507382, "grad_norm": 1.1998635530471802, "learning_rate": 1.2777816085373945e-05, "loss": 0.4542, "num_input_tokens_seen": 81509824, "step": 85365 }, { "epoch": 6.9638632841177905, "grad_norm": 51.54231643676758, "learning_rate": 1.2774711281035423e-05, "loss": 0.2613, "num_input_tokens_seen": 81513792, "step": 85370 }, { "epoch": 6.9642711477281996, "grad_norm": 0.6220692992210388, "learning_rate": 1.2771606724498303e-05, "loss": 0.3357, "num_input_tokens_seen": 81518064, "step": 85375 }, { "epoch": 6.964679011338609, "grad_norm": 1.1402125358581543, "learning_rate": 1.2768502415825523e-05, "loss": 0.4304, "num_input_tokens_seen": 81523680, "step": 85380 }, { "epoch": 6.965086874949017, "grad_norm": 2.531747817993164, "learning_rate": 1.2765398355079999e-05, "loss": 0.394, "num_input_tokens_seen": 81528272, "step": 85385 }, { "epoch": 6.965494738559426, "grad_norm": 2.9947845935821533, "learning_rate": 1.2762294542324654e-05, "loss": 0.3747, "num_input_tokens_seen": 81533008, "step": 85390 }, { "epoch": 6.965902602169835, "grad_norm": 2.908273696899414, "learning_rate": 1.2759190977622398e-05, "loss": 0.4117, "num_input_tokens_seen": 81537936, "step": 85395 }, { "epoch": 6.966310465780243, "grad_norm": 19.24546241760254, "learning_rate": 1.2756087661036138e-05, "loss": 0.3969, "num_input_tokens_seen": 81543344, "step": 85400 }, { "epoch": 6.966718329390652, "grad_norm": 0.8365691900253296, "learning_rate": 1.275298459262877e-05, "loss": 0.3787, "num_input_tokens_seen": 81548880, "step": 85405 }, { "epoch": 6.967126193001061, "grad_norm": 31.187849044799805, "learning_rate": 1.274988177246321e-05, "loss": 0.3117, "num_input_tokens_seen": 81553536, "step": 85410 }, { "epoch": 6.967534056611469, "grad_norm": 1.5268596410751343, "learning_rate": 1.274677920060234e-05, "loss": 0.3202, "num_input_tokens_seen": 81558416, "step": 85415 }, { "epoch": 6.967941920221878, "grad_norm": 0.6941712498664856, "learning_rate": 1.2743676877109052e-05, "loss": 0.3257, "num_input_tokens_seen": 81563120, "step": 85420 }, { "epoch": 6.968349783832286, "grad_norm": 12.877711296081543, "learning_rate": 1.2740574802046221e-05, "loss": 0.3571, "num_input_tokens_seen": 81567664, "step": 85425 }, { "epoch": 6.968757647442695, "grad_norm": 20.769515991210938, "learning_rate": 1.2737472975476722e-05, "loss": 0.3505, "num_input_tokens_seen": 81572416, "step": 85430 }, { "epoch": 6.969165511053104, "grad_norm": 3.664527654647827, "learning_rate": 1.2734371397463446e-05, "loss": 0.2809, "num_input_tokens_seen": 81576944, "step": 85435 }, { "epoch": 6.969573374663512, "grad_norm": 1.385957956314087, "learning_rate": 1.2731270068069246e-05, "loss": 0.4663, "num_input_tokens_seen": 81581024, "step": 85440 }, { "epoch": 6.969981238273921, "grad_norm": 2.1137349605560303, "learning_rate": 1.2728168987356993e-05, "loss": 0.348, "num_input_tokens_seen": 81586512, "step": 85445 }, { "epoch": 6.97038910188433, "grad_norm": 2.1312410831451416, "learning_rate": 1.2725068155389524e-05, "loss": 0.4454, "num_input_tokens_seen": 81592048, "step": 85450 }, { "epoch": 6.970796965494738, "grad_norm": 0.6101498007774353, "learning_rate": 1.2721967572229724e-05, "loss": 0.4101, "num_input_tokens_seen": 81597696, "step": 85455 }, { "epoch": 6.971204829105147, "grad_norm": 3.072984218597412, "learning_rate": 1.271886723794042e-05, "loss": 0.3116, "num_input_tokens_seen": 81602640, "step": 85460 }, { "epoch": 6.971612692715556, "grad_norm": 3.8053300380706787, "learning_rate": 1.2715767152584462e-05, "loss": 0.4535, "num_input_tokens_seen": 81606752, "step": 85465 }, { "epoch": 6.9720205563259645, "grad_norm": 13.324809074401855, "learning_rate": 1.271266731622468e-05, "loss": 0.3251, "num_input_tokens_seen": 81610816, "step": 85470 }, { "epoch": 6.9724284199363735, "grad_norm": 1.5582435131072998, "learning_rate": 1.2709567728923904e-05, "loss": 0.2707, "num_input_tokens_seen": 81615536, "step": 85475 }, { "epoch": 6.972836283546782, "grad_norm": 4.54357385635376, "learning_rate": 1.2706468390744974e-05, "loss": 0.3704, "num_input_tokens_seen": 81619504, "step": 85480 }, { "epoch": 6.973244147157191, "grad_norm": 1.4034810066223145, "learning_rate": 1.2703369301750708e-05, "loss": 0.6377, "num_input_tokens_seen": 81624496, "step": 85485 }, { "epoch": 6.9736520107676, "grad_norm": 20.889347076416016, "learning_rate": 1.2700270462003922e-05, "loss": 0.2831, "num_input_tokens_seen": 81628448, "step": 85490 }, { "epoch": 6.974059874378008, "grad_norm": 38.43015670776367, "learning_rate": 1.2697171871567414e-05, "loss": 0.3341, "num_input_tokens_seen": 81633728, "step": 85495 }, { "epoch": 6.974467737988417, "grad_norm": 0.4120681583881378, "learning_rate": 1.269407353050402e-05, "loss": 0.2936, "num_input_tokens_seen": 81637760, "step": 85500 }, { "epoch": 6.974875601598825, "grad_norm": 6.212864398956299, "learning_rate": 1.2690975438876523e-05, "loss": 0.3714, "num_input_tokens_seen": 81641984, "step": 85505 }, { "epoch": 6.975283465209234, "grad_norm": 3.1401376724243164, "learning_rate": 1.2687877596747721e-05, "loss": 0.1733, "num_input_tokens_seen": 81647488, "step": 85510 }, { "epoch": 6.975691328819643, "grad_norm": 26.550067901611328, "learning_rate": 1.2684780004180414e-05, "loss": 0.2168, "num_input_tokens_seen": 81653296, "step": 85515 }, { "epoch": 6.976099192430051, "grad_norm": 33.0347785949707, "learning_rate": 1.2681682661237376e-05, "loss": 0.3383, "num_input_tokens_seen": 81657888, "step": 85520 }, { "epoch": 6.97650705604046, "grad_norm": 5.164280414581299, "learning_rate": 1.2678585567981402e-05, "loss": 0.4008, "num_input_tokens_seen": 81662800, "step": 85525 }, { "epoch": 6.976914919650869, "grad_norm": 13.78857135772705, "learning_rate": 1.267548872447526e-05, "loss": 0.2997, "num_input_tokens_seen": 81668224, "step": 85530 }, { "epoch": 6.977322783261277, "grad_norm": 0.4591246247291565, "learning_rate": 1.2672392130781715e-05, "loss": 0.3019, "num_input_tokens_seen": 81673200, "step": 85535 }, { "epoch": 6.977730646871686, "grad_norm": 9.608251571655273, "learning_rate": 1.2669295786963553e-05, "loss": 0.2848, "num_input_tokens_seen": 81678096, "step": 85540 }, { "epoch": 6.978138510482095, "grad_norm": 1.7259284257888794, "learning_rate": 1.266619969308353e-05, "loss": 0.269, "num_input_tokens_seen": 81682496, "step": 85545 }, { "epoch": 6.978546374092503, "grad_norm": 34.268211364746094, "learning_rate": 1.2663103849204395e-05, "loss": 0.3784, "num_input_tokens_seen": 81686288, "step": 85550 }, { "epoch": 6.978954237702912, "grad_norm": 1.5389970541000366, "learning_rate": 1.26600082553889e-05, "loss": 0.365, "num_input_tokens_seen": 81690736, "step": 85555 }, { "epoch": 6.9793621013133205, "grad_norm": 0.7513511776924133, "learning_rate": 1.2656912911699798e-05, "loss": 0.2595, "num_input_tokens_seen": 81695184, "step": 85560 }, { "epoch": 6.9797699649237295, "grad_norm": 4.010027885437012, "learning_rate": 1.2653817818199814e-05, "loss": 0.3712, "num_input_tokens_seen": 81699552, "step": 85565 }, { "epoch": 6.9801778285341385, "grad_norm": 0.5498788356781006, "learning_rate": 1.2650722974951706e-05, "loss": 0.3497, "num_input_tokens_seen": 81703168, "step": 85570 }, { "epoch": 6.980585692144547, "grad_norm": 2.3736143112182617, "learning_rate": 1.2647628382018197e-05, "loss": 0.4384, "num_input_tokens_seen": 81708080, "step": 85575 }, { "epoch": 6.980993555754956, "grad_norm": 0.4864256978034973, "learning_rate": 1.2644534039462013e-05, "loss": 0.3207, "num_input_tokens_seen": 81712016, "step": 85580 }, { "epoch": 6.981401419365364, "grad_norm": 2.019625186920166, "learning_rate": 1.2641439947345862e-05, "loss": 0.2198, "num_input_tokens_seen": 81716736, "step": 85585 }, { "epoch": 6.981809282975773, "grad_norm": 8.77662181854248, "learning_rate": 1.2638346105732485e-05, "loss": 0.4313, "num_input_tokens_seen": 81722064, "step": 85590 }, { "epoch": 6.982217146586182, "grad_norm": 1.1392592191696167, "learning_rate": 1.2635252514684576e-05, "loss": 0.3942, "num_input_tokens_seen": 81726448, "step": 85595 }, { "epoch": 6.98262501019659, "grad_norm": 52.105796813964844, "learning_rate": 1.2632159174264845e-05, "loss": 0.3685, "num_input_tokens_seen": 81730864, "step": 85600 }, { "epoch": 6.983032873806999, "grad_norm": 3.3037710189819336, "learning_rate": 1.2629066084535985e-05, "loss": 0.2165, "num_input_tokens_seen": 81736112, "step": 85605 }, { "epoch": 6.983440737417408, "grad_norm": 10.946784973144531, "learning_rate": 1.2625973245560706e-05, "loss": 0.3134, "num_input_tokens_seen": 81741392, "step": 85610 }, { "epoch": 6.983848601027816, "grad_norm": 23.186054229736328, "learning_rate": 1.2622880657401692e-05, "loss": 0.5286, "num_input_tokens_seen": 81745888, "step": 85615 }, { "epoch": 6.984256464638225, "grad_norm": 22.440866470336914, "learning_rate": 1.2619788320121628e-05, "loss": 0.4091, "num_input_tokens_seen": 81750768, "step": 85620 }, { "epoch": 6.984664328248634, "grad_norm": 77.70673370361328, "learning_rate": 1.2616696233783192e-05, "loss": 0.4751, "num_input_tokens_seen": 81756144, "step": 85625 }, { "epoch": 6.985072191859042, "grad_norm": 0.34460484981536865, "learning_rate": 1.2613604398449055e-05, "loss": 0.2597, "num_input_tokens_seen": 81761200, "step": 85630 }, { "epoch": 6.985480055469451, "grad_norm": 10.230003356933594, "learning_rate": 1.2610512814181901e-05, "loss": 0.363, "num_input_tokens_seen": 81765952, "step": 85635 }, { "epoch": 6.985887919079859, "grad_norm": 0.7795274257659912, "learning_rate": 1.260742148104439e-05, "loss": 0.3282, "num_input_tokens_seen": 81770816, "step": 85640 }, { "epoch": 6.986295782690268, "grad_norm": 1.5304994583129883, "learning_rate": 1.260433039909918e-05, "loss": 0.3452, "num_input_tokens_seen": 81775696, "step": 85645 }, { "epoch": 6.986703646300677, "grad_norm": 0.45711928606033325, "learning_rate": 1.260123956840892e-05, "loss": 0.305, "num_input_tokens_seen": 81780112, "step": 85650 }, { "epoch": 6.987111509911085, "grad_norm": 3.3045578002929688, "learning_rate": 1.259814898903626e-05, "loss": 0.626, "num_input_tokens_seen": 81783840, "step": 85655 }, { "epoch": 6.987519373521494, "grad_norm": 4.780263900756836, "learning_rate": 1.2595058661043859e-05, "loss": 0.5249, "num_input_tokens_seen": 81788560, "step": 85660 }, { "epoch": 6.9879272371319034, "grad_norm": 15.389569282531738, "learning_rate": 1.2591968584494348e-05, "loss": 0.2306, "num_input_tokens_seen": 81793696, "step": 85665 }, { "epoch": 6.988335100742312, "grad_norm": 5.538829803466797, "learning_rate": 1.2588878759450363e-05, "loss": 0.3898, "num_input_tokens_seen": 81798688, "step": 85670 }, { "epoch": 6.988742964352721, "grad_norm": 20.465167999267578, "learning_rate": 1.2585789185974527e-05, "loss": 0.4681, "num_input_tokens_seen": 81803952, "step": 85675 }, { "epoch": 6.98915082796313, "grad_norm": 0.8983767032623291, "learning_rate": 1.2582699864129472e-05, "loss": 0.4703, "num_input_tokens_seen": 81809152, "step": 85680 }, { "epoch": 6.989558691573538, "grad_norm": 2.3250741958618164, "learning_rate": 1.2579610793977813e-05, "loss": 0.3647, "num_input_tokens_seen": 81813984, "step": 85685 }, { "epoch": 6.989966555183947, "grad_norm": 5.713572025299072, "learning_rate": 1.2576521975582167e-05, "loss": 0.6248, "num_input_tokens_seen": 81819472, "step": 85690 }, { "epoch": 6.990374418794355, "grad_norm": 0.42829766869544983, "learning_rate": 1.2573433409005133e-05, "loss": 0.3239, "num_input_tokens_seen": 81824368, "step": 85695 }, { "epoch": 6.990782282404764, "grad_norm": 27.28534698486328, "learning_rate": 1.2570345094309333e-05, "loss": 0.4914, "num_input_tokens_seen": 81828960, "step": 85700 }, { "epoch": 6.991190146015173, "grad_norm": 4.368559837341309, "learning_rate": 1.2567257031557356e-05, "loss": 0.2784, "num_input_tokens_seen": 81833888, "step": 85705 }, { "epoch": 6.991598009625581, "grad_norm": 13.553877830505371, "learning_rate": 1.2564169220811795e-05, "loss": 0.535, "num_input_tokens_seen": 81838576, "step": 85710 }, { "epoch": 6.99200587323599, "grad_norm": 0.7056224346160889, "learning_rate": 1.2561081662135239e-05, "loss": 0.239, "num_input_tokens_seen": 81843536, "step": 85715 }, { "epoch": 6.992413736846398, "grad_norm": 1.902341604232788, "learning_rate": 1.2557994355590263e-05, "loss": 0.2799, "num_input_tokens_seen": 81848640, "step": 85720 }, { "epoch": 6.992821600456807, "grad_norm": 1.958404541015625, "learning_rate": 1.2554907301239463e-05, "loss": 0.3191, "num_input_tokens_seen": 81853184, "step": 85725 }, { "epoch": 6.993229464067216, "grad_norm": 7.557413578033447, "learning_rate": 1.2551820499145407e-05, "loss": 0.4469, "num_input_tokens_seen": 81857616, "step": 85730 }, { "epoch": 6.993637327677624, "grad_norm": 0.46422457695007324, "learning_rate": 1.2548733949370656e-05, "loss": 0.2942, "num_input_tokens_seen": 81862112, "step": 85735 }, { "epoch": 6.994045191288033, "grad_norm": 12.591503143310547, "learning_rate": 1.2545647651977769e-05, "loss": 0.3544, "num_input_tokens_seen": 81866704, "step": 85740 }, { "epoch": 6.994453054898442, "grad_norm": 42.11956024169922, "learning_rate": 1.2542561607029322e-05, "loss": 0.4765, "num_input_tokens_seen": 81870768, "step": 85745 }, { "epoch": 6.99486091850885, "grad_norm": 1.3357946872711182, "learning_rate": 1.2539475814587856e-05, "loss": 0.2795, "num_input_tokens_seen": 81875440, "step": 85750 }, { "epoch": 6.995268782119259, "grad_norm": 0.7210766673088074, "learning_rate": 1.253639027471592e-05, "loss": 0.4018, "num_input_tokens_seen": 81880016, "step": 85755 }, { "epoch": 6.995676645729668, "grad_norm": 0.818259596824646, "learning_rate": 1.2533304987476051e-05, "loss": 0.2662, "num_input_tokens_seen": 81885456, "step": 85760 }, { "epoch": 6.9960845093400765, "grad_norm": 0.7971762418746948, "learning_rate": 1.25302199529308e-05, "loss": 0.4229, "num_input_tokens_seen": 81891024, "step": 85765 }, { "epoch": 6.9964923729504855, "grad_norm": 1.7792816162109375, "learning_rate": 1.2527135171142693e-05, "loss": 0.2947, "num_input_tokens_seen": 81895776, "step": 85770 }, { "epoch": 6.996900236560894, "grad_norm": 0.9523249268531799, "learning_rate": 1.2524050642174257e-05, "loss": 0.3304, "num_input_tokens_seen": 81900064, "step": 85775 }, { "epoch": 6.997308100171303, "grad_norm": 0.5920120477676392, "learning_rate": 1.252096636608801e-05, "loss": 0.361, "num_input_tokens_seen": 81905552, "step": 85780 }, { "epoch": 6.997715963781712, "grad_norm": 22.57669448852539, "learning_rate": 1.2517882342946462e-05, "loss": 0.3494, "num_input_tokens_seen": 81910000, "step": 85785 }, { "epoch": 6.99812382739212, "grad_norm": 0.8891860246658325, "learning_rate": 1.2514798572812148e-05, "loss": 0.3397, "num_input_tokens_seen": 81914032, "step": 85790 }, { "epoch": 6.998531691002529, "grad_norm": 0.6722621917724609, "learning_rate": 1.2511715055747558e-05, "loss": 0.2799, "num_input_tokens_seen": 81918656, "step": 85795 }, { "epoch": 6.998939554612938, "grad_norm": 0.8848140835762024, "learning_rate": 1.25086317918152e-05, "loss": 0.4692, "num_input_tokens_seen": 81923424, "step": 85800 }, { "epoch": 6.999347418223346, "grad_norm": 0.9032717347145081, "learning_rate": 1.2505548781077567e-05, "loss": 0.3263, "num_input_tokens_seen": 81928288, "step": 85805 }, { "epoch": 6.999755281833755, "grad_norm": 1.095925211906433, "learning_rate": 1.250246602359715e-05, "loss": 0.3018, "num_input_tokens_seen": 81932944, "step": 85810 }, { "epoch": 7.000163145444163, "grad_norm": 0.7254227995872498, "learning_rate": 1.2499383519436439e-05, "loss": 0.4136, "num_input_tokens_seen": 81937808, "step": 85815 }, { "epoch": 7.000571009054572, "grad_norm": 1.2762138843536377, "learning_rate": 1.249630126865791e-05, "loss": 0.4356, "num_input_tokens_seen": 81942784, "step": 85820 }, { "epoch": 7.000571009054572, "eval_loss": 0.36838069558143616, "eval_runtime": 570.9162, "eval_samples_per_second": 4.773, "eval_steps_per_second": 2.387, "num_input_tokens_seen": 81942784, "step": 85820 }, { "epoch": 7.000978872664981, "grad_norm": 6.133650779724121, "learning_rate": 1.2493219271324041e-05, "loss": 0.3734, "num_input_tokens_seen": 81948432, "step": 85825 }, { "epoch": 7.001386736275389, "grad_norm": 0.35423964262008667, "learning_rate": 1.2490137527497295e-05, "loss": 0.3954, "num_input_tokens_seen": 81953984, "step": 85830 }, { "epoch": 7.001794599885798, "grad_norm": 0.3405199646949768, "learning_rate": 1.2487056037240152e-05, "loss": 0.324, "num_input_tokens_seen": 81958800, "step": 85835 }, { "epoch": 7.002202463496207, "grad_norm": 0.6215469837188721, "learning_rate": 1.2483974800615069e-05, "loss": 0.4282, "num_input_tokens_seen": 81962592, "step": 85840 }, { "epoch": 7.002610327106615, "grad_norm": 0.8802592158317566, "learning_rate": 1.2480893817684498e-05, "loss": 0.3499, "num_input_tokens_seen": 81966592, "step": 85845 }, { "epoch": 7.003018190717024, "grad_norm": 0.31986889243125916, "learning_rate": 1.2477813088510879e-05, "loss": 0.286, "num_input_tokens_seen": 81972112, "step": 85850 }, { "epoch": 7.0034260543274325, "grad_norm": 28.296266555786133, "learning_rate": 1.2474732613156678e-05, "loss": 0.3592, "num_input_tokens_seen": 81976144, "step": 85855 }, { "epoch": 7.0038339179378415, "grad_norm": 0.604913055896759, "learning_rate": 1.2471652391684325e-05, "loss": 0.408, "num_input_tokens_seen": 81980800, "step": 85860 }, { "epoch": 7.0042417815482505, "grad_norm": 44.25483322143555, "learning_rate": 1.2468572424156252e-05, "loss": 0.4193, "num_input_tokens_seen": 81986560, "step": 85865 }, { "epoch": 7.004649645158659, "grad_norm": 0.8822099566459656, "learning_rate": 1.2465492710634894e-05, "loss": 0.3376, "num_input_tokens_seen": 81990400, "step": 85870 }, { "epoch": 7.005057508769068, "grad_norm": 35.83433532714844, "learning_rate": 1.2462413251182662e-05, "loss": 0.4985, "num_input_tokens_seen": 81995104, "step": 85875 }, { "epoch": 7.005465372379477, "grad_norm": 0.7370045185089111, "learning_rate": 1.2459334045861995e-05, "loss": 0.3345, "num_input_tokens_seen": 82000560, "step": 85880 }, { "epoch": 7.005873235989885, "grad_norm": 3.444901943206787, "learning_rate": 1.2456255094735297e-05, "loss": 0.5594, "num_input_tokens_seen": 82005488, "step": 85885 }, { "epoch": 7.006281099600294, "grad_norm": 4.3819451332092285, "learning_rate": 1.245317639786498e-05, "loss": 0.2532, "num_input_tokens_seen": 82009136, "step": 85890 }, { "epoch": 7.006688963210703, "grad_norm": 0.7361308932304382, "learning_rate": 1.2450097955313436e-05, "loss": 0.3896, "num_input_tokens_seen": 82013760, "step": 85895 }, { "epoch": 7.007096826821111, "grad_norm": 47.032562255859375, "learning_rate": 1.2447019767143081e-05, "loss": 0.4594, "num_input_tokens_seen": 82018640, "step": 85900 }, { "epoch": 7.00750469043152, "grad_norm": 1.166259765625, "learning_rate": 1.2443941833416303e-05, "loss": 0.3235, "num_input_tokens_seen": 82023792, "step": 85905 }, { "epoch": 7.007912554041928, "grad_norm": 24.917688369750977, "learning_rate": 1.2440864154195488e-05, "loss": 0.2365, "num_input_tokens_seen": 82027552, "step": 85910 }, { "epoch": 7.008320417652337, "grad_norm": 1.4626452922821045, "learning_rate": 1.2437786729543021e-05, "loss": 0.3631, "num_input_tokens_seen": 82031680, "step": 85915 }, { "epoch": 7.008728281262746, "grad_norm": 17.296457290649414, "learning_rate": 1.243470955952127e-05, "loss": 0.3819, "num_input_tokens_seen": 82036624, "step": 85920 }, { "epoch": 7.009136144873154, "grad_norm": 2.926736354827881, "learning_rate": 1.2431632644192626e-05, "loss": 0.4182, "num_input_tokens_seen": 82041264, "step": 85925 }, { "epoch": 7.009544008483563, "grad_norm": 13.464999198913574, "learning_rate": 1.2428555983619448e-05, "loss": 0.3688, "num_input_tokens_seen": 82046096, "step": 85930 }, { "epoch": 7.009951872093972, "grad_norm": 8.365955352783203, "learning_rate": 1.2425479577864097e-05, "loss": 0.3264, "num_input_tokens_seen": 82050928, "step": 85935 }, { "epoch": 7.01035973570438, "grad_norm": 1.413692593574524, "learning_rate": 1.2422403426988932e-05, "loss": 0.3183, "num_input_tokens_seen": 82055424, "step": 85940 }, { "epoch": 7.010767599314789, "grad_norm": 2.215313196182251, "learning_rate": 1.2419327531056304e-05, "loss": 0.4335, "num_input_tokens_seen": 82059920, "step": 85945 }, { "epoch": 7.011175462925197, "grad_norm": 44.57582473754883, "learning_rate": 1.2416251890128563e-05, "loss": 0.2398, "num_input_tokens_seen": 82064256, "step": 85950 }, { "epoch": 7.0115833265356065, "grad_norm": 24.771528244018555, "learning_rate": 1.2413176504268047e-05, "loss": 0.3397, "num_input_tokens_seen": 82069120, "step": 85955 }, { "epoch": 7.0119911901460155, "grad_norm": 4.1688456535339355, "learning_rate": 1.2410101373537089e-05, "loss": 0.3166, "num_input_tokens_seen": 82073856, "step": 85960 }, { "epoch": 7.012399053756424, "grad_norm": 16.350454330444336, "learning_rate": 1.2407026497998033e-05, "loss": 0.3403, "num_input_tokens_seen": 82078816, "step": 85965 }, { "epoch": 7.012806917366833, "grad_norm": 20.525484085083008, "learning_rate": 1.2403951877713198e-05, "loss": 0.3533, "num_input_tokens_seen": 82083920, "step": 85970 }, { "epoch": 7.013214780977242, "grad_norm": 44.77782440185547, "learning_rate": 1.2400877512744907e-05, "loss": 0.3579, "num_input_tokens_seen": 82087808, "step": 85975 }, { "epoch": 7.01362264458765, "grad_norm": 7.1748948097229, "learning_rate": 1.2397803403155479e-05, "loss": 0.3268, "num_input_tokens_seen": 82091760, "step": 85980 }, { "epoch": 7.014030508198059, "grad_norm": 33.60988235473633, "learning_rate": 1.2394729549007206e-05, "loss": 0.3029, "num_input_tokens_seen": 82096016, "step": 85985 }, { "epoch": 7.014438371808467, "grad_norm": 4.529780387878418, "learning_rate": 1.2391655950362419e-05, "loss": 0.3092, "num_input_tokens_seen": 82100608, "step": 85990 }, { "epoch": 7.014846235418876, "grad_norm": 29.378917694091797, "learning_rate": 1.2388582607283412e-05, "loss": 0.323, "num_input_tokens_seen": 82105936, "step": 85995 }, { "epoch": 7.015254099029285, "grad_norm": 22.928401947021484, "learning_rate": 1.2385509519832473e-05, "loss": 0.3727, "num_input_tokens_seen": 82111232, "step": 86000 }, { "epoch": 7.015661962639693, "grad_norm": 1.2846499681472778, "learning_rate": 1.2382436688071885e-05, "loss": 0.2234, "num_input_tokens_seen": 82115872, "step": 86005 }, { "epoch": 7.016069826250102, "grad_norm": 25.372333526611328, "learning_rate": 1.2379364112063957e-05, "loss": 0.3467, "num_input_tokens_seen": 82119936, "step": 86010 }, { "epoch": 7.016477689860511, "grad_norm": 11.797186851501465, "learning_rate": 1.2376291791870956e-05, "loss": 0.2426, "num_input_tokens_seen": 82124640, "step": 86015 }, { "epoch": 7.016885553470919, "grad_norm": 39.9974479675293, "learning_rate": 1.2373219727555154e-05, "loss": 0.3535, "num_input_tokens_seen": 82128896, "step": 86020 }, { "epoch": 7.017293417081328, "grad_norm": 54.509315490722656, "learning_rate": 1.2370147919178823e-05, "loss": 0.3888, "num_input_tokens_seen": 82133776, "step": 86025 }, { "epoch": 7.017701280691736, "grad_norm": 1.31008780002594, "learning_rate": 1.2367076366804218e-05, "loss": 0.3072, "num_input_tokens_seen": 82138400, "step": 86030 }, { "epoch": 7.018109144302145, "grad_norm": 18.392179489135742, "learning_rate": 1.2364005070493617e-05, "loss": 0.4346, "num_input_tokens_seen": 82142272, "step": 86035 }, { "epoch": 7.018517007912554, "grad_norm": 0.26080116629600525, "learning_rate": 1.2360934030309263e-05, "loss": 0.2987, "num_input_tokens_seen": 82145584, "step": 86040 }, { "epoch": 7.018924871522962, "grad_norm": 0.7988191843032837, "learning_rate": 1.2357863246313406e-05, "loss": 0.3756, "num_input_tokens_seen": 82150736, "step": 86045 }, { "epoch": 7.019332735133371, "grad_norm": 1.5349622964859009, "learning_rate": 1.2354792718568279e-05, "loss": 0.5694, "num_input_tokens_seen": 82154336, "step": 86050 }, { "epoch": 7.01974059874378, "grad_norm": 2.491549491882324, "learning_rate": 1.2351722447136137e-05, "loss": 0.1994, "num_input_tokens_seen": 82159584, "step": 86055 }, { "epoch": 7.0201484623541885, "grad_norm": 21.099830627441406, "learning_rate": 1.234865243207921e-05, "loss": 0.2981, "num_input_tokens_seen": 82163744, "step": 86060 }, { "epoch": 7.0205563259645976, "grad_norm": 31.810331344604492, "learning_rate": 1.2345582673459725e-05, "loss": 0.3687, "num_input_tokens_seen": 82167888, "step": 86065 }, { "epoch": 7.020964189575006, "grad_norm": 1.996788740158081, "learning_rate": 1.2342513171339898e-05, "loss": 0.3025, "num_input_tokens_seen": 82173072, "step": 86070 }, { "epoch": 7.021372053185415, "grad_norm": 7.718031883239746, "learning_rate": 1.233944392578195e-05, "loss": 0.3414, "num_input_tokens_seen": 82177600, "step": 86075 }, { "epoch": 7.021779916795824, "grad_norm": 23.161426544189453, "learning_rate": 1.2336374936848094e-05, "loss": 0.5762, "num_input_tokens_seen": 82183088, "step": 86080 }, { "epoch": 7.022187780406232, "grad_norm": 0.6810378432273865, "learning_rate": 1.233330620460054e-05, "loss": 0.3023, "num_input_tokens_seen": 82187920, "step": 86085 }, { "epoch": 7.022595644016641, "grad_norm": 0.7562113404273987, "learning_rate": 1.2330237729101484e-05, "loss": 0.2822, "num_input_tokens_seen": 82193152, "step": 86090 }, { "epoch": 7.02300350762705, "grad_norm": 3.8286960124969482, "learning_rate": 1.2327169510413117e-05, "loss": 0.4541, "num_input_tokens_seen": 82197072, "step": 86095 }, { "epoch": 7.023411371237458, "grad_norm": 0.8052070736885071, "learning_rate": 1.2324101548597647e-05, "loss": 0.3881, "num_input_tokens_seen": 82200144, "step": 86100 }, { "epoch": 7.023819234847867, "grad_norm": 0.35882312059402466, "learning_rate": 1.2321033843717255e-05, "loss": 0.32, "num_input_tokens_seen": 82204512, "step": 86105 }, { "epoch": 7.024227098458276, "grad_norm": 3.93149995803833, "learning_rate": 1.2317966395834119e-05, "loss": 0.2533, "num_input_tokens_seen": 82209408, "step": 86110 }, { "epoch": 7.024634962068684, "grad_norm": 29.173006057739258, "learning_rate": 1.2314899205010413e-05, "loss": 0.3924, "num_input_tokens_seen": 82214736, "step": 86115 }, { "epoch": 7.025042825679093, "grad_norm": 0.9000003933906555, "learning_rate": 1.2311832271308302e-05, "loss": 0.4646, "num_input_tokens_seen": 82220144, "step": 86120 }, { "epoch": 7.025450689289501, "grad_norm": 0.6204180717468262, "learning_rate": 1.2308765594789967e-05, "loss": 0.2966, "num_input_tokens_seen": 82225360, "step": 86125 }, { "epoch": 7.02585855289991, "grad_norm": 3.098344087600708, "learning_rate": 1.2305699175517563e-05, "loss": 0.2518, "num_input_tokens_seen": 82230912, "step": 86130 }, { "epoch": 7.026266416510319, "grad_norm": 0.29169848561286926, "learning_rate": 1.2302633013553242e-05, "loss": 0.286, "num_input_tokens_seen": 82235280, "step": 86135 }, { "epoch": 7.026674280120727, "grad_norm": 32.702266693115234, "learning_rate": 1.2299567108959145e-05, "loss": 0.4598, "num_input_tokens_seen": 82240096, "step": 86140 }, { "epoch": 7.027082143731136, "grad_norm": 0.5315441489219666, "learning_rate": 1.2296501461797432e-05, "loss": 0.4377, "num_input_tokens_seen": 82245280, "step": 86145 }, { "epoch": 7.027490007341545, "grad_norm": 0.8155116438865662, "learning_rate": 1.2293436072130241e-05, "loss": 0.3546, "num_input_tokens_seen": 82249952, "step": 86150 }, { "epoch": 7.0278978709519535, "grad_norm": 4.166966438293457, "learning_rate": 1.2290370940019699e-05, "loss": 0.3413, "num_input_tokens_seen": 82255008, "step": 86155 }, { "epoch": 7.0283057345623625, "grad_norm": 0.6177582740783691, "learning_rate": 1.2287306065527929e-05, "loss": 0.3429, "num_input_tokens_seen": 82259712, "step": 86160 }, { "epoch": 7.028713598172771, "grad_norm": 0.6639559268951416, "learning_rate": 1.2284241448717073e-05, "loss": 0.3192, "num_input_tokens_seen": 82264944, "step": 86165 }, { "epoch": 7.02912146178318, "grad_norm": 0.2622127830982208, "learning_rate": 1.228117708964924e-05, "loss": 0.2759, "num_input_tokens_seen": 82269632, "step": 86170 }, { "epoch": 7.029529325393589, "grad_norm": 0.9096003174781799, "learning_rate": 1.2278112988386542e-05, "loss": 0.2566, "num_input_tokens_seen": 82274880, "step": 86175 }, { "epoch": 7.029937189003997, "grad_norm": 0.5371572971343994, "learning_rate": 1.2275049144991086e-05, "loss": 0.2886, "num_input_tokens_seen": 82279392, "step": 86180 }, { "epoch": 7.030345052614406, "grad_norm": 0.5205113887786865, "learning_rate": 1.2271985559524968e-05, "loss": 0.281, "num_input_tokens_seen": 82284256, "step": 86185 }, { "epoch": 7.030752916224815, "grad_norm": 0.9004679322242737, "learning_rate": 1.2268922232050306e-05, "loss": 0.4147, "num_input_tokens_seen": 82288512, "step": 86190 }, { "epoch": 7.031160779835223, "grad_norm": 0.2735176682472229, "learning_rate": 1.2265859162629179e-05, "loss": 0.3135, "num_input_tokens_seen": 82293904, "step": 86195 }, { "epoch": 7.031568643445632, "grad_norm": 0.5723971128463745, "learning_rate": 1.2262796351323674e-05, "loss": 0.2715, "num_input_tokens_seen": 82299440, "step": 86200 }, { "epoch": 7.03197650705604, "grad_norm": 28.803003311157227, "learning_rate": 1.2259733798195878e-05, "loss": 0.5854, "num_input_tokens_seen": 82304384, "step": 86205 }, { "epoch": 7.032384370666449, "grad_norm": 7.048911094665527, "learning_rate": 1.2256671503307863e-05, "loss": 0.331, "num_input_tokens_seen": 82308704, "step": 86210 }, { "epoch": 7.032792234276858, "grad_norm": 17.73029899597168, "learning_rate": 1.22536094667217e-05, "loss": 0.3543, "num_input_tokens_seen": 82313984, "step": 86215 }, { "epoch": 7.033200097887266, "grad_norm": 20.358783721923828, "learning_rate": 1.2250547688499458e-05, "loss": 0.3602, "num_input_tokens_seen": 82318848, "step": 86220 }, { "epoch": 7.033607961497675, "grad_norm": 0.5782017111778259, "learning_rate": 1.2247486168703187e-05, "loss": 0.3431, "num_input_tokens_seen": 82322592, "step": 86225 }, { "epoch": 7.034015825108084, "grad_norm": 0.9812934398651123, "learning_rate": 1.2244424907394963e-05, "loss": 0.2162, "num_input_tokens_seen": 82327824, "step": 86230 }, { "epoch": 7.034423688718492, "grad_norm": 6.012195110321045, "learning_rate": 1.2241363904636827e-05, "loss": 0.4223, "num_input_tokens_seen": 82332672, "step": 86235 }, { "epoch": 7.034831552328901, "grad_norm": 69.45789337158203, "learning_rate": 1.2238303160490818e-05, "loss": 0.3752, "num_input_tokens_seen": 82338352, "step": 86240 }, { "epoch": 7.0352394159393095, "grad_norm": 1.709854245185852, "learning_rate": 1.2235242675018988e-05, "loss": 0.3079, "num_input_tokens_seen": 82342576, "step": 86245 }, { "epoch": 7.0356472795497185, "grad_norm": 39.387874603271484, "learning_rate": 1.223218244828335e-05, "loss": 0.3225, "num_input_tokens_seen": 82347200, "step": 86250 }, { "epoch": 7.0360551431601275, "grad_norm": 1.757149577140808, "learning_rate": 1.2229122480345962e-05, "loss": 0.2937, "num_input_tokens_seen": 82351648, "step": 86255 }, { "epoch": 7.036463006770536, "grad_norm": 1.9397646188735962, "learning_rate": 1.222606277126883e-05, "loss": 0.3263, "num_input_tokens_seen": 82355984, "step": 86260 }, { "epoch": 7.036870870380945, "grad_norm": 0.6595641374588013, "learning_rate": 1.2223003321113983e-05, "loss": 0.2281, "num_input_tokens_seen": 82361344, "step": 86265 }, { "epoch": 7.037278733991354, "grad_norm": 1.6155588626861572, "learning_rate": 1.2219944129943426e-05, "loss": 0.303, "num_input_tokens_seen": 82366656, "step": 86270 }, { "epoch": 7.037686597601762, "grad_norm": 2.213348865509033, "learning_rate": 1.2216885197819162e-05, "loss": 0.3145, "num_input_tokens_seen": 82370864, "step": 86275 }, { "epoch": 7.038094461212171, "grad_norm": 0.47379806637763977, "learning_rate": 1.2213826524803212e-05, "loss": 0.426, "num_input_tokens_seen": 82376432, "step": 86280 }, { "epoch": 7.03850232482258, "grad_norm": 0.3961562216281891, "learning_rate": 1.2210768110957566e-05, "loss": 0.2948, "num_input_tokens_seen": 82381968, "step": 86285 }, { "epoch": 7.038910188432988, "grad_norm": 1.3574846982955933, "learning_rate": 1.2207709956344216e-05, "loss": 0.2353, "num_input_tokens_seen": 82385664, "step": 86290 }, { "epoch": 7.039318052043397, "grad_norm": 38.07468032836914, "learning_rate": 1.220465206102514e-05, "loss": 0.4607, "num_input_tokens_seen": 82390560, "step": 86295 }, { "epoch": 7.039725915653805, "grad_norm": 3.2198619842529297, "learning_rate": 1.2201594425062338e-05, "loss": 0.542, "num_input_tokens_seen": 82396304, "step": 86300 }, { "epoch": 7.040133779264214, "grad_norm": 0.72385174036026, "learning_rate": 1.2198537048517781e-05, "loss": 0.4284, "num_input_tokens_seen": 82400656, "step": 86305 }, { "epoch": 7.040541642874623, "grad_norm": 42.734107971191406, "learning_rate": 1.2195479931453436e-05, "loss": 0.3326, "num_input_tokens_seen": 82405664, "step": 86310 }, { "epoch": 7.040949506485031, "grad_norm": 18.831872940063477, "learning_rate": 1.2192423073931272e-05, "loss": 0.2976, "num_input_tokens_seen": 82410224, "step": 86315 }, { "epoch": 7.04135737009544, "grad_norm": 4.106160640716553, "learning_rate": 1.2189366476013242e-05, "loss": 0.2326, "num_input_tokens_seen": 82414240, "step": 86320 }, { "epoch": 7.041765233705849, "grad_norm": 12.401236534118652, "learning_rate": 1.218631013776132e-05, "loss": 0.5079, "num_input_tokens_seen": 82419152, "step": 86325 }, { "epoch": 7.042173097316257, "grad_norm": 0.48748835921287537, "learning_rate": 1.2183254059237446e-05, "loss": 0.3148, "num_input_tokens_seen": 82424240, "step": 86330 }, { "epoch": 7.042580960926666, "grad_norm": 45.799922943115234, "learning_rate": 1.2180198240503562e-05, "loss": 0.4104, "num_input_tokens_seen": 82428528, "step": 86335 }, { "epoch": 7.042988824537074, "grad_norm": 1.7419761419296265, "learning_rate": 1.2177142681621618e-05, "loss": 0.3903, "num_input_tokens_seen": 82433696, "step": 86340 }, { "epoch": 7.043396688147483, "grad_norm": 10.551450729370117, "learning_rate": 1.2174087382653531e-05, "loss": 0.2851, "num_input_tokens_seen": 82439168, "step": 86345 }, { "epoch": 7.043804551757892, "grad_norm": 46.183406829833984, "learning_rate": 1.2171032343661252e-05, "loss": 0.3908, "num_input_tokens_seen": 82444288, "step": 86350 }, { "epoch": 7.044212415368301, "grad_norm": 2.4358274936676025, "learning_rate": 1.2167977564706695e-05, "loss": 0.2917, "num_input_tokens_seen": 82449472, "step": 86355 }, { "epoch": 7.04462027897871, "grad_norm": 22.35620880126953, "learning_rate": 1.2164923045851783e-05, "loss": 0.4151, "num_input_tokens_seen": 82453584, "step": 86360 }, { "epoch": 7.045028142589119, "grad_norm": 10.571795463562012, "learning_rate": 1.2161868787158426e-05, "loss": 0.5136, "num_input_tokens_seen": 82458848, "step": 86365 }, { "epoch": 7.045436006199527, "grad_norm": 2.111574172973633, "learning_rate": 1.2158814788688533e-05, "loss": 0.3348, "num_input_tokens_seen": 82464256, "step": 86370 }, { "epoch": 7.045843869809936, "grad_norm": 1.1764898300170898, "learning_rate": 1.2155761050504005e-05, "loss": 0.3432, "num_input_tokens_seen": 82468928, "step": 86375 }, { "epoch": 7.046251733420344, "grad_norm": 2.6427881717681885, "learning_rate": 1.2152707572666747e-05, "loss": 0.4465, "num_input_tokens_seen": 82474144, "step": 86380 }, { "epoch": 7.046659597030753, "grad_norm": 7.970294952392578, "learning_rate": 1.2149654355238634e-05, "loss": 0.3704, "num_input_tokens_seen": 82478624, "step": 86385 }, { "epoch": 7.047067460641162, "grad_norm": 1.3345228433609009, "learning_rate": 1.214660139828158e-05, "loss": 0.2613, "num_input_tokens_seen": 82482704, "step": 86390 }, { "epoch": 7.04747532425157, "grad_norm": 2.877742052078247, "learning_rate": 1.2143548701857453e-05, "loss": 0.2934, "num_input_tokens_seen": 82487584, "step": 86395 }, { "epoch": 7.047883187861979, "grad_norm": 1.4984434843063354, "learning_rate": 1.2140496266028128e-05, "loss": 0.2528, "num_input_tokens_seen": 82493024, "step": 86400 }, { "epoch": 7.048291051472388, "grad_norm": 1.582885980606079, "learning_rate": 1.2137444090855473e-05, "loss": 0.2204, "num_input_tokens_seen": 82497712, "step": 86405 }, { "epoch": 7.048698915082796, "grad_norm": 1.1238244771957397, "learning_rate": 1.2134392176401369e-05, "loss": 0.373, "num_input_tokens_seen": 82501296, "step": 86410 }, { "epoch": 7.049106778693205, "grad_norm": 0.28861063718795776, "learning_rate": 1.2131340522727669e-05, "loss": 0.3241, "num_input_tokens_seen": 82504880, "step": 86415 }, { "epoch": 7.049514642303613, "grad_norm": 0.2370678335428238, "learning_rate": 1.2128289129896228e-05, "loss": 0.4354, "num_input_tokens_seen": 82509216, "step": 86420 }, { "epoch": 7.049922505914022, "grad_norm": 1.2705590724945068, "learning_rate": 1.2125237997968897e-05, "loss": 0.2618, "num_input_tokens_seen": 82513968, "step": 86425 }, { "epoch": 7.050330369524431, "grad_norm": 0.6441341042518616, "learning_rate": 1.2122187127007512e-05, "loss": 0.3628, "num_input_tokens_seen": 82518864, "step": 86430 }, { "epoch": 7.050738233134839, "grad_norm": 20.224843978881836, "learning_rate": 1.2119136517073932e-05, "loss": 0.2542, "num_input_tokens_seen": 82523840, "step": 86435 }, { "epoch": 7.051146096745248, "grad_norm": 0.671816885471344, "learning_rate": 1.2116086168229978e-05, "loss": 0.3182, "num_input_tokens_seen": 82529648, "step": 86440 }, { "epoch": 7.051553960355657, "grad_norm": 0.7933605909347534, "learning_rate": 1.2113036080537484e-05, "loss": 0.3807, "num_input_tokens_seen": 82533536, "step": 86445 }, { "epoch": 7.0519618239660655, "grad_norm": 30.901039123535156, "learning_rate": 1.2109986254058264e-05, "loss": 0.4644, "num_input_tokens_seen": 82538656, "step": 86450 }, { "epoch": 7.0523696875764745, "grad_norm": 29.24452781677246, "learning_rate": 1.2106936688854157e-05, "loss": 0.2998, "num_input_tokens_seen": 82543408, "step": 86455 }, { "epoch": 7.0527775511868835, "grad_norm": 4.633955478668213, "learning_rate": 1.2103887384986961e-05, "loss": 0.241, "num_input_tokens_seen": 82548272, "step": 86460 }, { "epoch": 7.053185414797292, "grad_norm": 10.812987327575684, "learning_rate": 1.2100838342518489e-05, "loss": 0.4771, "num_input_tokens_seen": 82553248, "step": 86465 }, { "epoch": 7.053593278407701, "grad_norm": 20.71027374267578, "learning_rate": 1.2097789561510543e-05, "loss": 0.42, "num_input_tokens_seen": 82557696, "step": 86470 }, { "epoch": 7.054001142018109, "grad_norm": 41.60735321044922, "learning_rate": 1.209474104202491e-05, "loss": 0.3363, "num_input_tokens_seen": 82562432, "step": 86475 }, { "epoch": 7.054409005628518, "grad_norm": 9.38959789276123, "learning_rate": 1.2091692784123402e-05, "loss": 0.3547, "num_input_tokens_seen": 82567536, "step": 86480 }, { "epoch": 7.054816869238927, "grad_norm": 0.4374713599681854, "learning_rate": 1.2088644787867798e-05, "loss": 0.2985, "num_input_tokens_seen": 82571296, "step": 86485 }, { "epoch": 7.055224732849335, "grad_norm": 19.110864639282227, "learning_rate": 1.2085597053319875e-05, "loss": 0.5638, "num_input_tokens_seen": 82576416, "step": 86490 }, { "epoch": 7.055632596459744, "grad_norm": 7.6469407081604, "learning_rate": 1.2082549580541414e-05, "loss": 0.3958, "num_input_tokens_seen": 82581904, "step": 86495 }, { "epoch": 7.056040460070153, "grad_norm": 18.545005798339844, "learning_rate": 1.2079502369594184e-05, "loss": 0.4087, "num_input_tokens_seen": 82586336, "step": 86500 }, { "epoch": 7.056448323680561, "grad_norm": 0.47552675008773804, "learning_rate": 1.2076455420539948e-05, "loss": 0.3024, "num_input_tokens_seen": 82590432, "step": 86505 }, { "epoch": 7.05685618729097, "grad_norm": 0.44324803352355957, "learning_rate": 1.2073408733440473e-05, "loss": 0.4575, "num_input_tokens_seen": 82594928, "step": 86510 }, { "epoch": 7.057264050901378, "grad_norm": 4.343212127685547, "learning_rate": 1.2070362308357498e-05, "loss": 0.466, "num_input_tokens_seen": 82599104, "step": 86515 }, { "epoch": 7.057671914511787, "grad_norm": 0.9945310950279236, "learning_rate": 1.2067316145352794e-05, "loss": 0.3177, "num_input_tokens_seen": 82603920, "step": 86520 }, { "epoch": 7.058079778122196, "grad_norm": 0.7263138890266418, "learning_rate": 1.2064270244488098e-05, "loss": 0.3224, "num_input_tokens_seen": 82609168, "step": 86525 }, { "epoch": 7.058487641732604, "grad_norm": 9.94871997833252, "learning_rate": 1.2061224605825146e-05, "loss": 0.3378, "num_input_tokens_seen": 82614464, "step": 86530 }, { "epoch": 7.058895505343013, "grad_norm": 9.205126762390137, "learning_rate": 1.2058179229425676e-05, "loss": 0.2655, "num_input_tokens_seen": 82618256, "step": 86535 }, { "epoch": 7.059303368953422, "grad_norm": 1.9484038352966309, "learning_rate": 1.2055134115351402e-05, "loss": 0.4232, "num_input_tokens_seen": 82622976, "step": 86540 }, { "epoch": 7.0597112325638305, "grad_norm": 2.464313268661499, "learning_rate": 1.2052089263664068e-05, "loss": 0.4878, "num_input_tokens_seen": 82627664, "step": 86545 }, { "epoch": 7.0601190961742395, "grad_norm": 18.84864044189453, "learning_rate": 1.2049044674425383e-05, "loss": 0.4165, "num_input_tokens_seen": 82632224, "step": 86550 }, { "epoch": 7.060526959784648, "grad_norm": 1.784224510192871, "learning_rate": 1.2046000347697058e-05, "loss": 0.3801, "num_input_tokens_seen": 82636720, "step": 86555 }, { "epoch": 7.060934823395057, "grad_norm": 30.30502700805664, "learning_rate": 1.2042956283540795e-05, "loss": 0.472, "num_input_tokens_seen": 82641328, "step": 86560 }, { "epoch": 7.061342687005466, "grad_norm": 21.649948120117188, "learning_rate": 1.2039912482018311e-05, "loss": 0.3462, "num_input_tokens_seen": 82645728, "step": 86565 }, { "epoch": 7.061750550615874, "grad_norm": 2.6920900344848633, "learning_rate": 1.2036868943191293e-05, "loss": 0.4749, "num_input_tokens_seen": 82649920, "step": 86570 }, { "epoch": 7.062158414226283, "grad_norm": 1.7051196098327637, "learning_rate": 1.2033825667121435e-05, "loss": 0.3371, "num_input_tokens_seen": 82655184, "step": 86575 }, { "epoch": 7.062566277836692, "grad_norm": 21.306798934936523, "learning_rate": 1.2030782653870418e-05, "loss": 0.373, "num_input_tokens_seen": 82659184, "step": 86580 }, { "epoch": 7.0629741414471, "grad_norm": 1.942050814628601, "learning_rate": 1.2027739903499918e-05, "loss": 0.3311, "num_input_tokens_seen": 82664576, "step": 86585 }, { "epoch": 7.063382005057509, "grad_norm": 1.868741512298584, "learning_rate": 1.2024697416071629e-05, "loss": 0.3626, "num_input_tokens_seen": 82669312, "step": 86590 }, { "epoch": 7.063789868667917, "grad_norm": 0.9666062593460083, "learning_rate": 1.2021655191647207e-05, "loss": 0.3283, "num_input_tokens_seen": 82674000, "step": 86595 }, { "epoch": 7.064197732278326, "grad_norm": 35.9498176574707, "learning_rate": 1.201861323028832e-05, "loss": 0.3539, "num_input_tokens_seen": 82678752, "step": 86600 }, { "epoch": 7.064605595888735, "grad_norm": 1.871602177619934, "learning_rate": 1.2015571532056618e-05, "loss": 0.3182, "num_input_tokens_seen": 82684512, "step": 86605 }, { "epoch": 7.065013459499143, "grad_norm": 6.3156657218933105, "learning_rate": 1.2012530097013771e-05, "loss": 0.3776, "num_input_tokens_seen": 82689872, "step": 86610 }, { "epoch": 7.065421323109552, "grad_norm": 14.599693298339844, "learning_rate": 1.2009488925221423e-05, "loss": 0.3458, "num_input_tokens_seen": 82695520, "step": 86615 }, { "epoch": 7.065829186719961, "grad_norm": 18.547706604003906, "learning_rate": 1.2006448016741215e-05, "loss": 0.3406, "num_input_tokens_seen": 82699712, "step": 86620 }, { "epoch": 7.066237050330369, "grad_norm": 4.059109210968018, "learning_rate": 1.200340737163478e-05, "loss": 0.328, "num_input_tokens_seen": 82703952, "step": 86625 }, { "epoch": 7.066644913940778, "grad_norm": 2.8192808628082275, "learning_rate": 1.200036698996376e-05, "loss": 0.2519, "num_input_tokens_seen": 82708560, "step": 86630 }, { "epoch": 7.067052777551186, "grad_norm": 15.356669425964355, "learning_rate": 1.1997326871789774e-05, "loss": 0.4196, "num_input_tokens_seen": 82713408, "step": 86635 }, { "epoch": 7.0674606411615954, "grad_norm": 5.120311260223389, "learning_rate": 1.199428701717445e-05, "loss": 0.3858, "num_input_tokens_seen": 82718384, "step": 86640 }, { "epoch": 7.0678685047720045, "grad_norm": 3.1771674156188965, "learning_rate": 1.19912474261794e-05, "loss": 0.3162, "num_input_tokens_seen": 82723296, "step": 86645 }, { "epoch": 7.068276368382413, "grad_norm": 15.877523422241211, "learning_rate": 1.1988208098866228e-05, "loss": 0.3621, "num_input_tokens_seen": 82728464, "step": 86650 }, { "epoch": 7.068684231992822, "grad_norm": 8.104517936706543, "learning_rate": 1.1985169035296556e-05, "loss": 0.2747, "num_input_tokens_seen": 82733584, "step": 86655 }, { "epoch": 7.069092095603231, "grad_norm": 1.5120396614074707, "learning_rate": 1.198213023553198e-05, "loss": 0.4165, "num_input_tokens_seen": 82738768, "step": 86660 }, { "epoch": 7.069499959213639, "grad_norm": 7.396370887756348, "learning_rate": 1.197909169963409e-05, "loss": 0.4315, "num_input_tokens_seen": 82744656, "step": 86665 }, { "epoch": 7.069907822824048, "grad_norm": 14.067558288574219, "learning_rate": 1.197605342766448e-05, "loss": 0.3213, "num_input_tokens_seen": 82748416, "step": 86670 }, { "epoch": 7.070315686434457, "grad_norm": 13.460392951965332, "learning_rate": 1.1973015419684725e-05, "loss": 0.5141, "num_input_tokens_seen": 82753808, "step": 86675 }, { "epoch": 7.070723550044865, "grad_norm": 27.489948272705078, "learning_rate": 1.1969977675756418e-05, "loss": 0.3248, "num_input_tokens_seen": 82758208, "step": 86680 }, { "epoch": 7.071131413655274, "grad_norm": 19.26808738708496, "learning_rate": 1.1966940195941128e-05, "loss": 0.2659, "num_input_tokens_seen": 82762256, "step": 86685 }, { "epoch": 7.071539277265682, "grad_norm": 7.2715349197387695, "learning_rate": 1.1963902980300423e-05, "loss": 0.4222, "num_input_tokens_seen": 82766576, "step": 86690 }, { "epoch": 7.071947140876091, "grad_norm": 20.929988861083984, "learning_rate": 1.1960866028895853e-05, "loss": 0.5338, "num_input_tokens_seen": 82770960, "step": 86695 }, { "epoch": 7.0723550044865, "grad_norm": 1.0849615335464478, "learning_rate": 1.1957829341789e-05, "loss": 0.3017, "num_input_tokens_seen": 82776000, "step": 86700 }, { "epoch": 7.072762868096908, "grad_norm": 1.2524051666259766, "learning_rate": 1.1954792919041404e-05, "loss": 0.2573, "num_input_tokens_seen": 82780768, "step": 86705 }, { "epoch": 7.073170731707317, "grad_norm": 15.788077354431152, "learning_rate": 1.1951756760714614e-05, "loss": 0.3274, "num_input_tokens_seen": 82785600, "step": 86710 }, { "epoch": 7.073578595317726, "grad_norm": 39.80496597290039, "learning_rate": 1.1948720866870158e-05, "loss": 0.3426, "num_input_tokens_seen": 82790336, "step": 86715 }, { "epoch": 7.073986458928134, "grad_norm": 19.201622009277344, "learning_rate": 1.1945685237569593e-05, "loss": 0.2572, "num_input_tokens_seen": 82795328, "step": 86720 }, { "epoch": 7.074394322538543, "grad_norm": 2.9870309829711914, "learning_rate": 1.1942649872874442e-05, "loss": 0.4117, "num_input_tokens_seen": 82800176, "step": 86725 }, { "epoch": 7.074802186148951, "grad_norm": 1.576833724975586, "learning_rate": 1.193961477284623e-05, "loss": 0.3161, "num_input_tokens_seen": 82805008, "step": 86730 }, { "epoch": 7.07521004975936, "grad_norm": 2.6949825286865234, "learning_rate": 1.1936579937546477e-05, "loss": 0.3904, "num_input_tokens_seen": 82809392, "step": 86735 }, { "epoch": 7.075617913369769, "grad_norm": 1.5587611198425293, "learning_rate": 1.193354536703669e-05, "loss": 0.314, "num_input_tokens_seen": 82814000, "step": 86740 }, { "epoch": 7.0760257769801775, "grad_norm": 0.9358317852020264, "learning_rate": 1.1930511061378393e-05, "loss": 0.2458, "num_input_tokens_seen": 82818112, "step": 86745 }, { "epoch": 7.0764336405905865, "grad_norm": 0.6820416450500488, "learning_rate": 1.1927477020633085e-05, "loss": 0.3782, "num_input_tokens_seen": 82823184, "step": 86750 }, { "epoch": 7.076841504200996, "grad_norm": 1.1721277236938477, "learning_rate": 1.192444324486226e-05, "loss": 0.3887, "num_input_tokens_seen": 82828448, "step": 86755 }, { "epoch": 7.077249367811404, "grad_norm": 0.8915901184082031, "learning_rate": 1.1921409734127416e-05, "loss": 0.3206, "num_input_tokens_seen": 82833312, "step": 86760 }, { "epoch": 7.077657231421813, "grad_norm": 1.7627373933792114, "learning_rate": 1.191837648849004e-05, "loss": 0.4128, "num_input_tokens_seen": 82838208, "step": 86765 }, { "epoch": 7.078065095032221, "grad_norm": 1.5086792707443237, "learning_rate": 1.1915343508011614e-05, "loss": 0.2904, "num_input_tokens_seen": 82842912, "step": 86770 }, { "epoch": 7.07847295864263, "grad_norm": 1.5218675136566162, "learning_rate": 1.1912310792753614e-05, "loss": 0.3831, "num_input_tokens_seen": 82848016, "step": 86775 }, { "epoch": 7.078880822253039, "grad_norm": 1.2923656702041626, "learning_rate": 1.1909278342777513e-05, "loss": 0.3649, "num_input_tokens_seen": 82852400, "step": 86780 }, { "epoch": 7.079288685863447, "grad_norm": 5.050317764282227, "learning_rate": 1.1906246158144768e-05, "loss": 0.4284, "num_input_tokens_seen": 82857008, "step": 86785 }, { "epoch": 7.079696549473856, "grad_norm": 1.3282170295715332, "learning_rate": 1.190321423891686e-05, "loss": 0.1778, "num_input_tokens_seen": 82860704, "step": 86790 }, { "epoch": 7.080104413084265, "grad_norm": 13.556941986083984, "learning_rate": 1.1900182585155235e-05, "loss": 0.4302, "num_input_tokens_seen": 82866080, "step": 86795 }, { "epoch": 7.080512276694673, "grad_norm": 2.2943716049194336, "learning_rate": 1.1897151196921338e-05, "loss": 0.311, "num_input_tokens_seen": 82870720, "step": 86800 }, { "epoch": 7.080920140305082, "grad_norm": 10.405618667602539, "learning_rate": 1.1894120074276612e-05, "loss": 0.3127, "num_input_tokens_seen": 82875712, "step": 86805 }, { "epoch": 7.081328003915491, "grad_norm": 1.6232956647872925, "learning_rate": 1.1891089217282513e-05, "loss": 0.2283, "num_input_tokens_seen": 82880480, "step": 86810 }, { "epoch": 7.081735867525899, "grad_norm": 5.859878063201904, "learning_rate": 1.1888058626000464e-05, "loss": 0.4595, "num_input_tokens_seen": 82885552, "step": 86815 }, { "epoch": 7.082143731136308, "grad_norm": 13.31287956237793, "learning_rate": 1.18850283004919e-05, "loss": 0.431, "num_input_tokens_seen": 82889264, "step": 86820 }, { "epoch": 7.082551594746716, "grad_norm": 26.722108840942383, "learning_rate": 1.1881998240818232e-05, "loss": 0.415, "num_input_tokens_seen": 82894336, "step": 86825 }, { "epoch": 7.082959458357125, "grad_norm": 1.1515944004058838, "learning_rate": 1.1878968447040881e-05, "loss": 0.3544, "num_input_tokens_seen": 82899344, "step": 86830 }, { "epoch": 7.083367321967534, "grad_norm": 30.7279109954834, "learning_rate": 1.1875938919221272e-05, "loss": 0.3368, "num_input_tokens_seen": 82903584, "step": 86835 }, { "epoch": 7.0837751855779425, "grad_norm": 7.707388877868652, "learning_rate": 1.1872909657420803e-05, "loss": 0.3551, "num_input_tokens_seen": 82908544, "step": 86840 }, { "epoch": 7.0841830491883515, "grad_norm": 16.52808952331543, "learning_rate": 1.186988066170088e-05, "loss": 0.3163, "num_input_tokens_seen": 82913216, "step": 86845 }, { "epoch": 7.0845909127987605, "grad_norm": 0.8778136968612671, "learning_rate": 1.1866851932122886e-05, "loss": 0.3035, "num_input_tokens_seen": 82917424, "step": 86850 }, { "epoch": 7.084998776409169, "grad_norm": 7.766496658325195, "learning_rate": 1.186382346874823e-05, "loss": 0.2594, "num_input_tokens_seen": 82922320, "step": 86855 }, { "epoch": 7.085406640019578, "grad_norm": 21.99680519104004, "learning_rate": 1.1860795271638292e-05, "loss": 0.3237, "num_input_tokens_seen": 82927488, "step": 86860 }, { "epoch": 7.085814503629986, "grad_norm": 4.252613544464111, "learning_rate": 1.185776734085445e-05, "loss": 0.3305, "num_input_tokens_seen": 82932272, "step": 86865 }, { "epoch": 7.086222367240395, "grad_norm": 8.763949394226074, "learning_rate": 1.1854739676458083e-05, "loss": 0.2895, "num_input_tokens_seen": 82937712, "step": 86870 }, { "epoch": 7.086630230850804, "grad_norm": 5.960437774658203, "learning_rate": 1.1851712278510543e-05, "loss": 0.4974, "num_input_tokens_seen": 82942912, "step": 86875 }, { "epoch": 7.087038094461212, "grad_norm": 3.5712196826934814, "learning_rate": 1.1848685147073222e-05, "loss": 0.288, "num_input_tokens_seen": 82947568, "step": 86880 }, { "epoch": 7.087445958071621, "grad_norm": 6.178743362426758, "learning_rate": 1.1845658282207462e-05, "loss": 0.5197, "num_input_tokens_seen": 82952288, "step": 86885 }, { "epoch": 7.08785382168203, "grad_norm": 4.219261169433594, "learning_rate": 1.1842631683974618e-05, "loss": 0.3782, "num_input_tokens_seen": 82957136, "step": 86890 }, { "epoch": 7.088261685292438, "grad_norm": 1.2242021560668945, "learning_rate": 1.183960535243604e-05, "loss": 0.3073, "num_input_tokens_seen": 82962160, "step": 86895 }, { "epoch": 7.088669548902847, "grad_norm": 0.5140316486358643, "learning_rate": 1.1836579287653069e-05, "loss": 0.4292, "num_input_tokens_seen": 82966016, "step": 86900 }, { "epoch": 7.089077412513255, "grad_norm": 17.16573715209961, "learning_rate": 1.1833553489687044e-05, "loss": 0.3814, "num_input_tokens_seen": 82970384, "step": 86905 }, { "epoch": 7.089485276123664, "grad_norm": 22.19679832458496, "learning_rate": 1.1830527958599294e-05, "loss": 0.3426, "num_input_tokens_seen": 82975936, "step": 86910 }, { "epoch": 7.089893139734073, "grad_norm": 0.555019199848175, "learning_rate": 1.1827502694451136e-05, "loss": 0.2575, "num_input_tokens_seen": 82980544, "step": 86915 }, { "epoch": 7.090301003344481, "grad_norm": 2.4678730964660645, "learning_rate": 1.182447769730391e-05, "loss": 0.3598, "num_input_tokens_seen": 82985824, "step": 86920 }, { "epoch": 7.09070886695489, "grad_norm": 0.5748773217201233, "learning_rate": 1.1821452967218924e-05, "loss": 0.353, "num_input_tokens_seen": 82990672, "step": 86925 }, { "epoch": 7.091116730565299, "grad_norm": 1.41206693649292, "learning_rate": 1.1818428504257487e-05, "loss": 0.2985, "num_input_tokens_seen": 82995728, "step": 86930 }, { "epoch": 7.0915245941757075, "grad_norm": 15.788260459899902, "learning_rate": 1.1815404308480902e-05, "loss": 0.4066, "num_input_tokens_seen": 83001024, "step": 86935 }, { "epoch": 7.0919324577861165, "grad_norm": 1.0686606168746948, "learning_rate": 1.1812380379950458e-05, "loss": 0.3255, "num_input_tokens_seen": 83005456, "step": 86940 }, { "epoch": 7.092340321396525, "grad_norm": 5.0819411277771, "learning_rate": 1.1809356718727472e-05, "loss": 0.3475, "num_input_tokens_seen": 83009616, "step": 86945 }, { "epoch": 7.092748185006934, "grad_norm": 1.8832757472991943, "learning_rate": 1.180633332487322e-05, "loss": 0.3505, "num_input_tokens_seen": 83015104, "step": 86950 }, { "epoch": 7.093156048617343, "grad_norm": 1.523135781288147, "learning_rate": 1.1803310198448983e-05, "loss": 0.3595, "num_input_tokens_seen": 83019264, "step": 86955 }, { "epoch": 7.093563912227751, "grad_norm": 1.6539463996887207, "learning_rate": 1.1800287339516034e-05, "loss": 0.3168, "num_input_tokens_seen": 83024000, "step": 86960 }, { "epoch": 7.09397177583816, "grad_norm": 3.822502613067627, "learning_rate": 1.1797264748135658e-05, "loss": 0.3329, "num_input_tokens_seen": 83029104, "step": 86965 }, { "epoch": 7.094379639448569, "grad_norm": 1.34939706325531, "learning_rate": 1.179424242436912e-05, "loss": 0.349, "num_input_tokens_seen": 83034240, "step": 86970 }, { "epoch": 7.094787503058977, "grad_norm": 5.009003639221191, "learning_rate": 1.1791220368277672e-05, "loss": 0.3061, "num_input_tokens_seen": 83039440, "step": 86975 }, { "epoch": 7.095195366669386, "grad_norm": 2.3633925914764404, "learning_rate": 1.1788198579922577e-05, "loss": 0.3435, "num_input_tokens_seen": 83044704, "step": 86980 }, { "epoch": 7.095603230279794, "grad_norm": 0.6473730206489563, "learning_rate": 1.1785177059365074e-05, "loss": 0.4202, "num_input_tokens_seen": 83048960, "step": 86985 }, { "epoch": 7.096011093890203, "grad_norm": 1.0597920417785645, "learning_rate": 1.1782155806666422e-05, "loss": 0.3145, "num_input_tokens_seen": 83053824, "step": 86990 }, { "epoch": 7.096418957500612, "grad_norm": 1.6154645681381226, "learning_rate": 1.1779134821887861e-05, "loss": 0.4059, "num_input_tokens_seen": 83058352, "step": 86995 }, { "epoch": 7.09682682111102, "grad_norm": 2.5229334831237793, "learning_rate": 1.1776114105090616e-05, "loss": 0.2779, "num_input_tokens_seen": 83062608, "step": 87000 }, { "epoch": 7.097234684721429, "grad_norm": 3.8478896617889404, "learning_rate": 1.177309365633591e-05, "loss": 0.3096, "num_input_tokens_seen": 83066368, "step": 87005 }, { "epoch": 7.097642548331838, "grad_norm": 1.175075888633728, "learning_rate": 1.1770073475684984e-05, "loss": 0.3364, "num_input_tokens_seen": 83071488, "step": 87010 }, { "epoch": 7.098050411942246, "grad_norm": 0.9269161224365234, "learning_rate": 1.1767053563199047e-05, "loss": 0.3351, "num_input_tokens_seen": 83076480, "step": 87015 }, { "epoch": 7.098458275552655, "grad_norm": 0.5510665774345398, "learning_rate": 1.176403391893931e-05, "loss": 0.2984, "num_input_tokens_seen": 83080992, "step": 87020 }, { "epoch": 7.098866139163064, "grad_norm": 1.7990635633468628, "learning_rate": 1.1761014542966984e-05, "loss": 0.3776, "num_input_tokens_seen": 83086864, "step": 87025 }, { "epoch": 7.099274002773472, "grad_norm": 2.6241281032562256, "learning_rate": 1.1757995435343266e-05, "loss": 0.3142, "num_input_tokens_seen": 83090976, "step": 87030 }, { "epoch": 7.099681866383881, "grad_norm": 5.019297122955322, "learning_rate": 1.1754976596129352e-05, "loss": 0.2955, "num_input_tokens_seen": 83096032, "step": 87035 }, { "epoch": 7.1000897299942896, "grad_norm": 0.4736765921115875, "learning_rate": 1.1751958025386428e-05, "loss": 0.4905, "num_input_tokens_seen": 83100544, "step": 87040 }, { "epoch": 7.100497593604699, "grad_norm": 5.741905212402344, "learning_rate": 1.1748939723175695e-05, "loss": 0.3521, "num_input_tokens_seen": 83104672, "step": 87045 }, { "epoch": 7.100905457215108, "grad_norm": 3.03967022895813, "learning_rate": 1.174592168955832e-05, "loss": 0.3468, "num_input_tokens_seen": 83108000, "step": 87050 }, { "epoch": 7.101313320825516, "grad_norm": 1.2881560325622559, "learning_rate": 1.1742903924595484e-05, "loss": 0.3549, "num_input_tokens_seen": 83113328, "step": 87055 }, { "epoch": 7.101721184435925, "grad_norm": 6.78366231918335, "learning_rate": 1.1739886428348349e-05, "loss": 0.3886, "num_input_tokens_seen": 83117760, "step": 87060 }, { "epoch": 7.102129048046334, "grad_norm": 1.6421386003494263, "learning_rate": 1.1736869200878081e-05, "loss": 0.3173, "num_input_tokens_seen": 83123232, "step": 87065 }, { "epoch": 7.102536911656742, "grad_norm": 1.2175418138504028, "learning_rate": 1.1733852242245832e-05, "loss": 0.4593, "num_input_tokens_seen": 83128400, "step": 87070 }, { "epoch": 7.102944775267151, "grad_norm": 3.530120611190796, "learning_rate": 1.173083555251277e-05, "loss": 0.299, "num_input_tokens_seen": 83133536, "step": 87075 }, { "epoch": 7.103352638877559, "grad_norm": 0.9196858406066895, "learning_rate": 1.172781913174003e-05, "loss": 0.3426, "num_input_tokens_seen": 83137920, "step": 87080 }, { "epoch": 7.103760502487968, "grad_norm": 2.0895283222198486, "learning_rate": 1.1724802979988758e-05, "loss": 0.3806, "num_input_tokens_seen": 83141840, "step": 87085 }, { "epoch": 7.104168366098377, "grad_norm": 0.5755249261856079, "learning_rate": 1.1721787097320092e-05, "loss": 0.3445, "num_input_tokens_seen": 83146736, "step": 87090 }, { "epoch": 7.104576229708785, "grad_norm": 4.298991680145264, "learning_rate": 1.1718771483795146e-05, "loss": 0.3518, "num_input_tokens_seen": 83152000, "step": 87095 }, { "epoch": 7.104984093319194, "grad_norm": 2.4284133911132812, "learning_rate": 1.1715756139475068e-05, "loss": 0.2971, "num_input_tokens_seen": 83156928, "step": 87100 }, { "epoch": 7.105391956929603, "grad_norm": 4.248530864715576, "learning_rate": 1.171274106442097e-05, "loss": 0.3155, "num_input_tokens_seen": 83161248, "step": 87105 }, { "epoch": 7.105799820540011, "grad_norm": 1.3325563669204712, "learning_rate": 1.1709726258693962e-05, "loss": 0.3582, "num_input_tokens_seen": 83165792, "step": 87110 }, { "epoch": 7.10620768415042, "grad_norm": 0.6755800247192383, "learning_rate": 1.1706711722355148e-05, "loss": 0.2749, "num_input_tokens_seen": 83170416, "step": 87115 }, { "epoch": 7.106615547760828, "grad_norm": 0.9224437475204468, "learning_rate": 1.1703697455465648e-05, "loss": 0.3659, "num_input_tokens_seen": 83175968, "step": 87120 }, { "epoch": 7.107023411371237, "grad_norm": 2.214242458343506, "learning_rate": 1.1700683458086551e-05, "loss": 0.2745, "num_input_tokens_seen": 83180272, "step": 87125 }, { "epoch": 7.107431274981646, "grad_norm": 21.298887252807617, "learning_rate": 1.1697669730278949e-05, "loss": 0.362, "num_input_tokens_seen": 83185296, "step": 87130 }, { "epoch": 7.1078391385920545, "grad_norm": 1.3218954801559448, "learning_rate": 1.1694656272103927e-05, "loss": 0.2842, "num_input_tokens_seen": 83190784, "step": 87135 }, { "epoch": 7.1082470022024635, "grad_norm": 0.5933253169059753, "learning_rate": 1.1691643083622561e-05, "loss": 0.3123, "num_input_tokens_seen": 83195232, "step": 87140 }, { "epoch": 7.1086548658128725, "grad_norm": 1.990956425666809, "learning_rate": 1.1688630164895945e-05, "loss": 0.3364, "num_input_tokens_seen": 83200672, "step": 87145 }, { "epoch": 7.109062729423281, "grad_norm": 4.2220001220703125, "learning_rate": 1.1685617515985136e-05, "loss": 0.4051, "num_input_tokens_seen": 83206096, "step": 87150 }, { "epoch": 7.10947059303369, "grad_norm": 2.2837319374084473, "learning_rate": 1.1682605136951205e-05, "loss": 0.3474, "num_input_tokens_seen": 83210512, "step": 87155 }, { "epoch": 7.109878456644098, "grad_norm": 0.6746271848678589, "learning_rate": 1.1679593027855199e-05, "loss": 0.3317, "num_input_tokens_seen": 83215696, "step": 87160 }, { "epoch": 7.110286320254507, "grad_norm": 29.425325393676758, "learning_rate": 1.1676581188758193e-05, "loss": 0.3584, "num_input_tokens_seen": 83220080, "step": 87165 }, { "epoch": 7.110694183864916, "grad_norm": 4.205387592315674, "learning_rate": 1.1673569619721222e-05, "loss": 0.2935, "num_input_tokens_seen": 83225088, "step": 87170 }, { "epoch": 7.111102047475324, "grad_norm": 1.174543023109436, "learning_rate": 1.1670558320805333e-05, "loss": 0.3536, "num_input_tokens_seen": 83228816, "step": 87175 }, { "epoch": 7.111509911085733, "grad_norm": 1.3803682327270508, "learning_rate": 1.1667547292071563e-05, "loss": 0.3349, "num_input_tokens_seen": 83233392, "step": 87180 }, { "epoch": 7.111917774696142, "grad_norm": 2.593395709991455, "learning_rate": 1.1664536533580944e-05, "loss": 0.2896, "num_input_tokens_seen": 83238992, "step": 87185 }, { "epoch": 7.11232563830655, "grad_norm": 15.688231468200684, "learning_rate": 1.1661526045394503e-05, "loss": 0.2409, "num_input_tokens_seen": 83243408, "step": 87190 }, { "epoch": 7.112733501916959, "grad_norm": 3.8978986740112305, "learning_rate": 1.1658515827573263e-05, "loss": 0.2976, "num_input_tokens_seen": 83248544, "step": 87195 }, { "epoch": 7.113141365527367, "grad_norm": 1.0610438585281372, "learning_rate": 1.1655505880178235e-05, "loss": 0.3527, "num_input_tokens_seen": 83252800, "step": 87200 }, { "epoch": 7.113549229137776, "grad_norm": 0.7669112086296082, "learning_rate": 1.1652496203270424e-05, "loss": 0.2959, "num_input_tokens_seen": 83257136, "step": 87205 }, { "epoch": 7.113957092748185, "grad_norm": 2.3448662757873535, "learning_rate": 1.1649486796910853e-05, "loss": 0.3506, "num_input_tokens_seen": 83260800, "step": 87210 }, { "epoch": 7.114364956358593, "grad_norm": 1.949947714805603, "learning_rate": 1.1646477661160513e-05, "loss": 0.2792, "num_input_tokens_seen": 83265152, "step": 87215 }, { "epoch": 7.114772819969002, "grad_norm": 1.398083209991455, "learning_rate": 1.1643468796080396e-05, "loss": 0.3438, "num_input_tokens_seen": 83268992, "step": 87220 }, { "epoch": 7.115180683579411, "grad_norm": 2.3685684204101562, "learning_rate": 1.1640460201731488e-05, "loss": 0.2702, "num_input_tokens_seen": 83273840, "step": 87225 }, { "epoch": 7.1155885471898195, "grad_norm": 2.5449271202087402, "learning_rate": 1.1637451878174768e-05, "loss": 0.2644, "num_input_tokens_seen": 83279568, "step": 87230 }, { "epoch": 7.1159964108002285, "grad_norm": 0.8095223307609558, "learning_rate": 1.1634443825471232e-05, "loss": 0.3611, "num_input_tokens_seen": 83284400, "step": 87235 }, { "epoch": 7.1164042744106375, "grad_norm": 4.872448444366455, "learning_rate": 1.1631436043681835e-05, "loss": 0.3132, "num_input_tokens_seen": 83289744, "step": 87240 }, { "epoch": 7.116812138021046, "grad_norm": 0.6589099168777466, "learning_rate": 1.1628428532867553e-05, "loss": 0.3512, "num_input_tokens_seen": 83294368, "step": 87245 }, { "epoch": 7.117220001631455, "grad_norm": 41.79479217529297, "learning_rate": 1.162542129308933e-05, "loss": 0.3391, "num_input_tokens_seen": 83299104, "step": 87250 }, { "epoch": 7.117627865241863, "grad_norm": 3.8133246898651123, "learning_rate": 1.1622414324408146e-05, "loss": 0.2935, "num_input_tokens_seen": 83303248, "step": 87255 }, { "epoch": 7.118035728852272, "grad_norm": 2.7667651176452637, "learning_rate": 1.1619407626884937e-05, "loss": 0.3039, "num_input_tokens_seen": 83307456, "step": 87260 }, { "epoch": 7.118443592462681, "grad_norm": 4.54617166519165, "learning_rate": 1.1616401200580652e-05, "loss": 0.3171, "num_input_tokens_seen": 83312624, "step": 87265 }, { "epoch": 7.118851456073089, "grad_norm": 2.0951480865478516, "learning_rate": 1.1613395045556216e-05, "loss": 0.3913, "num_input_tokens_seen": 83318288, "step": 87270 }, { "epoch": 7.119259319683498, "grad_norm": 2.0322914123535156, "learning_rate": 1.1610389161872587e-05, "loss": 0.3426, "num_input_tokens_seen": 83323264, "step": 87275 }, { "epoch": 7.119667183293907, "grad_norm": 27.588464736938477, "learning_rate": 1.1607383549590679e-05, "loss": 0.2551, "num_input_tokens_seen": 83327920, "step": 87280 }, { "epoch": 7.120075046904315, "grad_norm": 1.47503662109375, "learning_rate": 1.1604378208771412e-05, "loss": 0.3234, "num_input_tokens_seen": 83332240, "step": 87285 }, { "epoch": 7.120482910514724, "grad_norm": 4.676126956939697, "learning_rate": 1.160137313947571e-05, "loss": 0.382, "num_input_tokens_seen": 83336944, "step": 87290 }, { "epoch": 7.120890774125132, "grad_norm": 1.4833033084869385, "learning_rate": 1.159836834176447e-05, "loss": 0.2488, "num_input_tokens_seen": 83342144, "step": 87295 }, { "epoch": 7.121298637735541, "grad_norm": 2.090940237045288, "learning_rate": 1.159536381569862e-05, "loss": 0.3995, "num_input_tokens_seen": 83347712, "step": 87300 }, { "epoch": 7.12170650134595, "grad_norm": 1.9855339527130127, "learning_rate": 1.1592359561339048e-05, "loss": 0.2626, "num_input_tokens_seen": 83353056, "step": 87305 }, { "epoch": 7.122114364956358, "grad_norm": 1.0409471988677979, "learning_rate": 1.1589355578746652e-05, "loss": 0.3433, "num_input_tokens_seen": 83357968, "step": 87310 }, { "epoch": 7.122522228566767, "grad_norm": 1.1544471979141235, "learning_rate": 1.1586351867982317e-05, "loss": 0.3712, "num_input_tokens_seen": 83363312, "step": 87315 }, { "epoch": 7.122930092177176, "grad_norm": 0.7120798826217651, "learning_rate": 1.1583348429106933e-05, "loss": 0.399, "num_input_tokens_seen": 83367936, "step": 87320 }, { "epoch": 7.123337955787584, "grad_norm": 2.6241257190704346, "learning_rate": 1.1580345262181374e-05, "loss": 0.3516, "num_input_tokens_seen": 83372160, "step": 87325 }, { "epoch": 7.1237458193979935, "grad_norm": 2.1567275524139404, "learning_rate": 1.1577342367266513e-05, "loss": 0.2915, "num_input_tokens_seen": 83376592, "step": 87330 }, { "epoch": 7.124153683008402, "grad_norm": 6.435102939605713, "learning_rate": 1.157433974442322e-05, "loss": 0.3155, "num_input_tokens_seen": 83381760, "step": 87335 }, { "epoch": 7.124561546618811, "grad_norm": 3.4778125286102295, "learning_rate": 1.1571337393712347e-05, "loss": 0.3739, "num_input_tokens_seen": 83386464, "step": 87340 }, { "epoch": 7.12496941022922, "grad_norm": 12.557353973388672, "learning_rate": 1.1568335315194768e-05, "loss": 0.3338, "num_input_tokens_seen": 83390608, "step": 87345 }, { "epoch": 7.125377273839628, "grad_norm": 20.39507293701172, "learning_rate": 1.1565333508931323e-05, "loss": 0.3574, "num_input_tokens_seen": 83395808, "step": 87350 }, { "epoch": 7.125785137450037, "grad_norm": 1.70181143283844, "learning_rate": 1.156233197498286e-05, "loss": 0.3502, "num_input_tokens_seen": 83400992, "step": 87355 }, { "epoch": 7.126193001060446, "grad_norm": 5.418923854827881, "learning_rate": 1.1559330713410208e-05, "loss": 0.3642, "num_input_tokens_seen": 83405776, "step": 87360 }, { "epoch": 7.126600864670854, "grad_norm": 0.7590888738632202, "learning_rate": 1.155632972427422e-05, "loss": 0.2669, "num_input_tokens_seen": 83411232, "step": 87365 }, { "epoch": 7.127008728281263, "grad_norm": 1.3378612995147705, "learning_rate": 1.1553329007635718e-05, "loss": 0.5564, "num_input_tokens_seen": 83416160, "step": 87370 }, { "epoch": 7.127416591891672, "grad_norm": 1.1587095260620117, "learning_rate": 1.1550328563555523e-05, "loss": 0.315, "num_input_tokens_seen": 83421520, "step": 87375 }, { "epoch": 7.12782445550208, "grad_norm": 48.72060775756836, "learning_rate": 1.154732839209445e-05, "loss": 0.362, "num_input_tokens_seen": 83426928, "step": 87380 }, { "epoch": 7.128232319112489, "grad_norm": 4.484549045562744, "learning_rate": 1.1544328493313309e-05, "loss": 0.3214, "num_input_tokens_seen": 83431104, "step": 87385 }, { "epoch": 7.128640182722897, "grad_norm": 0.6984753012657166, "learning_rate": 1.1541328867272919e-05, "loss": 0.2941, "num_input_tokens_seen": 83435600, "step": 87390 }, { "epoch": 7.129048046333306, "grad_norm": 1.4367215633392334, "learning_rate": 1.1538329514034072e-05, "loss": 0.3972, "num_input_tokens_seen": 83439888, "step": 87395 }, { "epoch": 7.129455909943715, "grad_norm": 0.938789963722229, "learning_rate": 1.1535330433657569e-05, "loss": 0.3627, "num_input_tokens_seen": 83444496, "step": 87400 }, { "epoch": 7.129863773554123, "grad_norm": 1.9035238027572632, "learning_rate": 1.1532331626204185e-05, "loss": 0.3526, "num_input_tokens_seen": 83449424, "step": 87405 }, { "epoch": 7.130271637164532, "grad_norm": 0.901861846446991, "learning_rate": 1.1529333091734725e-05, "loss": 0.2496, "num_input_tokens_seen": 83454720, "step": 87410 }, { "epoch": 7.130679500774941, "grad_norm": 8.038923263549805, "learning_rate": 1.1526334830309959e-05, "loss": 0.2697, "num_input_tokens_seen": 83459024, "step": 87415 }, { "epoch": 7.131087364385349, "grad_norm": 0.9123197197914124, "learning_rate": 1.1523336841990664e-05, "loss": 0.3032, "num_input_tokens_seen": 83464544, "step": 87420 }, { "epoch": 7.131495227995758, "grad_norm": 4.080225944519043, "learning_rate": 1.15203391268376e-05, "loss": 0.2457, "num_input_tokens_seen": 83469616, "step": 87425 }, { "epoch": 7.1319030916061665, "grad_norm": 0.595349133014679, "learning_rate": 1.1517341684911528e-05, "loss": 0.3673, "num_input_tokens_seen": 83473360, "step": 87430 }, { "epoch": 7.1323109552165755, "grad_norm": 21.75796890258789, "learning_rate": 1.151434451627322e-05, "loss": 0.3166, "num_input_tokens_seen": 83478064, "step": 87435 }, { "epoch": 7.1327188188269846, "grad_norm": 0.6162763237953186, "learning_rate": 1.1511347620983418e-05, "loss": 0.2748, "num_input_tokens_seen": 83483024, "step": 87440 }, { "epoch": 7.133126682437393, "grad_norm": 1.892026662826538, "learning_rate": 1.1508350999102868e-05, "loss": 0.3641, "num_input_tokens_seen": 83488368, "step": 87445 }, { "epoch": 7.133534546047802, "grad_norm": 1.7270151376724243, "learning_rate": 1.150535465069231e-05, "loss": 0.3779, "num_input_tokens_seen": 83493504, "step": 87450 }, { "epoch": 7.133942409658211, "grad_norm": 19.41141700744629, "learning_rate": 1.150235857581248e-05, "loss": 0.414, "num_input_tokens_seen": 83497904, "step": 87455 }, { "epoch": 7.134350273268619, "grad_norm": 13.930850982666016, "learning_rate": 1.1499362774524106e-05, "loss": 0.3417, "num_input_tokens_seen": 83502832, "step": 87460 }, { "epoch": 7.134758136879028, "grad_norm": 1.9130780696868896, "learning_rate": 1.149636724688791e-05, "loss": 0.2062, "num_input_tokens_seen": 83507904, "step": 87465 }, { "epoch": 7.135166000489436, "grad_norm": 1.1255697011947632, "learning_rate": 1.1493371992964604e-05, "loss": 0.3233, "num_input_tokens_seen": 83512960, "step": 87470 }, { "epoch": 7.135573864099845, "grad_norm": 0.6554425358772278, "learning_rate": 1.1490377012814919e-05, "loss": 0.3422, "num_input_tokens_seen": 83517472, "step": 87475 }, { "epoch": 7.135981727710254, "grad_norm": 1.136502742767334, "learning_rate": 1.1487382306499546e-05, "loss": 0.3419, "num_input_tokens_seen": 83521616, "step": 87480 }, { "epoch": 7.136389591320662, "grad_norm": 0.6827648282051086, "learning_rate": 1.1484387874079198e-05, "loss": 0.3234, "num_input_tokens_seen": 83526912, "step": 87485 }, { "epoch": 7.136797454931071, "grad_norm": 2.2479872703552246, "learning_rate": 1.1481393715614561e-05, "loss": 0.2274, "num_input_tokens_seen": 83531280, "step": 87490 }, { "epoch": 7.13720531854148, "grad_norm": 25.649335861206055, "learning_rate": 1.147839983116632e-05, "loss": 0.307, "num_input_tokens_seen": 83535200, "step": 87495 }, { "epoch": 7.137613182151888, "grad_norm": 0.5085519552230835, "learning_rate": 1.1475406220795179e-05, "loss": 0.3711, "num_input_tokens_seen": 83540416, "step": 87500 }, { "epoch": 7.138021045762297, "grad_norm": 22.166183471679688, "learning_rate": 1.147241288456181e-05, "loss": 0.2875, "num_input_tokens_seen": 83545968, "step": 87505 }, { "epoch": 7.138428909372705, "grad_norm": 1.1511335372924805, "learning_rate": 1.1469419822526877e-05, "loss": 0.2219, "num_input_tokens_seen": 83551184, "step": 87510 }, { "epoch": 7.138836772983114, "grad_norm": 16.32792091369629, "learning_rate": 1.1466427034751051e-05, "loss": 0.3337, "num_input_tokens_seen": 83555936, "step": 87515 }, { "epoch": 7.139244636593523, "grad_norm": 2.7658162117004395, "learning_rate": 1.1463434521295005e-05, "loss": 0.4786, "num_input_tokens_seen": 83561104, "step": 87520 }, { "epoch": 7.1396525002039315, "grad_norm": 16.256954193115234, "learning_rate": 1.1460442282219389e-05, "loss": 0.311, "num_input_tokens_seen": 83566048, "step": 87525 }, { "epoch": 7.1400603638143405, "grad_norm": 0.44180142879486084, "learning_rate": 1.1457450317584856e-05, "loss": 0.2455, "num_input_tokens_seen": 83569840, "step": 87530 }, { "epoch": 7.1404682274247495, "grad_norm": 1.1820610761642456, "learning_rate": 1.145445862745205e-05, "loss": 0.3885, "num_input_tokens_seen": 83575440, "step": 87535 }, { "epoch": 7.140876091035158, "grad_norm": 0.5382030606269836, "learning_rate": 1.14514672118816e-05, "loss": 0.2959, "num_input_tokens_seen": 83580816, "step": 87540 }, { "epoch": 7.141283954645567, "grad_norm": 5.235941410064697, "learning_rate": 1.1448476070934166e-05, "loss": 0.3076, "num_input_tokens_seen": 83585408, "step": 87545 }, { "epoch": 7.141691818255975, "grad_norm": 0.7057369947433472, "learning_rate": 1.1445485204670361e-05, "loss": 0.3375, "num_input_tokens_seen": 83590592, "step": 87550 }, { "epoch": 7.142099681866384, "grad_norm": 0.5391178727149963, "learning_rate": 1.144249461315081e-05, "loss": 0.4896, "num_input_tokens_seen": 83595632, "step": 87555 }, { "epoch": 7.142507545476793, "grad_norm": 4.851016998291016, "learning_rate": 1.1439504296436126e-05, "loss": 0.3607, "num_input_tokens_seen": 83600720, "step": 87560 }, { "epoch": 7.142915409087201, "grad_norm": 40.827613830566406, "learning_rate": 1.1436514254586936e-05, "loss": 0.4675, "num_input_tokens_seen": 83605616, "step": 87565 }, { "epoch": 7.14332327269761, "grad_norm": 1.0873730182647705, "learning_rate": 1.1433524487663838e-05, "loss": 0.231, "num_input_tokens_seen": 83610224, "step": 87570 }, { "epoch": 7.143731136308019, "grad_norm": 1.3074631690979004, "learning_rate": 1.1430534995727435e-05, "loss": 0.4331, "num_input_tokens_seen": 83614688, "step": 87575 }, { "epoch": 7.144138999918427, "grad_norm": 13.233102798461914, "learning_rate": 1.1427545778838323e-05, "loss": 0.4028, "num_input_tokens_seen": 83619712, "step": 87580 }, { "epoch": 7.144546863528836, "grad_norm": 8.511838912963867, "learning_rate": 1.1424556837057088e-05, "loss": 0.3352, "num_input_tokens_seen": 83624976, "step": 87585 }, { "epoch": 7.144954727139245, "grad_norm": 1.3081859350204468, "learning_rate": 1.1421568170444317e-05, "loss": 0.3347, "num_input_tokens_seen": 83629392, "step": 87590 }, { "epoch": 7.145362590749653, "grad_norm": 2.1619226932525635, "learning_rate": 1.141857977906059e-05, "loss": 0.3667, "num_input_tokens_seen": 83634352, "step": 87595 }, { "epoch": 7.145770454360062, "grad_norm": 1.5531994104385376, "learning_rate": 1.1415591662966482e-05, "loss": 0.2412, "num_input_tokens_seen": 83638528, "step": 87600 }, { "epoch": 7.14617831797047, "grad_norm": 3.9820029735565186, "learning_rate": 1.1412603822222547e-05, "loss": 0.3736, "num_input_tokens_seen": 83643376, "step": 87605 }, { "epoch": 7.146586181580879, "grad_norm": 9.373322486877441, "learning_rate": 1.1409616256889369e-05, "loss": 0.3352, "num_input_tokens_seen": 83647904, "step": 87610 }, { "epoch": 7.146994045191288, "grad_norm": 2.2305262088775635, "learning_rate": 1.1406628967027494e-05, "loss": 0.284, "num_input_tokens_seen": 83652624, "step": 87615 }, { "epoch": 7.1474019088016965, "grad_norm": 3.1269800662994385, "learning_rate": 1.1403641952697474e-05, "loss": 0.3399, "num_input_tokens_seen": 83657344, "step": 87620 }, { "epoch": 7.1478097724121055, "grad_norm": 2.774343967437744, "learning_rate": 1.1400655213959843e-05, "loss": 0.2733, "num_input_tokens_seen": 83661856, "step": 87625 }, { "epoch": 7.1482176360225145, "grad_norm": 4.263905048370361, "learning_rate": 1.1397668750875165e-05, "loss": 0.3258, "num_input_tokens_seen": 83667120, "step": 87630 }, { "epoch": 7.148625499632923, "grad_norm": 3.0234177112579346, "learning_rate": 1.1394682563503959e-05, "loss": 0.4371, "num_input_tokens_seen": 83672448, "step": 87635 }, { "epoch": 7.149033363243332, "grad_norm": 0.9238318800926208, "learning_rate": 1.1391696651906758e-05, "loss": 0.3202, "num_input_tokens_seen": 83678224, "step": 87640 }, { "epoch": 7.14944122685374, "grad_norm": 6.111441612243652, "learning_rate": 1.1388711016144082e-05, "loss": 0.3508, "num_input_tokens_seen": 83683616, "step": 87645 }, { "epoch": 7.149849090464149, "grad_norm": 24.521738052368164, "learning_rate": 1.1385725656276442e-05, "loss": 0.2722, "num_input_tokens_seen": 83688016, "step": 87650 }, { "epoch": 7.150256954074558, "grad_norm": 3.344026803970337, "learning_rate": 1.1382740572364364e-05, "loss": 0.4115, "num_input_tokens_seen": 83693632, "step": 87655 }, { "epoch": 7.150664817684966, "grad_norm": 3.9144985675811768, "learning_rate": 1.137975576446835e-05, "loss": 0.25, "num_input_tokens_seen": 83698208, "step": 87660 }, { "epoch": 7.151072681295375, "grad_norm": 1.620754361152649, "learning_rate": 1.13767712326489e-05, "loss": 0.2276, "num_input_tokens_seen": 83702304, "step": 87665 }, { "epoch": 7.151480544905784, "grad_norm": 6.79788875579834, "learning_rate": 1.13737869769665e-05, "loss": 0.2716, "num_input_tokens_seen": 83707152, "step": 87670 }, { "epoch": 7.151888408516192, "grad_norm": 1.0253561735153198, "learning_rate": 1.1370802997481652e-05, "loss": 0.5058, "num_input_tokens_seen": 83712176, "step": 87675 }, { "epoch": 7.152296272126601, "grad_norm": 0.9743545651435852, "learning_rate": 1.1367819294254841e-05, "loss": 0.2021, "num_input_tokens_seen": 83716416, "step": 87680 }, { "epoch": 7.152704135737009, "grad_norm": 1.2264896631240845, "learning_rate": 1.1364835867346538e-05, "loss": 0.289, "num_input_tokens_seen": 83721440, "step": 87685 }, { "epoch": 7.153111999347418, "grad_norm": 0.5718674063682556, "learning_rate": 1.136185271681722e-05, "loss": 0.4035, "num_input_tokens_seen": 83725024, "step": 87690 }, { "epoch": 7.153519862957827, "grad_norm": 4.496183395385742, "learning_rate": 1.135886984272734e-05, "loss": 0.3131, "num_input_tokens_seen": 83729984, "step": 87695 }, { "epoch": 7.153927726568235, "grad_norm": 5.383639335632324, "learning_rate": 1.1355887245137383e-05, "loss": 0.4041, "num_input_tokens_seen": 83734816, "step": 87700 }, { "epoch": 7.154335590178644, "grad_norm": 3.4170963764190674, "learning_rate": 1.1352904924107794e-05, "loss": 0.3543, "num_input_tokens_seen": 83740048, "step": 87705 }, { "epoch": 7.154743453789053, "grad_norm": 42.009925842285156, "learning_rate": 1.1349922879699023e-05, "loss": 0.2932, "num_input_tokens_seen": 83744448, "step": 87710 }, { "epoch": 7.155151317399461, "grad_norm": 53.66070556640625, "learning_rate": 1.1346941111971515e-05, "loss": 0.8525, "num_input_tokens_seen": 83749424, "step": 87715 }, { "epoch": 7.15555918100987, "grad_norm": 1.5654457807540894, "learning_rate": 1.1343959620985708e-05, "loss": 0.3733, "num_input_tokens_seen": 83753296, "step": 87720 }, { "epoch": 7.1559670446202785, "grad_norm": 0.8063821792602539, "learning_rate": 1.1340978406802039e-05, "loss": 0.3887, "num_input_tokens_seen": 83758240, "step": 87725 }, { "epoch": 7.156374908230688, "grad_norm": 8.427446365356445, "learning_rate": 1.1337997469480929e-05, "loss": 0.3082, "num_input_tokens_seen": 83763680, "step": 87730 }, { "epoch": 7.156782771841097, "grad_norm": 24.14516830444336, "learning_rate": 1.13350168090828e-05, "loss": 0.4217, "num_input_tokens_seen": 83768464, "step": 87735 }, { "epoch": 7.157190635451505, "grad_norm": 0.9168838858604431, "learning_rate": 1.1332036425668082e-05, "loss": 0.4019, "num_input_tokens_seen": 83773616, "step": 87740 }, { "epoch": 7.157598499061914, "grad_norm": 1.4244787693023682, "learning_rate": 1.1329056319297177e-05, "loss": 0.2799, "num_input_tokens_seen": 83778448, "step": 87745 }, { "epoch": 7.158006362672323, "grad_norm": 0.5450664758682251, "learning_rate": 1.1326076490030494e-05, "loss": 0.2625, "num_input_tokens_seen": 83782320, "step": 87750 }, { "epoch": 7.158414226282731, "grad_norm": 22.585954666137695, "learning_rate": 1.1323096937928426e-05, "loss": 0.2444, "num_input_tokens_seen": 83787408, "step": 87755 }, { "epoch": 7.15882208989314, "grad_norm": 0.7424014806747437, "learning_rate": 1.1320117663051364e-05, "loss": 0.2515, "num_input_tokens_seen": 83792032, "step": 87760 }, { "epoch": 7.159229953503548, "grad_norm": 6.184762477874756, "learning_rate": 1.1317138665459715e-05, "loss": 0.3202, "num_input_tokens_seen": 83795872, "step": 87765 }, { "epoch": 7.159637817113957, "grad_norm": 3.5032153129577637, "learning_rate": 1.1314159945213848e-05, "loss": 0.3642, "num_input_tokens_seen": 83800368, "step": 87770 }, { "epoch": 7.160045680724366, "grad_norm": 2.0459237098693848, "learning_rate": 1.1311181502374144e-05, "loss": 0.2605, "num_input_tokens_seen": 83804944, "step": 87775 }, { "epoch": 7.160453544334774, "grad_norm": 1.1999144554138184, "learning_rate": 1.1308203337000975e-05, "loss": 0.4614, "num_input_tokens_seen": 83809488, "step": 87780 }, { "epoch": 7.160861407945183, "grad_norm": 0.6722497344017029, "learning_rate": 1.1305225449154697e-05, "loss": 0.3542, "num_input_tokens_seen": 83813808, "step": 87785 }, { "epoch": 7.161269271555592, "grad_norm": 3.2785565853118896, "learning_rate": 1.1302247838895692e-05, "loss": 0.3414, "num_input_tokens_seen": 83818240, "step": 87790 }, { "epoch": 7.161677135166, "grad_norm": 0.489688515663147, "learning_rate": 1.1299270506284298e-05, "loss": 0.3401, "num_input_tokens_seen": 83823728, "step": 87795 }, { "epoch": 7.162084998776409, "grad_norm": 0.8637523651123047, "learning_rate": 1.129629345138087e-05, "loss": 0.3809, "num_input_tokens_seen": 83828496, "step": 87800 }, { "epoch": 7.162492862386818, "grad_norm": 49.118438720703125, "learning_rate": 1.1293316674245744e-05, "loss": 0.2197, "num_input_tokens_seen": 83833648, "step": 87805 }, { "epoch": 7.162900725997226, "grad_norm": 0.49071204662323, "learning_rate": 1.129034017493927e-05, "loss": 0.3617, "num_input_tokens_seen": 83838352, "step": 87810 }, { "epoch": 7.163308589607635, "grad_norm": 0.3586858808994293, "learning_rate": 1.1287363953521779e-05, "loss": 0.3184, "num_input_tokens_seen": 83842784, "step": 87815 }, { "epoch": 7.1637164532180435, "grad_norm": 7.4968390464782715, "learning_rate": 1.1284388010053592e-05, "loss": 0.3556, "num_input_tokens_seen": 83847280, "step": 87820 }, { "epoch": 7.1641243168284525, "grad_norm": 0.35260194540023804, "learning_rate": 1.1281412344595024e-05, "loss": 0.2599, "num_input_tokens_seen": 83851584, "step": 87825 }, { "epoch": 7.1645321804388615, "grad_norm": 0.9106534123420715, "learning_rate": 1.1278436957206406e-05, "loss": 0.2443, "num_input_tokens_seen": 83857520, "step": 87830 }, { "epoch": 7.16494004404927, "grad_norm": 0.43980613350868225, "learning_rate": 1.127546184794804e-05, "loss": 0.2873, "num_input_tokens_seen": 83861984, "step": 87835 }, { "epoch": 7.165347907659679, "grad_norm": 5.748682022094727, "learning_rate": 1.1272487016880232e-05, "loss": 0.4323, "num_input_tokens_seen": 83866976, "step": 87840 }, { "epoch": 7.165755771270088, "grad_norm": 0.5126810073852539, "learning_rate": 1.1269512464063279e-05, "loss": 0.3899, "num_input_tokens_seen": 83871152, "step": 87845 }, { "epoch": 7.166163634880496, "grad_norm": 1.3522896766662598, "learning_rate": 1.1266538189557475e-05, "loss": 0.2648, "num_input_tokens_seen": 83875808, "step": 87850 }, { "epoch": 7.166571498490905, "grad_norm": 9.808663368225098, "learning_rate": 1.1263564193423092e-05, "loss": 0.2379, "num_input_tokens_seen": 83880928, "step": 87855 }, { "epoch": 7.166979362101313, "grad_norm": 37.20054626464844, "learning_rate": 1.126059047572044e-05, "loss": 0.2743, "num_input_tokens_seen": 83886480, "step": 87860 }, { "epoch": 7.167387225711722, "grad_norm": 21.86212921142578, "learning_rate": 1.125761703650978e-05, "loss": 0.4116, "num_input_tokens_seen": 83891552, "step": 87865 }, { "epoch": 7.167795089322131, "grad_norm": 4.174607276916504, "learning_rate": 1.1254643875851381e-05, "loss": 0.3538, "num_input_tokens_seen": 83896944, "step": 87870 }, { "epoch": 7.168202952932539, "grad_norm": 0.8190648555755615, "learning_rate": 1.1251670993805511e-05, "loss": 0.338, "num_input_tokens_seen": 83901872, "step": 87875 }, { "epoch": 7.168610816542948, "grad_norm": 0.8840436935424805, "learning_rate": 1.1248698390432428e-05, "loss": 0.2619, "num_input_tokens_seen": 83907232, "step": 87880 }, { "epoch": 7.169018680153357, "grad_norm": 25.14182472229004, "learning_rate": 1.1245726065792386e-05, "loss": 0.4002, "num_input_tokens_seen": 83912112, "step": 87885 }, { "epoch": 7.169426543763765, "grad_norm": 0.11019332706928253, "learning_rate": 1.1242754019945631e-05, "loss": 0.4269, "num_input_tokens_seen": 83917888, "step": 87890 }, { "epoch": 7.169834407374174, "grad_norm": 9.040205955505371, "learning_rate": 1.1239782252952396e-05, "loss": 0.3618, "num_input_tokens_seen": 83922624, "step": 87895 }, { "epoch": 7.170242270984582, "grad_norm": 0.41869470477104187, "learning_rate": 1.123681076487294e-05, "loss": 0.1971, "num_input_tokens_seen": 83926832, "step": 87900 }, { "epoch": 7.170650134594991, "grad_norm": 15.032748222351074, "learning_rate": 1.123383955576748e-05, "loss": 0.5679, "num_input_tokens_seen": 83931408, "step": 87905 }, { "epoch": 7.1710579982054, "grad_norm": 1.877374529838562, "learning_rate": 1.1230868625696242e-05, "loss": 0.3343, "num_input_tokens_seen": 83935440, "step": 87910 }, { "epoch": 7.1714658618158085, "grad_norm": 1.6986594200134277, "learning_rate": 1.1227897974719437e-05, "loss": 0.2707, "num_input_tokens_seen": 83939760, "step": 87915 }, { "epoch": 7.1718737254262175, "grad_norm": 0.4907647669315338, "learning_rate": 1.1224927602897298e-05, "loss": 0.3269, "num_input_tokens_seen": 83943952, "step": 87920 }, { "epoch": 7.1722815890366265, "grad_norm": 12.823030471801758, "learning_rate": 1.1221957510290023e-05, "loss": 0.3547, "num_input_tokens_seen": 83949120, "step": 87925 }, { "epoch": 7.172689452647035, "grad_norm": 8.869695663452148, "learning_rate": 1.1218987696957815e-05, "loss": 0.2974, "num_input_tokens_seen": 83953888, "step": 87930 }, { "epoch": 7.173097316257444, "grad_norm": 16.630443572998047, "learning_rate": 1.1216018162960867e-05, "loss": 0.3579, "num_input_tokens_seen": 83958944, "step": 87935 }, { "epoch": 7.173505179867853, "grad_norm": 87.00186157226562, "learning_rate": 1.121304890835937e-05, "loss": 0.5302, "num_input_tokens_seen": 83963136, "step": 87940 }, { "epoch": 7.173913043478261, "grad_norm": 78.3311538696289, "learning_rate": 1.1210079933213519e-05, "loss": 0.349, "num_input_tokens_seen": 83967296, "step": 87945 }, { "epoch": 7.17432090708867, "grad_norm": 76.5074462890625, "learning_rate": 1.1207111237583487e-05, "loss": 0.2929, "num_input_tokens_seen": 83971808, "step": 87950 }, { "epoch": 7.174728770699078, "grad_norm": 1.8901435136795044, "learning_rate": 1.1204142821529451e-05, "loss": 0.4602, "num_input_tokens_seen": 83976688, "step": 87955 }, { "epoch": 7.175136634309487, "grad_norm": 0.7852360606193542, "learning_rate": 1.1201174685111569e-05, "loss": 0.3771, "num_input_tokens_seen": 83981520, "step": 87960 }, { "epoch": 7.175544497919896, "grad_norm": 1.1296204328536987, "learning_rate": 1.119820682839002e-05, "loss": 0.4309, "num_input_tokens_seen": 83986816, "step": 87965 }, { "epoch": 7.175952361530304, "grad_norm": 1.2419092655181885, "learning_rate": 1.1195239251424958e-05, "loss": 0.3113, "num_input_tokens_seen": 83991520, "step": 87970 }, { "epoch": 7.176360225140713, "grad_norm": 0.49396926164627075, "learning_rate": 1.1192271954276526e-05, "loss": 0.3984, "num_input_tokens_seen": 83996912, "step": 87975 }, { "epoch": 7.176768088751122, "grad_norm": 0.8683475852012634, "learning_rate": 1.1189304937004877e-05, "loss": 0.2837, "num_input_tokens_seen": 84001232, "step": 87980 }, { "epoch": 7.17717595236153, "grad_norm": 1.1892749071121216, "learning_rate": 1.1186338199670137e-05, "loss": 0.1663, "num_input_tokens_seen": 84005920, "step": 87985 }, { "epoch": 7.177583815971939, "grad_norm": 0.842410683631897, "learning_rate": 1.1183371742332463e-05, "loss": 0.2728, "num_input_tokens_seen": 84010768, "step": 87990 }, { "epoch": 7.177991679582347, "grad_norm": 1.0351240634918213, "learning_rate": 1.1180405565051972e-05, "loss": 0.4329, "num_input_tokens_seen": 84014656, "step": 87995 }, { "epoch": 7.178399543192756, "grad_norm": 0.5621654391288757, "learning_rate": 1.1177439667888787e-05, "loss": 0.2425, "num_input_tokens_seen": 84019680, "step": 88000 }, { "epoch": 7.178807406803165, "grad_norm": 0.5451735258102417, "learning_rate": 1.1174474050903028e-05, "loss": 0.3772, "num_input_tokens_seen": 84024912, "step": 88005 }, { "epoch": 7.179215270413573, "grad_norm": 0.6882387399673462, "learning_rate": 1.1171508714154803e-05, "loss": 0.3095, "num_input_tokens_seen": 84029200, "step": 88010 }, { "epoch": 7.1796231340239824, "grad_norm": 2.182602882385254, "learning_rate": 1.1168543657704222e-05, "loss": 0.6355, "num_input_tokens_seen": 84033680, "step": 88015 }, { "epoch": 7.1800309976343915, "grad_norm": 1.9095113277435303, "learning_rate": 1.1165578881611382e-05, "loss": 0.2597, "num_input_tokens_seen": 84038304, "step": 88020 }, { "epoch": 7.1804388612448, "grad_norm": 48.33064651489258, "learning_rate": 1.1162614385936373e-05, "loss": 0.3986, "num_input_tokens_seen": 84043024, "step": 88025 }, { "epoch": 7.180846724855209, "grad_norm": 0.9043124318122864, "learning_rate": 1.1159650170739297e-05, "loss": 0.4249, "num_input_tokens_seen": 84047824, "step": 88030 }, { "epoch": 7.181254588465617, "grad_norm": 0.8217117786407471, "learning_rate": 1.1156686236080233e-05, "loss": 0.3392, "num_input_tokens_seen": 84052992, "step": 88035 }, { "epoch": 7.181662452076026, "grad_norm": 1.8831698894500732, "learning_rate": 1.1153722582019255e-05, "loss": 0.3756, "num_input_tokens_seen": 84057936, "step": 88040 }, { "epoch": 7.182070315686435, "grad_norm": 0.9851216673851013, "learning_rate": 1.1150759208616437e-05, "loss": 0.3911, "num_input_tokens_seen": 84062624, "step": 88045 }, { "epoch": 7.182478179296843, "grad_norm": 1.400038242340088, "learning_rate": 1.1147796115931839e-05, "loss": 0.4027, "num_input_tokens_seen": 84068304, "step": 88050 }, { "epoch": 7.182886042907252, "grad_norm": 3.1345221996307373, "learning_rate": 1.1144833304025531e-05, "loss": 0.2479, "num_input_tokens_seen": 84073184, "step": 88055 }, { "epoch": 7.183293906517661, "grad_norm": 0.5890078544616699, "learning_rate": 1.1141870772957568e-05, "loss": 0.3994, "num_input_tokens_seen": 84077968, "step": 88060 }, { "epoch": 7.183701770128069, "grad_norm": 2.2229104042053223, "learning_rate": 1.1138908522787998e-05, "loss": 0.2876, "num_input_tokens_seen": 84082128, "step": 88065 }, { "epoch": 7.184109633738478, "grad_norm": 0.5522345304489136, "learning_rate": 1.1135946553576848e-05, "loss": 0.3502, "num_input_tokens_seen": 84086800, "step": 88070 }, { "epoch": 7.184517497348886, "grad_norm": 10.332292556762695, "learning_rate": 1.1132984865384183e-05, "loss": 0.402, "num_input_tokens_seen": 84091456, "step": 88075 }, { "epoch": 7.184925360959295, "grad_norm": 0.5231015086174011, "learning_rate": 1.113002345827002e-05, "loss": 0.5952, "num_input_tokens_seen": 84095952, "step": 88080 }, { "epoch": 7.185333224569704, "grad_norm": 0.57452392578125, "learning_rate": 1.1127062332294388e-05, "loss": 0.3492, "num_input_tokens_seen": 84101136, "step": 88085 }, { "epoch": 7.185741088180112, "grad_norm": 3.2416088581085205, "learning_rate": 1.1124101487517311e-05, "loss": 0.6231, "num_input_tokens_seen": 84106640, "step": 88090 }, { "epoch": 7.186148951790521, "grad_norm": 0.5751988887786865, "learning_rate": 1.1121140923998788e-05, "loss": 0.2839, "num_input_tokens_seen": 84111296, "step": 88095 }, { "epoch": 7.18655681540093, "grad_norm": 1.507325291633606, "learning_rate": 1.1118180641798851e-05, "loss": 0.2784, "num_input_tokens_seen": 84116816, "step": 88100 }, { "epoch": 7.186964679011338, "grad_norm": 0.8821531534194946, "learning_rate": 1.1115220640977491e-05, "loss": 0.4123, "num_input_tokens_seen": 84121744, "step": 88105 }, { "epoch": 7.187372542621747, "grad_norm": 2.2081987857818604, "learning_rate": 1.1112260921594714e-05, "loss": 0.3046, "num_input_tokens_seen": 84126832, "step": 88110 }, { "epoch": 7.1877804062321555, "grad_norm": 1.894917607307434, "learning_rate": 1.1109301483710494e-05, "loss": 0.2246, "num_input_tokens_seen": 84131712, "step": 88115 }, { "epoch": 7.1881882698425645, "grad_norm": 47.429962158203125, "learning_rate": 1.1106342327384838e-05, "loss": 0.283, "num_input_tokens_seen": 84136416, "step": 88120 }, { "epoch": 7.1885961334529735, "grad_norm": 14.599041938781738, "learning_rate": 1.1103383452677724e-05, "loss": 0.2411, "num_input_tokens_seen": 84140928, "step": 88125 }, { "epoch": 7.189003997063382, "grad_norm": 0.6867473721504211, "learning_rate": 1.1100424859649119e-05, "loss": 0.4166, "num_input_tokens_seen": 84145120, "step": 88130 }, { "epoch": 7.189411860673791, "grad_norm": 7.805976390838623, "learning_rate": 1.1097466548358998e-05, "loss": 0.2274, "num_input_tokens_seen": 84149520, "step": 88135 }, { "epoch": 7.1898197242842, "grad_norm": 30.487396240234375, "learning_rate": 1.1094508518867322e-05, "loss": 0.4186, "num_input_tokens_seen": 84154864, "step": 88140 }, { "epoch": 7.190227587894608, "grad_norm": 1.6456892490386963, "learning_rate": 1.109155077123405e-05, "loss": 0.352, "num_input_tokens_seen": 84159616, "step": 88145 }, { "epoch": 7.190635451505017, "grad_norm": 9.875887870788574, "learning_rate": 1.1088593305519133e-05, "loss": 0.5376, "num_input_tokens_seen": 84164720, "step": 88150 }, { "epoch": 7.191043315115426, "grad_norm": 49.31332778930664, "learning_rate": 1.1085636121782517e-05, "loss": 0.5104, "num_input_tokens_seen": 84170320, "step": 88155 }, { "epoch": 7.191451178725834, "grad_norm": 25.020801544189453, "learning_rate": 1.1082679220084136e-05, "loss": 0.4751, "num_input_tokens_seen": 84175120, "step": 88160 }, { "epoch": 7.191859042336243, "grad_norm": 28.311532974243164, "learning_rate": 1.107972260048394e-05, "loss": 0.2588, "num_input_tokens_seen": 84179632, "step": 88165 }, { "epoch": 7.192266905946651, "grad_norm": 107.32244110107422, "learning_rate": 1.1076766263041858e-05, "loss": 0.6424, "num_input_tokens_seen": 84183776, "step": 88170 }, { "epoch": 7.19267476955706, "grad_norm": 12.364304542541504, "learning_rate": 1.1073810207817804e-05, "loss": 0.3413, "num_input_tokens_seen": 84189120, "step": 88175 }, { "epoch": 7.193082633167469, "grad_norm": 17.58690643310547, "learning_rate": 1.10708544348717e-05, "loss": 0.3592, "num_input_tokens_seen": 84193888, "step": 88180 }, { "epoch": 7.193490496777877, "grad_norm": 0.46769315004348755, "learning_rate": 1.1067898944263447e-05, "loss": 0.2785, "num_input_tokens_seen": 84198512, "step": 88185 }, { "epoch": 7.193898360388286, "grad_norm": 4.991328716278076, "learning_rate": 1.1064943736052972e-05, "loss": 0.3087, "num_input_tokens_seen": 84203248, "step": 88190 }, { "epoch": 7.194306223998695, "grad_norm": 2.4194138050079346, "learning_rate": 1.1061988810300166e-05, "loss": 0.3834, "num_input_tokens_seen": 84208080, "step": 88195 }, { "epoch": 7.194714087609103, "grad_norm": 27.019868850708008, "learning_rate": 1.1059034167064928e-05, "loss": 0.3513, "num_input_tokens_seen": 84212224, "step": 88200 }, { "epoch": 7.195121951219512, "grad_norm": 2.2144083976745605, "learning_rate": 1.105607980640713e-05, "loss": 0.3778, "num_input_tokens_seen": 84216752, "step": 88205 }, { "epoch": 7.1955298148299205, "grad_norm": 55.59596633911133, "learning_rate": 1.1053125728386678e-05, "loss": 0.3691, "num_input_tokens_seen": 84222112, "step": 88210 }, { "epoch": 7.1959376784403295, "grad_norm": 5.536360740661621, "learning_rate": 1.1050171933063445e-05, "loss": 0.3511, "num_input_tokens_seen": 84227376, "step": 88215 }, { "epoch": 7.1963455420507385, "grad_norm": 0.6515796780586243, "learning_rate": 1.1047218420497297e-05, "loss": 0.6065, "num_input_tokens_seen": 84232736, "step": 88220 }, { "epoch": 7.196753405661147, "grad_norm": 1.9038785696029663, "learning_rate": 1.1044265190748094e-05, "loss": 0.3436, "num_input_tokens_seen": 84237072, "step": 88225 }, { "epoch": 7.197161269271556, "grad_norm": 26.687088012695312, "learning_rate": 1.1041312243875712e-05, "loss": 0.4955, "num_input_tokens_seen": 84241488, "step": 88230 }, { "epoch": 7.197569132881965, "grad_norm": 19.194744110107422, "learning_rate": 1.1038359579940002e-05, "loss": 0.2853, "num_input_tokens_seen": 84245520, "step": 88235 }, { "epoch": 7.197976996492373, "grad_norm": 28.415616989135742, "learning_rate": 1.1035407199000811e-05, "loss": 0.3702, "num_input_tokens_seen": 84251008, "step": 88240 }, { "epoch": 7.198384860102782, "grad_norm": 5.616684913635254, "learning_rate": 1.103245510111798e-05, "loss": 0.4091, "num_input_tokens_seen": 84256144, "step": 88245 }, { "epoch": 7.19879272371319, "grad_norm": 5.053414344787598, "learning_rate": 1.1029503286351343e-05, "loss": 0.3422, "num_input_tokens_seen": 84259552, "step": 88250 }, { "epoch": 7.199200587323599, "grad_norm": 2.0242159366607666, "learning_rate": 1.1026551754760744e-05, "loss": 0.3127, "num_input_tokens_seen": 84264384, "step": 88255 }, { "epoch": 7.199608450934008, "grad_norm": 0.5610626935958862, "learning_rate": 1.1023600506406007e-05, "loss": 0.4491, "num_input_tokens_seen": 84269536, "step": 88260 }, { "epoch": 7.200016314544416, "grad_norm": 1.004379153251648, "learning_rate": 1.1020649541346944e-05, "loss": 0.392, "num_input_tokens_seen": 84274880, "step": 88265 }, { "epoch": 7.200424178154825, "grad_norm": 4.637144088745117, "learning_rate": 1.1017698859643378e-05, "loss": 0.3796, "num_input_tokens_seen": 84280176, "step": 88270 }, { "epoch": 7.200832041765234, "grad_norm": 7.867973804473877, "learning_rate": 1.1014748461355112e-05, "loss": 0.3771, "num_input_tokens_seen": 84284672, "step": 88275 }, { "epoch": 7.201239905375642, "grad_norm": 0.44911515712738037, "learning_rate": 1.1011798346541954e-05, "loss": 0.3154, "num_input_tokens_seen": 84289568, "step": 88280 }, { "epoch": 7.201647768986051, "grad_norm": 4.461373329162598, "learning_rate": 1.1008848515263701e-05, "loss": 0.3925, "num_input_tokens_seen": 84294032, "step": 88285 }, { "epoch": 7.20205563259646, "grad_norm": 25.070661544799805, "learning_rate": 1.1005898967580142e-05, "loss": 0.3837, "num_input_tokens_seen": 84298576, "step": 88290 }, { "epoch": 7.202463496206868, "grad_norm": 1.9297997951507568, "learning_rate": 1.1002949703551055e-05, "loss": 0.4399, "num_input_tokens_seen": 84303968, "step": 88295 }, { "epoch": 7.202871359817277, "grad_norm": 0.7782300710678101, "learning_rate": 1.1000000723236237e-05, "loss": 0.2163, "num_input_tokens_seen": 84307952, "step": 88300 }, { "epoch": 7.2032792234276855, "grad_norm": 0.7945834994316101, "learning_rate": 1.0997052026695458e-05, "loss": 0.2989, "num_input_tokens_seen": 84312720, "step": 88305 }, { "epoch": 7.2036870870380945, "grad_norm": 14.936110496520996, "learning_rate": 1.0994103613988483e-05, "loss": 0.3945, "num_input_tokens_seen": 84317200, "step": 88310 }, { "epoch": 7.2040949506485035, "grad_norm": 0.8802111744880676, "learning_rate": 1.0991155485175067e-05, "loss": 0.2802, "num_input_tokens_seen": 84322320, "step": 88315 }, { "epoch": 7.204502814258912, "grad_norm": 0.7222386598587036, "learning_rate": 1.0988207640314987e-05, "loss": 0.3048, "num_input_tokens_seen": 84328016, "step": 88320 }, { "epoch": 7.204910677869321, "grad_norm": 84.38134002685547, "learning_rate": 1.0985260079467984e-05, "loss": 0.3884, "num_input_tokens_seen": 84332240, "step": 88325 }, { "epoch": 7.205318541479729, "grad_norm": 21.81388282775879, "learning_rate": 1.0982312802693803e-05, "loss": 0.4287, "num_input_tokens_seen": 84336416, "step": 88330 }, { "epoch": 7.205726405090138, "grad_norm": 2.844050884246826, "learning_rate": 1.0979365810052186e-05, "loss": 0.254, "num_input_tokens_seen": 84341952, "step": 88335 }, { "epoch": 7.206134268700547, "grad_norm": 7.000749588012695, "learning_rate": 1.0976419101602856e-05, "loss": 0.4972, "num_input_tokens_seen": 84346704, "step": 88340 }, { "epoch": 7.206542132310955, "grad_norm": 8.527604103088379, "learning_rate": 1.0973472677405563e-05, "loss": 0.4455, "num_input_tokens_seen": 84351520, "step": 88345 }, { "epoch": 7.206949995921364, "grad_norm": 1.5953885316848755, "learning_rate": 1.0970526537520018e-05, "loss": 0.3199, "num_input_tokens_seen": 84356032, "step": 88350 }, { "epoch": 7.207357859531773, "grad_norm": 12.100282669067383, "learning_rate": 1.0967580682005937e-05, "loss": 0.3276, "num_input_tokens_seen": 84360848, "step": 88355 }, { "epoch": 7.207765723142181, "grad_norm": 38.501583099365234, "learning_rate": 1.0964635110923024e-05, "loss": 0.3028, "num_input_tokens_seen": 84366192, "step": 88360 }, { "epoch": 7.20817358675259, "grad_norm": 3.0766055583953857, "learning_rate": 1.0961689824331003e-05, "loss": 0.4259, "num_input_tokens_seen": 84370608, "step": 88365 }, { "epoch": 7.208581450362999, "grad_norm": 0.414180189371109, "learning_rate": 1.0958744822289562e-05, "loss": 0.3941, "num_input_tokens_seen": 84375136, "step": 88370 }, { "epoch": 7.208989313973407, "grad_norm": 3.3050410747528076, "learning_rate": 1.0955800104858402e-05, "loss": 0.2991, "num_input_tokens_seen": 84379968, "step": 88375 }, { "epoch": 7.209397177583816, "grad_norm": 0.6669141054153442, "learning_rate": 1.0952855672097193e-05, "loss": 0.353, "num_input_tokens_seen": 84384400, "step": 88380 }, { "epoch": 7.209805041194224, "grad_norm": 2.817460060119629, "learning_rate": 1.0949911524065639e-05, "loss": 0.5049, "num_input_tokens_seen": 84388880, "step": 88385 }, { "epoch": 7.210212904804633, "grad_norm": 19.679147720336914, "learning_rate": 1.0946967660823413e-05, "loss": 0.3524, "num_input_tokens_seen": 84393120, "step": 88390 }, { "epoch": 7.210620768415042, "grad_norm": 4.647326469421387, "learning_rate": 1.0944024082430176e-05, "loss": 0.4638, "num_input_tokens_seen": 84397184, "step": 88395 }, { "epoch": 7.21102863202545, "grad_norm": 3.7386229038238525, "learning_rate": 1.09410807889456e-05, "loss": 0.3018, "num_input_tokens_seen": 84401168, "step": 88400 }, { "epoch": 7.211436495635859, "grad_norm": 9.562830924987793, "learning_rate": 1.0938137780429341e-05, "loss": 0.422, "num_input_tokens_seen": 84405920, "step": 88405 }, { "epoch": 7.211844359246268, "grad_norm": 2.1702628135681152, "learning_rate": 1.0935195056941058e-05, "loss": 0.3097, "num_input_tokens_seen": 84411680, "step": 88410 }, { "epoch": 7.2122522228566766, "grad_norm": 1.6533782482147217, "learning_rate": 1.0932252618540393e-05, "loss": 0.3662, "num_input_tokens_seen": 84416608, "step": 88415 }, { "epoch": 7.212660086467086, "grad_norm": 0.3511499762535095, "learning_rate": 1.0929310465286988e-05, "loss": 0.3676, "num_input_tokens_seen": 84420944, "step": 88420 }, { "epoch": 7.213067950077494, "grad_norm": 6.015135288238525, "learning_rate": 1.0926368597240471e-05, "loss": 0.3041, "num_input_tokens_seen": 84425728, "step": 88425 }, { "epoch": 7.213475813687903, "grad_norm": 0.35602936148643494, "learning_rate": 1.0923427014460494e-05, "loss": 0.3254, "num_input_tokens_seen": 84430576, "step": 88430 }, { "epoch": 7.213883677298312, "grad_norm": 0.8697722554206848, "learning_rate": 1.0920485717006668e-05, "loss": 0.2737, "num_input_tokens_seen": 84436208, "step": 88435 }, { "epoch": 7.21429154090872, "grad_norm": 1.3374488353729248, "learning_rate": 1.0917544704938615e-05, "loss": 0.2712, "num_input_tokens_seen": 84441088, "step": 88440 }, { "epoch": 7.214699404519129, "grad_norm": 0.7795921564102173, "learning_rate": 1.0914603978315946e-05, "loss": 0.2962, "num_input_tokens_seen": 84445344, "step": 88445 }, { "epoch": 7.215107268129538, "grad_norm": 1.0950853824615479, "learning_rate": 1.0911663537198261e-05, "loss": 0.3576, "num_input_tokens_seen": 84450000, "step": 88450 }, { "epoch": 7.215515131739946, "grad_norm": 13.963860511779785, "learning_rate": 1.0908723381645178e-05, "loss": 0.3379, "num_input_tokens_seen": 84454416, "step": 88455 }, { "epoch": 7.215922995350355, "grad_norm": 1.5585260391235352, "learning_rate": 1.0905783511716283e-05, "loss": 0.3009, "num_input_tokens_seen": 84459104, "step": 88460 }, { "epoch": 7.216330858960763, "grad_norm": 21.637754440307617, "learning_rate": 1.0902843927471168e-05, "loss": 0.2637, "num_input_tokens_seen": 84463376, "step": 88465 }, { "epoch": 7.216738722571172, "grad_norm": 1.334149956703186, "learning_rate": 1.0899904628969409e-05, "loss": 0.3847, "num_input_tokens_seen": 84467600, "step": 88470 }, { "epoch": 7.217146586181581, "grad_norm": 1.7693840265274048, "learning_rate": 1.08969656162706e-05, "loss": 0.2396, "num_input_tokens_seen": 84472832, "step": 88475 }, { "epoch": 7.217554449791989, "grad_norm": 1.8615251779556274, "learning_rate": 1.0894026889434302e-05, "loss": 0.3706, "num_input_tokens_seen": 84477872, "step": 88480 }, { "epoch": 7.217962313402398, "grad_norm": 0.46169060468673706, "learning_rate": 1.089108844852009e-05, "loss": 0.2397, "num_input_tokens_seen": 84482448, "step": 88485 }, { "epoch": 7.218370177012807, "grad_norm": 15.912117958068848, "learning_rate": 1.0888150293587518e-05, "loss": 0.2829, "num_input_tokens_seen": 84486832, "step": 88490 }, { "epoch": 7.218778040623215, "grad_norm": 0.30417951941490173, "learning_rate": 1.0885212424696135e-05, "loss": 0.3235, "num_input_tokens_seen": 84491680, "step": 88495 }, { "epoch": 7.219185904233624, "grad_norm": 0.4445512592792511, "learning_rate": 1.0882274841905504e-05, "loss": 0.3289, "num_input_tokens_seen": 84496768, "step": 88500 }, { "epoch": 7.219593767844033, "grad_norm": 1.248943567276001, "learning_rate": 1.0879337545275165e-05, "loss": 0.2946, "num_input_tokens_seen": 84502240, "step": 88505 }, { "epoch": 7.2200016314544415, "grad_norm": 0.8783971667289734, "learning_rate": 1.0876400534864653e-05, "loss": 0.3189, "num_input_tokens_seen": 84506912, "step": 88510 }, { "epoch": 7.2204094950648505, "grad_norm": 0.4723440110683441, "learning_rate": 1.0873463810733495e-05, "loss": 0.3553, "num_input_tokens_seen": 84510992, "step": 88515 }, { "epoch": 7.220817358675259, "grad_norm": 0.42690491676330566, "learning_rate": 1.0870527372941228e-05, "loss": 0.3158, "num_input_tokens_seen": 84516240, "step": 88520 }, { "epoch": 7.221225222285668, "grad_norm": 0.4299461841583252, "learning_rate": 1.0867591221547369e-05, "loss": 0.2497, "num_input_tokens_seen": 84520928, "step": 88525 }, { "epoch": 7.221633085896077, "grad_norm": 0.5180603265762329, "learning_rate": 1.0864655356611431e-05, "loss": 0.3281, "num_input_tokens_seen": 84526480, "step": 88530 }, { "epoch": 7.222040949506485, "grad_norm": 0.43479031324386597, "learning_rate": 1.086171977819292e-05, "loss": 0.2908, "num_input_tokens_seen": 84531856, "step": 88535 }, { "epoch": 7.222448813116894, "grad_norm": 1.3045300245285034, "learning_rate": 1.0858784486351342e-05, "loss": 0.3767, "num_input_tokens_seen": 84535936, "step": 88540 }, { "epoch": 7.222856676727303, "grad_norm": 41.048622131347656, "learning_rate": 1.0855849481146187e-05, "loss": 0.2777, "num_input_tokens_seen": 84541584, "step": 88545 }, { "epoch": 7.223264540337711, "grad_norm": 0.5974483489990234, "learning_rate": 1.0852914762636958e-05, "loss": 0.3157, "num_input_tokens_seen": 84546288, "step": 88550 }, { "epoch": 7.22367240394812, "grad_norm": 1.2628885507583618, "learning_rate": 1.084998033088314e-05, "loss": 0.3673, "num_input_tokens_seen": 84550448, "step": 88555 }, { "epoch": 7.224080267558528, "grad_norm": 0.6263206601142883, "learning_rate": 1.0847046185944207e-05, "loss": 0.4267, "num_input_tokens_seen": 84555312, "step": 88560 }, { "epoch": 7.224488131168937, "grad_norm": 0.6450612545013428, "learning_rate": 1.0844112327879633e-05, "loss": 0.2708, "num_input_tokens_seen": 84560528, "step": 88565 }, { "epoch": 7.224895994779346, "grad_norm": 0.6037328243255615, "learning_rate": 1.0841178756748885e-05, "loss": 0.355, "num_input_tokens_seen": 84564544, "step": 88570 }, { "epoch": 7.225303858389754, "grad_norm": 2.348728656768799, "learning_rate": 1.0838245472611427e-05, "loss": 0.3609, "num_input_tokens_seen": 84569136, "step": 88575 }, { "epoch": 7.225711722000163, "grad_norm": 22.909276962280273, "learning_rate": 1.083531247552671e-05, "loss": 0.3004, "num_input_tokens_seen": 84574592, "step": 88580 }, { "epoch": 7.226119585610572, "grad_norm": 1.4627755880355835, "learning_rate": 1.0832379765554196e-05, "loss": 0.3201, "num_input_tokens_seen": 84578016, "step": 88585 }, { "epoch": 7.22652744922098, "grad_norm": 1.4876736402511597, "learning_rate": 1.0829447342753327e-05, "loss": 0.2682, "num_input_tokens_seen": 84584256, "step": 88590 }, { "epoch": 7.226935312831389, "grad_norm": 0.8798331618309021, "learning_rate": 1.0826515207183535e-05, "loss": 0.3616, "num_input_tokens_seen": 84589664, "step": 88595 }, { "epoch": 7.2273431764417975, "grad_norm": 3.228241205215454, "learning_rate": 1.082358335890426e-05, "loss": 0.2623, "num_input_tokens_seen": 84594224, "step": 88600 }, { "epoch": 7.2277510400522065, "grad_norm": 0.31283506751060486, "learning_rate": 1.0820651797974917e-05, "loss": 0.3664, "num_input_tokens_seen": 84597936, "step": 88605 }, { "epoch": 7.2281589036626155, "grad_norm": 0.3983108699321747, "learning_rate": 1.0817720524454947e-05, "loss": 0.3873, "num_input_tokens_seen": 84601424, "step": 88610 }, { "epoch": 7.228566767273024, "grad_norm": 0.7056750655174255, "learning_rate": 1.0814789538403752e-05, "loss": 0.2463, "num_input_tokens_seen": 84606192, "step": 88615 }, { "epoch": 7.228974630883433, "grad_norm": 0.3103448450565338, "learning_rate": 1.0811858839880747e-05, "loss": 0.2757, "num_input_tokens_seen": 84610800, "step": 88620 }, { "epoch": 7.229382494493842, "grad_norm": 15.462772369384766, "learning_rate": 1.0808928428945329e-05, "loss": 0.3337, "num_input_tokens_seen": 84615072, "step": 88625 }, { "epoch": 7.22979035810425, "grad_norm": 1.075317144393921, "learning_rate": 1.0805998305656905e-05, "loss": 0.3918, "num_input_tokens_seen": 84620384, "step": 88630 }, { "epoch": 7.230198221714659, "grad_norm": 4.489132881164551, "learning_rate": 1.0803068470074867e-05, "loss": 0.3746, "num_input_tokens_seen": 84625312, "step": 88635 }, { "epoch": 7.230606085325067, "grad_norm": 1.6306532621383667, "learning_rate": 1.0800138922258598e-05, "loss": 0.4951, "num_input_tokens_seen": 84629024, "step": 88640 }, { "epoch": 7.231013948935476, "grad_norm": 2.30316162109375, "learning_rate": 1.0797209662267479e-05, "loss": 0.3206, "num_input_tokens_seen": 84633888, "step": 88645 }, { "epoch": 7.231421812545885, "grad_norm": 0.3366912007331848, "learning_rate": 1.0794280690160876e-05, "loss": 0.2434, "num_input_tokens_seen": 84638624, "step": 88650 }, { "epoch": 7.231829676156293, "grad_norm": 1.439616084098816, "learning_rate": 1.0791352005998176e-05, "loss": 0.3356, "num_input_tokens_seen": 84643552, "step": 88655 }, { "epoch": 7.232237539766702, "grad_norm": 0.5392530560493469, "learning_rate": 1.0788423609838732e-05, "loss": 0.276, "num_input_tokens_seen": 84647872, "step": 88660 }, { "epoch": 7.232645403377111, "grad_norm": 35.20497131347656, "learning_rate": 1.0785495501741904e-05, "loss": 0.3484, "num_input_tokens_seen": 84653184, "step": 88665 }, { "epoch": 7.233053266987519, "grad_norm": 1.4765838384628296, "learning_rate": 1.0782567681767033e-05, "loss": 0.3217, "num_input_tokens_seen": 84658256, "step": 88670 }, { "epoch": 7.233461130597928, "grad_norm": 9.796679496765137, "learning_rate": 1.0779640149973486e-05, "loss": 0.4659, "num_input_tokens_seen": 84663008, "step": 88675 }, { "epoch": 7.233868994208336, "grad_norm": 0.5072055459022522, "learning_rate": 1.0776712906420586e-05, "loss": 0.3849, "num_input_tokens_seen": 84668112, "step": 88680 }, { "epoch": 7.234276857818745, "grad_norm": 0.3899340331554413, "learning_rate": 1.0773785951167673e-05, "loss": 0.3543, "num_input_tokens_seen": 84672128, "step": 88685 }, { "epoch": 7.234684721429154, "grad_norm": 0.4285488724708557, "learning_rate": 1.0770859284274074e-05, "loss": 0.3061, "num_input_tokens_seen": 84676192, "step": 88690 }, { "epoch": 7.235092585039562, "grad_norm": 0.804840624332428, "learning_rate": 1.076793290579911e-05, "loss": 0.2783, "num_input_tokens_seen": 84680624, "step": 88695 }, { "epoch": 7.235500448649971, "grad_norm": 3.2510743141174316, "learning_rate": 1.0765006815802098e-05, "loss": 0.3436, "num_input_tokens_seen": 84685328, "step": 88700 }, { "epoch": 7.2359083122603804, "grad_norm": 1.7148054838180542, "learning_rate": 1.0762081014342351e-05, "loss": 0.3898, "num_input_tokens_seen": 84690064, "step": 88705 }, { "epoch": 7.236316175870789, "grad_norm": 0.55058354139328, "learning_rate": 1.075915550147917e-05, "loss": 0.4482, "num_input_tokens_seen": 84695472, "step": 88710 }, { "epoch": 7.236724039481198, "grad_norm": 0.7870596051216125, "learning_rate": 1.0756230277271847e-05, "loss": 0.4725, "num_input_tokens_seen": 84700384, "step": 88715 }, { "epoch": 7.237131903091607, "grad_norm": 15.322770118713379, "learning_rate": 1.0753305341779693e-05, "loss": 0.279, "num_input_tokens_seen": 84705008, "step": 88720 }, { "epoch": 7.237539766702015, "grad_norm": 0.6278648376464844, "learning_rate": 1.0750380695061987e-05, "loss": 0.2979, "num_input_tokens_seen": 84709984, "step": 88725 }, { "epoch": 7.237947630312424, "grad_norm": 12.786971092224121, "learning_rate": 1.0747456337178009e-05, "loss": 0.3843, "num_input_tokens_seen": 84715712, "step": 88730 }, { "epoch": 7.238355493922832, "grad_norm": 31.79309844970703, "learning_rate": 1.0744532268187032e-05, "loss": 0.2977, "num_input_tokens_seen": 84720064, "step": 88735 }, { "epoch": 7.238763357533241, "grad_norm": 1.5892786979675293, "learning_rate": 1.0741608488148322e-05, "loss": 0.3544, "num_input_tokens_seen": 84724848, "step": 88740 }, { "epoch": 7.23917122114365, "grad_norm": 16.065643310546875, "learning_rate": 1.073868499712116e-05, "loss": 0.4017, "num_input_tokens_seen": 84729872, "step": 88745 }, { "epoch": 7.239579084754058, "grad_norm": 2.3035712242126465, "learning_rate": 1.073576179516479e-05, "loss": 0.4152, "num_input_tokens_seen": 84735136, "step": 88750 }, { "epoch": 7.239986948364467, "grad_norm": 45.49519729614258, "learning_rate": 1.0732838882338467e-05, "loss": 0.3841, "num_input_tokens_seen": 84739664, "step": 88755 }, { "epoch": 7.240394811974876, "grad_norm": 1.6475988626480103, "learning_rate": 1.072991625870143e-05, "loss": 0.4022, "num_input_tokens_seen": 84743824, "step": 88760 }, { "epoch": 7.240802675585284, "grad_norm": 2.0911974906921387, "learning_rate": 1.0726993924312937e-05, "loss": 0.3441, "num_input_tokens_seen": 84748528, "step": 88765 }, { "epoch": 7.241210539195693, "grad_norm": 6.906544208526611, "learning_rate": 1.0724071879232209e-05, "loss": 0.2688, "num_input_tokens_seen": 84754176, "step": 88770 }, { "epoch": 7.241618402806101, "grad_norm": 0.5430828928947449, "learning_rate": 1.0721150123518478e-05, "loss": 0.3884, "num_input_tokens_seen": 84759424, "step": 88775 }, { "epoch": 7.24202626641651, "grad_norm": 0.6096800565719604, "learning_rate": 1.0718228657230958e-05, "loss": 0.4196, "num_input_tokens_seen": 84763744, "step": 88780 }, { "epoch": 7.242434130026919, "grad_norm": 0.8983360528945923, "learning_rate": 1.0715307480428885e-05, "loss": 0.3701, "num_input_tokens_seen": 84767696, "step": 88785 }, { "epoch": 7.242841993637327, "grad_norm": 2.0013604164123535, "learning_rate": 1.0712386593171458e-05, "loss": 0.1778, "num_input_tokens_seen": 84772784, "step": 88790 }, { "epoch": 7.243249857247736, "grad_norm": 34.84231948852539, "learning_rate": 1.0709465995517885e-05, "loss": 0.2749, "num_input_tokens_seen": 84776544, "step": 88795 }, { "epoch": 7.243657720858145, "grad_norm": 1.3752833604812622, "learning_rate": 1.0706545687527365e-05, "loss": 0.3936, "num_input_tokens_seen": 84780592, "step": 88800 }, { "epoch": 7.2440655844685535, "grad_norm": 2.113401412963867, "learning_rate": 1.0703625669259077e-05, "loss": 0.2811, "num_input_tokens_seen": 84785104, "step": 88805 }, { "epoch": 7.2444734480789625, "grad_norm": 0.5289232134819031, "learning_rate": 1.0700705940772231e-05, "loss": 0.397, "num_input_tokens_seen": 84790048, "step": 88810 }, { "epoch": 7.244881311689371, "grad_norm": 9.622395515441895, "learning_rate": 1.0697786502126003e-05, "loss": 0.2744, "num_input_tokens_seen": 84795616, "step": 88815 }, { "epoch": 7.24528917529978, "grad_norm": 31.998003005981445, "learning_rate": 1.0694867353379565e-05, "loss": 0.34, "num_input_tokens_seen": 84800304, "step": 88820 }, { "epoch": 7.245697038910189, "grad_norm": 48.7255744934082, "learning_rate": 1.0691948494592085e-05, "loss": 0.3791, "num_input_tokens_seen": 84804880, "step": 88825 }, { "epoch": 7.246104902520597, "grad_norm": 1.0947048664093018, "learning_rate": 1.068902992582273e-05, "loss": 0.411, "num_input_tokens_seen": 84809344, "step": 88830 }, { "epoch": 7.246512766131006, "grad_norm": 1.8795782327651978, "learning_rate": 1.0686111647130655e-05, "loss": 0.2975, "num_input_tokens_seen": 84813616, "step": 88835 }, { "epoch": 7.246920629741415, "grad_norm": 0.39646753668785095, "learning_rate": 1.0683193658575019e-05, "loss": 0.2885, "num_input_tokens_seen": 84818672, "step": 88840 }, { "epoch": 7.247328493351823, "grad_norm": 18.265579223632812, "learning_rate": 1.0680275960214962e-05, "loss": 0.2665, "num_input_tokens_seen": 84822992, "step": 88845 }, { "epoch": 7.247736356962232, "grad_norm": 27.453048706054688, "learning_rate": 1.0677358552109618e-05, "loss": 0.4241, "num_input_tokens_seen": 84827440, "step": 88850 }, { "epoch": 7.248144220572641, "grad_norm": 3.6988465785980225, "learning_rate": 1.0674441434318135e-05, "loss": 0.5223, "num_input_tokens_seen": 84832096, "step": 88855 }, { "epoch": 7.248552084183049, "grad_norm": 14.190557479858398, "learning_rate": 1.067152460689964e-05, "loss": 0.4948, "num_input_tokens_seen": 84837456, "step": 88860 }, { "epoch": 7.248959947793458, "grad_norm": 31.615686416625977, "learning_rate": 1.0668608069913256e-05, "loss": 0.519, "num_input_tokens_seen": 84842416, "step": 88865 }, { "epoch": 7.249367811403866, "grad_norm": 0.9754350781440735, "learning_rate": 1.0665691823418084e-05, "loss": 0.3809, "num_input_tokens_seen": 84847568, "step": 88870 }, { "epoch": 7.249775675014275, "grad_norm": 4.990386009216309, "learning_rate": 1.0662775867473257e-05, "loss": 0.4332, "num_input_tokens_seen": 84852560, "step": 88875 }, { "epoch": 7.250183538624684, "grad_norm": 1.1309704780578613, "learning_rate": 1.0659860202137872e-05, "loss": 0.344, "num_input_tokens_seen": 84856672, "step": 88880 }, { "epoch": 7.250591402235092, "grad_norm": 1.670666217803955, "learning_rate": 1.0656944827471025e-05, "loss": 0.231, "num_input_tokens_seen": 84861152, "step": 88885 }, { "epoch": 7.250999265845501, "grad_norm": 0.3541300892829895, "learning_rate": 1.0654029743531812e-05, "loss": 0.4092, "num_input_tokens_seen": 84865696, "step": 88890 }, { "epoch": 7.2514071294559095, "grad_norm": 1.4478440284729004, "learning_rate": 1.0651114950379312e-05, "loss": 0.2596, "num_input_tokens_seen": 84869280, "step": 88895 }, { "epoch": 7.2518149930663185, "grad_norm": 6.3113179206848145, "learning_rate": 1.0648200448072625e-05, "loss": 0.3134, "num_input_tokens_seen": 84873872, "step": 88900 }, { "epoch": 7.2522228566767275, "grad_norm": 0.5685296058654785, "learning_rate": 1.0645286236670815e-05, "loss": 0.2325, "num_input_tokens_seen": 84878592, "step": 88905 }, { "epoch": 7.252630720287136, "grad_norm": 0.31123870611190796, "learning_rate": 1.0642372316232954e-05, "loss": 0.334, "num_input_tokens_seen": 84882608, "step": 88910 }, { "epoch": 7.253038583897545, "grad_norm": 0.43511515855789185, "learning_rate": 1.0639458686818096e-05, "loss": 0.3278, "num_input_tokens_seen": 84886640, "step": 88915 }, { "epoch": 7.253446447507954, "grad_norm": 22.369943618774414, "learning_rate": 1.0636545348485316e-05, "loss": 0.4527, "num_input_tokens_seen": 84891760, "step": 88920 }, { "epoch": 7.253854311118362, "grad_norm": 12.773449897766113, "learning_rate": 1.0633632301293662e-05, "loss": 0.3613, "num_input_tokens_seen": 84896656, "step": 88925 }, { "epoch": 7.254262174728771, "grad_norm": 0.8793328404426575, "learning_rate": 1.0630719545302173e-05, "loss": 0.3443, "num_input_tokens_seen": 84901824, "step": 88930 }, { "epoch": 7.25467003833918, "grad_norm": 1.2278196811676025, "learning_rate": 1.0627807080569885e-05, "loss": 0.3433, "num_input_tokens_seen": 84906432, "step": 88935 }, { "epoch": 7.255077901949588, "grad_norm": 1.427660346031189, "learning_rate": 1.062489490715585e-05, "loss": 0.4662, "num_input_tokens_seen": 84912272, "step": 88940 }, { "epoch": 7.255485765559997, "grad_norm": 2.1508378982543945, "learning_rate": 1.0621983025119083e-05, "loss": 0.3244, "num_input_tokens_seen": 84917584, "step": 88945 }, { "epoch": 7.255893629170405, "grad_norm": 0.6574717164039612, "learning_rate": 1.0619071434518615e-05, "loss": 0.448, "num_input_tokens_seen": 84922704, "step": 88950 }, { "epoch": 7.256301492780814, "grad_norm": 20.105979919433594, "learning_rate": 1.0616160135413459e-05, "loss": 0.4296, "num_input_tokens_seen": 84927776, "step": 88955 }, { "epoch": 7.256709356391223, "grad_norm": 0.9825154542922974, "learning_rate": 1.0613249127862621e-05, "loss": 0.294, "num_input_tokens_seen": 84932704, "step": 88960 }, { "epoch": 7.257117220001631, "grad_norm": 20.47924041748047, "learning_rate": 1.0610338411925114e-05, "loss": 0.3645, "num_input_tokens_seen": 84937408, "step": 88965 }, { "epoch": 7.25752508361204, "grad_norm": 1.615433931350708, "learning_rate": 1.0607427987659929e-05, "loss": 0.2307, "num_input_tokens_seen": 84941680, "step": 88970 }, { "epoch": 7.257932947222449, "grad_norm": 33.70000457763672, "learning_rate": 1.0604517855126064e-05, "loss": 0.537, "num_input_tokens_seen": 84945344, "step": 88975 }, { "epoch": 7.258340810832857, "grad_norm": 1.3244551420211792, "learning_rate": 1.0601608014382496e-05, "loss": 0.4071, "num_input_tokens_seen": 84949152, "step": 88980 }, { "epoch": 7.258748674443266, "grad_norm": 10.996393203735352, "learning_rate": 1.0598698465488227e-05, "loss": 0.325, "num_input_tokens_seen": 84953920, "step": 88985 }, { "epoch": 7.2591565380536744, "grad_norm": 72.40597534179688, "learning_rate": 1.059578920850222e-05, "loss": 0.304, "num_input_tokens_seen": 84958368, "step": 88990 }, { "epoch": 7.2595644016640835, "grad_norm": 0.6625436544418335, "learning_rate": 1.0592880243483441e-05, "loss": 0.1878, "num_input_tokens_seen": 84963232, "step": 88995 }, { "epoch": 7.2599722652744925, "grad_norm": 0.9366651177406311, "learning_rate": 1.0589971570490862e-05, "loss": 0.3644, "num_input_tokens_seen": 84967936, "step": 89000 }, { "epoch": 7.260380128884901, "grad_norm": 1.9885417222976685, "learning_rate": 1.0587063189583424e-05, "loss": 0.4111, "num_input_tokens_seen": 84973200, "step": 89005 }, { "epoch": 7.26078799249531, "grad_norm": 11.800882339477539, "learning_rate": 1.05841551008201e-05, "loss": 0.2334, "num_input_tokens_seen": 84979056, "step": 89010 }, { "epoch": 7.261195856105719, "grad_norm": 0.36792057752609253, "learning_rate": 1.0581247304259828e-05, "loss": 0.3453, "num_input_tokens_seen": 84983856, "step": 89015 }, { "epoch": 7.261603719716127, "grad_norm": 14.611865997314453, "learning_rate": 1.0578339799961546e-05, "loss": 0.3278, "num_input_tokens_seen": 84988288, "step": 89020 }, { "epoch": 7.262011583326536, "grad_norm": 1.0370848178863525, "learning_rate": 1.0575432587984176e-05, "loss": 0.3655, "num_input_tokens_seen": 84992960, "step": 89025 }, { "epoch": 7.262419446936944, "grad_norm": 1.2953718900680542, "learning_rate": 1.0572525668386671e-05, "loss": 0.2407, "num_input_tokens_seen": 84997344, "step": 89030 }, { "epoch": 7.262827310547353, "grad_norm": 42.05066680908203, "learning_rate": 1.0569619041227941e-05, "loss": 0.3602, "num_input_tokens_seen": 85001968, "step": 89035 }, { "epoch": 7.263235174157762, "grad_norm": 0.4830010235309601, "learning_rate": 1.0566712706566898e-05, "loss": 0.2045, "num_input_tokens_seen": 85006352, "step": 89040 }, { "epoch": 7.26364303776817, "grad_norm": 1.274732232093811, "learning_rate": 1.0563806664462456e-05, "loss": 0.1818, "num_input_tokens_seen": 85011600, "step": 89045 }, { "epoch": 7.264050901378579, "grad_norm": 0.5339292287826538, "learning_rate": 1.056090091497351e-05, "loss": 0.2784, "num_input_tokens_seen": 85017312, "step": 89050 }, { "epoch": 7.264458764988988, "grad_norm": 1.176955223083496, "learning_rate": 1.0557995458158975e-05, "loss": 0.5438, "num_input_tokens_seen": 85022048, "step": 89055 }, { "epoch": 7.264866628599396, "grad_norm": 0.7626945972442627, "learning_rate": 1.0555090294077738e-05, "loss": 0.5852, "num_input_tokens_seen": 85027056, "step": 89060 }, { "epoch": 7.265274492209805, "grad_norm": 33.366416931152344, "learning_rate": 1.055218542278868e-05, "loss": 0.3148, "num_input_tokens_seen": 85031248, "step": 89065 }, { "epoch": 7.265682355820214, "grad_norm": 9.260695457458496, "learning_rate": 1.0549280844350676e-05, "loss": 0.4708, "num_input_tokens_seen": 85036160, "step": 89070 }, { "epoch": 7.266090219430622, "grad_norm": 0.7006109356880188, "learning_rate": 1.0546376558822618e-05, "loss": 0.3231, "num_input_tokens_seen": 85041840, "step": 89075 }, { "epoch": 7.266498083041031, "grad_norm": 1.0889015197753906, "learning_rate": 1.0543472566263365e-05, "loss": 0.3463, "num_input_tokens_seen": 85045904, "step": 89080 }, { "epoch": 7.266905946651439, "grad_norm": 1.2945003509521484, "learning_rate": 1.0540568866731782e-05, "loss": 0.1839, "num_input_tokens_seen": 85050624, "step": 89085 }, { "epoch": 7.267313810261848, "grad_norm": 1.0105575323104858, "learning_rate": 1.0537665460286723e-05, "loss": 0.3436, "num_input_tokens_seen": 85055664, "step": 89090 }, { "epoch": 7.267721673872257, "grad_norm": 0.4398021399974823, "learning_rate": 1.0534762346987037e-05, "loss": 0.4377, "num_input_tokens_seen": 85061248, "step": 89095 }, { "epoch": 7.2681295374826655, "grad_norm": 0.6651552319526672, "learning_rate": 1.0531859526891574e-05, "loss": 0.4804, "num_input_tokens_seen": 85066544, "step": 89100 }, { "epoch": 7.268537401093075, "grad_norm": 18.339679718017578, "learning_rate": 1.052895700005917e-05, "loss": 0.389, "num_input_tokens_seen": 85071568, "step": 89105 }, { "epoch": 7.268945264703484, "grad_norm": 1.6501020193099976, "learning_rate": 1.052605476654866e-05, "loss": 0.4069, "num_input_tokens_seen": 85076048, "step": 89110 }, { "epoch": 7.269353128313892, "grad_norm": 3.061908483505249, "learning_rate": 1.0523152826418859e-05, "loss": 0.3936, "num_input_tokens_seen": 85081408, "step": 89115 }, { "epoch": 7.269760991924301, "grad_norm": 35.70682907104492, "learning_rate": 1.0520251179728607e-05, "loss": 0.3741, "num_input_tokens_seen": 85086048, "step": 89120 }, { "epoch": 7.270168855534709, "grad_norm": 38.45599365234375, "learning_rate": 1.0517349826536712e-05, "loss": 0.3393, "num_input_tokens_seen": 85091040, "step": 89125 }, { "epoch": 7.270576719145118, "grad_norm": 1.0789436101913452, "learning_rate": 1.0514448766901982e-05, "loss": 0.3207, "num_input_tokens_seen": 85095424, "step": 89130 }, { "epoch": 7.270984582755527, "grad_norm": 30.8135929107666, "learning_rate": 1.0511548000883209e-05, "loss": 0.3056, "num_input_tokens_seen": 85099264, "step": 89135 }, { "epoch": 7.271392446365935, "grad_norm": 1.3239164352416992, "learning_rate": 1.0508647528539214e-05, "loss": 0.1954, "num_input_tokens_seen": 85104416, "step": 89140 }, { "epoch": 7.271800309976344, "grad_norm": 1.5081324577331543, "learning_rate": 1.0505747349928774e-05, "loss": 0.4215, "num_input_tokens_seen": 85108832, "step": 89145 }, { "epoch": 7.272208173586753, "grad_norm": 1.0571016073226929, "learning_rate": 1.0502847465110676e-05, "loss": 0.2421, "num_input_tokens_seen": 85112448, "step": 89150 }, { "epoch": 7.272616037197161, "grad_norm": 37.08290481567383, "learning_rate": 1.0499947874143702e-05, "loss": 0.2807, "num_input_tokens_seen": 85116736, "step": 89155 }, { "epoch": 7.27302390080757, "grad_norm": 0.4021691381931305, "learning_rate": 1.049704857708661e-05, "loss": 0.3805, "num_input_tokens_seen": 85122208, "step": 89160 }, { "epoch": 7.273431764417978, "grad_norm": 1.3887993097305298, "learning_rate": 1.0494149573998191e-05, "loss": 0.3988, "num_input_tokens_seen": 85127088, "step": 89165 }, { "epoch": 7.273839628028387, "grad_norm": 6.761157035827637, "learning_rate": 1.04912508649372e-05, "loss": 0.3151, "num_input_tokens_seen": 85132176, "step": 89170 }, { "epoch": 7.274247491638796, "grad_norm": 0.34407269954681396, "learning_rate": 1.0488352449962385e-05, "loss": 0.2222, "num_input_tokens_seen": 85136240, "step": 89175 }, { "epoch": 7.274655355249204, "grad_norm": 25.462940216064453, "learning_rate": 1.048545432913249e-05, "loss": 0.2256, "num_input_tokens_seen": 85141424, "step": 89180 }, { "epoch": 7.275063218859613, "grad_norm": 1.990243911743164, "learning_rate": 1.048255650250628e-05, "loss": 0.2847, "num_input_tokens_seen": 85145392, "step": 89185 }, { "epoch": 7.275471082470022, "grad_norm": 1.4444358348846436, "learning_rate": 1.0479658970142478e-05, "loss": 0.5585, "num_input_tokens_seen": 85150256, "step": 89190 }, { "epoch": 7.2758789460804305, "grad_norm": 0.5737981796264648, "learning_rate": 1.0476761732099822e-05, "loss": 0.2646, "num_input_tokens_seen": 85154656, "step": 89195 }, { "epoch": 7.2762868096908395, "grad_norm": 0.25185421109199524, "learning_rate": 1.0473864788437029e-05, "loss": 0.2195, "num_input_tokens_seen": 85160688, "step": 89200 }, { "epoch": 7.2766946733012485, "grad_norm": 1.2605900764465332, "learning_rate": 1.0470968139212819e-05, "loss": 0.3108, "num_input_tokens_seen": 85165520, "step": 89205 }, { "epoch": 7.277102536911657, "grad_norm": 24.220182418823242, "learning_rate": 1.0468071784485922e-05, "loss": 0.3725, "num_input_tokens_seen": 85171216, "step": 89210 }, { "epoch": 7.277510400522066, "grad_norm": 0.4534662663936615, "learning_rate": 1.0465175724315032e-05, "loss": 0.297, "num_input_tokens_seen": 85176448, "step": 89215 }, { "epoch": 7.277918264132474, "grad_norm": 4.992622375488281, "learning_rate": 1.0462279958758852e-05, "loss": 0.3878, "num_input_tokens_seen": 85181616, "step": 89220 }, { "epoch": 7.278326127742883, "grad_norm": 0.8528611063957214, "learning_rate": 1.0459384487876083e-05, "loss": 0.2717, "num_input_tokens_seen": 85186112, "step": 89225 }, { "epoch": 7.278733991353292, "grad_norm": 35.52592086791992, "learning_rate": 1.0456489311725411e-05, "loss": 0.3914, "num_input_tokens_seen": 85190480, "step": 89230 }, { "epoch": 7.2791418549637, "grad_norm": 19.605091094970703, "learning_rate": 1.045359443036552e-05, "loss": 0.3029, "num_input_tokens_seen": 85195136, "step": 89235 }, { "epoch": 7.279549718574109, "grad_norm": 33.15610885620117, "learning_rate": 1.0450699843855083e-05, "loss": 0.7218, "num_input_tokens_seen": 85199296, "step": 89240 }, { "epoch": 7.279957582184517, "grad_norm": 1.0142642259597778, "learning_rate": 1.0447805552252785e-05, "loss": 0.4539, "num_input_tokens_seen": 85204048, "step": 89245 }, { "epoch": 7.280365445794926, "grad_norm": 34.55604553222656, "learning_rate": 1.0444911555617281e-05, "loss": 0.3396, "num_input_tokens_seen": 85208432, "step": 89250 }, { "epoch": 7.280773309405335, "grad_norm": 0.7882688641548157, "learning_rate": 1.0442017854007241e-05, "loss": 0.3676, "num_input_tokens_seen": 85213648, "step": 89255 }, { "epoch": 7.281181173015743, "grad_norm": 0.5648983716964722, "learning_rate": 1.0439124447481308e-05, "loss": 0.3221, "num_input_tokens_seen": 85218400, "step": 89260 }, { "epoch": 7.281589036626152, "grad_norm": 1.2368443012237549, "learning_rate": 1.0436231336098137e-05, "loss": 0.369, "num_input_tokens_seen": 85223952, "step": 89265 }, { "epoch": 7.281996900236561, "grad_norm": 1.4691389799118042, "learning_rate": 1.0433338519916361e-05, "loss": 0.3827, "num_input_tokens_seen": 85228624, "step": 89270 }, { "epoch": 7.282404763846969, "grad_norm": 0.556060254573822, "learning_rate": 1.0430445998994631e-05, "loss": 0.338, "num_input_tokens_seen": 85232464, "step": 89275 }, { "epoch": 7.282812627457378, "grad_norm": 2.2010905742645264, "learning_rate": 1.0427553773391569e-05, "loss": 0.3881, "num_input_tokens_seen": 85237136, "step": 89280 }, { "epoch": 7.283220491067787, "grad_norm": 11.658316612243652, "learning_rate": 1.04246618431658e-05, "loss": 0.3439, "num_input_tokens_seen": 85242368, "step": 89285 }, { "epoch": 7.2836283546781955, "grad_norm": 5.3752851486206055, "learning_rate": 1.0421770208375945e-05, "loss": 0.3286, "num_input_tokens_seen": 85246544, "step": 89290 }, { "epoch": 7.2840362182886045, "grad_norm": 3.2228219509124756, "learning_rate": 1.0418878869080601e-05, "loss": 0.3007, "num_input_tokens_seen": 85251728, "step": 89295 }, { "epoch": 7.284444081899013, "grad_norm": 0.47284165024757385, "learning_rate": 1.0415987825338395e-05, "loss": 0.3499, "num_input_tokens_seen": 85256528, "step": 89300 }, { "epoch": 7.284851945509422, "grad_norm": 1.451801061630249, "learning_rate": 1.041309707720792e-05, "loss": 0.3443, "num_input_tokens_seen": 85262144, "step": 89305 }, { "epoch": 7.285259809119831, "grad_norm": 1.3469570875167847, "learning_rate": 1.0410206624747767e-05, "loss": 0.3173, "num_input_tokens_seen": 85266864, "step": 89310 }, { "epoch": 7.285667672730239, "grad_norm": 1.1786258220672607, "learning_rate": 1.040731646801652e-05, "loss": 0.2952, "num_input_tokens_seen": 85271760, "step": 89315 }, { "epoch": 7.286075536340648, "grad_norm": 0.6173064708709717, "learning_rate": 1.0404426607072777e-05, "loss": 0.2598, "num_input_tokens_seen": 85276544, "step": 89320 }, { "epoch": 7.286483399951057, "grad_norm": 1.2421733140945435, "learning_rate": 1.0401537041975102e-05, "loss": 0.4524, "num_input_tokens_seen": 85280992, "step": 89325 }, { "epoch": 7.286891263561465, "grad_norm": 34.3387565612793, "learning_rate": 1.0398647772782067e-05, "loss": 0.6662, "num_input_tokens_seen": 85285760, "step": 89330 }, { "epoch": 7.287299127171874, "grad_norm": 9.481894493103027, "learning_rate": 1.039575879955223e-05, "loss": 0.2354, "num_input_tokens_seen": 85289728, "step": 89335 }, { "epoch": 7.287706990782282, "grad_norm": 1.6041052341461182, "learning_rate": 1.0392870122344165e-05, "loss": 0.2297, "num_input_tokens_seen": 85293664, "step": 89340 }, { "epoch": 7.288114854392691, "grad_norm": 0.6127327084541321, "learning_rate": 1.0389981741216418e-05, "loss": 0.3627, "num_input_tokens_seen": 85297840, "step": 89345 }, { "epoch": 7.2885227180031, "grad_norm": 0.8328803181648254, "learning_rate": 1.0387093656227529e-05, "loss": 0.3253, "num_input_tokens_seen": 85302448, "step": 89350 }, { "epoch": 7.288930581613508, "grad_norm": 2.8625075817108154, "learning_rate": 1.0384205867436045e-05, "loss": 0.4233, "num_input_tokens_seen": 85306688, "step": 89355 }, { "epoch": 7.289338445223917, "grad_norm": 0.9731552600860596, "learning_rate": 1.0381318374900487e-05, "loss": 0.2412, "num_input_tokens_seen": 85311408, "step": 89360 }, { "epoch": 7.289746308834326, "grad_norm": 34.05339050292969, "learning_rate": 1.0378431178679405e-05, "loss": 0.3012, "num_input_tokens_seen": 85316160, "step": 89365 }, { "epoch": 7.290154172444734, "grad_norm": 0.548854410648346, "learning_rate": 1.0375544278831306e-05, "loss": 0.3198, "num_input_tokens_seen": 85320992, "step": 89370 }, { "epoch": 7.290562036055143, "grad_norm": 27.987201690673828, "learning_rate": 1.0372657675414713e-05, "loss": 0.4745, "num_input_tokens_seen": 85325376, "step": 89375 }, { "epoch": 7.290969899665551, "grad_norm": 1.9468994140625, "learning_rate": 1.0369771368488132e-05, "loss": 0.3091, "num_input_tokens_seen": 85329920, "step": 89380 }, { "epoch": 7.29137776327596, "grad_norm": 8.745787620544434, "learning_rate": 1.0366885358110071e-05, "loss": 0.2839, "num_input_tokens_seen": 85335136, "step": 89385 }, { "epoch": 7.291785626886369, "grad_norm": 0.7129886746406555, "learning_rate": 1.0363999644339023e-05, "loss": 0.3717, "num_input_tokens_seen": 85340368, "step": 89390 }, { "epoch": 7.292193490496778, "grad_norm": 1.4686847925186157, "learning_rate": 1.0361114227233484e-05, "loss": 0.2556, "num_input_tokens_seen": 85344464, "step": 89395 }, { "epoch": 7.292601354107187, "grad_norm": 0.9502225518226624, "learning_rate": 1.0358229106851939e-05, "loss": 0.3276, "num_input_tokens_seen": 85348480, "step": 89400 }, { "epoch": 7.293009217717596, "grad_norm": 0.5885856747627258, "learning_rate": 1.035534428325286e-05, "loss": 0.2679, "num_input_tokens_seen": 85353984, "step": 89405 }, { "epoch": 7.293417081328004, "grad_norm": 0.720949649810791, "learning_rate": 1.0352459756494736e-05, "loss": 0.304, "num_input_tokens_seen": 85358704, "step": 89410 }, { "epoch": 7.293824944938413, "grad_norm": 0.895117998123169, "learning_rate": 1.0349575526636031e-05, "loss": 0.4379, "num_input_tokens_seen": 85364016, "step": 89415 }, { "epoch": 7.294232808548822, "grad_norm": 7.599299430847168, "learning_rate": 1.0346691593735206e-05, "loss": 0.2159, "num_input_tokens_seen": 85368336, "step": 89420 }, { "epoch": 7.29464067215923, "grad_norm": 1.4817698001861572, "learning_rate": 1.0343807957850705e-05, "loss": 0.4835, "num_input_tokens_seen": 85373680, "step": 89425 }, { "epoch": 7.295048535769639, "grad_norm": 0.7154068350791931, "learning_rate": 1.0340924619040997e-05, "loss": 0.464, "num_input_tokens_seen": 85378384, "step": 89430 }, { "epoch": 7.295456399380047, "grad_norm": 0.5751439929008484, "learning_rate": 1.0338041577364522e-05, "loss": 0.3112, "num_input_tokens_seen": 85382752, "step": 89435 }, { "epoch": 7.295864262990456, "grad_norm": 22.10437774658203, "learning_rate": 1.033515883287971e-05, "loss": 0.2698, "num_input_tokens_seen": 85386448, "step": 89440 }, { "epoch": 7.296272126600865, "grad_norm": 3.7787914276123047, "learning_rate": 1.0332276385644996e-05, "loss": 0.4517, "num_input_tokens_seen": 85391616, "step": 89445 }, { "epoch": 7.296679990211273, "grad_norm": 1.1405109167099, "learning_rate": 1.03293942357188e-05, "loss": 0.2696, "num_input_tokens_seen": 85396608, "step": 89450 }, { "epoch": 7.297087853821682, "grad_norm": 2.1960551738739014, "learning_rate": 1.032651238315956e-05, "loss": 0.2968, "num_input_tokens_seen": 85400704, "step": 89455 }, { "epoch": 7.29749571743209, "grad_norm": 1.270806908607483, "learning_rate": 1.0323630828025676e-05, "loss": 0.5394, "num_input_tokens_seen": 85405328, "step": 89460 }, { "epoch": 7.297903581042499, "grad_norm": 1.8611884117126465, "learning_rate": 1.032074957037556e-05, "loss": 0.2312, "num_input_tokens_seen": 85410208, "step": 89465 }, { "epoch": 7.298311444652908, "grad_norm": 9.017881393432617, "learning_rate": 1.0317868610267603e-05, "loss": 0.3697, "num_input_tokens_seen": 85415296, "step": 89470 }, { "epoch": 7.298719308263316, "grad_norm": 14.61830997467041, "learning_rate": 1.0314987947760218e-05, "loss": 0.2414, "num_input_tokens_seen": 85419664, "step": 89475 }, { "epoch": 7.299127171873725, "grad_norm": 0.869260311126709, "learning_rate": 1.0312107582911789e-05, "loss": 0.5912, "num_input_tokens_seen": 85424288, "step": 89480 }, { "epoch": 7.299535035484134, "grad_norm": 1.6154204607009888, "learning_rate": 1.0309227515780698e-05, "loss": 0.5518, "num_input_tokens_seen": 85429680, "step": 89485 }, { "epoch": 7.2999428990945425, "grad_norm": 1.0490233898162842, "learning_rate": 1.0306347746425315e-05, "loss": 0.3199, "num_input_tokens_seen": 85434736, "step": 89490 }, { "epoch": 7.3003507627049515, "grad_norm": 17.571731567382812, "learning_rate": 1.0303468274904026e-05, "loss": 0.3233, "num_input_tokens_seen": 85440096, "step": 89495 }, { "epoch": 7.3007586263153605, "grad_norm": 0.9569318294525146, "learning_rate": 1.0300589101275194e-05, "loss": 0.2646, "num_input_tokens_seen": 85445056, "step": 89500 }, { "epoch": 7.301166489925769, "grad_norm": 5.146505832672119, "learning_rate": 1.0297710225597172e-05, "loss": 0.555, "num_input_tokens_seen": 85449552, "step": 89505 }, { "epoch": 7.301574353536178, "grad_norm": 3.257289171218872, "learning_rate": 1.0294831647928314e-05, "loss": 0.3599, "num_input_tokens_seen": 85454048, "step": 89510 }, { "epoch": 7.301982217146586, "grad_norm": 0.753259003162384, "learning_rate": 1.0291953368326973e-05, "loss": 0.3417, "num_input_tokens_seen": 85459024, "step": 89515 }, { "epoch": 7.302390080756995, "grad_norm": 30.852100372314453, "learning_rate": 1.0289075386851485e-05, "loss": 0.3326, "num_input_tokens_seen": 85463632, "step": 89520 }, { "epoch": 7.302797944367404, "grad_norm": 7.2696533203125, "learning_rate": 1.0286197703560191e-05, "loss": 0.2916, "num_input_tokens_seen": 85468336, "step": 89525 }, { "epoch": 7.303205807977812, "grad_norm": 0.9869745373725891, "learning_rate": 1.0283320318511417e-05, "loss": 0.532, "num_input_tokens_seen": 85472784, "step": 89530 }, { "epoch": 7.303613671588221, "grad_norm": 19.374731063842773, "learning_rate": 1.0280443231763476e-05, "loss": 0.2622, "num_input_tokens_seen": 85477536, "step": 89535 }, { "epoch": 7.30402153519863, "grad_norm": 28.904966354370117, "learning_rate": 1.0277566443374704e-05, "loss": 0.5041, "num_input_tokens_seen": 85481200, "step": 89540 }, { "epoch": 7.304429398809038, "grad_norm": 0.6737920045852661, "learning_rate": 1.0274689953403407e-05, "loss": 0.3036, "num_input_tokens_seen": 85486224, "step": 89545 }, { "epoch": 7.304837262419447, "grad_norm": 1.7853686809539795, "learning_rate": 1.0271813761907887e-05, "loss": 0.3271, "num_input_tokens_seen": 85491616, "step": 89550 }, { "epoch": 7.305245126029855, "grad_norm": 21.002328872680664, "learning_rate": 1.026893786894644e-05, "loss": 0.2624, "num_input_tokens_seen": 85495904, "step": 89555 }, { "epoch": 7.305652989640264, "grad_norm": 1.260143756866455, "learning_rate": 1.0266062274577357e-05, "loss": 0.2991, "num_input_tokens_seen": 85501504, "step": 89560 }, { "epoch": 7.306060853250673, "grad_norm": 9.247969627380371, "learning_rate": 1.026318697885894e-05, "loss": 0.5252, "num_input_tokens_seen": 85506208, "step": 89565 }, { "epoch": 7.306468716861081, "grad_norm": 1.1115753650665283, "learning_rate": 1.026031198184946e-05, "loss": 0.5194, "num_input_tokens_seen": 85510528, "step": 89570 }, { "epoch": 7.30687658047149, "grad_norm": 6.333432197570801, "learning_rate": 1.0257437283607194e-05, "loss": 0.375, "num_input_tokens_seen": 85514912, "step": 89575 }, { "epoch": 7.307284444081899, "grad_norm": 1.246793508529663, "learning_rate": 1.02545628841904e-05, "loss": 0.4999, "num_input_tokens_seen": 85519104, "step": 89580 }, { "epoch": 7.3076923076923075, "grad_norm": 28.295265197753906, "learning_rate": 1.0251688783657359e-05, "loss": 0.293, "num_input_tokens_seen": 85524064, "step": 89585 }, { "epoch": 7.3081001713027165, "grad_norm": 0.28476110100746155, "learning_rate": 1.024881498206632e-05, "loss": 0.2716, "num_input_tokens_seen": 85528432, "step": 89590 }, { "epoch": 7.308508034913125, "grad_norm": 0.7358438968658447, "learning_rate": 1.0245941479475534e-05, "loss": 0.3811, "num_input_tokens_seen": 85532560, "step": 89595 }, { "epoch": 7.308915898523534, "grad_norm": 3.2959439754486084, "learning_rate": 1.0243068275943244e-05, "loss": 0.6562, "num_input_tokens_seen": 85536896, "step": 89600 }, { "epoch": 7.309323762133943, "grad_norm": 1.0345439910888672, "learning_rate": 1.024019537152768e-05, "loss": 0.3557, "num_input_tokens_seen": 85541904, "step": 89605 }, { "epoch": 7.309731625744351, "grad_norm": 146.36209106445312, "learning_rate": 1.0237322766287093e-05, "loss": 0.3876, "num_input_tokens_seen": 85547536, "step": 89610 }, { "epoch": 7.31013948935476, "grad_norm": 1.195168137550354, "learning_rate": 1.02344504602797e-05, "loss": 0.5909, "num_input_tokens_seen": 85552592, "step": 89615 }, { "epoch": 7.310547352965169, "grad_norm": 0.4477856755256653, "learning_rate": 1.0231578453563726e-05, "loss": 0.2838, "num_input_tokens_seen": 85557984, "step": 89620 }, { "epoch": 7.310955216575577, "grad_norm": 7.4771409034729, "learning_rate": 1.0228706746197367e-05, "loss": 0.4605, "num_input_tokens_seen": 85562528, "step": 89625 }, { "epoch": 7.311363080185986, "grad_norm": 0.8641887307167053, "learning_rate": 1.0225835338238857e-05, "loss": 0.4067, "num_input_tokens_seen": 85567008, "step": 89630 }, { "epoch": 7.311770943796395, "grad_norm": 0.9961604475975037, "learning_rate": 1.0222964229746388e-05, "loss": 0.3197, "num_input_tokens_seen": 85572160, "step": 89635 }, { "epoch": 7.312178807406803, "grad_norm": 1.968631625175476, "learning_rate": 1.0220093420778151e-05, "loss": 0.1926, "num_input_tokens_seen": 85576608, "step": 89640 }, { "epoch": 7.312586671017212, "grad_norm": 13.473402976989746, "learning_rate": 1.0217222911392343e-05, "loss": 0.4888, "num_input_tokens_seen": 85581392, "step": 89645 }, { "epoch": 7.31299453462762, "grad_norm": 1.354413628578186, "learning_rate": 1.0214352701647145e-05, "loss": 0.2821, "num_input_tokens_seen": 85586416, "step": 89650 }, { "epoch": 7.313402398238029, "grad_norm": 0.6054708361625671, "learning_rate": 1.0211482791600736e-05, "loss": 0.6269, "num_input_tokens_seen": 85590048, "step": 89655 }, { "epoch": 7.313810261848438, "grad_norm": 0.4373200237751007, "learning_rate": 1.0208613181311283e-05, "loss": 0.2471, "num_input_tokens_seen": 85594544, "step": 89660 }, { "epoch": 7.314218125458846, "grad_norm": 4.013500213623047, "learning_rate": 1.0205743870836955e-05, "loss": 0.2683, "num_input_tokens_seen": 85599232, "step": 89665 }, { "epoch": 7.314625989069255, "grad_norm": 2.1284353733062744, "learning_rate": 1.0202874860235908e-05, "loss": 0.3108, "num_input_tokens_seen": 85603696, "step": 89670 }, { "epoch": 7.315033852679664, "grad_norm": 0.4053824245929718, "learning_rate": 1.0200006149566307e-05, "loss": 0.3498, "num_input_tokens_seen": 85608928, "step": 89675 }, { "epoch": 7.3154417162900724, "grad_norm": 6.157281875610352, "learning_rate": 1.0197137738886292e-05, "loss": 0.2742, "num_input_tokens_seen": 85613456, "step": 89680 }, { "epoch": 7.3158495799004815, "grad_norm": 0.43239206075668335, "learning_rate": 1.0194269628254005e-05, "loss": 0.3557, "num_input_tokens_seen": 85618400, "step": 89685 }, { "epoch": 7.31625744351089, "grad_norm": 2.248246908187866, "learning_rate": 1.0191401817727572e-05, "loss": 0.3092, "num_input_tokens_seen": 85623712, "step": 89690 }, { "epoch": 7.316665307121299, "grad_norm": 1.1945013999938965, "learning_rate": 1.0188534307365141e-05, "loss": 0.3745, "num_input_tokens_seen": 85628480, "step": 89695 }, { "epoch": 7.317073170731708, "grad_norm": 1.2275944948196411, "learning_rate": 1.0185667097224826e-05, "loss": 0.4536, "num_input_tokens_seen": 85632976, "step": 89700 }, { "epoch": 7.317481034342116, "grad_norm": 8.268436431884766, "learning_rate": 1.0182800187364747e-05, "loss": 0.2915, "num_input_tokens_seen": 85637136, "step": 89705 }, { "epoch": 7.317888897952525, "grad_norm": 8.799899101257324, "learning_rate": 1.0179933577843006e-05, "loss": 0.2119, "num_input_tokens_seen": 85642192, "step": 89710 }, { "epoch": 7.318296761562934, "grad_norm": 0.607925295829773, "learning_rate": 1.017706726871771e-05, "loss": 0.2857, "num_input_tokens_seen": 85647040, "step": 89715 }, { "epoch": 7.318704625173342, "grad_norm": 1.3116261959075928, "learning_rate": 1.0174201260046968e-05, "loss": 0.4127, "num_input_tokens_seen": 85651472, "step": 89720 }, { "epoch": 7.319112488783751, "grad_norm": 4.056443691253662, "learning_rate": 1.0171335551888866e-05, "loss": 0.2576, "num_input_tokens_seen": 85655728, "step": 89725 }, { "epoch": 7.319520352394159, "grad_norm": 1.88600492477417, "learning_rate": 1.0168470144301492e-05, "loss": 0.5214, "num_input_tokens_seen": 85660752, "step": 89730 }, { "epoch": 7.319928216004568, "grad_norm": 1.9700413942337036, "learning_rate": 1.0165605037342918e-05, "loss": 0.3779, "num_input_tokens_seen": 85664880, "step": 89735 }, { "epoch": 7.320336079614977, "grad_norm": 3.6189210414886475, "learning_rate": 1.0162740231071235e-05, "loss": 0.3456, "num_input_tokens_seen": 85669824, "step": 89740 }, { "epoch": 7.320743943225385, "grad_norm": 1.8116828203201294, "learning_rate": 1.0159875725544505e-05, "loss": 0.3908, "num_input_tokens_seen": 85673888, "step": 89745 }, { "epoch": 7.321151806835794, "grad_norm": 22.51289939880371, "learning_rate": 1.0157011520820784e-05, "loss": 0.2844, "num_input_tokens_seen": 85679072, "step": 89750 }, { "epoch": 7.321559670446203, "grad_norm": 1.2982580661773682, "learning_rate": 1.0154147616958135e-05, "loss": 0.322, "num_input_tokens_seen": 85684304, "step": 89755 }, { "epoch": 7.321967534056611, "grad_norm": 14.92467975616455, "learning_rate": 1.0151284014014596e-05, "loss": 0.2541, "num_input_tokens_seen": 85689312, "step": 89760 }, { "epoch": 7.32237539766702, "grad_norm": 3.2365288734436035, "learning_rate": 1.0148420712048226e-05, "loss": 0.2863, "num_input_tokens_seen": 85692928, "step": 89765 }, { "epoch": 7.322783261277429, "grad_norm": 27.68419075012207, "learning_rate": 1.0145557711117062e-05, "loss": 0.2867, "num_input_tokens_seen": 85697328, "step": 89770 }, { "epoch": 7.323191124887837, "grad_norm": 56.02259063720703, "learning_rate": 1.014269501127913e-05, "loss": 0.2858, "num_input_tokens_seen": 85701808, "step": 89775 }, { "epoch": 7.323598988498246, "grad_norm": 1.5650757551193237, "learning_rate": 1.0139832612592454e-05, "loss": 0.3468, "num_input_tokens_seen": 85706864, "step": 89780 }, { "epoch": 7.3240068521086545, "grad_norm": 4.663955211639404, "learning_rate": 1.013697051511506e-05, "loss": 0.3961, "num_input_tokens_seen": 85711600, "step": 89785 }, { "epoch": 7.3244147157190636, "grad_norm": 1.67792546749115, "learning_rate": 1.0134108718904955e-05, "loss": 0.3058, "num_input_tokens_seen": 85716288, "step": 89790 }, { "epoch": 7.324822579329473, "grad_norm": 0.9618801474571228, "learning_rate": 1.0131247224020152e-05, "loss": 0.3131, "num_input_tokens_seen": 85719920, "step": 89795 }, { "epoch": 7.325230442939881, "grad_norm": 11.266817092895508, "learning_rate": 1.0128386030518649e-05, "loss": 0.2107, "num_input_tokens_seen": 85724656, "step": 89800 }, { "epoch": 7.32563830655029, "grad_norm": 1.1842353343963623, "learning_rate": 1.0125525138458433e-05, "loss": 0.3144, "num_input_tokens_seen": 85729632, "step": 89805 }, { "epoch": 7.326046170160698, "grad_norm": 3.8713526725769043, "learning_rate": 1.012266454789751e-05, "loss": 0.3769, "num_input_tokens_seen": 85734960, "step": 89810 }, { "epoch": 7.326454033771107, "grad_norm": 25.816612243652344, "learning_rate": 1.0119804258893855e-05, "loss": 0.3347, "num_input_tokens_seen": 85739632, "step": 89815 }, { "epoch": 7.326861897381516, "grad_norm": 3.3701424598693848, "learning_rate": 1.0116944271505444e-05, "loss": 0.7568, "num_input_tokens_seen": 85743568, "step": 89820 }, { "epoch": 7.327269760991924, "grad_norm": 1.2881617546081543, "learning_rate": 1.011408458579024e-05, "loss": 0.3366, "num_input_tokens_seen": 85748848, "step": 89825 }, { "epoch": 7.327677624602333, "grad_norm": 0.5144095420837402, "learning_rate": 1.0111225201806227e-05, "loss": 0.1916, "num_input_tokens_seen": 85753360, "step": 89830 }, { "epoch": 7.328085488212742, "grad_norm": 1.0392142534255981, "learning_rate": 1.010836611961135e-05, "loss": 0.1595, "num_input_tokens_seen": 85758608, "step": 89835 }, { "epoch": 7.32849335182315, "grad_norm": 1.832602620124817, "learning_rate": 1.0105507339263564e-05, "loss": 0.2061, "num_input_tokens_seen": 85763392, "step": 89840 }, { "epoch": 7.328901215433559, "grad_norm": 4.745311260223389, "learning_rate": 1.0102648860820818e-05, "loss": 0.3568, "num_input_tokens_seen": 85768480, "step": 89845 }, { "epoch": 7.329309079043968, "grad_norm": 0.5653929710388184, "learning_rate": 1.0099790684341037e-05, "loss": 0.2241, "num_input_tokens_seen": 85773280, "step": 89850 }, { "epoch": 7.329716942654376, "grad_norm": 0.44961467385292053, "learning_rate": 1.0096932809882178e-05, "loss": 0.1553, "num_input_tokens_seen": 85778192, "step": 89855 }, { "epoch": 7.330124806264785, "grad_norm": 0.9321066737174988, "learning_rate": 1.0094075237502157e-05, "loss": 0.2312, "num_input_tokens_seen": 85783072, "step": 89860 }, { "epoch": 7.330532669875193, "grad_norm": 1.4446635246276855, "learning_rate": 1.0091217967258899e-05, "loss": 0.3377, "num_input_tokens_seen": 85787376, "step": 89865 }, { "epoch": 7.330940533485602, "grad_norm": 4.812520503997803, "learning_rate": 1.0088360999210308e-05, "loss": 0.4431, "num_input_tokens_seen": 85793152, "step": 89870 }, { "epoch": 7.331348397096011, "grad_norm": 0.6601043343544006, "learning_rate": 1.008550433341431e-05, "loss": 0.3413, "num_input_tokens_seen": 85798208, "step": 89875 }, { "epoch": 7.3317562607064195, "grad_norm": 2.9380767345428467, "learning_rate": 1.0082647969928804e-05, "loss": 0.3691, "num_input_tokens_seen": 85803312, "step": 89880 }, { "epoch": 7.3321641243168285, "grad_norm": 0.7728389501571655, "learning_rate": 1.0079791908811683e-05, "loss": 0.1154, "num_input_tokens_seen": 85807136, "step": 89885 }, { "epoch": 7.3325719879272375, "grad_norm": 54.59953689575195, "learning_rate": 1.0076936150120835e-05, "loss": 0.3655, "num_input_tokens_seen": 85811776, "step": 89890 }, { "epoch": 7.332979851537646, "grad_norm": 0.8326742053031921, "learning_rate": 1.0074080693914157e-05, "loss": 0.3315, "num_input_tokens_seen": 85816880, "step": 89895 }, { "epoch": 7.333387715148055, "grad_norm": 34.87828826904297, "learning_rate": 1.0071225540249517e-05, "loss": 0.4356, "num_input_tokens_seen": 85822080, "step": 89900 }, { "epoch": 7.333795578758463, "grad_norm": 0.7987156510353088, "learning_rate": 1.0068370689184798e-05, "loss": 0.2703, "num_input_tokens_seen": 85826976, "step": 89905 }, { "epoch": 7.334203442368872, "grad_norm": 10.370447158813477, "learning_rate": 1.0065516140777856e-05, "loss": 0.3706, "num_input_tokens_seen": 85831632, "step": 89910 }, { "epoch": 7.334611305979281, "grad_norm": 14.792192459106445, "learning_rate": 1.0062661895086559e-05, "loss": 0.5372, "num_input_tokens_seen": 85836128, "step": 89915 }, { "epoch": 7.335019169589689, "grad_norm": 0.8572118282318115, "learning_rate": 1.0059807952168754e-05, "loss": 0.3861, "num_input_tokens_seen": 85840480, "step": 89920 }, { "epoch": 7.335427033200098, "grad_norm": 0.2562749981880188, "learning_rate": 1.0056954312082296e-05, "loss": 0.3365, "num_input_tokens_seen": 85845168, "step": 89925 }, { "epoch": 7.335834896810507, "grad_norm": 1.7561705112457275, "learning_rate": 1.0054100974885024e-05, "loss": 0.3899, "num_input_tokens_seen": 85850096, "step": 89930 }, { "epoch": 7.336242760420915, "grad_norm": 1.337564468383789, "learning_rate": 1.0051247940634767e-05, "loss": 0.4501, "num_input_tokens_seen": 85854912, "step": 89935 }, { "epoch": 7.336650624031324, "grad_norm": 1.8343479633331299, "learning_rate": 1.0048395209389368e-05, "loss": 0.3916, "num_input_tokens_seen": 85858992, "step": 89940 }, { "epoch": 7.337058487641732, "grad_norm": 1.4336210489273071, "learning_rate": 1.0045542781206646e-05, "loss": 0.204, "num_input_tokens_seen": 85863152, "step": 89945 }, { "epoch": 7.337466351252141, "grad_norm": 4.091008186340332, "learning_rate": 1.0042690656144419e-05, "loss": 0.2926, "num_input_tokens_seen": 85867744, "step": 89950 }, { "epoch": 7.33787421486255, "grad_norm": 5.0456671714782715, "learning_rate": 1.0039838834260492e-05, "loss": 0.608, "num_input_tokens_seen": 85874192, "step": 89955 }, { "epoch": 7.338282078472958, "grad_norm": 35.77069091796875, "learning_rate": 1.003698731561267e-05, "loss": 0.5615, "num_input_tokens_seen": 85879616, "step": 89960 }, { "epoch": 7.338689942083367, "grad_norm": 4.187093257904053, "learning_rate": 1.0034136100258765e-05, "loss": 0.6177, "num_input_tokens_seen": 85884624, "step": 89965 }, { "epoch": 7.339097805693776, "grad_norm": 4.824526309967041, "learning_rate": 1.0031285188256561e-05, "loss": 0.2669, "num_input_tokens_seen": 85890080, "step": 89970 }, { "epoch": 7.3395056693041845, "grad_norm": 2.5223989486694336, "learning_rate": 1.0028434579663848e-05, "loss": 0.4116, "num_input_tokens_seen": 85894384, "step": 89975 }, { "epoch": 7.3399135329145935, "grad_norm": 11.4124174118042, "learning_rate": 1.0025584274538393e-05, "loss": 0.2711, "num_input_tokens_seen": 85899296, "step": 89980 }, { "epoch": 7.3403213965250025, "grad_norm": 1.1234545707702637, "learning_rate": 1.0022734272937991e-05, "loss": 0.3793, "num_input_tokens_seen": 85904304, "step": 89985 }, { "epoch": 7.340729260135411, "grad_norm": 2.082165002822876, "learning_rate": 1.0019884574920404e-05, "loss": 0.2673, "num_input_tokens_seen": 85910224, "step": 89990 }, { "epoch": 7.34113712374582, "grad_norm": 7.8246049880981445, "learning_rate": 1.0017035180543389e-05, "loss": 0.2566, "num_input_tokens_seen": 85914720, "step": 89995 }, { "epoch": 7.341544987356228, "grad_norm": 0.8245742321014404, "learning_rate": 1.0014186089864702e-05, "loss": 0.2792, "num_input_tokens_seen": 85919536, "step": 90000 }, { "epoch": 7.341952850966637, "grad_norm": 0.7179188132286072, "learning_rate": 1.0011337302942089e-05, "loss": 0.3294, "num_input_tokens_seen": 85925152, "step": 90005 }, { "epoch": 7.342360714577046, "grad_norm": 22.852142333984375, "learning_rate": 1.000848881983331e-05, "loss": 0.3756, "num_input_tokens_seen": 85930144, "step": 90010 }, { "epoch": 7.342768578187454, "grad_norm": 1.668700098991394, "learning_rate": 1.0005640640596089e-05, "loss": 0.3481, "num_input_tokens_seen": 85935712, "step": 90015 }, { "epoch": 7.343176441797863, "grad_norm": 81.73220825195312, "learning_rate": 1.000279276528816e-05, "loss": 0.3212, "num_input_tokens_seen": 85941216, "step": 90020 }, { "epoch": 7.343584305408271, "grad_norm": 0.7225087881088257, "learning_rate": 9.999945193967245e-06, "loss": 0.413, "num_input_tokens_seen": 85945376, "step": 90025 }, { "epoch": 7.34399216901868, "grad_norm": 12.192193984985352, "learning_rate": 9.997097926691071e-06, "loss": 0.2542, "num_input_tokens_seen": 85950384, "step": 90030 }, { "epoch": 7.344400032629089, "grad_norm": 0.4096616208553314, "learning_rate": 9.994250963517349e-06, "loss": 0.175, "num_input_tokens_seen": 85956112, "step": 90035 }, { "epoch": 7.344807896239497, "grad_norm": 0.8363901972770691, "learning_rate": 9.991404304503784e-06, "loss": 0.3307, "num_input_tokens_seen": 85960512, "step": 90040 }, { "epoch": 7.345215759849906, "grad_norm": 17.01426887512207, "learning_rate": 9.988557949708074e-06, "loss": 0.1929, "num_input_tokens_seen": 85964336, "step": 90045 }, { "epoch": 7.345623623460315, "grad_norm": 2.5552618503570557, "learning_rate": 9.985711899187919e-06, "loss": 0.2803, "num_input_tokens_seen": 85969104, "step": 90050 }, { "epoch": 7.346031487070723, "grad_norm": 1.167349100112915, "learning_rate": 9.982866153000994e-06, "loss": 0.3672, "num_input_tokens_seen": 85974176, "step": 90055 }, { "epoch": 7.346439350681132, "grad_norm": 0.4018446207046509, "learning_rate": 9.980020711205e-06, "loss": 0.234, "num_input_tokens_seen": 85978112, "step": 90060 }, { "epoch": 7.346847214291541, "grad_norm": 26.99216079711914, "learning_rate": 9.9771755738576e-06, "loss": 0.4489, "num_input_tokens_seen": 85983120, "step": 90065 }, { "epoch": 7.347255077901949, "grad_norm": 28.66765594482422, "learning_rate": 9.974330741016471e-06, "loss": 0.3298, "num_input_tokens_seen": 85988528, "step": 90070 }, { "epoch": 7.347662941512358, "grad_norm": 15.659721374511719, "learning_rate": 9.971486212739273e-06, "loss": 0.304, "num_input_tokens_seen": 85993488, "step": 90075 }, { "epoch": 7.348070805122767, "grad_norm": 24.610279083251953, "learning_rate": 9.968641989083663e-06, "loss": 0.3753, "num_input_tokens_seen": 85997648, "step": 90080 }, { "epoch": 7.348478668733176, "grad_norm": 19.882991790771484, "learning_rate": 9.965798070107294e-06, "loss": 0.2277, "num_input_tokens_seen": 86001632, "step": 90085 }, { "epoch": 7.348886532343585, "grad_norm": 0.9687944650650024, "learning_rate": 9.962954455867798e-06, "loss": 0.5049, "num_input_tokens_seen": 86007008, "step": 90090 }, { "epoch": 7.349294395953993, "grad_norm": 1.5731693506240845, "learning_rate": 9.960111146422835e-06, "loss": 0.3314, "num_input_tokens_seen": 86012160, "step": 90095 }, { "epoch": 7.349702259564402, "grad_norm": 0.6826485991477966, "learning_rate": 9.957268141830032e-06, "loss": 0.2749, "num_input_tokens_seen": 86017872, "step": 90100 }, { "epoch": 7.350110123174811, "grad_norm": 24.6173152923584, "learning_rate": 9.954425442147006e-06, "loss": 0.5426, "num_input_tokens_seen": 86023024, "step": 90105 }, { "epoch": 7.350517986785219, "grad_norm": 1.9399480819702148, "learning_rate": 9.951583047431387e-06, "loss": 0.2531, "num_input_tokens_seen": 86028064, "step": 90110 }, { "epoch": 7.350925850395628, "grad_norm": 25.932510375976562, "learning_rate": 9.948740957740773e-06, "loss": 0.3013, "num_input_tokens_seen": 86033248, "step": 90115 }, { "epoch": 7.351333714006036, "grad_norm": 1.6000889539718628, "learning_rate": 9.945899173132797e-06, "loss": 0.4805, "num_input_tokens_seen": 86038848, "step": 90120 }, { "epoch": 7.351741577616445, "grad_norm": 0.8123868107795715, "learning_rate": 9.943057693665046e-06, "loss": 0.3114, "num_input_tokens_seen": 86043280, "step": 90125 }, { "epoch": 7.352149441226854, "grad_norm": 0.593201756477356, "learning_rate": 9.940216519395118e-06, "loss": 0.1866, "num_input_tokens_seen": 86048832, "step": 90130 }, { "epoch": 7.352557304837262, "grad_norm": 4.743459701538086, "learning_rate": 9.937375650380593e-06, "loss": 0.2455, "num_input_tokens_seen": 86053872, "step": 90135 }, { "epoch": 7.352965168447671, "grad_norm": 17.152938842773438, "learning_rate": 9.93453508667907e-06, "loss": 0.4775, "num_input_tokens_seen": 86058816, "step": 90140 }, { "epoch": 7.35337303205808, "grad_norm": 55.65955352783203, "learning_rate": 9.93169482834812e-06, "loss": 0.432, "num_input_tokens_seen": 86063008, "step": 90145 }, { "epoch": 7.353780895668488, "grad_norm": 0.676190197467804, "learning_rate": 9.928854875445316e-06, "loss": 0.4287, "num_input_tokens_seen": 86068384, "step": 90150 }, { "epoch": 7.354188759278897, "grad_norm": 0.6882506608963013, "learning_rate": 9.926015228028216e-06, "loss": 0.4634, "num_input_tokens_seen": 86073520, "step": 90155 }, { "epoch": 7.354596622889305, "grad_norm": 32.261077880859375, "learning_rate": 9.923175886154374e-06, "loss": 0.3353, "num_input_tokens_seen": 86078640, "step": 90160 }, { "epoch": 7.355004486499714, "grad_norm": 2.4424052238464355, "learning_rate": 9.920336849881359e-06, "loss": 0.2667, "num_input_tokens_seen": 86082192, "step": 90165 }, { "epoch": 7.355412350110123, "grad_norm": 10.820544242858887, "learning_rate": 9.917498119266707e-06, "loss": 0.3087, "num_input_tokens_seen": 86086976, "step": 90170 }, { "epoch": 7.3558202137205315, "grad_norm": 1.5937564373016357, "learning_rate": 9.914659694367962e-06, "loss": 0.2806, "num_input_tokens_seen": 86090912, "step": 90175 }, { "epoch": 7.3562280773309405, "grad_norm": 39.41350173950195, "learning_rate": 9.911821575242644e-06, "loss": 0.4529, "num_input_tokens_seen": 86095312, "step": 90180 }, { "epoch": 7.3566359409413495, "grad_norm": 55.276458740234375, "learning_rate": 9.908983761948301e-06, "loss": 0.3787, "num_input_tokens_seen": 86100944, "step": 90185 }, { "epoch": 7.357043804551758, "grad_norm": 46.23064041137695, "learning_rate": 9.906146254542444e-06, "loss": 0.4286, "num_input_tokens_seen": 86105600, "step": 90190 }, { "epoch": 7.357451668162167, "grad_norm": 42.2498779296875, "learning_rate": 9.903309053082591e-06, "loss": 0.3336, "num_input_tokens_seen": 86110592, "step": 90195 }, { "epoch": 7.357859531772576, "grad_norm": 2.616431474685669, "learning_rate": 9.900472157626245e-06, "loss": 0.3413, "num_input_tokens_seen": 86115200, "step": 90200 }, { "epoch": 7.358267395382984, "grad_norm": 0.7789286971092224, "learning_rate": 9.897635568230915e-06, "loss": 0.2568, "num_input_tokens_seen": 86120560, "step": 90205 }, { "epoch": 7.358675258993393, "grad_norm": 10.934930801391602, "learning_rate": 9.894799284954092e-06, "loss": 0.2714, "num_input_tokens_seen": 86125200, "step": 90210 }, { "epoch": 7.359083122603801, "grad_norm": 0.4380720555782318, "learning_rate": 9.891963307853273e-06, "loss": 0.3645, "num_input_tokens_seen": 86130016, "step": 90215 }, { "epoch": 7.35949098621421, "grad_norm": 0.6781414151191711, "learning_rate": 9.889127636985934e-06, "loss": 0.1667, "num_input_tokens_seen": 86133808, "step": 90220 }, { "epoch": 7.359898849824619, "grad_norm": 0.8167148232460022, "learning_rate": 9.88629227240955e-06, "loss": 0.3521, "num_input_tokens_seen": 86138800, "step": 90225 }, { "epoch": 7.360306713435027, "grad_norm": 96.44693756103516, "learning_rate": 9.883457214181607e-06, "loss": 0.4465, "num_input_tokens_seen": 86143424, "step": 90230 }, { "epoch": 7.360714577045436, "grad_norm": 1.1283180713653564, "learning_rate": 9.880622462359562e-06, "loss": 0.3804, "num_input_tokens_seen": 86148016, "step": 90235 }, { "epoch": 7.361122440655845, "grad_norm": 1.2984806299209595, "learning_rate": 9.877788017000877e-06, "loss": 0.2516, "num_input_tokens_seen": 86152304, "step": 90240 }, { "epoch": 7.361530304266253, "grad_norm": 2.865034818649292, "learning_rate": 9.874953878162996e-06, "loss": 0.3909, "num_input_tokens_seen": 86158000, "step": 90245 }, { "epoch": 7.361938167876662, "grad_norm": 0.815270185470581, "learning_rate": 9.87212004590338e-06, "loss": 0.5754, "num_input_tokens_seen": 86163104, "step": 90250 }, { "epoch": 7.36234603148707, "grad_norm": 0.58136385679245, "learning_rate": 9.869286520279461e-06, "loss": 0.5396, "num_input_tokens_seen": 86167616, "step": 90255 }, { "epoch": 7.362753895097479, "grad_norm": 2.4374606609344482, "learning_rate": 9.866453301348674e-06, "loss": 0.3525, "num_input_tokens_seen": 86172432, "step": 90260 }, { "epoch": 7.363161758707888, "grad_norm": 10.460383415222168, "learning_rate": 9.863620389168452e-06, "loss": 0.3478, "num_input_tokens_seen": 86176944, "step": 90265 }, { "epoch": 7.3635696223182965, "grad_norm": 0.9859727621078491, "learning_rate": 9.8607877837962e-06, "loss": 0.7121, "num_input_tokens_seen": 86181328, "step": 90270 }, { "epoch": 7.3639774859287055, "grad_norm": 0.4000513553619385, "learning_rate": 9.857955485289358e-06, "loss": 0.2355, "num_input_tokens_seen": 86185824, "step": 90275 }, { "epoch": 7.3643853495391145, "grad_norm": 0.6027957797050476, "learning_rate": 9.855123493705323e-06, "loss": 0.3039, "num_input_tokens_seen": 86191168, "step": 90280 }, { "epoch": 7.364793213149523, "grad_norm": 3.684464454650879, "learning_rate": 9.8522918091015e-06, "loss": 0.3043, "num_input_tokens_seen": 86196144, "step": 90285 }, { "epoch": 7.365201076759932, "grad_norm": 1.744780421257019, "learning_rate": 9.849460431535277e-06, "loss": 0.2108, "num_input_tokens_seen": 86201040, "step": 90290 }, { "epoch": 7.36560894037034, "grad_norm": 6.6116132736206055, "learning_rate": 9.846629361064061e-06, "loss": 0.3197, "num_input_tokens_seen": 86205888, "step": 90295 }, { "epoch": 7.366016803980749, "grad_norm": 28.893611907958984, "learning_rate": 9.843798597745227e-06, "loss": 0.4295, "num_input_tokens_seen": 86210448, "step": 90300 }, { "epoch": 7.366424667591158, "grad_norm": 0.8644488453865051, "learning_rate": 9.840968141636159e-06, "loss": 0.5198, "num_input_tokens_seen": 86215488, "step": 90305 }, { "epoch": 7.366832531201566, "grad_norm": 12.752199172973633, "learning_rate": 9.838137992794222e-06, "loss": 0.4075, "num_input_tokens_seen": 86220256, "step": 90310 }, { "epoch": 7.367240394811975, "grad_norm": 5.1973161697387695, "learning_rate": 9.835308151276778e-06, "loss": 0.3015, "num_input_tokens_seen": 86224336, "step": 90315 }, { "epoch": 7.367648258422384, "grad_norm": 14.041523933410645, "learning_rate": 9.832478617141203e-06, "loss": 0.4664, "num_input_tokens_seen": 86229824, "step": 90320 }, { "epoch": 7.368056122032792, "grad_norm": 65.55024719238281, "learning_rate": 9.82964939044484e-06, "loss": 0.1901, "num_input_tokens_seen": 86234752, "step": 90325 }, { "epoch": 7.368463985643201, "grad_norm": 5.4453043937683105, "learning_rate": 9.826820471245037e-06, "loss": 0.3228, "num_input_tokens_seen": 86240032, "step": 90330 }, { "epoch": 7.36887184925361, "grad_norm": 0.5889990925788879, "learning_rate": 9.823991859599138e-06, "loss": 0.3258, "num_input_tokens_seen": 86244320, "step": 90335 }, { "epoch": 7.369279712864018, "grad_norm": 1.0026229619979858, "learning_rate": 9.821163555564476e-06, "loss": 0.3739, "num_input_tokens_seen": 86248848, "step": 90340 }, { "epoch": 7.369687576474427, "grad_norm": 35.9190559387207, "learning_rate": 9.818335559198376e-06, "loss": 0.4481, "num_input_tokens_seen": 86253616, "step": 90345 }, { "epoch": 7.370095440084835, "grad_norm": 2.666893482208252, "learning_rate": 9.815507870558163e-06, "loss": 0.4822, "num_input_tokens_seen": 86258688, "step": 90350 }, { "epoch": 7.370503303695244, "grad_norm": 35.71880340576172, "learning_rate": 9.812680489701157e-06, "loss": 0.3934, "num_input_tokens_seen": 86263712, "step": 90355 }, { "epoch": 7.370911167305653, "grad_norm": 3.1341774463653564, "learning_rate": 9.809853416684654e-06, "loss": 0.6748, "num_input_tokens_seen": 86268208, "step": 90360 }, { "epoch": 7.371319030916061, "grad_norm": 2.8765101432800293, "learning_rate": 9.807026651565974e-06, "loss": 0.1751, "num_input_tokens_seen": 86272928, "step": 90365 }, { "epoch": 7.3717268945264705, "grad_norm": 10.080643653869629, "learning_rate": 9.804200194402408e-06, "loss": 0.3162, "num_input_tokens_seen": 86277232, "step": 90370 }, { "epoch": 7.372134758136879, "grad_norm": 7.202945709228516, "learning_rate": 9.801374045251246e-06, "loss": 0.3429, "num_input_tokens_seen": 86282288, "step": 90375 }, { "epoch": 7.372542621747288, "grad_norm": 0.8051270842552185, "learning_rate": 9.79854820416977e-06, "loss": 0.2706, "num_input_tokens_seen": 86287344, "step": 90380 }, { "epoch": 7.372950485357697, "grad_norm": 0.9305030107498169, "learning_rate": 9.795722671215266e-06, "loss": 0.3106, "num_input_tokens_seen": 86291584, "step": 90385 }, { "epoch": 7.373358348968105, "grad_norm": 1.0514355897903442, "learning_rate": 9.792897446445008e-06, "loss": 0.1901, "num_input_tokens_seen": 86295392, "step": 90390 }, { "epoch": 7.373766212578514, "grad_norm": 13.416471481323242, "learning_rate": 9.790072529916253e-06, "loss": 0.4294, "num_input_tokens_seen": 86300256, "step": 90395 }, { "epoch": 7.374174076188923, "grad_norm": 18.322994232177734, "learning_rate": 9.787247921686262e-06, "loss": 0.3406, "num_input_tokens_seen": 86305472, "step": 90400 }, { "epoch": 7.374581939799331, "grad_norm": 81.31822204589844, "learning_rate": 9.78442362181229e-06, "loss": 0.3769, "num_input_tokens_seen": 86309808, "step": 90405 }, { "epoch": 7.37498980340974, "grad_norm": 24.318222045898438, "learning_rate": 9.781599630351588e-06, "loss": 0.4867, "num_input_tokens_seen": 86313888, "step": 90410 }, { "epoch": 7.375397667020149, "grad_norm": 0.44151008129119873, "learning_rate": 9.778775947361398e-06, "loss": 0.451, "num_input_tokens_seen": 86318976, "step": 90415 }, { "epoch": 7.375805530630557, "grad_norm": 1.5512148141860962, "learning_rate": 9.77595257289895e-06, "loss": 0.6776, "num_input_tokens_seen": 86324272, "step": 90420 }, { "epoch": 7.376213394240966, "grad_norm": 6.624149322509766, "learning_rate": 9.773129507021466e-06, "loss": 0.3028, "num_input_tokens_seen": 86328560, "step": 90425 }, { "epoch": 7.376621257851374, "grad_norm": 1.387696385383606, "learning_rate": 9.770306749786187e-06, "loss": 0.2618, "num_input_tokens_seen": 86333744, "step": 90430 }, { "epoch": 7.377029121461783, "grad_norm": 31.39737892150879, "learning_rate": 9.767484301250317e-06, "loss": 0.3695, "num_input_tokens_seen": 86338560, "step": 90435 }, { "epoch": 7.377436985072192, "grad_norm": 15.373868942260742, "learning_rate": 9.764662161471069e-06, "loss": 0.3865, "num_input_tokens_seen": 86343456, "step": 90440 }, { "epoch": 7.3778448486826, "grad_norm": 3.025376558303833, "learning_rate": 9.761840330505637e-06, "loss": 0.4032, "num_input_tokens_seen": 86347184, "step": 90445 }, { "epoch": 7.378252712293009, "grad_norm": 1.3041443824768066, "learning_rate": 9.759018808411233e-06, "loss": 0.3503, "num_input_tokens_seen": 86351504, "step": 90450 }, { "epoch": 7.378660575903418, "grad_norm": 1.1877039670944214, "learning_rate": 9.756197595245043e-06, "loss": 0.4406, "num_input_tokens_seen": 86356672, "step": 90455 }, { "epoch": 7.379068439513826, "grad_norm": 1.3662388324737549, "learning_rate": 9.753376691064247e-06, "loss": 0.3439, "num_input_tokens_seen": 86361472, "step": 90460 }, { "epoch": 7.379476303124235, "grad_norm": 1.765675663948059, "learning_rate": 9.750556095926031e-06, "loss": 0.292, "num_input_tokens_seen": 86366688, "step": 90465 }, { "epoch": 7.3798841667346435, "grad_norm": 0.3813454508781433, "learning_rate": 9.74773580988756e-06, "loss": 0.2982, "num_input_tokens_seen": 86371328, "step": 90470 }, { "epoch": 7.3802920303450525, "grad_norm": 21.778024673461914, "learning_rate": 9.744915833006005e-06, "loss": 0.2697, "num_input_tokens_seen": 86376352, "step": 90475 }, { "epoch": 7.3806998939554616, "grad_norm": 10.219866752624512, "learning_rate": 9.74209616533852e-06, "loss": 0.2724, "num_input_tokens_seen": 86382096, "step": 90480 }, { "epoch": 7.38110775756587, "grad_norm": 1.3858788013458252, "learning_rate": 9.739276806942269e-06, "loss": 0.2915, "num_input_tokens_seen": 86387152, "step": 90485 }, { "epoch": 7.381515621176279, "grad_norm": 1.3267602920532227, "learning_rate": 9.73645775787438e-06, "loss": 0.4304, "num_input_tokens_seen": 86391840, "step": 90490 }, { "epoch": 7.381923484786688, "grad_norm": 40.75156784057617, "learning_rate": 9.733639018192014e-06, "loss": 0.4107, "num_input_tokens_seen": 86397168, "step": 90495 }, { "epoch": 7.382331348397096, "grad_norm": 2.6828863620758057, "learning_rate": 9.730820587952299e-06, "loss": 0.4965, "num_input_tokens_seen": 86401344, "step": 90500 }, { "epoch": 7.382739212007505, "grad_norm": 1.254729151725769, "learning_rate": 9.728002467212363e-06, "loss": 0.3076, "num_input_tokens_seen": 86406928, "step": 90505 }, { "epoch": 7.383147075617913, "grad_norm": 5.537779808044434, "learning_rate": 9.725184656029326e-06, "loss": 0.2376, "num_input_tokens_seen": 86411744, "step": 90510 }, { "epoch": 7.383554939228322, "grad_norm": 71.75226593017578, "learning_rate": 9.722367154460298e-06, "loss": 0.5239, "num_input_tokens_seen": 86415824, "step": 90515 }, { "epoch": 7.383962802838731, "grad_norm": 1.9447063207626343, "learning_rate": 9.719549962562405e-06, "loss": 0.1967, "num_input_tokens_seen": 86421264, "step": 90520 }, { "epoch": 7.384370666449139, "grad_norm": 0.5176811218261719, "learning_rate": 9.71673308039274e-06, "loss": 0.2504, "num_input_tokens_seen": 86425792, "step": 90525 }, { "epoch": 7.384778530059548, "grad_norm": 12.449360847473145, "learning_rate": 9.713916508008402e-06, "loss": 0.2757, "num_input_tokens_seen": 86430752, "step": 90530 }, { "epoch": 7.385186393669957, "grad_norm": 2.5218610763549805, "learning_rate": 9.71110024546647e-06, "loss": 0.4138, "num_input_tokens_seen": 86436000, "step": 90535 }, { "epoch": 7.385594257280365, "grad_norm": 1.8699352741241455, "learning_rate": 9.708284292824051e-06, "loss": 0.3336, "num_input_tokens_seen": 86439888, "step": 90540 }, { "epoch": 7.386002120890774, "grad_norm": 0.8032630681991577, "learning_rate": 9.70546865013821e-06, "loss": 0.3968, "num_input_tokens_seen": 86442832, "step": 90545 }, { "epoch": 7.386409984501183, "grad_norm": 1.2057697772979736, "learning_rate": 9.702653317466023e-06, "loss": 0.2517, "num_input_tokens_seen": 86447312, "step": 90550 }, { "epoch": 7.386817848111591, "grad_norm": 26.69868278503418, "learning_rate": 9.69983829486455e-06, "loss": 0.3805, "num_input_tokens_seen": 86451344, "step": 90555 }, { "epoch": 7.387225711722, "grad_norm": 1.1971814632415771, "learning_rate": 9.697023582390845e-06, "loss": 0.2404, "num_input_tokens_seen": 86455168, "step": 90560 }, { "epoch": 7.3876335753324085, "grad_norm": 32.80366516113281, "learning_rate": 9.694209180101979e-06, "loss": 0.3084, "num_input_tokens_seen": 86460000, "step": 90565 }, { "epoch": 7.3880414389428175, "grad_norm": 0.7565449476242065, "learning_rate": 9.691395088054989e-06, "loss": 0.3469, "num_input_tokens_seen": 86464784, "step": 90570 }, { "epoch": 7.3884493025532265, "grad_norm": 29.679046630859375, "learning_rate": 9.688581306306916e-06, "loss": 0.4123, "num_input_tokens_seen": 86470432, "step": 90575 }, { "epoch": 7.388857166163635, "grad_norm": 1.169446349143982, "learning_rate": 9.685767834914783e-06, "loss": 0.3773, "num_input_tokens_seen": 86475888, "step": 90580 }, { "epoch": 7.389265029774044, "grad_norm": 0.4732336103916168, "learning_rate": 9.68295467393564e-06, "loss": 0.2368, "num_input_tokens_seen": 86480944, "step": 90585 }, { "epoch": 7.389672893384452, "grad_norm": 2.5187642574310303, "learning_rate": 9.680141823426494e-06, "loss": 0.3474, "num_input_tokens_seen": 86485984, "step": 90590 }, { "epoch": 7.390080756994861, "grad_norm": 0.5012020468711853, "learning_rate": 9.677329283444367e-06, "loss": 0.2614, "num_input_tokens_seen": 86490224, "step": 90595 }, { "epoch": 7.39048862060527, "grad_norm": 28.93604278564453, "learning_rate": 9.674517054046262e-06, "loss": 0.3601, "num_input_tokens_seen": 86494912, "step": 90600 }, { "epoch": 7.390896484215678, "grad_norm": 0.8675101399421692, "learning_rate": 9.671705135289186e-06, "loss": 0.4435, "num_input_tokens_seen": 86499904, "step": 90605 }, { "epoch": 7.391304347826087, "grad_norm": 33.51310729980469, "learning_rate": 9.668893527230133e-06, "loss": 0.5336, "num_input_tokens_seen": 86504576, "step": 90610 }, { "epoch": 7.391712211436496, "grad_norm": 0.6074522733688354, "learning_rate": 9.66608222992609e-06, "loss": 0.195, "num_input_tokens_seen": 86509904, "step": 90615 }, { "epoch": 7.392120075046904, "grad_norm": 2.7267074584960938, "learning_rate": 9.663271243434049e-06, "loss": 0.259, "num_input_tokens_seen": 86514688, "step": 90620 }, { "epoch": 7.392527938657313, "grad_norm": 1.997198462486267, "learning_rate": 9.660460567810973e-06, "loss": 0.5033, "num_input_tokens_seen": 86519264, "step": 90625 }, { "epoch": 7.392935802267722, "grad_norm": 1.372743010520935, "learning_rate": 9.65765020311385e-06, "loss": 0.3463, "num_input_tokens_seen": 86523168, "step": 90630 }, { "epoch": 7.39334366587813, "grad_norm": 9.881394386291504, "learning_rate": 9.654840149399641e-06, "loss": 0.2322, "num_input_tokens_seen": 86527392, "step": 90635 }, { "epoch": 7.393751529488539, "grad_norm": 2.0347907543182373, "learning_rate": 9.652030406725298e-06, "loss": 0.3036, "num_input_tokens_seen": 86532080, "step": 90640 }, { "epoch": 7.394159393098947, "grad_norm": 0.5883376002311707, "learning_rate": 9.649220975147772e-06, "loss": 0.22, "num_input_tokens_seen": 86537840, "step": 90645 }, { "epoch": 7.394567256709356, "grad_norm": 2.9805142879486084, "learning_rate": 9.64641185472402e-06, "loss": 0.4322, "num_input_tokens_seen": 86543040, "step": 90650 }, { "epoch": 7.394975120319765, "grad_norm": 0.8488717675209045, "learning_rate": 9.643603045510977e-06, "loss": 0.4802, "num_input_tokens_seen": 86548112, "step": 90655 }, { "epoch": 7.3953829839301735, "grad_norm": 0.9268636107444763, "learning_rate": 9.640794547565573e-06, "loss": 0.3237, "num_input_tokens_seen": 86552752, "step": 90660 }, { "epoch": 7.3957908475405825, "grad_norm": 0.5476598143577576, "learning_rate": 9.63798636094474e-06, "loss": 0.3064, "num_input_tokens_seen": 86557424, "step": 90665 }, { "epoch": 7.3961987111509915, "grad_norm": 0.8615862131118774, "learning_rate": 9.635178485705384e-06, "loss": 0.4762, "num_input_tokens_seen": 86562368, "step": 90670 }, { "epoch": 7.3966065747614, "grad_norm": 46.71392822265625, "learning_rate": 9.632370921904443e-06, "loss": 0.435, "num_input_tokens_seen": 86566624, "step": 90675 }, { "epoch": 7.397014438371809, "grad_norm": 2.3007030487060547, "learning_rate": 9.62956366959881e-06, "loss": 0.3353, "num_input_tokens_seen": 86571152, "step": 90680 }, { "epoch": 7.397422301982218, "grad_norm": 0.46796929836273193, "learning_rate": 9.626756728845394e-06, "loss": 0.4483, "num_input_tokens_seen": 86575776, "step": 90685 }, { "epoch": 7.397830165592626, "grad_norm": 1.192087173461914, "learning_rate": 9.623950099701073e-06, "loss": 0.2608, "num_input_tokens_seen": 86581424, "step": 90690 }, { "epoch": 7.398238029203035, "grad_norm": 0.6868820190429688, "learning_rate": 9.621143782222763e-06, "loss": 0.4111, "num_input_tokens_seen": 86586144, "step": 90695 }, { "epoch": 7.398645892813443, "grad_norm": 135.5572509765625, "learning_rate": 9.618337776467332e-06, "loss": 0.3809, "num_input_tokens_seen": 86590528, "step": 90700 }, { "epoch": 7.399053756423852, "grad_norm": 0.9409264326095581, "learning_rate": 9.615532082491657e-06, "loss": 0.2836, "num_input_tokens_seen": 86595088, "step": 90705 }, { "epoch": 7.399461620034261, "grad_norm": 1.2013684511184692, "learning_rate": 9.612726700352612e-06, "loss": 0.2841, "num_input_tokens_seen": 86599968, "step": 90710 }, { "epoch": 7.399869483644669, "grad_norm": 0.2560253143310547, "learning_rate": 9.609921630107047e-06, "loss": 0.2967, "num_input_tokens_seen": 86604016, "step": 90715 }, { "epoch": 7.400277347255078, "grad_norm": 1.368564248085022, "learning_rate": 9.607116871811841e-06, "loss": 0.4297, "num_input_tokens_seen": 86609168, "step": 90720 }, { "epoch": 7.400685210865486, "grad_norm": 0.7460176944732666, "learning_rate": 9.604312425523835e-06, "loss": 0.2956, "num_input_tokens_seen": 86614192, "step": 90725 }, { "epoch": 7.401093074475895, "grad_norm": 0.4749423563480377, "learning_rate": 9.601508291299873e-06, "loss": 0.2815, "num_input_tokens_seen": 86618880, "step": 90730 }, { "epoch": 7.401500938086304, "grad_norm": 1.8733422756195068, "learning_rate": 9.598704469196796e-06, "loss": 0.5827, "num_input_tokens_seen": 86624336, "step": 90735 }, { "epoch": 7.401908801696712, "grad_norm": 0.38237130641937256, "learning_rate": 9.595900959271434e-06, "loss": 0.2549, "num_input_tokens_seen": 86630064, "step": 90740 }, { "epoch": 7.402316665307121, "grad_norm": 1.0918073654174805, "learning_rate": 9.593097761580608e-06, "loss": 0.2876, "num_input_tokens_seen": 86635024, "step": 90745 }, { "epoch": 7.40272452891753, "grad_norm": 0.4684220254421234, "learning_rate": 9.590294876181149e-06, "loss": 0.3145, "num_input_tokens_seen": 86639984, "step": 90750 }, { "epoch": 7.403132392527938, "grad_norm": 0.35535165667533875, "learning_rate": 9.587492303129867e-06, "loss": 0.4751, "num_input_tokens_seen": 86644560, "step": 90755 }, { "epoch": 7.403540256138347, "grad_norm": 2.910555839538574, "learning_rate": 9.584690042483565e-06, "loss": 0.2138, "num_input_tokens_seen": 86649408, "step": 90760 }, { "epoch": 7.403948119748756, "grad_norm": 27.32048797607422, "learning_rate": 9.581888094299046e-06, "loss": 0.3754, "num_input_tokens_seen": 86654480, "step": 90765 }, { "epoch": 7.404355983359165, "grad_norm": 1.3624918460845947, "learning_rate": 9.579086458633104e-06, "loss": 0.2763, "num_input_tokens_seen": 86658384, "step": 90770 }, { "epoch": 7.404763846969574, "grad_norm": 1.388312578201294, "learning_rate": 9.576285135542528e-06, "loss": 0.5081, "num_input_tokens_seen": 86662672, "step": 90775 }, { "epoch": 7.405171710579982, "grad_norm": 3.2196264266967773, "learning_rate": 9.573484125084086e-06, "loss": 0.2287, "num_input_tokens_seen": 86667088, "step": 90780 }, { "epoch": 7.405579574190391, "grad_norm": 38.39320755004883, "learning_rate": 9.570683427314578e-06, "loss": 0.5111, "num_input_tokens_seen": 86671904, "step": 90785 }, { "epoch": 7.4059874378008, "grad_norm": 24.197433471679688, "learning_rate": 9.56788304229076e-06, "loss": 0.3259, "num_input_tokens_seen": 86676752, "step": 90790 }, { "epoch": 7.406395301411208, "grad_norm": 35.82030487060547, "learning_rate": 9.565082970069394e-06, "loss": 0.6586, "num_input_tokens_seen": 86681488, "step": 90795 }, { "epoch": 7.406803165021617, "grad_norm": 9.408308982849121, "learning_rate": 9.562283210707232e-06, "loss": 0.2672, "num_input_tokens_seen": 86686688, "step": 90800 }, { "epoch": 7.407211028632026, "grad_norm": 0.438822478055954, "learning_rate": 9.559483764261034e-06, "loss": 0.4309, "num_input_tokens_seen": 86691680, "step": 90805 }, { "epoch": 7.407618892242434, "grad_norm": 0.32766860723495483, "learning_rate": 9.55668463078754e-06, "loss": 0.2519, "num_input_tokens_seen": 86696288, "step": 90810 }, { "epoch": 7.408026755852843, "grad_norm": 0.6099432706832886, "learning_rate": 9.553885810343489e-06, "loss": 0.2234, "num_input_tokens_seen": 86701136, "step": 90815 }, { "epoch": 7.408434619463251, "grad_norm": 1.4540433883666992, "learning_rate": 9.551087302985606e-06, "loss": 0.3582, "num_input_tokens_seen": 86706032, "step": 90820 }, { "epoch": 7.40884248307366, "grad_norm": 2.8610382080078125, "learning_rate": 9.54828910877061e-06, "loss": 0.2569, "num_input_tokens_seen": 86710416, "step": 90825 }, { "epoch": 7.409250346684069, "grad_norm": 0.591883659362793, "learning_rate": 9.54549122775524e-06, "loss": 0.3577, "num_input_tokens_seen": 86715248, "step": 90830 }, { "epoch": 7.409658210294477, "grad_norm": 9.31222915649414, "learning_rate": 9.542693659996194e-06, "loss": 0.5818, "num_input_tokens_seen": 86720128, "step": 90835 }, { "epoch": 7.410066073904886, "grad_norm": 33.03944396972656, "learning_rate": 9.539896405550177e-06, "loss": 0.2866, "num_input_tokens_seen": 86725072, "step": 90840 }, { "epoch": 7.410473937515295, "grad_norm": 1.680048942565918, "learning_rate": 9.537099464473887e-06, "loss": 0.2883, "num_input_tokens_seen": 86729648, "step": 90845 }, { "epoch": 7.410881801125703, "grad_norm": 2.944119930267334, "learning_rate": 9.534302836824025e-06, "loss": 0.4898, "num_input_tokens_seen": 86734208, "step": 90850 }, { "epoch": 7.411289664736112, "grad_norm": 0.6877440810203552, "learning_rate": 9.531506522657274e-06, "loss": 0.3743, "num_input_tokens_seen": 86738656, "step": 90855 }, { "epoch": 7.4116975283465205, "grad_norm": 29.910751342773438, "learning_rate": 9.528710522030312e-06, "loss": 0.3776, "num_input_tokens_seen": 86743072, "step": 90860 }, { "epoch": 7.4121053919569295, "grad_norm": 16.849185943603516, "learning_rate": 9.525914834999814e-06, "loss": 0.646, "num_input_tokens_seen": 86748112, "step": 90865 }, { "epoch": 7.4125132555673385, "grad_norm": 2.773986577987671, "learning_rate": 9.523119461622437e-06, "loss": 0.2681, "num_input_tokens_seen": 86753424, "step": 90870 }, { "epoch": 7.412921119177747, "grad_norm": 1.920931100845337, "learning_rate": 9.52032440195486e-06, "loss": 0.329, "num_input_tokens_seen": 86758096, "step": 90875 }, { "epoch": 7.413328982788156, "grad_norm": 20.588523864746094, "learning_rate": 9.517529656053734e-06, "loss": 0.298, "num_input_tokens_seen": 86763552, "step": 90880 }, { "epoch": 7.413736846398565, "grad_norm": 3.5189530849456787, "learning_rate": 9.514735223975699e-06, "loss": 0.4985, "num_input_tokens_seen": 86768752, "step": 90885 }, { "epoch": 7.414144710008973, "grad_norm": 13.987208366394043, "learning_rate": 9.511941105777403e-06, "loss": 0.335, "num_input_tokens_seen": 86772720, "step": 90890 }, { "epoch": 7.414552573619382, "grad_norm": 1.8801900148391724, "learning_rate": 9.509147301515478e-06, "loss": 0.3892, "num_input_tokens_seen": 86777216, "step": 90895 }, { "epoch": 7.414960437229791, "grad_norm": 6.482282638549805, "learning_rate": 9.506353811246556e-06, "loss": 0.5297, "num_input_tokens_seen": 86782144, "step": 90900 }, { "epoch": 7.415368300840199, "grad_norm": 22.47157859802246, "learning_rate": 9.503560635027259e-06, "loss": 0.3276, "num_input_tokens_seen": 86787360, "step": 90905 }, { "epoch": 7.415776164450608, "grad_norm": 1.2975431680679321, "learning_rate": 9.500767772914202e-06, "loss": 0.228, "num_input_tokens_seen": 86792096, "step": 90910 }, { "epoch": 7.416184028061016, "grad_norm": 1.372090458869934, "learning_rate": 9.497975224963986e-06, "loss": 0.3172, "num_input_tokens_seen": 86797408, "step": 90915 }, { "epoch": 7.416591891671425, "grad_norm": 0.4267176687717438, "learning_rate": 9.495182991233236e-06, "loss": 0.2031, "num_input_tokens_seen": 86802112, "step": 90920 }, { "epoch": 7.416999755281834, "grad_norm": 2.4706499576568604, "learning_rate": 9.492391071778539e-06, "loss": 0.2804, "num_input_tokens_seen": 86807168, "step": 90925 }, { "epoch": 7.417407618892242, "grad_norm": 0.41164153814315796, "learning_rate": 9.489599466656485e-06, "loss": 0.3513, "num_input_tokens_seen": 86812000, "step": 90930 }, { "epoch": 7.417815482502651, "grad_norm": 1.97575044631958, "learning_rate": 9.48680817592365e-06, "loss": 0.232, "num_input_tokens_seen": 86816016, "step": 90935 }, { "epoch": 7.418223346113059, "grad_norm": 1.0962226390838623, "learning_rate": 9.48401719963663e-06, "loss": 0.3391, "num_input_tokens_seen": 86821696, "step": 90940 }, { "epoch": 7.418631209723468, "grad_norm": 1.4341013431549072, "learning_rate": 9.481226537851987e-06, "loss": 0.3652, "num_input_tokens_seen": 86826912, "step": 90945 }, { "epoch": 7.419039073333877, "grad_norm": 2.5745978355407715, "learning_rate": 9.478436190626288e-06, "loss": 0.3019, "num_input_tokens_seen": 86830928, "step": 90950 }, { "epoch": 7.4194469369442855, "grad_norm": 0.553800106048584, "learning_rate": 9.475646158016094e-06, "loss": 0.2624, "num_input_tokens_seen": 86835632, "step": 90955 }, { "epoch": 7.4198548005546945, "grad_norm": 9.971015930175781, "learning_rate": 9.472856440077946e-06, "loss": 0.3897, "num_input_tokens_seen": 86839696, "step": 90960 }, { "epoch": 7.4202626641651035, "grad_norm": 1.05711829662323, "learning_rate": 9.470067036868407e-06, "loss": 0.2866, "num_input_tokens_seen": 86844032, "step": 90965 }, { "epoch": 7.420670527775512, "grad_norm": 21.923891067504883, "learning_rate": 9.467277948444011e-06, "loss": 0.3431, "num_input_tokens_seen": 86848864, "step": 90970 }, { "epoch": 7.421078391385921, "grad_norm": 3.6798927783966064, "learning_rate": 9.46448917486129e-06, "loss": 0.355, "num_input_tokens_seen": 86853872, "step": 90975 }, { "epoch": 7.42148625499633, "grad_norm": 2.1563973426818848, "learning_rate": 9.461700716176766e-06, "loss": 0.1929, "num_input_tokens_seen": 86859232, "step": 90980 }, { "epoch": 7.421894118606738, "grad_norm": 0.46788179874420166, "learning_rate": 9.458912572446973e-06, "loss": 0.2683, "num_input_tokens_seen": 86863152, "step": 90985 }, { "epoch": 7.422301982217147, "grad_norm": 1.2354044914245605, "learning_rate": 9.456124743728417e-06, "loss": 0.2194, "num_input_tokens_seen": 86867776, "step": 90990 }, { "epoch": 7.422709845827555, "grad_norm": 0.7317225933074951, "learning_rate": 9.45333723007761e-06, "loss": 0.3987, "num_input_tokens_seen": 86872512, "step": 90995 }, { "epoch": 7.423117709437964, "grad_norm": 0.9244425892829895, "learning_rate": 9.450550031551043e-06, "loss": 0.2942, "num_input_tokens_seen": 86877536, "step": 91000 }, { "epoch": 7.423525573048373, "grad_norm": 0.5431424975395203, "learning_rate": 9.447763148205224e-06, "loss": 0.3021, "num_input_tokens_seen": 86883200, "step": 91005 }, { "epoch": 7.423933436658781, "grad_norm": 2.8493154048919678, "learning_rate": 9.44497658009664e-06, "loss": 0.2351, "num_input_tokens_seen": 86888240, "step": 91010 }, { "epoch": 7.42434130026919, "grad_norm": 0.44178879261016846, "learning_rate": 9.442190327281772e-06, "loss": 0.2389, "num_input_tokens_seen": 86893168, "step": 91015 }, { "epoch": 7.424749163879599, "grad_norm": 1.156923770904541, "learning_rate": 9.439404389817094e-06, "loss": 0.15, "num_input_tokens_seen": 86897968, "step": 91020 }, { "epoch": 7.425157027490007, "grad_norm": 6.827583312988281, "learning_rate": 9.436618767759076e-06, "loss": 0.4199, "num_input_tokens_seen": 86902128, "step": 91025 }, { "epoch": 7.425564891100416, "grad_norm": 37.55772399902344, "learning_rate": 9.433833461164182e-06, "loss": 0.2315, "num_input_tokens_seen": 86907200, "step": 91030 }, { "epoch": 7.425972754710824, "grad_norm": 0.45109254121780396, "learning_rate": 9.43104847008887e-06, "loss": 0.3468, "num_input_tokens_seen": 86912240, "step": 91035 }, { "epoch": 7.426380618321233, "grad_norm": 69.09870147705078, "learning_rate": 9.428263794589593e-06, "loss": 0.4476, "num_input_tokens_seen": 86917760, "step": 91040 }, { "epoch": 7.426788481931642, "grad_norm": 0.791183352470398, "learning_rate": 9.425479434722778e-06, "loss": 0.3568, "num_input_tokens_seen": 86921728, "step": 91045 }, { "epoch": 7.42719634554205, "grad_norm": 17.528980255126953, "learning_rate": 9.42269539054489e-06, "loss": 0.4137, "num_input_tokens_seen": 86926576, "step": 91050 }, { "epoch": 7.4276042091524594, "grad_norm": 0.39507627487182617, "learning_rate": 9.419911662112346e-06, "loss": 0.232, "num_input_tokens_seen": 86931504, "step": 91055 }, { "epoch": 7.4280120727628685, "grad_norm": 0.51209956407547, "learning_rate": 9.41712824948157e-06, "loss": 0.3805, "num_input_tokens_seen": 86936080, "step": 91060 }, { "epoch": 7.428419936373277, "grad_norm": 0.9134146571159363, "learning_rate": 9.414345152708987e-06, "loss": 0.1963, "num_input_tokens_seen": 86941360, "step": 91065 }, { "epoch": 7.428827799983686, "grad_norm": 33.906917572021484, "learning_rate": 9.411562371850993e-06, "loss": 0.6954, "num_input_tokens_seen": 86946752, "step": 91070 }, { "epoch": 7.429235663594094, "grad_norm": 3.2938568592071533, "learning_rate": 9.408779906964016e-06, "loss": 0.4765, "num_input_tokens_seen": 86951728, "step": 91075 }, { "epoch": 7.429643527204503, "grad_norm": 62.60436248779297, "learning_rate": 9.405997758104443e-06, "loss": 0.3224, "num_input_tokens_seen": 86956992, "step": 91080 }, { "epoch": 7.430051390814912, "grad_norm": 9.161361694335938, "learning_rate": 9.403215925328671e-06, "loss": 0.1006, "num_input_tokens_seen": 86962288, "step": 91085 }, { "epoch": 7.43045925442532, "grad_norm": 1.0885766744613647, "learning_rate": 9.400434408693076e-06, "loss": 0.2672, "num_input_tokens_seen": 86967248, "step": 91090 }, { "epoch": 7.430867118035729, "grad_norm": 6.267099857330322, "learning_rate": 9.397653208254054e-06, "loss": 0.4158, "num_input_tokens_seen": 86972384, "step": 91095 }, { "epoch": 7.431274981646138, "grad_norm": 2.0783469676971436, "learning_rate": 9.394872324067974e-06, "loss": 0.3477, "num_input_tokens_seen": 86977600, "step": 91100 }, { "epoch": 7.431682845256546, "grad_norm": 126.27159118652344, "learning_rate": 9.3920917561912e-06, "loss": 0.4689, "num_input_tokens_seen": 86982496, "step": 91105 }, { "epoch": 7.432090708866955, "grad_norm": 0.3030793368816376, "learning_rate": 9.389311504680092e-06, "loss": 0.3685, "num_input_tokens_seen": 86987648, "step": 91110 }, { "epoch": 7.432498572477364, "grad_norm": 0.9350234270095825, "learning_rate": 9.386531569591e-06, "loss": 0.3036, "num_input_tokens_seen": 86991792, "step": 91115 }, { "epoch": 7.432906436087772, "grad_norm": 0.6409348249435425, "learning_rate": 9.383751950980288e-06, "loss": 0.2545, "num_input_tokens_seen": 86996576, "step": 91120 }, { "epoch": 7.433314299698181, "grad_norm": 17.118078231811523, "learning_rate": 9.380972648904288e-06, "loss": 0.2716, "num_input_tokens_seen": 87000192, "step": 91125 }, { "epoch": 7.433722163308589, "grad_norm": 1.9353312253952026, "learning_rate": 9.378193663419332e-06, "loss": 0.3137, "num_input_tokens_seen": 87004672, "step": 91130 }, { "epoch": 7.434130026918998, "grad_norm": 41.34653854370117, "learning_rate": 9.375414994581746e-06, "loss": 0.5884, "num_input_tokens_seen": 87008912, "step": 91135 }, { "epoch": 7.434537890529407, "grad_norm": 1.0119494199752808, "learning_rate": 9.372636642447869e-06, "loss": 0.1972, "num_input_tokens_seen": 87013136, "step": 91140 }, { "epoch": 7.434945754139815, "grad_norm": 1.5846927165985107, "learning_rate": 9.369858607074005e-06, "loss": 0.24, "num_input_tokens_seen": 87018208, "step": 91145 }, { "epoch": 7.435353617750224, "grad_norm": 1.9442793130874634, "learning_rate": 9.367080888516466e-06, "loss": 0.2425, "num_input_tokens_seen": 87023040, "step": 91150 }, { "epoch": 7.435761481360633, "grad_norm": 2.797975540161133, "learning_rate": 9.364303486831552e-06, "loss": 0.4928, "num_input_tokens_seen": 87027456, "step": 91155 }, { "epoch": 7.4361693449710415, "grad_norm": 0.5509242415428162, "learning_rate": 9.361526402075565e-06, "loss": 0.3219, "num_input_tokens_seen": 87032720, "step": 91160 }, { "epoch": 7.4365772085814505, "grad_norm": 32.59797286987305, "learning_rate": 9.358749634304791e-06, "loss": 0.5153, "num_input_tokens_seen": 87037392, "step": 91165 }, { "epoch": 7.436985072191859, "grad_norm": 0.5033907294273376, "learning_rate": 9.35597318357552e-06, "loss": 0.2623, "num_input_tokens_seen": 87042512, "step": 91170 }, { "epoch": 7.437392935802268, "grad_norm": 1.2596224546432495, "learning_rate": 9.353197049944021e-06, "loss": 0.4428, "num_input_tokens_seen": 87046656, "step": 91175 }, { "epoch": 7.437800799412677, "grad_norm": 1.231660008430481, "learning_rate": 9.350421233466562e-06, "loss": 0.3863, "num_input_tokens_seen": 87051792, "step": 91180 }, { "epoch": 7.438208663023085, "grad_norm": 0.384126216173172, "learning_rate": 9.34764573419942e-06, "loss": 0.2425, "num_input_tokens_seen": 87057280, "step": 91185 }, { "epoch": 7.438616526633494, "grad_norm": 0.3750905990600586, "learning_rate": 9.344870552198851e-06, "loss": 0.2951, "num_input_tokens_seen": 87062480, "step": 91190 }, { "epoch": 7.439024390243903, "grad_norm": 28.845701217651367, "learning_rate": 9.342095687521105e-06, "loss": 0.4034, "num_input_tokens_seen": 87066656, "step": 91195 }, { "epoch": 7.439432253854311, "grad_norm": 1.286863088607788, "learning_rate": 9.339321140222414e-06, "loss": 0.2513, "num_input_tokens_seen": 87071904, "step": 91200 }, { "epoch": 7.43984011746472, "grad_norm": 0.39146339893341064, "learning_rate": 9.33654691035904e-06, "loss": 0.2528, "num_input_tokens_seen": 87077568, "step": 91205 }, { "epoch": 7.440247981075128, "grad_norm": 0.46958908438682556, "learning_rate": 9.333772997987202e-06, "loss": 0.2508, "num_input_tokens_seen": 87082064, "step": 91210 }, { "epoch": 7.440655844685537, "grad_norm": 28.611167907714844, "learning_rate": 9.330999403163129e-06, "loss": 0.4137, "num_input_tokens_seen": 87086960, "step": 91215 }, { "epoch": 7.441063708295946, "grad_norm": 0.7725069522857666, "learning_rate": 9.32822612594304e-06, "loss": 0.5867, "num_input_tokens_seen": 87090976, "step": 91220 }, { "epoch": 7.441471571906354, "grad_norm": 4.392200946807861, "learning_rate": 9.325453166383141e-06, "loss": 0.5009, "num_input_tokens_seen": 87096080, "step": 91225 }, { "epoch": 7.441879435516763, "grad_norm": 0.3934921622276306, "learning_rate": 9.322680524539654e-06, "loss": 0.221, "num_input_tokens_seen": 87100192, "step": 91230 }, { "epoch": 7.442287299127172, "grad_norm": 30.50386619567871, "learning_rate": 9.319908200468769e-06, "loss": 0.3713, "num_input_tokens_seen": 87104768, "step": 91235 }, { "epoch": 7.44269516273758, "grad_norm": 1.029107689857483, "learning_rate": 9.317136194226684e-06, "loss": 0.2915, "num_input_tokens_seen": 87110320, "step": 91240 }, { "epoch": 7.443103026347989, "grad_norm": 18.922189712524414, "learning_rate": 9.314364505869574e-06, "loss": 0.4709, "num_input_tokens_seen": 87115824, "step": 91245 }, { "epoch": 7.443510889958398, "grad_norm": 3.4137537479400635, "learning_rate": 9.311593135453641e-06, "loss": 0.3087, "num_input_tokens_seen": 87119984, "step": 91250 }, { "epoch": 7.4439187535688065, "grad_norm": 1.6274175643920898, "learning_rate": 9.308822083035046e-06, "loss": 0.1811, "num_input_tokens_seen": 87125376, "step": 91255 }, { "epoch": 7.4443266171792155, "grad_norm": 3.1330008506774902, "learning_rate": 9.30605134866996e-06, "loss": 0.3845, "num_input_tokens_seen": 87130480, "step": 91260 }, { "epoch": 7.444734480789624, "grad_norm": 0.7242058515548706, "learning_rate": 9.30328093241455e-06, "loss": 0.3501, "num_input_tokens_seen": 87134800, "step": 91265 }, { "epoch": 7.445142344400033, "grad_norm": 0.3785724341869354, "learning_rate": 9.300510834324952e-06, "loss": 0.3701, "num_input_tokens_seen": 87139200, "step": 91270 }, { "epoch": 7.445550208010442, "grad_norm": 9.422741889953613, "learning_rate": 9.297741054457337e-06, "loss": 0.438, "num_input_tokens_seen": 87143936, "step": 91275 }, { "epoch": 7.44595807162085, "grad_norm": 0.645138144493103, "learning_rate": 9.294971592867837e-06, "loss": 0.3047, "num_input_tokens_seen": 87148192, "step": 91280 }, { "epoch": 7.446365935231259, "grad_norm": 35.13657760620117, "learning_rate": 9.292202449612594e-06, "loss": 0.2481, "num_input_tokens_seen": 87152928, "step": 91285 }, { "epoch": 7.446773798841667, "grad_norm": 1.1018942594528198, "learning_rate": 9.289433624747729e-06, "loss": 0.5259, "num_input_tokens_seen": 87158160, "step": 91290 }, { "epoch": 7.447181662452076, "grad_norm": 3.1504642963409424, "learning_rate": 9.28666511832937e-06, "loss": 0.2648, "num_input_tokens_seen": 87163136, "step": 91295 }, { "epoch": 7.447589526062485, "grad_norm": 9.865631103515625, "learning_rate": 9.283896930413632e-06, "loss": 0.25, "num_input_tokens_seen": 87168096, "step": 91300 }, { "epoch": 7.447997389672893, "grad_norm": 0.36843517422676086, "learning_rate": 9.281129061056623e-06, "loss": 0.2709, "num_input_tokens_seen": 87172144, "step": 91305 }, { "epoch": 7.448405253283302, "grad_norm": 13.633560180664062, "learning_rate": 9.278361510314451e-06, "loss": 0.5639, "num_input_tokens_seen": 87177776, "step": 91310 }, { "epoch": 7.448813116893711, "grad_norm": 26.979488372802734, "learning_rate": 9.275594278243201e-06, "loss": 0.6233, "num_input_tokens_seen": 87182256, "step": 91315 }, { "epoch": 7.449220980504119, "grad_norm": 1.6707018613815308, "learning_rate": 9.272827364898984e-06, "loss": 0.341, "num_input_tokens_seen": 87187184, "step": 91320 }, { "epoch": 7.449628844114528, "grad_norm": 1.6346428394317627, "learning_rate": 9.270060770337871e-06, "loss": 0.3913, "num_input_tokens_seen": 87191264, "step": 91325 }, { "epoch": 7.450036707724937, "grad_norm": 40.917503356933594, "learning_rate": 9.267294494615944e-06, "loss": 0.2694, "num_input_tokens_seen": 87196016, "step": 91330 }, { "epoch": 7.450444571335345, "grad_norm": 4.426854133605957, "learning_rate": 9.264528537789262e-06, "loss": 0.491, "num_input_tokens_seen": 87201568, "step": 91335 }, { "epoch": 7.450852434945754, "grad_norm": 0.8029530644416809, "learning_rate": 9.261762899913908e-06, "loss": 0.2711, "num_input_tokens_seen": 87206000, "step": 91340 }, { "epoch": 7.4512602985561625, "grad_norm": 3.051332950592041, "learning_rate": 9.258997581045936e-06, "loss": 0.4192, "num_input_tokens_seen": 87211056, "step": 91345 }, { "epoch": 7.4516681621665715, "grad_norm": 3.8438541889190674, "learning_rate": 9.256232581241392e-06, "loss": 0.3645, "num_input_tokens_seen": 87216496, "step": 91350 }, { "epoch": 7.4520760257769805, "grad_norm": 41.17184066772461, "learning_rate": 9.253467900556314e-06, "loss": 0.2294, "num_input_tokens_seen": 87221552, "step": 91355 }, { "epoch": 7.452483889387389, "grad_norm": 26.67057991027832, "learning_rate": 9.250703539046763e-06, "loss": 0.5781, "num_input_tokens_seen": 87225904, "step": 91360 }, { "epoch": 7.452891752997798, "grad_norm": 0.27257388830184937, "learning_rate": 9.247939496768754e-06, "loss": 0.3454, "num_input_tokens_seen": 87230480, "step": 91365 }, { "epoch": 7.453299616608207, "grad_norm": 1.0083274841308594, "learning_rate": 9.24517577377832e-06, "loss": 0.2794, "num_input_tokens_seen": 87235984, "step": 91370 }, { "epoch": 7.453707480218615, "grad_norm": 3.5238616466522217, "learning_rate": 9.242412370131478e-06, "loss": 0.4084, "num_input_tokens_seen": 87240976, "step": 91375 }, { "epoch": 7.454115343829024, "grad_norm": 0.8187643885612488, "learning_rate": 9.239649285884236e-06, "loss": 0.3828, "num_input_tokens_seen": 87246176, "step": 91380 }, { "epoch": 7.454523207439432, "grad_norm": 0.497237890958786, "learning_rate": 9.23688652109261e-06, "loss": 0.2871, "num_input_tokens_seen": 87250816, "step": 91385 }, { "epoch": 7.454931071049841, "grad_norm": 5.6170783042907715, "learning_rate": 9.2341240758126e-06, "loss": 0.3134, "num_input_tokens_seen": 87255520, "step": 91390 }, { "epoch": 7.45533893466025, "grad_norm": 20.535470962524414, "learning_rate": 9.231361950100192e-06, "loss": 0.3358, "num_input_tokens_seen": 87260528, "step": 91395 }, { "epoch": 7.455746798270658, "grad_norm": 58.47701644897461, "learning_rate": 9.228600144011371e-06, "loss": 0.3057, "num_input_tokens_seen": 87265168, "step": 91400 }, { "epoch": 7.456154661881067, "grad_norm": 25.875408172607422, "learning_rate": 9.225838657602131e-06, "loss": 0.2857, "num_input_tokens_seen": 87269648, "step": 91405 }, { "epoch": 7.456562525491476, "grad_norm": 0.3169727921485901, "learning_rate": 9.22307749092844e-06, "loss": 0.4184, "num_input_tokens_seen": 87274416, "step": 91410 }, { "epoch": 7.456970389101884, "grad_norm": 2.16695499420166, "learning_rate": 9.220316644046261e-06, "loss": 0.2655, "num_input_tokens_seen": 87278960, "step": 91415 }, { "epoch": 7.457378252712293, "grad_norm": 4.2855305671691895, "learning_rate": 9.217556117011562e-06, "loss": 0.4764, "num_input_tokens_seen": 87283552, "step": 91420 }, { "epoch": 7.457786116322701, "grad_norm": 14.576120376586914, "learning_rate": 9.214795909880291e-06, "loss": 0.2819, "num_input_tokens_seen": 87287536, "step": 91425 }, { "epoch": 7.45819397993311, "grad_norm": 14.902286529541016, "learning_rate": 9.2120360227084e-06, "loss": 0.2866, "num_input_tokens_seen": 87292048, "step": 91430 }, { "epoch": 7.458601843543519, "grad_norm": 55.40265655517578, "learning_rate": 9.209276455551832e-06, "loss": 0.5675, "num_input_tokens_seen": 87296944, "step": 91435 }, { "epoch": 7.459009707153927, "grad_norm": 4.339159965515137, "learning_rate": 9.206517208466511e-06, "loss": 0.3274, "num_input_tokens_seen": 87302064, "step": 91440 }, { "epoch": 7.459417570764336, "grad_norm": 1.419671654701233, "learning_rate": 9.203758281508381e-06, "loss": 0.3186, "num_input_tokens_seen": 87307072, "step": 91445 }, { "epoch": 7.459825434374745, "grad_norm": 2.7241342067718506, "learning_rate": 9.200999674733362e-06, "loss": 0.2414, "num_input_tokens_seen": 87312160, "step": 91450 }, { "epoch": 7.4602332979851536, "grad_norm": 0.6500258445739746, "learning_rate": 9.198241388197365e-06, "loss": 0.1769, "num_input_tokens_seen": 87316800, "step": 91455 }, { "epoch": 7.460641161595563, "grad_norm": 0.5626968145370483, "learning_rate": 9.195483421956302e-06, "loss": 0.3012, "num_input_tokens_seen": 87321568, "step": 91460 }, { "epoch": 7.461049025205972, "grad_norm": 0.6144492030143738, "learning_rate": 9.192725776066072e-06, "loss": 0.4646, "num_input_tokens_seen": 87326432, "step": 91465 }, { "epoch": 7.46145688881638, "grad_norm": 1.5515705347061157, "learning_rate": 9.189968450582565e-06, "loss": 0.3066, "num_input_tokens_seen": 87331664, "step": 91470 }, { "epoch": 7.461864752426789, "grad_norm": 1.3867098093032837, "learning_rate": 9.18721144556169e-06, "loss": 0.3486, "num_input_tokens_seen": 87336592, "step": 91475 }, { "epoch": 7.462272616037197, "grad_norm": 2.3240997791290283, "learning_rate": 9.184454761059319e-06, "loss": 0.2371, "num_input_tokens_seen": 87340960, "step": 91480 }, { "epoch": 7.462680479647606, "grad_norm": 30.303606033325195, "learning_rate": 9.181698397131328e-06, "loss": 0.3751, "num_input_tokens_seen": 87345552, "step": 91485 }, { "epoch": 7.463088343258015, "grad_norm": 6.77018404006958, "learning_rate": 9.178942353833581e-06, "loss": 0.1995, "num_input_tokens_seen": 87349920, "step": 91490 }, { "epoch": 7.463496206868423, "grad_norm": 0.22253243625164032, "learning_rate": 9.176186631221958e-06, "loss": 0.4042, "num_input_tokens_seen": 87354624, "step": 91495 }, { "epoch": 7.463904070478832, "grad_norm": 0.43141689896583557, "learning_rate": 9.173431229352308e-06, "loss": 0.5418, "num_input_tokens_seen": 87359696, "step": 91500 }, { "epoch": 7.46431193408924, "grad_norm": 2.0723066329956055, "learning_rate": 9.170676148280486e-06, "loss": 0.4225, "num_input_tokens_seen": 87364176, "step": 91505 }, { "epoch": 7.464719797699649, "grad_norm": 15.619879722595215, "learning_rate": 9.167921388062329e-06, "loss": 0.3066, "num_input_tokens_seen": 87369104, "step": 91510 }, { "epoch": 7.465127661310058, "grad_norm": 1.5230170488357544, "learning_rate": 9.165166948753668e-06, "loss": 0.5171, "num_input_tokens_seen": 87372688, "step": 91515 }, { "epoch": 7.465535524920466, "grad_norm": 0.5690388083457947, "learning_rate": 9.162412830410353e-06, "loss": 0.5306, "num_input_tokens_seen": 87377232, "step": 91520 }, { "epoch": 7.465943388530875, "grad_norm": 15.846076965332031, "learning_rate": 9.159659033088203e-06, "loss": 0.3266, "num_input_tokens_seen": 87382864, "step": 91525 }, { "epoch": 7.466351252141284, "grad_norm": 0.5195409655570984, "learning_rate": 9.15690555684303e-06, "loss": 0.3326, "num_input_tokens_seen": 87387264, "step": 91530 }, { "epoch": 7.466759115751692, "grad_norm": 3.5442049503326416, "learning_rate": 9.154152401730642e-06, "loss": 0.3653, "num_input_tokens_seen": 87391968, "step": 91535 }, { "epoch": 7.467166979362101, "grad_norm": 0.4745776057243347, "learning_rate": 9.151399567806862e-06, "loss": 0.306, "num_input_tokens_seen": 87396336, "step": 91540 }, { "epoch": 7.46757484297251, "grad_norm": 1.402161717414856, "learning_rate": 9.148647055127477e-06, "loss": 0.3599, "num_input_tokens_seen": 87401520, "step": 91545 }, { "epoch": 7.4679827065829185, "grad_norm": 4.086452007293701, "learning_rate": 9.145894863748277e-06, "loss": 0.2125, "num_input_tokens_seen": 87406224, "step": 91550 }, { "epoch": 7.4683905701933275, "grad_norm": 4.431843280792236, "learning_rate": 9.143142993725056e-06, "loss": 0.4784, "num_input_tokens_seen": 87411088, "step": 91555 }, { "epoch": 7.468798433803736, "grad_norm": 3.320978879928589, "learning_rate": 9.140391445113578e-06, "loss": 0.2072, "num_input_tokens_seen": 87416944, "step": 91560 }, { "epoch": 7.469206297414145, "grad_norm": 1.5226846933364868, "learning_rate": 9.137640217969638e-06, "loss": 0.1731, "num_input_tokens_seen": 87421632, "step": 91565 }, { "epoch": 7.469614161024554, "grad_norm": 13.848700523376465, "learning_rate": 9.134889312348984e-06, "loss": 0.3715, "num_input_tokens_seen": 87426640, "step": 91570 }, { "epoch": 7.470022024634962, "grad_norm": 77.49129486083984, "learning_rate": 9.132138728307387e-06, "loss": 0.6663, "num_input_tokens_seen": 87431136, "step": 91575 }, { "epoch": 7.470429888245371, "grad_norm": 0.7883756756782532, "learning_rate": 9.129388465900596e-06, "loss": 0.2792, "num_input_tokens_seen": 87436064, "step": 91580 }, { "epoch": 7.47083775185578, "grad_norm": 0.6514481902122498, "learning_rate": 9.126638525184353e-06, "loss": 0.2905, "num_input_tokens_seen": 87440464, "step": 91585 }, { "epoch": 7.471245615466188, "grad_norm": 1.1683801412582397, "learning_rate": 9.123888906214405e-06, "loss": 0.4049, "num_input_tokens_seen": 87445536, "step": 91590 }, { "epoch": 7.471653479076597, "grad_norm": 0.5437849760055542, "learning_rate": 9.121139609046484e-06, "loss": 0.274, "num_input_tokens_seen": 87449776, "step": 91595 }, { "epoch": 7.472061342687005, "grad_norm": 0.5386927723884583, "learning_rate": 9.118390633736304e-06, "loss": 0.2328, "num_input_tokens_seen": 87454992, "step": 91600 }, { "epoch": 7.472469206297414, "grad_norm": 0.5739753246307373, "learning_rate": 9.115641980339606e-06, "loss": 0.282, "num_input_tokens_seen": 87459424, "step": 91605 }, { "epoch": 7.472877069907823, "grad_norm": 0.5418801307678223, "learning_rate": 9.112893648912096e-06, "loss": 0.1895, "num_input_tokens_seen": 87464320, "step": 91610 }, { "epoch": 7.473284933518231, "grad_norm": 0.8523703813552856, "learning_rate": 9.110145639509481e-06, "loss": 0.33, "num_input_tokens_seen": 87469168, "step": 91615 }, { "epoch": 7.47369279712864, "grad_norm": 2.968613386154175, "learning_rate": 9.107397952187463e-06, "loss": 0.2941, "num_input_tokens_seen": 87473584, "step": 91620 }, { "epoch": 7.474100660739049, "grad_norm": 0.3655546307563782, "learning_rate": 9.104650587001726e-06, "loss": 0.2876, "num_input_tokens_seen": 87479264, "step": 91625 }, { "epoch": 7.474508524349457, "grad_norm": 31.796220779418945, "learning_rate": 9.101903544007976e-06, "loss": 0.417, "num_input_tokens_seen": 87484272, "step": 91630 }, { "epoch": 7.474916387959866, "grad_norm": 1.080746054649353, "learning_rate": 9.099156823261886e-06, "loss": 0.3318, "num_input_tokens_seen": 87489280, "step": 91635 }, { "epoch": 7.4753242515702745, "grad_norm": 6.217341899871826, "learning_rate": 9.09641042481913e-06, "loss": 0.6091, "num_input_tokens_seen": 87494928, "step": 91640 }, { "epoch": 7.4757321151806835, "grad_norm": 3.5377352237701416, "learning_rate": 9.093664348735367e-06, "loss": 0.2761, "num_input_tokens_seen": 87499072, "step": 91645 }, { "epoch": 7.4761399787910925, "grad_norm": 6.270910739898682, "learning_rate": 9.09091859506628e-06, "loss": 0.3621, "num_input_tokens_seen": 87504016, "step": 91650 }, { "epoch": 7.476547842401501, "grad_norm": 1.0118064880371094, "learning_rate": 9.08817316386751e-06, "loss": 0.2309, "num_input_tokens_seen": 87509248, "step": 91655 }, { "epoch": 7.47695570601191, "grad_norm": 1.5757827758789062, "learning_rate": 9.085428055194711e-06, "loss": 0.3841, "num_input_tokens_seen": 87513520, "step": 91660 }, { "epoch": 7.477363569622319, "grad_norm": 6.222409248352051, "learning_rate": 9.082683269103523e-06, "loss": 0.4522, "num_input_tokens_seen": 87518224, "step": 91665 }, { "epoch": 7.477771433232727, "grad_norm": 44.20328140258789, "learning_rate": 9.07993880564957e-06, "loss": 0.3779, "num_input_tokens_seen": 87523248, "step": 91670 }, { "epoch": 7.478179296843136, "grad_norm": 10.240720748901367, "learning_rate": 9.077194664888502e-06, "loss": 0.37, "num_input_tokens_seen": 87527808, "step": 91675 }, { "epoch": 7.478587160453545, "grad_norm": 0.5299599766731262, "learning_rate": 9.074450846875934e-06, "loss": 0.1995, "num_input_tokens_seen": 87533344, "step": 91680 }, { "epoch": 7.478995024063953, "grad_norm": 2.929753303527832, "learning_rate": 9.07170735166748e-06, "loss": 0.233, "num_input_tokens_seen": 87538736, "step": 91685 }, { "epoch": 7.479402887674362, "grad_norm": 0.723320484161377, "learning_rate": 9.068964179318739e-06, "loss": 0.2958, "num_input_tokens_seen": 87543360, "step": 91690 }, { "epoch": 7.47981075128477, "grad_norm": 0.9011256694793701, "learning_rate": 9.066221329885335e-06, "loss": 0.4035, "num_input_tokens_seen": 87548448, "step": 91695 }, { "epoch": 7.480218614895179, "grad_norm": 11.225236892700195, "learning_rate": 9.063478803422853e-06, "loss": 0.4926, "num_input_tokens_seen": 87552672, "step": 91700 }, { "epoch": 7.480626478505588, "grad_norm": 0.43177008628845215, "learning_rate": 9.060736599986886e-06, "loss": 0.3207, "num_input_tokens_seen": 87558128, "step": 91705 }, { "epoch": 7.481034342115996, "grad_norm": 0.735614538192749, "learning_rate": 9.057994719633011e-06, "loss": 0.3508, "num_input_tokens_seen": 87562208, "step": 91710 }, { "epoch": 7.481442205726405, "grad_norm": 0.6638978123664856, "learning_rate": 9.055253162416814e-06, "loss": 0.3645, "num_input_tokens_seen": 87566896, "step": 91715 }, { "epoch": 7.481850069336814, "grad_norm": 19.10232162475586, "learning_rate": 9.052511928393855e-06, "loss": 0.3208, "num_input_tokens_seen": 87571568, "step": 91720 }, { "epoch": 7.482257932947222, "grad_norm": 2.4341471195220947, "learning_rate": 9.049771017619705e-06, "loss": 0.2737, "num_input_tokens_seen": 87576208, "step": 91725 }, { "epoch": 7.482665796557631, "grad_norm": 0.9870989918708801, "learning_rate": 9.04703043014992e-06, "loss": 0.2777, "num_input_tokens_seen": 87581264, "step": 91730 }, { "epoch": 7.483073660168039, "grad_norm": 4.715447902679443, "learning_rate": 9.04429016604004e-06, "loss": 0.3053, "num_input_tokens_seen": 87585968, "step": 91735 }, { "epoch": 7.483481523778448, "grad_norm": 1.6843171119689941, "learning_rate": 9.041550225345624e-06, "loss": 0.4301, "num_input_tokens_seen": 87590656, "step": 91740 }, { "epoch": 7.4838893873888575, "grad_norm": 1.4744802713394165, "learning_rate": 9.038810608122205e-06, "loss": 0.3417, "num_input_tokens_seen": 87595712, "step": 91745 }, { "epoch": 7.484297250999266, "grad_norm": 0.693634569644928, "learning_rate": 9.036071314425315e-06, "loss": 0.3566, "num_input_tokens_seen": 87600672, "step": 91750 }, { "epoch": 7.484705114609675, "grad_norm": 0.5103480815887451, "learning_rate": 9.033332344310466e-06, "loss": 0.3299, "num_input_tokens_seen": 87604928, "step": 91755 }, { "epoch": 7.485112978220084, "grad_norm": 0.5000353455543518, "learning_rate": 9.030593697833195e-06, "loss": 0.3674, "num_input_tokens_seen": 87609680, "step": 91760 }, { "epoch": 7.485520841830492, "grad_norm": 3.7449560165405273, "learning_rate": 9.027855375049005e-06, "loss": 0.3863, "num_input_tokens_seen": 87614944, "step": 91765 }, { "epoch": 7.485928705440901, "grad_norm": 0.4628910422325134, "learning_rate": 9.025117376013398e-06, "loss": 0.6238, "num_input_tokens_seen": 87620976, "step": 91770 }, { "epoch": 7.486336569051309, "grad_norm": 7.419365882873535, "learning_rate": 9.022379700781871e-06, "loss": 0.4425, "num_input_tokens_seen": 87625520, "step": 91775 }, { "epoch": 7.486744432661718, "grad_norm": 0.8948659896850586, "learning_rate": 9.019642349409916e-06, "loss": 0.3516, "num_input_tokens_seen": 87630016, "step": 91780 }, { "epoch": 7.487152296272127, "grad_norm": 1.5130542516708374, "learning_rate": 9.016905321953022e-06, "loss": 0.2945, "num_input_tokens_seen": 87634064, "step": 91785 }, { "epoch": 7.487560159882535, "grad_norm": 0.4827924370765686, "learning_rate": 9.01416861846667e-06, "loss": 0.2467, "num_input_tokens_seen": 87639120, "step": 91790 }, { "epoch": 7.487968023492944, "grad_norm": 1.2525416612625122, "learning_rate": 9.011432239006327e-06, "loss": 0.3008, "num_input_tokens_seen": 87644208, "step": 91795 }, { "epoch": 7.488375887103353, "grad_norm": 0.3108280599117279, "learning_rate": 9.008696183627446e-06, "loss": 0.3493, "num_input_tokens_seen": 87649088, "step": 91800 }, { "epoch": 7.488783750713761, "grad_norm": 0.8623848557472229, "learning_rate": 9.005960452385512e-06, "loss": 0.3651, "num_input_tokens_seen": 87654160, "step": 91805 }, { "epoch": 7.48919161432417, "grad_norm": 0.4103298783302307, "learning_rate": 9.003225045335962e-06, "loss": 0.4897, "num_input_tokens_seen": 87659344, "step": 91810 }, { "epoch": 7.489599477934579, "grad_norm": 0.3905392289161682, "learning_rate": 9.00048996253424e-06, "loss": 0.3506, "num_input_tokens_seen": 87663680, "step": 91815 }, { "epoch": 7.490007341544987, "grad_norm": 0.4497835636138916, "learning_rate": 8.99775520403579e-06, "loss": 0.3549, "num_input_tokens_seen": 87669280, "step": 91820 }, { "epoch": 7.490415205155396, "grad_norm": 0.3939189016819, "learning_rate": 8.995020769896031e-06, "loss": 0.3276, "num_input_tokens_seen": 87674640, "step": 91825 }, { "epoch": 7.490823068765804, "grad_norm": 1.2310116291046143, "learning_rate": 8.992286660170412e-06, "loss": 0.3842, "num_input_tokens_seen": 87679024, "step": 91830 }, { "epoch": 7.491230932376213, "grad_norm": 0.3387817144393921, "learning_rate": 8.98955287491434e-06, "loss": 0.3956, "num_input_tokens_seen": 87683936, "step": 91835 }, { "epoch": 7.491638795986622, "grad_norm": 0.4651423990726471, "learning_rate": 8.986819414183228e-06, "loss": 0.2747, "num_input_tokens_seen": 87689200, "step": 91840 }, { "epoch": 7.4920466595970305, "grad_norm": 1.2492632865905762, "learning_rate": 8.98408627803248e-06, "loss": 0.2826, "num_input_tokens_seen": 87693952, "step": 91845 }, { "epoch": 7.4924545232074395, "grad_norm": 0.7197639346122742, "learning_rate": 8.9813534665175e-06, "loss": 0.3355, "num_input_tokens_seen": 87699728, "step": 91850 }, { "epoch": 7.492862386817848, "grad_norm": 0.33358752727508545, "learning_rate": 8.978620979693677e-06, "loss": 0.3626, "num_input_tokens_seen": 87704144, "step": 91855 }, { "epoch": 7.493270250428257, "grad_norm": 1.6738793849945068, "learning_rate": 8.975888817616398e-06, "loss": 0.2922, "num_input_tokens_seen": 87708160, "step": 91860 }, { "epoch": 7.493678114038666, "grad_norm": 2.3121232986450195, "learning_rate": 8.973156980341046e-06, "loss": 0.2821, "num_input_tokens_seen": 87713168, "step": 91865 }, { "epoch": 7.494085977649074, "grad_norm": 0.6123195886611938, "learning_rate": 8.970425467922982e-06, "loss": 0.2368, "num_input_tokens_seen": 87718720, "step": 91870 }, { "epoch": 7.494493841259483, "grad_norm": 1.8555727005004883, "learning_rate": 8.96769428041759e-06, "loss": 0.3443, "num_input_tokens_seen": 87723664, "step": 91875 }, { "epoch": 7.494901704869892, "grad_norm": 3.2741341590881348, "learning_rate": 8.964963417880227e-06, "loss": 0.3415, "num_input_tokens_seen": 87728384, "step": 91880 }, { "epoch": 7.4953095684803, "grad_norm": 0.4950801730155945, "learning_rate": 8.962232880366236e-06, "loss": 0.2602, "num_input_tokens_seen": 87733552, "step": 91885 }, { "epoch": 7.495717432090709, "grad_norm": 0.9822801351547241, "learning_rate": 8.959502667930963e-06, "loss": 0.3017, "num_input_tokens_seen": 87737584, "step": 91890 }, { "epoch": 7.496125295701118, "grad_norm": 4.066037654876709, "learning_rate": 8.956772780629763e-06, "loss": 0.3586, "num_input_tokens_seen": 87742016, "step": 91895 }, { "epoch": 7.496533159311526, "grad_norm": 6.52775239944458, "learning_rate": 8.954043218517963e-06, "loss": 0.2911, "num_input_tokens_seen": 87747216, "step": 91900 }, { "epoch": 7.496941022921935, "grad_norm": 17.638784408569336, "learning_rate": 8.951313981650888e-06, "loss": 0.3051, "num_input_tokens_seen": 87751776, "step": 91905 }, { "epoch": 7.497348886532343, "grad_norm": 0.3680724501609802, "learning_rate": 8.948585070083852e-06, "loss": 0.4396, "num_input_tokens_seen": 87756864, "step": 91910 }, { "epoch": 7.497756750142752, "grad_norm": 1.7621746063232422, "learning_rate": 8.94585648387218e-06, "loss": 0.4141, "num_input_tokens_seen": 87761408, "step": 91915 }, { "epoch": 7.498164613753161, "grad_norm": 0.41299811005592346, "learning_rate": 8.94312822307118e-06, "loss": 0.4965, "num_input_tokens_seen": 87766544, "step": 91920 }, { "epoch": 7.498572477363569, "grad_norm": 0.438425749540329, "learning_rate": 8.940400287736147e-06, "loss": 0.3334, "num_input_tokens_seen": 87770880, "step": 91925 }, { "epoch": 7.498980340973978, "grad_norm": 0.4408642053604126, "learning_rate": 8.937672677922376e-06, "loss": 0.3202, "num_input_tokens_seen": 87776048, "step": 91930 }, { "epoch": 7.499388204584387, "grad_norm": 0.8531621098518372, "learning_rate": 8.934945393685145e-06, "loss": 0.4753, "num_input_tokens_seen": 87781344, "step": 91935 }, { "epoch": 7.4997960681947955, "grad_norm": 0.5172780752182007, "learning_rate": 8.932218435079754e-06, "loss": 0.5057, "num_input_tokens_seen": 87786160, "step": 91940 }, { "epoch": 7.5002039318052045, "grad_norm": 5.035933494567871, "learning_rate": 8.929491802161465e-06, "loss": 0.381, "num_input_tokens_seen": 87791104, "step": 91945 }, { "epoch": 7.5006117954156135, "grad_norm": 24.379297256469727, "learning_rate": 8.926765494985551e-06, "loss": 0.5113, "num_input_tokens_seen": 87796256, "step": 91950 }, { "epoch": 7.5006117954156135, "eval_loss": 0.3346007168292999, "eval_runtime": 570.9789, "eval_samples_per_second": 4.773, "eval_steps_per_second": 2.387, "num_input_tokens_seen": 87796256, "step": 91950 }, { "epoch": 7.501019659026022, "grad_norm": 0.8435405492782593, "learning_rate": 8.92403951360726e-06, "loss": 0.3247, "num_input_tokens_seen": 87800688, "step": 91955 }, { "epoch": 7.501427522636431, "grad_norm": 0.5862576365470886, "learning_rate": 8.921313858081868e-06, "loss": 0.3226, "num_input_tokens_seen": 87805584, "step": 91960 }, { "epoch": 7.501835386246839, "grad_norm": 0.8022240996360779, "learning_rate": 8.918588528464611e-06, "loss": 0.4549, "num_input_tokens_seen": 87810432, "step": 91965 }, { "epoch": 7.502243249857248, "grad_norm": 5.870099067687988, "learning_rate": 8.91586352481073e-06, "loss": 0.2641, "num_input_tokens_seen": 87815168, "step": 91970 }, { "epoch": 7.502651113467657, "grad_norm": 1.0050079822540283, "learning_rate": 8.913138847175462e-06, "loss": 0.3433, "num_input_tokens_seen": 87819840, "step": 91975 }, { "epoch": 7.503058977078065, "grad_norm": 0.30497056245803833, "learning_rate": 8.910414495614033e-06, "loss": 0.2735, "num_input_tokens_seen": 87824016, "step": 91980 }, { "epoch": 7.503466840688474, "grad_norm": 0.4695036709308624, "learning_rate": 8.907690470181666e-06, "loss": 0.2875, "num_input_tokens_seen": 87828624, "step": 91985 }, { "epoch": 7.503874704298882, "grad_norm": 0.4932272136211395, "learning_rate": 8.904966770933577e-06, "loss": 0.3334, "num_input_tokens_seen": 87832608, "step": 91990 }, { "epoch": 7.504282567909291, "grad_norm": 3.7108609676361084, "learning_rate": 8.902243397924967e-06, "loss": 0.3207, "num_input_tokens_seen": 87836720, "step": 91995 }, { "epoch": 7.5046904315197, "grad_norm": 1.4900544881820679, "learning_rate": 8.899520351211037e-06, "loss": 0.3212, "num_input_tokens_seen": 87841216, "step": 92000 }, { "epoch": 7.505098295130108, "grad_norm": 0.3330386281013489, "learning_rate": 8.896797630846998e-06, "loss": 0.3217, "num_input_tokens_seen": 87845808, "step": 92005 }, { "epoch": 7.505506158740517, "grad_norm": 2.357491970062256, "learning_rate": 8.894075236888027e-06, "loss": 0.3074, "num_input_tokens_seen": 87850496, "step": 92010 }, { "epoch": 7.505914022350926, "grad_norm": 1.3727598190307617, "learning_rate": 8.891353169389307e-06, "loss": 0.2515, "num_input_tokens_seen": 87855792, "step": 92015 }, { "epoch": 7.506321885961334, "grad_norm": 1.843715786933899, "learning_rate": 8.888631428406017e-06, "loss": 0.3407, "num_input_tokens_seen": 87861312, "step": 92020 }, { "epoch": 7.506729749571743, "grad_norm": 3.038543462753296, "learning_rate": 8.88591001399331e-06, "loss": 0.3477, "num_input_tokens_seen": 87865600, "step": 92025 }, { "epoch": 7.507137613182152, "grad_norm": 9.593128204345703, "learning_rate": 8.883188926206367e-06, "loss": 0.2166, "num_input_tokens_seen": 87871392, "step": 92030 }, { "epoch": 7.5075454767925605, "grad_norm": 0.6071098446846008, "learning_rate": 8.880468165100337e-06, "loss": 0.29, "num_input_tokens_seen": 87876112, "step": 92035 }, { "epoch": 7.5079533404029695, "grad_norm": 9.710390090942383, "learning_rate": 8.877747730730368e-06, "loss": 0.2791, "num_input_tokens_seen": 87880912, "step": 92040 }, { "epoch": 7.508361204013378, "grad_norm": 2.327738046646118, "learning_rate": 8.875027623151593e-06, "loss": 0.3741, "num_input_tokens_seen": 87884944, "step": 92045 }, { "epoch": 7.508769067623787, "grad_norm": 3.4983770847320557, "learning_rate": 8.872307842419166e-06, "loss": 0.421, "num_input_tokens_seen": 87890128, "step": 92050 }, { "epoch": 7.509176931234196, "grad_norm": 36.93891143798828, "learning_rate": 8.869588388588204e-06, "loss": 0.5046, "num_input_tokens_seen": 87894128, "step": 92055 }, { "epoch": 7.509584794844604, "grad_norm": 0.9811142683029175, "learning_rate": 8.866869261713834e-06, "loss": 0.4706, "num_input_tokens_seen": 87898800, "step": 92060 }, { "epoch": 7.509992658455013, "grad_norm": 51.14839553833008, "learning_rate": 8.864150461851168e-06, "loss": 0.4769, "num_input_tokens_seen": 87904080, "step": 92065 }, { "epoch": 7.510400522065421, "grad_norm": 0.7777583599090576, "learning_rate": 8.861431989055305e-06, "loss": 0.2625, "num_input_tokens_seen": 87909360, "step": 92070 }, { "epoch": 7.51080838567583, "grad_norm": 41.9913215637207, "learning_rate": 8.858713843381367e-06, "loss": 0.3068, "num_input_tokens_seen": 87913904, "step": 92075 }, { "epoch": 7.511216249286239, "grad_norm": 1.3735857009887695, "learning_rate": 8.85599602488444e-06, "loss": 0.336, "num_input_tokens_seen": 87918784, "step": 92080 }, { "epoch": 7.511624112896647, "grad_norm": 9.930681228637695, "learning_rate": 8.853278533619613e-06, "loss": 0.3392, "num_input_tokens_seen": 87924608, "step": 92085 }, { "epoch": 7.512031976507056, "grad_norm": 61.50996017456055, "learning_rate": 8.850561369641962e-06, "loss": 0.4536, "num_input_tokens_seen": 87929248, "step": 92090 }, { "epoch": 7.512439840117465, "grad_norm": 0.409628301858902, "learning_rate": 8.84784453300658e-06, "loss": 0.2356, "num_input_tokens_seen": 87934336, "step": 92095 }, { "epoch": 7.512847703727873, "grad_norm": 0.8136794567108154, "learning_rate": 8.845128023768523e-06, "loss": 0.266, "num_input_tokens_seen": 87939520, "step": 92100 }, { "epoch": 7.513255567338282, "grad_norm": 0.49571189284324646, "learning_rate": 8.84241184198286e-06, "loss": 0.335, "num_input_tokens_seen": 87944480, "step": 92105 }, { "epoch": 7.513663430948691, "grad_norm": 9.578998565673828, "learning_rate": 8.839695987704638e-06, "loss": 0.3775, "num_input_tokens_seen": 87949088, "step": 92110 }, { "epoch": 7.514071294559099, "grad_norm": 2.081362724304199, "learning_rate": 8.836980460988914e-06, "loss": 0.4858, "num_input_tokens_seen": 87953520, "step": 92115 }, { "epoch": 7.514479158169508, "grad_norm": 5.825798511505127, "learning_rate": 8.83426526189073e-06, "loss": 0.2174, "num_input_tokens_seen": 87958688, "step": 92120 }, { "epoch": 7.514887021779916, "grad_norm": 1.2186050415039062, "learning_rate": 8.831550390465116e-06, "loss": 0.2571, "num_input_tokens_seen": 87963136, "step": 92125 }, { "epoch": 7.515294885390325, "grad_norm": 0.5616098642349243, "learning_rate": 8.828835846767106e-06, "loss": 0.2617, "num_input_tokens_seen": 87967792, "step": 92130 }, { "epoch": 7.515702749000734, "grad_norm": 19.8314266204834, "learning_rate": 8.826121630851714e-06, "loss": 0.5504, "num_input_tokens_seen": 87971872, "step": 92135 }, { "epoch": 7.5161106126111425, "grad_norm": 1.5254024267196655, "learning_rate": 8.823407742773973e-06, "loss": 0.3563, "num_input_tokens_seen": 87976624, "step": 92140 }, { "epoch": 7.516518476221552, "grad_norm": 2.060131072998047, "learning_rate": 8.820694182588884e-06, "loss": 0.3033, "num_input_tokens_seen": 87981136, "step": 92145 }, { "epoch": 7.516926339831961, "grad_norm": 25.30003547668457, "learning_rate": 8.817980950351449e-06, "loss": 0.2676, "num_input_tokens_seen": 87985616, "step": 92150 }, { "epoch": 7.517334203442369, "grad_norm": 0.4167267680168152, "learning_rate": 8.815268046116654e-06, "loss": 0.2963, "num_input_tokens_seen": 87990384, "step": 92155 }, { "epoch": 7.517742067052778, "grad_norm": 27.550561904907227, "learning_rate": 8.812555469939505e-06, "loss": 0.6659, "num_input_tokens_seen": 87994768, "step": 92160 }, { "epoch": 7.518149930663187, "grad_norm": 1.541734218597412, "learning_rate": 8.80984322187498e-06, "loss": 0.4091, "num_input_tokens_seen": 87999184, "step": 92165 }, { "epoch": 7.518557794273595, "grad_norm": 0.4176373779773712, "learning_rate": 8.807131301978056e-06, "loss": 0.3523, "num_input_tokens_seen": 88003792, "step": 92170 }, { "epoch": 7.518965657884004, "grad_norm": 7.726508617401123, "learning_rate": 8.804419710303697e-06, "loss": 0.2745, "num_input_tokens_seen": 88009136, "step": 92175 }, { "epoch": 7.519373521494412, "grad_norm": 4.635723114013672, "learning_rate": 8.80170844690686e-06, "loss": 0.3355, "num_input_tokens_seen": 88014016, "step": 92180 }, { "epoch": 7.519781385104821, "grad_norm": 2.149794578552246, "learning_rate": 8.798997511842518e-06, "loss": 0.282, "num_input_tokens_seen": 88018752, "step": 92185 }, { "epoch": 7.52018924871523, "grad_norm": 0.3906882107257843, "learning_rate": 8.796286905165612e-06, "loss": 0.3472, "num_input_tokens_seen": 88023408, "step": 92190 }, { "epoch": 7.520597112325638, "grad_norm": 0.41214969754219055, "learning_rate": 8.793576626931086e-06, "loss": 0.3973, "num_input_tokens_seen": 88027888, "step": 92195 }, { "epoch": 7.521004975936047, "grad_norm": 1.371292233467102, "learning_rate": 8.790866677193867e-06, "loss": 0.3459, "num_input_tokens_seen": 88031856, "step": 92200 }, { "epoch": 7.521412839546455, "grad_norm": 0.9197198748588562, "learning_rate": 8.7881570560089e-06, "loss": 0.3288, "num_input_tokens_seen": 88036768, "step": 92205 }, { "epoch": 7.521820703156864, "grad_norm": 9.389800071716309, "learning_rate": 8.785447763431101e-06, "loss": 0.279, "num_input_tokens_seen": 88041024, "step": 92210 }, { "epoch": 7.522228566767273, "grad_norm": 2.7125773429870605, "learning_rate": 8.782738799515389e-06, "loss": 0.5269, "num_input_tokens_seen": 88046832, "step": 92215 }, { "epoch": 7.522636430377681, "grad_norm": 0.5998468399047852, "learning_rate": 8.780030164316666e-06, "loss": 0.3431, "num_input_tokens_seen": 88051440, "step": 92220 }, { "epoch": 7.52304429398809, "grad_norm": 1.1085938215255737, "learning_rate": 8.777321857889833e-06, "loss": 0.2464, "num_input_tokens_seen": 88056528, "step": 92225 }, { "epoch": 7.523452157598499, "grad_norm": 0.35790976881980896, "learning_rate": 8.7746138802898e-06, "loss": 0.2189, "num_input_tokens_seen": 88060576, "step": 92230 }, { "epoch": 7.5238600212089075, "grad_norm": 49.20569610595703, "learning_rate": 8.77190623157145e-06, "loss": 0.3978, "num_input_tokens_seen": 88065536, "step": 92235 }, { "epoch": 7.5242678848193165, "grad_norm": 34.416751861572266, "learning_rate": 8.769198911789663e-06, "loss": 0.4995, "num_input_tokens_seen": 88070048, "step": 92240 }, { "epoch": 7.5246757484297255, "grad_norm": 2.698279619216919, "learning_rate": 8.76649192099932e-06, "loss": 0.4987, "num_input_tokens_seen": 88075056, "step": 92245 }, { "epoch": 7.525083612040134, "grad_norm": 3.5510663986206055, "learning_rate": 8.763785259255285e-06, "loss": 0.3881, "num_input_tokens_seen": 88079936, "step": 92250 }, { "epoch": 7.525491475650543, "grad_norm": 0.8517365455627441, "learning_rate": 8.761078926612417e-06, "loss": 0.3634, "num_input_tokens_seen": 88084128, "step": 92255 }, { "epoch": 7.525899339260951, "grad_norm": 124.6382827758789, "learning_rate": 8.758372923125587e-06, "loss": 0.5939, "num_input_tokens_seen": 88089232, "step": 92260 }, { "epoch": 7.52630720287136, "grad_norm": 0.5839376449584961, "learning_rate": 8.755667248849638e-06, "loss": 0.581, "num_input_tokens_seen": 88093504, "step": 92265 }, { "epoch": 7.526715066481769, "grad_norm": 1.046705961227417, "learning_rate": 8.75296190383941e-06, "loss": 0.327, "num_input_tokens_seen": 88099040, "step": 92270 }, { "epoch": 7.527122930092177, "grad_norm": 0.5352306962013245, "learning_rate": 8.750256888149739e-06, "loss": 0.3099, "num_input_tokens_seen": 88102912, "step": 92275 }, { "epoch": 7.527530793702586, "grad_norm": 2.3569741249084473, "learning_rate": 8.747552201835457e-06, "loss": 0.348, "num_input_tokens_seen": 88108080, "step": 92280 }, { "epoch": 7.527938657312994, "grad_norm": 1.1096596717834473, "learning_rate": 8.744847844951384e-06, "loss": 0.3782, "num_input_tokens_seen": 88112656, "step": 92285 }, { "epoch": 7.528346520923403, "grad_norm": 0.5292778611183167, "learning_rate": 8.742143817552335e-06, "loss": 0.3973, "num_input_tokens_seen": 88117504, "step": 92290 }, { "epoch": 7.528754384533812, "grad_norm": 5.003251075744629, "learning_rate": 8.739440119693124e-06, "loss": 0.2104, "num_input_tokens_seen": 88122656, "step": 92295 }, { "epoch": 7.52916224814422, "grad_norm": 0.8432673811912537, "learning_rate": 8.736736751428556e-06, "loss": 0.3284, "num_input_tokens_seen": 88126000, "step": 92300 }, { "epoch": 7.529570111754629, "grad_norm": 3.2515461444854736, "learning_rate": 8.734033712813423e-06, "loss": 0.3896, "num_input_tokens_seen": 88130816, "step": 92305 }, { "epoch": 7.529977975365038, "grad_norm": 1.2760186195373535, "learning_rate": 8.731331003902506e-06, "loss": 0.2917, "num_input_tokens_seen": 88135952, "step": 92310 }, { "epoch": 7.530385838975446, "grad_norm": 0.5874670743942261, "learning_rate": 8.728628624750604e-06, "loss": 0.2817, "num_input_tokens_seen": 88140432, "step": 92315 }, { "epoch": 7.530793702585855, "grad_norm": 1.200299620628357, "learning_rate": 8.725926575412488e-06, "loss": 0.2748, "num_input_tokens_seen": 88145104, "step": 92320 }, { "epoch": 7.531201566196264, "grad_norm": 27.909767150878906, "learning_rate": 8.723224855942925e-06, "loss": 0.4566, "num_input_tokens_seen": 88149616, "step": 92325 }, { "epoch": 7.5316094298066725, "grad_norm": 1.0896328687667847, "learning_rate": 8.720523466396673e-06, "loss": 0.3918, "num_input_tokens_seen": 88154704, "step": 92330 }, { "epoch": 7.5320172934170815, "grad_norm": 47.34252166748047, "learning_rate": 8.717822406828491e-06, "loss": 0.3326, "num_input_tokens_seen": 88159408, "step": 92335 }, { "epoch": 7.53242515702749, "grad_norm": 25.206472396850586, "learning_rate": 8.715121677293133e-06, "loss": 0.3467, "num_input_tokens_seen": 88164144, "step": 92340 }, { "epoch": 7.532833020637899, "grad_norm": 0.5803186893463135, "learning_rate": 8.712421277845342e-06, "loss": 0.2355, "num_input_tokens_seen": 88169120, "step": 92345 }, { "epoch": 7.533240884248308, "grad_norm": 0.438748836517334, "learning_rate": 8.70972120853985e-06, "loss": 0.3447, "num_input_tokens_seen": 88174080, "step": 92350 }, { "epoch": 7.533648747858716, "grad_norm": 0.4774312973022461, "learning_rate": 8.707021469431379e-06, "loss": 0.2483, "num_input_tokens_seen": 88178912, "step": 92355 }, { "epoch": 7.534056611469125, "grad_norm": 0.5871837139129639, "learning_rate": 8.704322060574666e-06, "loss": 0.2283, "num_input_tokens_seen": 88183120, "step": 92360 }, { "epoch": 7.534464475079534, "grad_norm": 0.44625723361968994, "learning_rate": 8.70162298202442e-06, "loss": 0.3845, "num_input_tokens_seen": 88188032, "step": 92365 }, { "epoch": 7.534872338689942, "grad_norm": 15.956864356994629, "learning_rate": 8.698924233835354e-06, "loss": 0.3972, "num_input_tokens_seen": 88193488, "step": 92370 }, { "epoch": 7.535280202300351, "grad_norm": 0.44800055027008057, "learning_rate": 8.696225816062165e-06, "loss": 0.1995, "num_input_tokens_seen": 88198848, "step": 92375 }, { "epoch": 7.53568806591076, "grad_norm": 0.9845051169395447, "learning_rate": 8.693527728759543e-06, "loss": 0.3275, "num_input_tokens_seen": 88203600, "step": 92380 }, { "epoch": 7.536095929521168, "grad_norm": 1.462931513786316, "learning_rate": 8.690829971982192e-06, "loss": 0.366, "num_input_tokens_seen": 88207744, "step": 92385 }, { "epoch": 7.536503793131577, "grad_norm": 0.3959624469280243, "learning_rate": 8.688132545784788e-06, "loss": 0.4335, "num_input_tokens_seen": 88212816, "step": 92390 }, { "epoch": 7.536911656741985, "grad_norm": 0.6849522590637207, "learning_rate": 8.685435450222007e-06, "loss": 0.3428, "num_input_tokens_seen": 88217232, "step": 92395 }, { "epoch": 7.537319520352394, "grad_norm": 8.850101470947266, "learning_rate": 8.682738685348515e-06, "loss": 0.3472, "num_input_tokens_seen": 88221952, "step": 92400 }, { "epoch": 7.537727383962803, "grad_norm": 0.41777104139328003, "learning_rate": 8.680042251218977e-06, "loss": 0.198, "num_input_tokens_seen": 88226320, "step": 92405 }, { "epoch": 7.538135247573211, "grad_norm": 7.81785249710083, "learning_rate": 8.67734614788805e-06, "loss": 0.3794, "num_input_tokens_seen": 88231904, "step": 92410 }, { "epoch": 7.53854311118362, "grad_norm": 101.36676025390625, "learning_rate": 8.67465037541038e-06, "loss": 0.3773, "num_input_tokens_seen": 88236944, "step": 92415 }, { "epoch": 7.538950974794028, "grad_norm": 8.895000457763672, "learning_rate": 8.671954933840606e-06, "loss": 0.3908, "num_input_tokens_seen": 88241104, "step": 92420 }, { "epoch": 7.539358838404437, "grad_norm": 0.408735066652298, "learning_rate": 8.669259823233364e-06, "loss": 0.2619, "num_input_tokens_seen": 88245648, "step": 92425 }, { "epoch": 7.5397667020148464, "grad_norm": 2.6729297637939453, "learning_rate": 8.66656504364329e-06, "loss": 0.3601, "num_input_tokens_seen": 88250704, "step": 92430 }, { "epoch": 7.540174565625255, "grad_norm": 28.910903930664062, "learning_rate": 8.663870595125003e-06, "loss": 0.2969, "num_input_tokens_seen": 88254992, "step": 92435 }, { "epoch": 7.540582429235664, "grad_norm": 3.098015785217285, "learning_rate": 8.661176477733122e-06, "loss": 0.3537, "num_input_tokens_seen": 88259840, "step": 92440 }, { "epoch": 7.540990292846073, "grad_norm": 0.29032576084136963, "learning_rate": 8.658482691522238e-06, "loss": 0.5262, "num_input_tokens_seen": 88264560, "step": 92445 }, { "epoch": 7.541398156456481, "grad_norm": 0.7005641460418701, "learning_rate": 8.655789236546974e-06, "loss": 0.4356, "num_input_tokens_seen": 88269872, "step": 92450 }, { "epoch": 7.54180602006689, "grad_norm": 2.1012861728668213, "learning_rate": 8.65309611286192e-06, "loss": 0.4052, "num_input_tokens_seen": 88274784, "step": 92455 }, { "epoch": 7.542213883677299, "grad_norm": 1.1706336736679077, "learning_rate": 8.650403320521658e-06, "loss": 0.2394, "num_input_tokens_seen": 88280832, "step": 92460 }, { "epoch": 7.542621747287707, "grad_norm": 0.8835551738739014, "learning_rate": 8.647710859580777e-06, "loss": 0.3238, "num_input_tokens_seen": 88285216, "step": 92465 }, { "epoch": 7.543029610898116, "grad_norm": 60.91712951660156, "learning_rate": 8.645018730093837e-06, "loss": 0.2641, "num_input_tokens_seen": 88289728, "step": 92470 }, { "epoch": 7.543437474508524, "grad_norm": 3.0462229251861572, "learning_rate": 8.642326932115428e-06, "loss": 0.3251, "num_input_tokens_seen": 88294288, "step": 92475 }, { "epoch": 7.543845338118933, "grad_norm": 111.9703598022461, "learning_rate": 8.639635465700099e-06, "loss": 0.4005, "num_input_tokens_seen": 88299472, "step": 92480 }, { "epoch": 7.544253201729342, "grad_norm": 26.080577850341797, "learning_rate": 8.63694433090241e-06, "loss": 0.4998, "num_input_tokens_seen": 88304352, "step": 92485 }, { "epoch": 7.54466106533975, "grad_norm": 46.131492614746094, "learning_rate": 8.634253527776898e-06, "loss": 0.3005, "num_input_tokens_seen": 88309136, "step": 92490 }, { "epoch": 7.545068928950159, "grad_norm": 6.247004985809326, "learning_rate": 8.63156305637812e-06, "loss": 0.3556, "num_input_tokens_seen": 88313456, "step": 92495 }, { "epoch": 7.545476792560567, "grad_norm": 21.493183135986328, "learning_rate": 8.628872916760603e-06, "loss": 0.4856, "num_input_tokens_seen": 88318560, "step": 92500 }, { "epoch": 7.545884656170976, "grad_norm": 0.8153456449508667, "learning_rate": 8.626183108978875e-06, "loss": 0.3162, "num_input_tokens_seen": 88323744, "step": 92505 }, { "epoch": 7.546292519781385, "grad_norm": 0.6624220013618469, "learning_rate": 8.623493633087452e-06, "loss": 0.2835, "num_input_tokens_seen": 88328640, "step": 92510 }, { "epoch": 7.546700383391794, "grad_norm": 4.581149578094482, "learning_rate": 8.620804489140861e-06, "loss": 0.3212, "num_input_tokens_seen": 88332832, "step": 92515 }, { "epoch": 7.547108247002202, "grad_norm": 77.08038330078125, "learning_rate": 8.6181156771936e-06, "loss": 0.6297, "num_input_tokens_seen": 88338688, "step": 92520 }, { "epoch": 7.547516110612611, "grad_norm": 0.48450767993927, "learning_rate": 8.615427197300177e-06, "loss": 0.3212, "num_input_tokens_seen": 88343744, "step": 92525 }, { "epoch": 7.5479239742230195, "grad_norm": 12.9577054977417, "learning_rate": 8.612739049515081e-06, "loss": 0.4055, "num_input_tokens_seen": 88348272, "step": 92530 }, { "epoch": 7.5483318378334285, "grad_norm": 1.0114647150039673, "learning_rate": 8.610051233892802e-06, "loss": 0.3228, "num_input_tokens_seen": 88352240, "step": 92535 }, { "epoch": 7.5487397014438375, "grad_norm": 36.56771469116211, "learning_rate": 8.60736375048782e-06, "loss": 0.4849, "num_input_tokens_seen": 88357408, "step": 92540 }, { "epoch": 7.549147565054246, "grad_norm": 2.066263198852539, "learning_rate": 8.604676599354608e-06, "loss": 0.3221, "num_input_tokens_seen": 88362464, "step": 92545 }, { "epoch": 7.549555428664655, "grad_norm": 3.5654468536376953, "learning_rate": 8.601989780547637e-06, "loss": 0.4077, "num_input_tokens_seen": 88367312, "step": 92550 }, { "epoch": 7.549963292275063, "grad_norm": 19.56500816345215, "learning_rate": 8.599303294121353e-06, "loss": 0.3115, "num_input_tokens_seen": 88372304, "step": 92555 }, { "epoch": 7.550371155885472, "grad_norm": 1.8154127597808838, "learning_rate": 8.596617140130234e-06, "loss": 0.4122, "num_input_tokens_seen": 88377360, "step": 92560 }, { "epoch": 7.550779019495881, "grad_norm": 11.395435333251953, "learning_rate": 8.593931318628715e-06, "loss": 0.2603, "num_input_tokens_seen": 88381904, "step": 92565 }, { "epoch": 7.551186883106289, "grad_norm": 3.1065011024475098, "learning_rate": 8.591245829671235e-06, "loss": 0.3169, "num_input_tokens_seen": 88387072, "step": 92570 }, { "epoch": 7.551594746716698, "grad_norm": 0.900729775428772, "learning_rate": 8.58856067331223e-06, "loss": 0.3506, "num_input_tokens_seen": 88391648, "step": 92575 }, { "epoch": 7.552002610327107, "grad_norm": 86.8926773071289, "learning_rate": 8.585875849606118e-06, "loss": 0.44, "num_input_tokens_seen": 88396656, "step": 92580 }, { "epoch": 7.552410473937515, "grad_norm": 4.263976573944092, "learning_rate": 8.583191358607337e-06, "loss": 0.3173, "num_input_tokens_seen": 88401904, "step": 92585 }, { "epoch": 7.552818337547924, "grad_norm": 25.570236206054688, "learning_rate": 8.580507200370292e-06, "loss": 0.2879, "num_input_tokens_seen": 88406672, "step": 92590 }, { "epoch": 7.553226201158333, "grad_norm": 1.0613172054290771, "learning_rate": 8.577823374949387e-06, "loss": 0.2674, "num_input_tokens_seen": 88411792, "step": 92595 }, { "epoch": 7.553634064768741, "grad_norm": 1.158422589302063, "learning_rate": 8.575139882399016e-06, "loss": 0.3989, "num_input_tokens_seen": 88416816, "step": 92600 }, { "epoch": 7.55404192837915, "grad_norm": 0.6716288328170776, "learning_rate": 8.57245672277359e-06, "loss": 0.3043, "num_input_tokens_seen": 88421872, "step": 92605 }, { "epoch": 7.554449791989558, "grad_norm": 1.6457182168960571, "learning_rate": 8.569773896127483e-06, "loss": 0.3296, "num_input_tokens_seen": 88427120, "step": 92610 }, { "epoch": 7.554857655599967, "grad_norm": 1.1614292860031128, "learning_rate": 8.56709140251508e-06, "loss": 0.3781, "num_input_tokens_seen": 88431168, "step": 92615 }, { "epoch": 7.555265519210376, "grad_norm": 2.2557337284088135, "learning_rate": 8.56440924199075e-06, "loss": 0.414, "num_input_tokens_seen": 88435680, "step": 92620 }, { "epoch": 7.5556733828207845, "grad_norm": 0.6563211679458618, "learning_rate": 8.561727414608853e-06, "loss": 0.3809, "num_input_tokens_seen": 88439264, "step": 92625 }, { "epoch": 7.5560812464311935, "grad_norm": 56.40935134887695, "learning_rate": 8.559045920423763e-06, "loss": 0.2641, "num_input_tokens_seen": 88444752, "step": 92630 }, { "epoch": 7.556489110041602, "grad_norm": 1.639538049697876, "learning_rate": 8.556364759489829e-06, "loss": 0.3683, "num_input_tokens_seen": 88449552, "step": 92635 }, { "epoch": 7.556896973652011, "grad_norm": 1.0311648845672607, "learning_rate": 8.553683931861395e-06, "loss": 0.3188, "num_input_tokens_seen": 88455040, "step": 92640 }, { "epoch": 7.55730483726242, "grad_norm": 28.404109954833984, "learning_rate": 8.551003437592785e-06, "loss": 0.3416, "num_input_tokens_seen": 88460160, "step": 92645 }, { "epoch": 7.557712700872828, "grad_norm": 0.3949609696865082, "learning_rate": 8.54832327673836e-06, "loss": 0.3415, "num_input_tokens_seen": 88465552, "step": 92650 }, { "epoch": 7.558120564483237, "grad_norm": 0.4237569570541382, "learning_rate": 8.545643449352431e-06, "loss": 0.405, "num_input_tokens_seen": 88470112, "step": 92655 }, { "epoch": 7.558528428093646, "grad_norm": 1.46077561378479, "learning_rate": 8.542963955489314e-06, "loss": 0.2633, "num_input_tokens_seen": 88475072, "step": 92660 }, { "epoch": 7.558936291704054, "grad_norm": 2.2352147102355957, "learning_rate": 8.540284795203326e-06, "loss": 0.2542, "num_input_tokens_seen": 88479808, "step": 92665 }, { "epoch": 7.559344155314463, "grad_norm": 1.0415676832199097, "learning_rate": 8.53760596854877e-06, "loss": 0.3487, "num_input_tokens_seen": 88484608, "step": 92670 }, { "epoch": 7.559752018924872, "grad_norm": 26.881797790527344, "learning_rate": 8.534927475579949e-06, "loss": 0.2775, "num_input_tokens_seen": 88490192, "step": 92675 }, { "epoch": 7.56015988253528, "grad_norm": 1.026428461074829, "learning_rate": 8.53224931635115e-06, "loss": 0.3416, "num_input_tokens_seen": 88495520, "step": 92680 }, { "epoch": 7.560567746145689, "grad_norm": 2.3150386810302734, "learning_rate": 8.529571490916657e-06, "loss": 0.2798, "num_input_tokens_seen": 88500544, "step": 92685 }, { "epoch": 7.560975609756097, "grad_norm": 1.330925464630127, "learning_rate": 8.526893999330746e-06, "loss": 0.529, "num_input_tokens_seen": 88504400, "step": 92690 }, { "epoch": 7.561383473366506, "grad_norm": 0.7897728085517883, "learning_rate": 8.524216841647702e-06, "loss": 0.375, "num_input_tokens_seen": 88509296, "step": 92695 }, { "epoch": 7.561791336976915, "grad_norm": 2.434650421142578, "learning_rate": 8.521540017921781e-06, "loss": 0.3632, "num_input_tokens_seen": 88514208, "step": 92700 }, { "epoch": 7.562199200587323, "grad_norm": 76.43400573730469, "learning_rate": 8.518863528207242e-06, "loss": 0.3354, "num_input_tokens_seen": 88519600, "step": 92705 }, { "epoch": 7.562607064197732, "grad_norm": 4.202363967895508, "learning_rate": 8.516187372558327e-06, "loss": 0.201, "num_input_tokens_seen": 88524896, "step": 92710 }, { "epoch": 7.563014927808141, "grad_norm": 61.90960693359375, "learning_rate": 8.513511551029299e-06, "loss": 0.296, "num_input_tokens_seen": 88529520, "step": 92715 }, { "epoch": 7.5634227914185495, "grad_norm": 0.7865017056465149, "learning_rate": 8.510836063674386e-06, "loss": 0.3108, "num_input_tokens_seen": 88534896, "step": 92720 }, { "epoch": 7.5638306550289585, "grad_norm": 1.674564003944397, "learning_rate": 8.50816091054782e-06, "loss": 0.3059, "num_input_tokens_seen": 88539968, "step": 92725 }, { "epoch": 7.5642385186393675, "grad_norm": 0.9686866998672485, "learning_rate": 8.50548609170382e-06, "loss": 0.2453, "num_input_tokens_seen": 88545536, "step": 92730 }, { "epoch": 7.564646382249776, "grad_norm": 0.4068771004676819, "learning_rate": 8.502811607196603e-06, "loss": 0.2451, "num_input_tokens_seen": 88551088, "step": 92735 }, { "epoch": 7.565054245860185, "grad_norm": 12.632863998413086, "learning_rate": 8.500137457080396e-06, "loss": 0.4, "num_input_tokens_seen": 88555408, "step": 92740 }, { "epoch": 7.565462109470593, "grad_norm": 0.3901008665561676, "learning_rate": 8.497463641409387e-06, "loss": 0.3196, "num_input_tokens_seen": 88560480, "step": 92745 }, { "epoch": 7.565869973081002, "grad_norm": 52.65818405151367, "learning_rate": 8.494790160237778e-06, "loss": 0.2867, "num_input_tokens_seen": 88564928, "step": 92750 }, { "epoch": 7.566277836691411, "grad_norm": 4.151028633117676, "learning_rate": 8.492117013619752e-06, "loss": 0.4402, "num_input_tokens_seen": 88569728, "step": 92755 }, { "epoch": 7.566685700301819, "grad_norm": 18.07267951965332, "learning_rate": 8.489444201609509e-06, "loss": 0.4008, "num_input_tokens_seen": 88574176, "step": 92760 }, { "epoch": 7.567093563912228, "grad_norm": 44.79582595825195, "learning_rate": 8.486771724261214e-06, "loss": 0.4423, "num_input_tokens_seen": 88579040, "step": 92765 }, { "epoch": 7.567501427522636, "grad_norm": 11.716951370239258, "learning_rate": 8.48409958162904e-06, "loss": 0.3518, "num_input_tokens_seen": 88584560, "step": 92770 }, { "epoch": 7.567909291133045, "grad_norm": 1.1682027578353882, "learning_rate": 8.48142777376715e-06, "loss": 0.335, "num_input_tokens_seen": 88589632, "step": 92775 }, { "epoch": 7.568317154743454, "grad_norm": 1.8930227756500244, "learning_rate": 8.478756300729692e-06, "loss": 0.3003, "num_input_tokens_seen": 88593872, "step": 92780 }, { "epoch": 7.568725018353862, "grad_norm": 0.5192172527313232, "learning_rate": 8.476085162570833e-06, "loss": 0.3797, "num_input_tokens_seen": 88597888, "step": 92785 }, { "epoch": 7.569132881964271, "grad_norm": 1.825109839439392, "learning_rate": 8.473414359344703e-06, "loss": 0.3218, "num_input_tokens_seen": 88602384, "step": 92790 }, { "epoch": 7.56954074557468, "grad_norm": 5.577886581420898, "learning_rate": 8.470743891105443e-06, "loss": 0.2331, "num_input_tokens_seen": 88607136, "step": 92795 }, { "epoch": 7.569948609185088, "grad_norm": 0.5233178734779358, "learning_rate": 8.46807375790718e-06, "loss": 0.3534, "num_input_tokens_seen": 88612352, "step": 92800 }, { "epoch": 7.570356472795497, "grad_norm": 7.4464335441589355, "learning_rate": 8.465403959804036e-06, "loss": 0.2564, "num_input_tokens_seen": 88617136, "step": 92805 }, { "epoch": 7.570764336405906, "grad_norm": 4.462393760681152, "learning_rate": 8.46273449685013e-06, "loss": 0.3975, "num_input_tokens_seen": 88620688, "step": 92810 }, { "epoch": 7.571172200016314, "grad_norm": 3.4684159755706787, "learning_rate": 8.460065369099568e-06, "loss": 0.3626, "num_input_tokens_seen": 88625680, "step": 92815 }, { "epoch": 7.571580063626723, "grad_norm": 14.474239349365234, "learning_rate": 8.457396576606453e-06, "loss": 0.2337, "num_input_tokens_seen": 88629824, "step": 92820 }, { "epoch": 7.5719879272371315, "grad_norm": 0.9861079454421997, "learning_rate": 8.454728119424871e-06, "loss": 0.406, "num_input_tokens_seen": 88633488, "step": 92825 }, { "epoch": 7.5723957908475406, "grad_norm": 0.543465793132782, "learning_rate": 8.452059997608926e-06, "loss": 0.3445, "num_input_tokens_seen": 88637728, "step": 92830 }, { "epoch": 7.57280365445795, "grad_norm": 3.5269715785980225, "learning_rate": 8.449392211212697e-06, "loss": 0.3143, "num_input_tokens_seen": 88641152, "step": 92835 }, { "epoch": 7.573211518068358, "grad_norm": 1.1655972003936768, "learning_rate": 8.446724760290254e-06, "loss": 0.3111, "num_input_tokens_seen": 88645616, "step": 92840 }, { "epoch": 7.573619381678767, "grad_norm": 0.7522294521331787, "learning_rate": 8.444057644895656e-06, "loss": 0.3178, "num_input_tokens_seen": 88650656, "step": 92845 }, { "epoch": 7.574027245289175, "grad_norm": 4.324301242828369, "learning_rate": 8.441390865082985e-06, "loss": 0.2435, "num_input_tokens_seen": 88656320, "step": 92850 }, { "epoch": 7.574435108899584, "grad_norm": 47.58440399169922, "learning_rate": 8.438724420906282e-06, "loss": 0.3503, "num_input_tokens_seen": 88661456, "step": 92855 }, { "epoch": 7.574842972509993, "grad_norm": 4.270881652832031, "learning_rate": 8.4360583124196e-06, "loss": 0.3622, "num_input_tokens_seen": 88666064, "step": 92860 }, { "epoch": 7.575250836120401, "grad_norm": 3.0270185470581055, "learning_rate": 8.43339253967697e-06, "loss": 0.368, "num_input_tokens_seen": 88670896, "step": 92865 }, { "epoch": 7.57565869973081, "grad_norm": 0.40046006441116333, "learning_rate": 8.430727102732441e-06, "loss": 0.3565, "num_input_tokens_seen": 88675184, "step": 92870 }, { "epoch": 7.576066563341219, "grad_norm": 0.6794564127922058, "learning_rate": 8.428062001640033e-06, "loss": 0.3806, "num_input_tokens_seen": 88680080, "step": 92875 }, { "epoch": 7.576474426951627, "grad_norm": 0.36357253789901733, "learning_rate": 8.425397236453766e-06, "loss": 0.4623, "num_input_tokens_seen": 88684912, "step": 92880 }, { "epoch": 7.576882290562036, "grad_norm": 2.5380966663360596, "learning_rate": 8.422732807227654e-06, "loss": 0.5216, "num_input_tokens_seen": 88689440, "step": 92885 }, { "epoch": 7.577290154172445, "grad_norm": 2.8143367767333984, "learning_rate": 8.420068714015696e-06, "loss": 0.3314, "num_input_tokens_seen": 88694784, "step": 92890 }, { "epoch": 7.577698017782853, "grad_norm": 0.4592786431312561, "learning_rate": 8.417404956871908e-06, "loss": 0.4007, "num_input_tokens_seen": 88699472, "step": 92895 }, { "epoch": 7.578105881393262, "grad_norm": 26.80556869506836, "learning_rate": 8.414741535850276e-06, "loss": 0.3372, "num_input_tokens_seen": 88703856, "step": 92900 }, { "epoch": 7.57851374500367, "grad_norm": 1.9407655000686646, "learning_rate": 8.412078451004785e-06, "loss": 0.3678, "num_input_tokens_seen": 88708480, "step": 92905 }, { "epoch": 7.578921608614079, "grad_norm": 20.21055793762207, "learning_rate": 8.409415702389406e-06, "loss": 0.4246, "num_input_tokens_seen": 88713296, "step": 92910 }, { "epoch": 7.579329472224488, "grad_norm": 2.2411446571350098, "learning_rate": 8.40675329005813e-06, "loss": 0.4101, "num_input_tokens_seen": 88718208, "step": 92915 }, { "epoch": 7.5797373358348965, "grad_norm": 14.358977317810059, "learning_rate": 8.404091214064913e-06, "loss": 0.4145, "num_input_tokens_seen": 88722016, "step": 92920 }, { "epoch": 7.5801451994453055, "grad_norm": 2.82718825340271, "learning_rate": 8.401429474463718e-06, "loss": 0.2896, "num_input_tokens_seen": 88726448, "step": 92925 }, { "epoch": 7.5805530630557145, "grad_norm": 0.7368757128715515, "learning_rate": 8.398768071308491e-06, "loss": 0.4385, "num_input_tokens_seen": 88731600, "step": 92930 }, { "epoch": 7.580960926666123, "grad_norm": 51.16629409790039, "learning_rate": 8.39610700465318e-06, "loss": 0.2985, "num_input_tokens_seen": 88737040, "step": 92935 }, { "epoch": 7.581368790276532, "grad_norm": 1.5683537721633911, "learning_rate": 8.393446274551728e-06, "loss": 0.2475, "num_input_tokens_seen": 88742048, "step": 92940 }, { "epoch": 7.581776653886941, "grad_norm": 2.7968027591705322, "learning_rate": 8.390785881058064e-06, "loss": 0.3553, "num_input_tokens_seen": 88746768, "step": 92945 }, { "epoch": 7.582184517497349, "grad_norm": 52.442108154296875, "learning_rate": 8.388125824226101e-06, "loss": 0.3693, "num_input_tokens_seen": 88750544, "step": 92950 }, { "epoch": 7.582592381107758, "grad_norm": 3.5646824836730957, "learning_rate": 8.38546610410978e-06, "loss": 0.295, "num_input_tokens_seen": 88754544, "step": 92955 }, { "epoch": 7.583000244718166, "grad_norm": 0.5819301009178162, "learning_rate": 8.382806720762998e-06, "loss": 0.3456, "num_input_tokens_seen": 88758816, "step": 92960 }, { "epoch": 7.583408108328575, "grad_norm": 122.47692108154297, "learning_rate": 8.380147674239664e-06, "loss": 0.4143, "num_input_tokens_seen": 88763776, "step": 92965 }, { "epoch": 7.583815971938984, "grad_norm": 1.0051389932632446, "learning_rate": 8.377488964593674e-06, "loss": 0.4418, "num_input_tokens_seen": 88768960, "step": 92970 }, { "epoch": 7.584223835549392, "grad_norm": 0.6241465210914612, "learning_rate": 8.37483059187892e-06, "loss": 0.258, "num_input_tokens_seen": 88774112, "step": 92975 }, { "epoch": 7.584631699159801, "grad_norm": 1.7624874114990234, "learning_rate": 8.37217255614928e-06, "loss": 0.3554, "num_input_tokens_seen": 88778608, "step": 92980 }, { "epoch": 7.585039562770209, "grad_norm": 5.813647270202637, "learning_rate": 8.36951485745864e-06, "loss": 0.4665, "num_input_tokens_seen": 88782512, "step": 92985 }, { "epoch": 7.585447426380618, "grad_norm": 1.4366376399993896, "learning_rate": 8.36685749586087e-06, "loss": 0.4097, "num_input_tokens_seen": 88787440, "step": 92990 }, { "epoch": 7.585855289991027, "grad_norm": 1.3579570055007935, "learning_rate": 8.364200471409831e-06, "loss": 0.2879, "num_input_tokens_seen": 88792576, "step": 92995 }, { "epoch": 7.586263153601435, "grad_norm": 8.3670654296875, "learning_rate": 8.361543784159371e-06, "loss": 0.3107, "num_input_tokens_seen": 88797184, "step": 93000 }, { "epoch": 7.586671017211844, "grad_norm": 1.6657665967941284, "learning_rate": 8.358887434163354e-06, "loss": 0.2923, "num_input_tokens_seen": 88802208, "step": 93005 }, { "epoch": 7.587078880822253, "grad_norm": 4.059002876281738, "learning_rate": 8.356231421475618e-06, "loss": 0.3324, "num_input_tokens_seen": 88806720, "step": 93010 }, { "epoch": 7.5874867444326615, "grad_norm": 9.408772468566895, "learning_rate": 8.353575746150003e-06, "loss": 0.2082, "num_input_tokens_seen": 88811280, "step": 93015 }, { "epoch": 7.5878946080430705, "grad_norm": 2.6507556438446045, "learning_rate": 8.35092040824033e-06, "loss": 0.3258, "num_input_tokens_seen": 88815152, "step": 93020 }, { "epoch": 7.5883024716534795, "grad_norm": 4.780832767486572, "learning_rate": 8.348265407800418e-06, "loss": 0.5087, "num_input_tokens_seen": 88819696, "step": 93025 }, { "epoch": 7.588710335263888, "grad_norm": 0.5316338539123535, "learning_rate": 8.3456107448841e-06, "loss": 0.3608, "num_input_tokens_seen": 88824144, "step": 93030 }, { "epoch": 7.589118198874297, "grad_norm": 0.6996116638183594, "learning_rate": 8.342956419545176e-06, "loss": 0.3136, "num_input_tokens_seen": 88828464, "step": 93035 }, { "epoch": 7.589526062484705, "grad_norm": 0.23202461004257202, "learning_rate": 8.340302431837447e-06, "loss": 0.2491, "num_input_tokens_seen": 88833472, "step": 93040 }, { "epoch": 7.589933926095114, "grad_norm": 2.65323805809021, "learning_rate": 8.337648781814698e-06, "loss": 0.3148, "num_input_tokens_seen": 88837744, "step": 93045 }, { "epoch": 7.590341789705523, "grad_norm": 12.146315574645996, "learning_rate": 8.334995469530738e-06, "loss": 0.3236, "num_input_tokens_seen": 88842864, "step": 93050 }, { "epoch": 7.590749653315931, "grad_norm": 0.4418698251247406, "learning_rate": 8.332342495039336e-06, "loss": 0.4061, "num_input_tokens_seen": 88848048, "step": 93055 }, { "epoch": 7.59115751692634, "grad_norm": 0.650581419467926, "learning_rate": 8.32968985839427e-06, "loss": 0.2358, "num_input_tokens_seen": 88853456, "step": 93060 }, { "epoch": 7.591565380536749, "grad_norm": 0.8760775327682495, "learning_rate": 8.32703755964931e-06, "loss": 0.3589, "num_input_tokens_seen": 88858160, "step": 93065 }, { "epoch": 7.591973244147157, "grad_norm": 5.193299293518066, "learning_rate": 8.324385598858203e-06, "loss": 0.2902, "num_input_tokens_seen": 88862992, "step": 93070 }, { "epoch": 7.592381107757566, "grad_norm": 0.8055769205093384, "learning_rate": 8.321733976074724e-06, "loss": 0.3058, "num_input_tokens_seen": 88867360, "step": 93075 }, { "epoch": 7.592788971367975, "grad_norm": 2.617572546005249, "learning_rate": 8.319082691352608e-06, "loss": 0.4067, "num_input_tokens_seen": 88871488, "step": 93080 }, { "epoch": 7.593196834978383, "grad_norm": 0.8326299786567688, "learning_rate": 8.316431744745601e-06, "loss": 0.2126, "num_input_tokens_seen": 88876096, "step": 93085 }, { "epoch": 7.593604698588792, "grad_norm": 24.70396614074707, "learning_rate": 8.31378113630743e-06, "loss": 0.3957, "num_input_tokens_seen": 88880752, "step": 93090 }, { "epoch": 7.5940125621992, "grad_norm": 0.5139403939247131, "learning_rate": 8.311130866091827e-06, "loss": 0.2785, "num_input_tokens_seen": 88885520, "step": 93095 }, { "epoch": 7.594420425809609, "grad_norm": 0.24616432189941406, "learning_rate": 8.308480934152507e-06, "loss": 0.3593, "num_input_tokens_seen": 88889536, "step": 93100 }, { "epoch": 7.594828289420018, "grad_norm": 25.23598289489746, "learning_rate": 8.305831340543189e-06, "loss": 0.3006, "num_input_tokens_seen": 88894736, "step": 93105 }, { "epoch": 7.595236153030426, "grad_norm": 2.317749261856079, "learning_rate": 8.303182085317566e-06, "loss": 0.1999, "num_input_tokens_seen": 88899936, "step": 93110 }, { "epoch": 7.595644016640835, "grad_norm": 0.6012011766433716, "learning_rate": 8.300533168529356e-06, "loss": 0.4346, "num_input_tokens_seen": 88904048, "step": 93115 }, { "epoch": 7.596051880251244, "grad_norm": 1.3422709703445435, "learning_rate": 8.297884590232245e-06, "loss": 0.2827, "num_input_tokens_seen": 88909264, "step": 93120 }, { "epoch": 7.596459743861653, "grad_norm": 0.40670982003211975, "learning_rate": 8.295236350479913e-06, "loss": 0.3553, "num_input_tokens_seen": 88914576, "step": 93125 }, { "epoch": 7.596867607472062, "grad_norm": 1.0509449243545532, "learning_rate": 8.292588449326042e-06, "loss": 0.2615, "num_input_tokens_seen": 88919328, "step": 93130 }, { "epoch": 7.59727547108247, "grad_norm": 2.8983843326568604, "learning_rate": 8.289940886824296e-06, "loss": 0.4778, "num_input_tokens_seen": 88923936, "step": 93135 }, { "epoch": 7.597683334692879, "grad_norm": 3.5902647972106934, "learning_rate": 8.287293663028357e-06, "loss": 0.2456, "num_input_tokens_seen": 88929088, "step": 93140 }, { "epoch": 7.598091198303288, "grad_norm": 1.6066707372665405, "learning_rate": 8.284646777991872e-06, "loss": 0.4104, "num_input_tokens_seen": 88933472, "step": 93145 }, { "epoch": 7.598499061913696, "grad_norm": 0.5288256406784058, "learning_rate": 8.282000231768494e-06, "loss": 0.2773, "num_input_tokens_seen": 88938880, "step": 93150 }, { "epoch": 7.598906925524105, "grad_norm": 1.8081512451171875, "learning_rate": 8.279354024411859e-06, "loss": 0.2303, "num_input_tokens_seen": 88943920, "step": 93155 }, { "epoch": 7.599314789134514, "grad_norm": 19.5687313079834, "learning_rate": 8.276708155975623e-06, "loss": 0.3408, "num_input_tokens_seen": 88948544, "step": 93160 }, { "epoch": 7.599722652744922, "grad_norm": 23.78473472595215, "learning_rate": 8.274062626513407e-06, "loss": 0.3384, "num_input_tokens_seen": 88953632, "step": 93165 }, { "epoch": 7.600130516355331, "grad_norm": 2.3909952640533447, "learning_rate": 8.271417436078833e-06, "loss": 0.4511, "num_input_tokens_seen": 88958304, "step": 93170 }, { "epoch": 7.600538379965739, "grad_norm": 17.352436065673828, "learning_rate": 8.268772584725518e-06, "loss": 0.2319, "num_input_tokens_seen": 88963232, "step": 93175 }, { "epoch": 7.600946243576148, "grad_norm": 9.68797492980957, "learning_rate": 8.266128072507066e-06, "loss": 0.2949, "num_input_tokens_seen": 88968416, "step": 93180 }, { "epoch": 7.601354107186557, "grad_norm": 55.36408996582031, "learning_rate": 8.263483899477095e-06, "loss": 0.5572, "num_input_tokens_seen": 88973600, "step": 93185 }, { "epoch": 7.601761970796965, "grad_norm": 3.34269642829895, "learning_rate": 8.260840065689196e-06, "loss": 0.405, "num_input_tokens_seen": 88977856, "step": 93190 }, { "epoch": 7.602169834407374, "grad_norm": 0.9415244460105896, "learning_rate": 8.258196571196952e-06, "loss": 0.3342, "num_input_tokens_seen": 88983040, "step": 93195 }, { "epoch": 7.602577698017782, "grad_norm": 4.703978538513184, "learning_rate": 8.255553416053946e-06, "loss": 0.3338, "num_input_tokens_seen": 88988320, "step": 93200 }, { "epoch": 7.602985561628191, "grad_norm": 2.9964451789855957, "learning_rate": 8.252910600313762e-06, "loss": 0.2126, "num_input_tokens_seen": 88992880, "step": 93205 }, { "epoch": 7.6033934252386, "grad_norm": 7.627467155456543, "learning_rate": 8.250268124029964e-06, "loss": 0.2469, "num_input_tokens_seen": 88996960, "step": 93210 }, { "epoch": 7.6038012888490085, "grad_norm": 0.6392951011657715, "learning_rate": 8.247625987256113e-06, "loss": 0.421, "num_input_tokens_seen": 89001968, "step": 93215 }, { "epoch": 7.6042091524594175, "grad_norm": 26.702054977416992, "learning_rate": 8.244984190045768e-06, "loss": 0.3437, "num_input_tokens_seen": 89006016, "step": 93220 }, { "epoch": 7.6046170160698265, "grad_norm": 0.27128469944000244, "learning_rate": 8.242342732452472e-06, "loss": 0.4434, "num_input_tokens_seen": 89011280, "step": 93225 }, { "epoch": 7.605024879680235, "grad_norm": 2.949909210205078, "learning_rate": 8.239701614529768e-06, "loss": 0.3337, "num_input_tokens_seen": 89016304, "step": 93230 }, { "epoch": 7.605432743290644, "grad_norm": 0.6244311332702637, "learning_rate": 8.237060836331189e-06, "loss": 0.4577, "num_input_tokens_seen": 89021664, "step": 93235 }, { "epoch": 7.605840606901053, "grad_norm": 0.770576536655426, "learning_rate": 8.234420397910266e-06, "loss": 0.3016, "num_input_tokens_seen": 89025904, "step": 93240 }, { "epoch": 7.606248470511461, "grad_norm": 0.516739547252655, "learning_rate": 8.231780299320507e-06, "loss": 0.3074, "num_input_tokens_seen": 89030656, "step": 93245 }, { "epoch": 7.60665633412187, "grad_norm": 0.392844021320343, "learning_rate": 8.229140540615446e-06, "loss": 0.2976, "num_input_tokens_seen": 89034720, "step": 93250 }, { "epoch": 7.607064197732278, "grad_norm": 2.5937721729278564, "learning_rate": 8.226501121848582e-06, "loss": 0.2891, "num_input_tokens_seen": 89039472, "step": 93255 }, { "epoch": 7.607472061342687, "grad_norm": 63.32010269165039, "learning_rate": 8.22386204307341e-06, "loss": 0.3937, "num_input_tokens_seen": 89044400, "step": 93260 }, { "epoch": 7.607879924953096, "grad_norm": 7.460061073303223, "learning_rate": 8.221223304343417e-06, "loss": 0.3367, "num_input_tokens_seen": 89049920, "step": 93265 }, { "epoch": 7.608287788563504, "grad_norm": 61.06814193725586, "learning_rate": 8.218584905712108e-06, "loss": 0.3959, "num_input_tokens_seen": 89054256, "step": 93270 }, { "epoch": 7.608695652173913, "grad_norm": 0.5765277147293091, "learning_rate": 8.21594684723295e-06, "loss": 0.3199, "num_input_tokens_seen": 89058384, "step": 93275 }, { "epoch": 7.609103515784322, "grad_norm": 13.862459182739258, "learning_rate": 8.213309128959417e-06, "loss": 0.4541, "num_input_tokens_seen": 89063216, "step": 93280 }, { "epoch": 7.60951137939473, "grad_norm": 1.165515661239624, "learning_rate": 8.210671750944979e-06, "loss": 0.2332, "num_input_tokens_seen": 89069104, "step": 93285 }, { "epoch": 7.609919243005139, "grad_norm": 2.4108314514160156, "learning_rate": 8.20803471324308e-06, "loss": 0.3166, "num_input_tokens_seen": 89074352, "step": 93290 }, { "epoch": 7.610327106615548, "grad_norm": 0.6488144993782043, "learning_rate": 8.20539801590719e-06, "loss": 0.3589, "num_input_tokens_seen": 89079616, "step": 93295 }, { "epoch": 7.610734970225956, "grad_norm": 0.8563317656517029, "learning_rate": 8.202761658990742e-06, "loss": 0.4801, "num_input_tokens_seen": 89084288, "step": 93300 }, { "epoch": 7.611142833836365, "grad_norm": 0.5266008377075195, "learning_rate": 8.200125642547183e-06, "loss": 0.3241, "num_input_tokens_seen": 89088880, "step": 93305 }, { "epoch": 7.6115506974467735, "grad_norm": 2.0147271156311035, "learning_rate": 8.19748996662993e-06, "loss": 0.2802, "num_input_tokens_seen": 89092576, "step": 93310 }, { "epoch": 7.6119585610571825, "grad_norm": 24.145341873168945, "learning_rate": 8.19485463129242e-06, "loss": 0.4248, "num_input_tokens_seen": 89097376, "step": 93315 }, { "epoch": 7.6123664246675915, "grad_norm": 2.194607973098755, "learning_rate": 8.19221963658807e-06, "loss": 0.3164, "num_input_tokens_seen": 89102320, "step": 93320 }, { "epoch": 7.612774288278, "grad_norm": 1.4707719087600708, "learning_rate": 8.189584982570284e-06, "loss": 0.3969, "num_input_tokens_seen": 89107408, "step": 93325 }, { "epoch": 7.613182151888409, "grad_norm": 0.5076121687889099, "learning_rate": 8.186950669292465e-06, "loss": 0.5638, "num_input_tokens_seen": 89111568, "step": 93330 }, { "epoch": 7.613590015498817, "grad_norm": 0.6642536520957947, "learning_rate": 8.184316696808006e-06, "loss": 0.257, "num_input_tokens_seen": 89116592, "step": 93335 }, { "epoch": 7.613997879109226, "grad_norm": 24.841094970703125, "learning_rate": 8.181683065170309e-06, "loss": 0.4672, "num_input_tokens_seen": 89121024, "step": 93340 }, { "epoch": 7.614405742719635, "grad_norm": 0.5202680826187134, "learning_rate": 8.17904977443275e-06, "loss": 0.4442, "num_input_tokens_seen": 89125408, "step": 93345 }, { "epoch": 7.614813606330043, "grad_norm": 0.9817371368408203, "learning_rate": 8.176416824648706e-06, "loss": 0.3151, "num_input_tokens_seen": 89130208, "step": 93350 }, { "epoch": 7.615221469940452, "grad_norm": 1.4105629920959473, "learning_rate": 8.173784215871543e-06, "loss": 0.3323, "num_input_tokens_seen": 89134704, "step": 93355 }, { "epoch": 7.615629333550861, "grad_norm": 9.26734733581543, "learning_rate": 8.171151948154625e-06, "loss": 0.6481, "num_input_tokens_seen": 89140464, "step": 93360 }, { "epoch": 7.616037197161269, "grad_norm": 5.424785137176514, "learning_rate": 8.168520021551304e-06, "loss": 0.3256, "num_input_tokens_seen": 89145712, "step": 93365 }, { "epoch": 7.616445060771678, "grad_norm": 0.8095365762710571, "learning_rate": 8.165888436114933e-06, "loss": 0.3224, "num_input_tokens_seen": 89150336, "step": 93370 }, { "epoch": 7.616852924382087, "grad_norm": 2.1794610023498535, "learning_rate": 8.163257191898849e-06, "loss": 0.2944, "num_input_tokens_seen": 89154768, "step": 93375 }, { "epoch": 7.617260787992495, "grad_norm": 5.997701644897461, "learning_rate": 8.160626288956378e-06, "loss": 0.3835, "num_input_tokens_seen": 89159952, "step": 93380 }, { "epoch": 7.617668651602904, "grad_norm": 0.8975918889045715, "learning_rate": 8.157995727340865e-06, "loss": 0.3156, "num_input_tokens_seen": 89164688, "step": 93385 }, { "epoch": 7.618076515213312, "grad_norm": 41.2396240234375, "learning_rate": 8.155365507105623e-06, "loss": 0.2571, "num_input_tokens_seen": 89169152, "step": 93390 }, { "epoch": 7.618484378823721, "grad_norm": 0.7677475810050964, "learning_rate": 8.152735628303965e-06, "loss": 0.3977, "num_input_tokens_seen": 89173488, "step": 93395 }, { "epoch": 7.61889224243413, "grad_norm": 0.5355474352836609, "learning_rate": 8.150106090989185e-06, "loss": 0.5289, "num_input_tokens_seen": 89178032, "step": 93400 }, { "epoch": 7.6193001060445384, "grad_norm": 44.618858337402344, "learning_rate": 8.147476895214606e-06, "loss": 0.3098, "num_input_tokens_seen": 89182032, "step": 93405 }, { "epoch": 7.6197079696549475, "grad_norm": 0.6222739219665527, "learning_rate": 8.14484804103351e-06, "loss": 0.305, "num_input_tokens_seen": 89186128, "step": 93410 }, { "epoch": 7.620115833265356, "grad_norm": 1.9703476428985596, "learning_rate": 8.142219528499179e-06, "loss": 0.2966, "num_input_tokens_seen": 89190720, "step": 93415 }, { "epoch": 7.620523696875765, "grad_norm": 5.668281555175781, "learning_rate": 8.13959135766489e-06, "loss": 0.249, "num_input_tokens_seen": 89195216, "step": 93420 }, { "epoch": 7.620931560486174, "grad_norm": 0.6112138032913208, "learning_rate": 8.136963528583927e-06, "loss": 0.2532, "num_input_tokens_seen": 89200768, "step": 93425 }, { "epoch": 7.621339424096582, "grad_norm": 11.164172172546387, "learning_rate": 8.134336041309546e-06, "loss": 0.5679, "num_input_tokens_seen": 89205712, "step": 93430 }, { "epoch": 7.621747287706991, "grad_norm": 0.3236723244190216, "learning_rate": 8.131708895895007e-06, "loss": 0.4209, "num_input_tokens_seen": 89210256, "step": 93435 }, { "epoch": 7.6221551513174, "grad_norm": 10.057259559631348, "learning_rate": 8.129082092393562e-06, "loss": 0.3603, "num_input_tokens_seen": 89215440, "step": 93440 }, { "epoch": 7.622563014927808, "grad_norm": 1.9370293617248535, "learning_rate": 8.126455630858446e-06, "loss": 0.2522, "num_input_tokens_seen": 89220368, "step": 93445 }, { "epoch": 7.622970878538217, "grad_norm": 5.7650465965271, "learning_rate": 8.123829511342912e-06, "loss": 0.2924, "num_input_tokens_seen": 89223968, "step": 93450 }, { "epoch": 7.623378742148626, "grad_norm": 1.9837088584899902, "learning_rate": 8.121203733900182e-06, "loss": 0.3963, "num_input_tokens_seen": 89228272, "step": 93455 }, { "epoch": 7.623786605759034, "grad_norm": 101.18978118896484, "learning_rate": 8.118578298583482e-06, "loss": 0.4674, "num_input_tokens_seen": 89232528, "step": 93460 }, { "epoch": 7.624194469369443, "grad_norm": 2.5893092155456543, "learning_rate": 8.115953205446016e-06, "loss": 0.4332, "num_input_tokens_seen": 89237392, "step": 93465 }, { "epoch": 7.624602332979851, "grad_norm": 1.602663278579712, "learning_rate": 8.113328454541014e-06, "loss": 0.3707, "num_input_tokens_seen": 89243184, "step": 93470 }, { "epoch": 7.62501019659026, "grad_norm": 18.97483253479004, "learning_rate": 8.110704045921669e-06, "loss": 0.6193, "num_input_tokens_seen": 89248560, "step": 93475 }, { "epoch": 7.625418060200669, "grad_norm": 0.40525883436203003, "learning_rate": 8.108079979641175e-06, "loss": 0.2662, "num_input_tokens_seen": 89253264, "step": 93480 }, { "epoch": 7.625825923811077, "grad_norm": 1.477419376373291, "learning_rate": 8.105456255752722e-06, "loss": 0.3811, "num_input_tokens_seen": 89258944, "step": 93485 }, { "epoch": 7.626233787421486, "grad_norm": 2.866806745529175, "learning_rate": 8.102832874309494e-06, "loss": 0.2995, "num_input_tokens_seen": 89263648, "step": 93490 }, { "epoch": 7.626641651031895, "grad_norm": 0.7717509865760803, "learning_rate": 8.10020983536466e-06, "loss": 0.278, "num_input_tokens_seen": 89267456, "step": 93495 }, { "epoch": 7.627049514642303, "grad_norm": 2.0215580463409424, "learning_rate": 8.097587138971394e-06, "loss": 0.1792, "num_input_tokens_seen": 89271680, "step": 93500 }, { "epoch": 7.627457378252712, "grad_norm": 0.4011470675468445, "learning_rate": 8.094964785182855e-06, "loss": 0.2976, "num_input_tokens_seen": 89276240, "step": 93505 }, { "epoch": 7.627865241863121, "grad_norm": 36.525882720947266, "learning_rate": 8.092342774052186e-06, "loss": 0.3441, "num_input_tokens_seen": 89281936, "step": 93510 }, { "epoch": 7.6282731054735295, "grad_norm": 9.008395195007324, "learning_rate": 8.089721105632556e-06, "loss": 0.3038, "num_input_tokens_seen": 89286464, "step": 93515 }, { "epoch": 7.628680969083939, "grad_norm": 0.3370642066001892, "learning_rate": 8.087099779977089e-06, "loss": 0.3211, "num_input_tokens_seen": 89291408, "step": 93520 }, { "epoch": 7.629088832694347, "grad_norm": 1.3831377029418945, "learning_rate": 8.084478797138926e-06, "loss": 0.3693, "num_input_tokens_seen": 89295344, "step": 93525 }, { "epoch": 7.629496696304756, "grad_norm": 1.5254113674163818, "learning_rate": 8.081858157171188e-06, "loss": 0.2469, "num_input_tokens_seen": 89299376, "step": 93530 }, { "epoch": 7.629904559915165, "grad_norm": 0.5485551357269287, "learning_rate": 8.079237860126989e-06, "loss": 0.3299, "num_input_tokens_seen": 89304208, "step": 93535 }, { "epoch": 7.630312423525573, "grad_norm": 0.5943014621734619, "learning_rate": 8.076617906059458e-06, "loss": 0.2906, "num_input_tokens_seen": 89307936, "step": 93540 }, { "epoch": 7.630720287135982, "grad_norm": 0.6079673171043396, "learning_rate": 8.07399829502169e-06, "loss": 0.3216, "num_input_tokens_seen": 89312432, "step": 93545 }, { "epoch": 7.63112815074639, "grad_norm": 1.207467794418335, "learning_rate": 8.071379027066783e-06, "loss": 0.3878, "num_input_tokens_seen": 89317584, "step": 93550 }, { "epoch": 7.631536014356799, "grad_norm": 0.48142439126968384, "learning_rate": 8.06876010224782e-06, "loss": 0.2595, "num_input_tokens_seen": 89322880, "step": 93555 }, { "epoch": 7.631943877967208, "grad_norm": 1.5846877098083496, "learning_rate": 8.066141520617906e-06, "loss": 0.4926, "num_input_tokens_seen": 89328592, "step": 93560 }, { "epoch": 7.632351741577616, "grad_norm": 30.417583465576172, "learning_rate": 8.063523282230109e-06, "loss": 0.4896, "num_input_tokens_seen": 89333904, "step": 93565 }, { "epoch": 7.632759605188025, "grad_norm": 1.0006706714630127, "learning_rate": 8.060905387137496e-06, "loss": 0.3785, "num_input_tokens_seen": 89338528, "step": 93570 }, { "epoch": 7.633167468798434, "grad_norm": 0.7478241920471191, "learning_rate": 8.058287835393132e-06, "loss": 0.2678, "num_input_tokens_seen": 89342800, "step": 93575 }, { "epoch": 7.633575332408842, "grad_norm": 0.9706024527549744, "learning_rate": 8.05567062705007e-06, "loss": 0.31, "num_input_tokens_seen": 89347872, "step": 93580 }, { "epoch": 7.633983196019251, "grad_norm": 1.4051251411437988, "learning_rate": 8.053053762161371e-06, "loss": 0.2968, "num_input_tokens_seen": 89352544, "step": 93585 }, { "epoch": 7.63439105962966, "grad_norm": 0.5527690649032593, "learning_rate": 8.05043724078007e-06, "loss": 0.3853, "num_input_tokens_seen": 89358096, "step": 93590 }, { "epoch": 7.634798923240068, "grad_norm": 0.2903071343898773, "learning_rate": 8.047821062959204e-06, "loss": 0.285, "num_input_tokens_seen": 89363520, "step": 93595 }, { "epoch": 7.635206786850477, "grad_norm": 2.336498498916626, "learning_rate": 8.045205228751796e-06, "loss": 0.3006, "num_input_tokens_seen": 89368880, "step": 93600 }, { "epoch": 7.6356146504608855, "grad_norm": 1.1187366247177124, "learning_rate": 8.042589738210879e-06, "loss": 0.3076, "num_input_tokens_seen": 89373648, "step": 93605 }, { "epoch": 7.6360225140712945, "grad_norm": 2.481154441833496, "learning_rate": 8.039974591389462e-06, "loss": 0.2643, "num_input_tokens_seen": 89377616, "step": 93610 }, { "epoch": 7.6364303776817035, "grad_norm": 1.3337169885635376, "learning_rate": 8.037359788340554e-06, "loss": 0.301, "num_input_tokens_seen": 89382928, "step": 93615 }, { "epoch": 7.636838241292112, "grad_norm": 14.93911075592041, "learning_rate": 8.034745329117158e-06, "loss": 0.7471, "num_input_tokens_seen": 89387888, "step": 93620 }, { "epoch": 7.637246104902521, "grad_norm": 104.21674346923828, "learning_rate": 8.032131213772261e-06, "loss": 0.3731, "num_input_tokens_seen": 89392160, "step": 93625 }, { "epoch": 7.63765396851293, "grad_norm": 1.2299396991729736, "learning_rate": 8.029517442358855e-06, "loss": 0.4514, "num_input_tokens_seen": 89396976, "step": 93630 }, { "epoch": 7.638061832123338, "grad_norm": 0.9222869873046875, "learning_rate": 8.02690401492992e-06, "loss": 0.3471, "num_input_tokens_seen": 89401728, "step": 93635 }, { "epoch": 7.638469695733747, "grad_norm": 1.3847696781158447, "learning_rate": 8.024290931538422e-06, "loss": 0.3698, "num_input_tokens_seen": 89406800, "step": 93640 }, { "epoch": 7.638877559344156, "grad_norm": 1.0793579816818237, "learning_rate": 8.021678192237339e-06, "loss": 0.1887, "num_input_tokens_seen": 89412096, "step": 93645 }, { "epoch": 7.639285422954564, "grad_norm": 0.9512411952018738, "learning_rate": 8.019065797079625e-06, "loss": 0.3255, "num_input_tokens_seen": 89416368, "step": 93650 }, { "epoch": 7.639693286564973, "grad_norm": 0.3024277091026306, "learning_rate": 8.016453746118233e-06, "loss": 0.3523, "num_input_tokens_seen": 89421376, "step": 93655 }, { "epoch": 7.640101150175381, "grad_norm": 0.8978280425071716, "learning_rate": 8.013842039406105e-06, "loss": 0.2618, "num_input_tokens_seen": 89425488, "step": 93660 }, { "epoch": 7.64050901378579, "grad_norm": 0.3287277817726135, "learning_rate": 8.011230676996173e-06, "loss": 0.4004, "num_input_tokens_seen": 89429504, "step": 93665 }, { "epoch": 7.640916877396199, "grad_norm": 25.690853118896484, "learning_rate": 8.008619658941385e-06, "loss": 0.4755, "num_input_tokens_seen": 89434816, "step": 93670 }, { "epoch": 7.641324741006607, "grad_norm": 2.6786317825317383, "learning_rate": 8.006008985294658e-06, "loss": 0.3593, "num_input_tokens_seen": 89439136, "step": 93675 }, { "epoch": 7.641732604617016, "grad_norm": 2.409085273742676, "learning_rate": 8.003398656108907e-06, "loss": 0.2366, "num_input_tokens_seen": 89444032, "step": 93680 }, { "epoch": 7.642140468227424, "grad_norm": 0.4872899055480957, "learning_rate": 8.00078867143704e-06, "loss": 0.2705, "num_input_tokens_seen": 89447824, "step": 93685 }, { "epoch": 7.642548331837833, "grad_norm": 7.7845234870910645, "learning_rate": 7.998179031331959e-06, "loss": 0.2559, "num_input_tokens_seen": 89452288, "step": 93690 }, { "epoch": 7.642956195448242, "grad_norm": 111.09580993652344, "learning_rate": 7.99556973584657e-06, "loss": 0.5394, "num_input_tokens_seen": 89456528, "step": 93695 }, { "epoch": 7.6433640590586505, "grad_norm": 1.1440424919128418, "learning_rate": 7.99296078503376e-06, "loss": 0.277, "num_input_tokens_seen": 89461808, "step": 93700 }, { "epoch": 7.6437719226690595, "grad_norm": 12.924115180969238, "learning_rate": 7.990352178946402e-06, "loss": 0.5573, "num_input_tokens_seen": 89467072, "step": 93705 }, { "epoch": 7.6441797862794685, "grad_norm": 0.3455617129802704, "learning_rate": 7.987743917637374e-06, "loss": 0.3809, "num_input_tokens_seen": 89471360, "step": 93710 }, { "epoch": 7.644587649889877, "grad_norm": 13.600703239440918, "learning_rate": 7.985136001159557e-06, "loss": 0.285, "num_input_tokens_seen": 89476592, "step": 93715 }, { "epoch": 7.644995513500286, "grad_norm": 1.4873026609420776, "learning_rate": 7.9825284295658e-06, "loss": 0.3129, "num_input_tokens_seen": 89482448, "step": 93720 }, { "epoch": 7.645403377110695, "grad_norm": 0.8492153882980347, "learning_rate": 7.979921202908964e-06, "loss": 0.4012, "num_input_tokens_seen": 89487392, "step": 93725 }, { "epoch": 7.645811240721103, "grad_norm": 7.9496026039123535, "learning_rate": 7.977314321241891e-06, "loss": 0.3242, "num_input_tokens_seen": 89491952, "step": 93730 }, { "epoch": 7.646219104331512, "grad_norm": 0.4935292899608612, "learning_rate": 7.974707784617413e-06, "loss": 0.2744, "num_input_tokens_seen": 89496592, "step": 93735 }, { "epoch": 7.64662696794192, "grad_norm": 37.76596450805664, "learning_rate": 7.972101593088386e-06, "loss": 0.2323, "num_input_tokens_seen": 89501024, "step": 93740 }, { "epoch": 7.647034831552329, "grad_norm": 0.3718458414077759, "learning_rate": 7.969495746707622e-06, "loss": 0.2868, "num_input_tokens_seen": 89505936, "step": 93745 }, { "epoch": 7.647442695162738, "grad_norm": 9.896434783935547, "learning_rate": 7.966890245527942e-06, "loss": 0.5135, "num_input_tokens_seen": 89511104, "step": 93750 }, { "epoch": 7.647850558773146, "grad_norm": 0.6477553248405457, "learning_rate": 7.96428508960216e-06, "loss": 0.3328, "num_input_tokens_seen": 89515504, "step": 93755 }, { "epoch": 7.648258422383555, "grad_norm": 13.242713928222656, "learning_rate": 7.961680278983072e-06, "loss": 0.333, "num_input_tokens_seen": 89518912, "step": 93760 }, { "epoch": 7.648666285993963, "grad_norm": 0.9225502014160156, "learning_rate": 7.95907581372349e-06, "loss": 0.4327, "num_input_tokens_seen": 89523872, "step": 93765 }, { "epoch": 7.649074149604372, "grad_norm": 75.09174346923828, "learning_rate": 7.956471693876205e-06, "loss": 0.3947, "num_input_tokens_seen": 89529088, "step": 93770 }, { "epoch": 7.649482013214781, "grad_norm": 23.97209358215332, "learning_rate": 7.953867919493993e-06, "loss": 0.3896, "num_input_tokens_seen": 89533776, "step": 93775 }, { "epoch": 7.649889876825189, "grad_norm": 0.5940617918968201, "learning_rate": 7.951264490629636e-06, "loss": 0.3737, "num_input_tokens_seen": 89538544, "step": 93780 }, { "epoch": 7.650297740435598, "grad_norm": 24.96586799621582, "learning_rate": 7.948661407335905e-06, "loss": 0.2424, "num_input_tokens_seen": 89543344, "step": 93785 }, { "epoch": 7.650705604046007, "grad_norm": 30.21624755859375, "learning_rate": 7.94605866966556e-06, "loss": 0.6109, "num_input_tokens_seen": 89547536, "step": 93790 }, { "epoch": 7.651113467656415, "grad_norm": 1.7192037105560303, "learning_rate": 7.94345627767136e-06, "loss": 0.4332, "num_input_tokens_seen": 89552704, "step": 93795 }, { "epoch": 7.651521331266824, "grad_norm": 1.5156350135803223, "learning_rate": 7.940854231406045e-06, "loss": 0.4949, "num_input_tokens_seen": 89557504, "step": 93800 }, { "epoch": 7.651929194877233, "grad_norm": 5.524563312530518, "learning_rate": 7.938252530922371e-06, "loss": 0.3396, "num_input_tokens_seen": 89561152, "step": 93805 }, { "epoch": 7.652337058487642, "grad_norm": 0.459049254655838, "learning_rate": 7.935651176273074e-06, "loss": 0.3225, "num_input_tokens_seen": 89565648, "step": 93810 }, { "epoch": 7.652744922098051, "grad_norm": 55.25678253173828, "learning_rate": 7.933050167510872e-06, "loss": 0.2779, "num_input_tokens_seen": 89570608, "step": 93815 }, { "epoch": 7.653152785708459, "grad_norm": 0.8663849234580994, "learning_rate": 7.930449504688483e-06, "loss": 0.4665, "num_input_tokens_seen": 89575248, "step": 93820 }, { "epoch": 7.653560649318868, "grad_norm": 38.364784240722656, "learning_rate": 7.927849187858641e-06, "loss": 0.4662, "num_input_tokens_seen": 89580352, "step": 93825 }, { "epoch": 7.653968512929277, "grad_norm": 0.6102341413497925, "learning_rate": 7.925249217074038e-06, "loss": 0.205, "num_input_tokens_seen": 89584720, "step": 93830 }, { "epoch": 7.654376376539685, "grad_norm": 31.641401290893555, "learning_rate": 7.92264959238738e-06, "loss": 0.2605, "num_input_tokens_seen": 89589408, "step": 93835 }, { "epoch": 7.654784240150094, "grad_norm": 1.2203458547592163, "learning_rate": 7.920050313851357e-06, "loss": 0.2, "num_input_tokens_seen": 89594160, "step": 93840 }, { "epoch": 7.655192103760503, "grad_norm": 0.5225489139556885, "learning_rate": 7.917451381518648e-06, "loss": 0.4535, "num_input_tokens_seen": 89598480, "step": 93845 }, { "epoch": 7.655599967370911, "grad_norm": 1.7858076095581055, "learning_rate": 7.914852795441949e-06, "loss": 0.3213, "num_input_tokens_seen": 89602880, "step": 93850 }, { "epoch": 7.65600783098132, "grad_norm": 59.86102294921875, "learning_rate": 7.912254555673927e-06, "loss": 0.3517, "num_input_tokens_seen": 89608208, "step": 93855 }, { "epoch": 7.656415694591729, "grad_norm": 0.7929157018661499, "learning_rate": 7.90965666226724e-06, "loss": 0.4053, "num_input_tokens_seen": 89612896, "step": 93860 }, { "epoch": 7.656823558202137, "grad_norm": 34.642372131347656, "learning_rate": 7.907059115274542e-06, "loss": 0.349, "num_input_tokens_seen": 89617504, "step": 93865 }, { "epoch": 7.657231421812546, "grad_norm": 74.97467803955078, "learning_rate": 7.904461914748504e-06, "loss": 0.4819, "num_input_tokens_seen": 89622688, "step": 93870 }, { "epoch": 7.657639285422954, "grad_norm": 43.38051986694336, "learning_rate": 7.901865060741756e-06, "loss": 0.3263, "num_input_tokens_seen": 89628048, "step": 93875 }, { "epoch": 7.658047149033363, "grad_norm": 12.748576164245605, "learning_rate": 7.899268553306937e-06, "loss": 0.2302, "num_input_tokens_seen": 89632832, "step": 93880 }, { "epoch": 7.658455012643772, "grad_norm": 45.89181900024414, "learning_rate": 7.896672392496682e-06, "loss": 0.3185, "num_input_tokens_seen": 89637648, "step": 93885 }, { "epoch": 7.65886287625418, "grad_norm": 0.43749451637268066, "learning_rate": 7.894076578363599e-06, "loss": 0.3345, "num_input_tokens_seen": 89642464, "step": 93890 }, { "epoch": 7.659270739864589, "grad_norm": 7.10611629486084, "learning_rate": 7.891481110960322e-06, "loss": 0.3149, "num_input_tokens_seen": 89646880, "step": 93895 }, { "epoch": 7.6596786034749975, "grad_norm": 0.428072452545166, "learning_rate": 7.888885990339457e-06, "loss": 0.426, "num_input_tokens_seen": 89651568, "step": 93900 }, { "epoch": 7.6600864670854065, "grad_norm": 1.1129039525985718, "learning_rate": 7.886291216553598e-06, "loss": 0.43, "num_input_tokens_seen": 89656704, "step": 93905 }, { "epoch": 7.6604943306958155, "grad_norm": 1.1852431297302246, "learning_rate": 7.883696789655345e-06, "loss": 0.3012, "num_input_tokens_seen": 89662032, "step": 93910 }, { "epoch": 7.660902194306224, "grad_norm": 4.5379815101623535, "learning_rate": 7.881102709697285e-06, "loss": 0.3589, "num_input_tokens_seen": 89666896, "step": 93915 }, { "epoch": 7.661310057916633, "grad_norm": 5.80983304977417, "learning_rate": 7.878508976731999e-06, "loss": 0.3769, "num_input_tokens_seen": 89671600, "step": 93920 }, { "epoch": 7.661717921527042, "grad_norm": 0.6133542060852051, "learning_rate": 7.875915590812063e-06, "loss": 0.4784, "num_input_tokens_seen": 89676176, "step": 93925 }, { "epoch": 7.66212578513745, "grad_norm": 23.534282684326172, "learning_rate": 7.87332255199004e-06, "loss": 0.2236, "num_input_tokens_seen": 89680656, "step": 93930 }, { "epoch": 7.662533648747859, "grad_norm": 2.944855213165283, "learning_rate": 7.870729860318481e-06, "loss": 0.3293, "num_input_tokens_seen": 89685776, "step": 93935 }, { "epoch": 7.662941512358268, "grad_norm": 11.033573150634766, "learning_rate": 7.868137515849961e-06, "loss": 0.2991, "num_input_tokens_seen": 89690352, "step": 93940 }, { "epoch": 7.663349375968676, "grad_norm": 0.9958063364028931, "learning_rate": 7.86554551863701e-06, "loss": 0.3056, "num_input_tokens_seen": 89694816, "step": 93945 }, { "epoch": 7.663757239579085, "grad_norm": 1.1511263847351074, "learning_rate": 7.862953868732173e-06, "loss": 0.4694, "num_input_tokens_seen": 89698880, "step": 93950 }, { "epoch": 7.664165103189493, "grad_norm": 7.805654048919678, "learning_rate": 7.86036256618797e-06, "loss": 0.4545, "num_input_tokens_seen": 89704400, "step": 93955 }, { "epoch": 7.664572966799902, "grad_norm": 1.1559083461761475, "learning_rate": 7.857771611056941e-06, "loss": 0.4071, "num_input_tokens_seen": 89709136, "step": 93960 }, { "epoch": 7.664980830410311, "grad_norm": 5.7521772384643555, "learning_rate": 7.855181003391602e-06, "loss": 0.4302, "num_input_tokens_seen": 89713664, "step": 93965 }, { "epoch": 7.665388694020719, "grad_norm": 3.553612470626831, "learning_rate": 7.852590743244453e-06, "loss": 0.4059, "num_input_tokens_seen": 89718192, "step": 93970 }, { "epoch": 7.665796557631128, "grad_norm": 0.35696494579315186, "learning_rate": 7.850000830668e-06, "loss": 0.4137, "num_input_tokens_seen": 89722992, "step": 93975 }, { "epoch": 7.666204421241536, "grad_norm": 0.9878347516059875, "learning_rate": 7.84741126571475e-06, "loss": 0.5635, "num_input_tokens_seen": 89727264, "step": 93980 }, { "epoch": 7.666612284851945, "grad_norm": 18.195720672607422, "learning_rate": 7.844822048437184e-06, "loss": 0.3648, "num_input_tokens_seen": 89732336, "step": 93985 }, { "epoch": 7.667020148462354, "grad_norm": 2.0562918186187744, "learning_rate": 7.842233178887784e-06, "loss": 0.2963, "num_input_tokens_seen": 89736592, "step": 93990 }, { "epoch": 7.667428012072763, "grad_norm": 0.7221970558166504, "learning_rate": 7.839644657119027e-06, "loss": 0.3483, "num_input_tokens_seen": 89741504, "step": 93995 }, { "epoch": 7.6678358756831715, "grad_norm": 0.7755146026611328, "learning_rate": 7.837056483183375e-06, "loss": 0.3268, "num_input_tokens_seen": 89746448, "step": 94000 }, { "epoch": 7.6682437392935805, "grad_norm": 1.3409173488616943, "learning_rate": 7.8344686571333e-06, "loss": 0.2014, "num_input_tokens_seen": 89751408, "step": 94005 }, { "epoch": 7.668651602903989, "grad_norm": 1.3739570379257202, "learning_rate": 7.831881179021255e-06, "loss": 0.3711, "num_input_tokens_seen": 89755456, "step": 94010 }, { "epoch": 7.669059466514398, "grad_norm": 0.26003560423851013, "learning_rate": 7.829294048899683e-06, "loss": 0.392, "num_input_tokens_seen": 89760544, "step": 94015 }, { "epoch": 7.669467330124807, "grad_norm": 13.866982460021973, "learning_rate": 7.826707266821014e-06, "loss": 0.3445, "num_input_tokens_seen": 89765072, "step": 94020 }, { "epoch": 7.669875193735215, "grad_norm": 0.522049605846405, "learning_rate": 7.8241208328377e-06, "loss": 0.2519, "num_input_tokens_seen": 89769728, "step": 94025 }, { "epoch": 7.670283057345624, "grad_norm": 2.642965316772461, "learning_rate": 7.82153474700216e-06, "loss": 0.3743, "num_input_tokens_seen": 89774160, "step": 94030 }, { "epoch": 7.670690920956032, "grad_norm": 1.6248142719268799, "learning_rate": 7.818949009366808e-06, "loss": 0.5162, "num_input_tokens_seen": 89777968, "step": 94035 }, { "epoch": 7.671098784566441, "grad_norm": 9.717617988586426, "learning_rate": 7.81636361998406e-06, "loss": 0.2709, "num_input_tokens_seen": 89783040, "step": 94040 }, { "epoch": 7.67150664817685, "grad_norm": 30.01380729675293, "learning_rate": 7.813778578906322e-06, "loss": 0.4893, "num_input_tokens_seen": 89787760, "step": 94045 }, { "epoch": 7.671914511787258, "grad_norm": 78.84073638916016, "learning_rate": 7.811193886185988e-06, "loss": 0.4534, "num_input_tokens_seen": 89792544, "step": 94050 }, { "epoch": 7.672322375397667, "grad_norm": 86.84211730957031, "learning_rate": 7.80860954187545e-06, "loss": 0.6223, "num_input_tokens_seen": 89796912, "step": 94055 }, { "epoch": 7.672730239008076, "grad_norm": 0.8732646703720093, "learning_rate": 7.80602554602709e-06, "loss": 0.2435, "num_input_tokens_seen": 89802560, "step": 94060 }, { "epoch": 7.673138102618484, "grad_norm": 0.5353595018386841, "learning_rate": 7.803441898693278e-06, "loss": 0.3974, "num_input_tokens_seen": 89807872, "step": 94065 }, { "epoch": 7.673545966228893, "grad_norm": 1.7016384601593018, "learning_rate": 7.800858599926403e-06, "loss": 0.2524, "num_input_tokens_seen": 89812752, "step": 94070 }, { "epoch": 7.673953829839302, "grad_norm": 5.557322025299072, "learning_rate": 7.79827564977881e-06, "loss": 0.2841, "num_input_tokens_seen": 89818128, "step": 94075 }, { "epoch": 7.67436169344971, "grad_norm": 0.3114902079105377, "learning_rate": 7.795693048302865e-06, "loss": 0.3242, "num_input_tokens_seen": 89822480, "step": 94080 }, { "epoch": 7.674769557060119, "grad_norm": 22.66667366027832, "learning_rate": 7.793110795550909e-06, "loss": 0.2784, "num_input_tokens_seen": 89827248, "step": 94085 }, { "epoch": 7.675177420670527, "grad_norm": 1.3657665252685547, "learning_rate": 7.79052889157528e-06, "loss": 0.3529, "num_input_tokens_seen": 89832192, "step": 94090 }, { "epoch": 7.6755852842809364, "grad_norm": 1.3853765726089478, "learning_rate": 7.787947336428324e-06, "loss": 0.3792, "num_input_tokens_seen": 89836912, "step": 94095 }, { "epoch": 7.6759931478913455, "grad_norm": 1.185613989830017, "learning_rate": 7.78536613016236e-06, "loss": 0.4048, "num_input_tokens_seen": 89842576, "step": 94100 }, { "epoch": 7.676401011501754, "grad_norm": 0.5518466830253601, "learning_rate": 7.782785272829712e-06, "loss": 0.2703, "num_input_tokens_seen": 89847824, "step": 94105 }, { "epoch": 7.676808875112163, "grad_norm": 13.266157150268555, "learning_rate": 7.780204764482681e-06, "loss": 0.2107, "num_input_tokens_seen": 89852560, "step": 94110 }, { "epoch": 7.677216738722571, "grad_norm": 0.39938342571258545, "learning_rate": 7.777624605173591e-06, "loss": 0.4116, "num_input_tokens_seen": 89856576, "step": 94115 }, { "epoch": 7.67762460233298, "grad_norm": 4.965251922607422, "learning_rate": 7.775044794954734e-06, "loss": 0.2675, "num_input_tokens_seen": 89862096, "step": 94120 }, { "epoch": 7.678032465943389, "grad_norm": 2.3183579444885254, "learning_rate": 7.772465333878399e-06, "loss": 0.4022, "num_input_tokens_seen": 89867056, "step": 94125 }, { "epoch": 7.678440329553797, "grad_norm": 0.5222903490066528, "learning_rate": 7.76988622199687e-06, "loss": 0.3893, "num_input_tokens_seen": 89872096, "step": 94130 }, { "epoch": 7.678848193164206, "grad_norm": 45.766204833984375, "learning_rate": 7.767307459362417e-06, "loss": 0.4278, "num_input_tokens_seen": 89875984, "step": 94135 }, { "epoch": 7.679256056774615, "grad_norm": 5.224176406860352, "learning_rate": 7.764729046027328e-06, "loss": 0.2837, "num_input_tokens_seen": 89881488, "step": 94140 }, { "epoch": 7.679663920385023, "grad_norm": 42.83370590209961, "learning_rate": 7.762150982043856e-06, "loss": 0.6681, "num_input_tokens_seen": 89885648, "step": 94145 }, { "epoch": 7.680071783995432, "grad_norm": 49.67593002319336, "learning_rate": 7.75957326746426e-06, "loss": 0.3939, "num_input_tokens_seen": 89889824, "step": 94150 }, { "epoch": 7.680479647605841, "grad_norm": 8.002735137939453, "learning_rate": 7.75699590234078e-06, "loss": 0.2963, "num_input_tokens_seen": 89894288, "step": 94155 }, { "epoch": 7.680887511216249, "grad_norm": 4.996227264404297, "learning_rate": 7.754418886725676e-06, "loss": 0.4642, "num_input_tokens_seen": 89898912, "step": 94160 }, { "epoch": 7.681295374826658, "grad_norm": 1.264675498008728, "learning_rate": 7.75184222067117e-06, "loss": 0.4255, "num_input_tokens_seen": 89902848, "step": 94165 }, { "epoch": 7.681703238437066, "grad_norm": 0.46315544843673706, "learning_rate": 7.749265904229494e-06, "loss": 0.4487, "num_input_tokens_seen": 89908704, "step": 94170 }, { "epoch": 7.682111102047475, "grad_norm": 29.31016731262207, "learning_rate": 7.746689937452869e-06, "loss": 0.4796, "num_input_tokens_seen": 89914688, "step": 94175 }, { "epoch": 7.682518965657884, "grad_norm": 2.7697396278381348, "learning_rate": 7.744114320393506e-06, "loss": 0.3257, "num_input_tokens_seen": 89918880, "step": 94180 }, { "epoch": 7.682926829268292, "grad_norm": 27.270139694213867, "learning_rate": 7.741539053103613e-06, "loss": 0.393, "num_input_tokens_seen": 89924080, "step": 94185 }, { "epoch": 7.683334692878701, "grad_norm": 2.5078542232513428, "learning_rate": 7.738964135635392e-06, "loss": 0.5642, "num_input_tokens_seen": 89928640, "step": 94190 }, { "epoch": 7.68374255648911, "grad_norm": 1.2343900203704834, "learning_rate": 7.736389568041032e-06, "loss": 0.2928, "num_input_tokens_seen": 89932624, "step": 94195 }, { "epoch": 7.6841504200995185, "grad_norm": 3.9901084899902344, "learning_rate": 7.733815350372714e-06, "loss": 0.2708, "num_input_tokens_seen": 89936992, "step": 94200 }, { "epoch": 7.6845582837099276, "grad_norm": 7.112312316894531, "learning_rate": 7.731241482682628e-06, "loss": 0.2471, "num_input_tokens_seen": 89941968, "step": 94205 }, { "epoch": 7.684966147320337, "grad_norm": 4.277328014373779, "learning_rate": 7.728667965022937e-06, "loss": 0.283, "num_input_tokens_seen": 89946784, "step": 94210 }, { "epoch": 7.685374010930745, "grad_norm": 1.266864538192749, "learning_rate": 7.72609479744581e-06, "loss": 0.2586, "num_input_tokens_seen": 89951600, "step": 94215 }, { "epoch": 7.685781874541154, "grad_norm": 0.1737838089466095, "learning_rate": 7.723521980003392e-06, "loss": 0.4333, "num_input_tokens_seen": 89955600, "step": 94220 }, { "epoch": 7.686189738151562, "grad_norm": 3.0493693351745605, "learning_rate": 7.720949512747852e-06, "loss": 0.3497, "num_input_tokens_seen": 89960480, "step": 94225 }, { "epoch": 7.686597601761971, "grad_norm": 21.198373794555664, "learning_rate": 7.718377395731322e-06, "loss": 0.3667, "num_input_tokens_seen": 89964832, "step": 94230 }, { "epoch": 7.68700546537238, "grad_norm": 1.0562793016433716, "learning_rate": 7.71580562900594e-06, "loss": 0.3109, "num_input_tokens_seen": 89969840, "step": 94235 }, { "epoch": 7.687413328982788, "grad_norm": 8.583832740783691, "learning_rate": 7.713234212623832e-06, "loss": 0.2525, "num_input_tokens_seen": 89975136, "step": 94240 }, { "epoch": 7.687821192593197, "grad_norm": 30.19634246826172, "learning_rate": 7.710663146637112e-06, "loss": 0.2472, "num_input_tokens_seen": 89979008, "step": 94245 }, { "epoch": 7.688229056203605, "grad_norm": 21.820362091064453, "learning_rate": 7.708092431097913e-06, "loss": 0.671, "num_input_tokens_seen": 89983760, "step": 94250 }, { "epoch": 7.688636919814014, "grad_norm": 2.0751235485076904, "learning_rate": 7.705522066058332e-06, "loss": 0.5883, "num_input_tokens_seen": 89987840, "step": 94255 }, { "epoch": 7.689044783424423, "grad_norm": 0.8650686144828796, "learning_rate": 7.70295205157047e-06, "loss": 0.4251, "num_input_tokens_seen": 89992416, "step": 94260 }, { "epoch": 7.689452647034831, "grad_norm": 0.7720374464988708, "learning_rate": 7.700382387686412e-06, "loss": 0.5675, "num_input_tokens_seen": 89997408, "step": 94265 }, { "epoch": 7.68986051064524, "grad_norm": 0.5520207285881042, "learning_rate": 7.69781307445826e-06, "loss": 0.3559, "num_input_tokens_seen": 90002144, "step": 94270 }, { "epoch": 7.690268374255649, "grad_norm": 0.825280487537384, "learning_rate": 7.695244111938085e-06, "loss": 0.26, "num_input_tokens_seen": 90007248, "step": 94275 }, { "epoch": 7.690676237866057, "grad_norm": 100.15142059326172, "learning_rate": 7.692675500177956e-06, "loss": 0.2647, "num_input_tokens_seen": 90012416, "step": 94280 }, { "epoch": 7.691084101476466, "grad_norm": 26.770307540893555, "learning_rate": 7.690107239229943e-06, "loss": 0.6007, "num_input_tokens_seen": 90017152, "step": 94285 }, { "epoch": 7.691491965086875, "grad_norm": 68.56507873535156, "learning_rate": 7.687539329146092e-06, "loss": 0.3269, "num_input_tokens_seen": 90021136, "step": 94290 }, { "epoch": 7.6918998286972835, "grad_norm": 0.9663946628570557, "learning_rate": 7.684971769978471e-06, "loss": 0.2864, "num_input_tokens_seen": 90025840, "step": 94295 }, { "epoch": 7.6923076923076925, "grad_norm": 0.8597044348716736, "learning_rate": 7.682404561779116e-06, "loss": 0.3151, "num_input_tokens_seen": 90030976, "step": 94300 }, { "epoch": 7.692715555918101, "grad_norm": 0.523087739944458, "learning_rate": 7.67983770460006e-06, "loss": 0.2346, "num_input_tokens_seen": 90035216, "step": 94305 }, { "epoch": 7.69312341952851, "grad_norm": 6.990381240844727, "learning_rate": 7.677271198493336e-06, "loss": 0.2888, "num_input_tokens_seen": 90040160, "step": 94310 }, { "epoch": 7.693531283138919, "grad_norm": 43.02734375, "learning_rate": 7.674705043510961e-06, "loss": 0.3176, "num_input_tokens_seen": 90044928, "step": 94315 }, { "epoch": 7.693939146749327, "grad_norm": 4.302680015563965, "learning_rate": 7.672139239704953e-06, "loss": 0.4195, "num_input_tokens_seen": 90048912, "step": 94320 }, { "epoch": 7.694347010359736, "grad_norm": 15.734931945800781, "learning_rate": 7.669573787127321e-06, "loss": 0.3767, "num_input_tokens_seen": 90053712, "step": 94325 }, { "epoch": 7.694754873970144, "grad_norm": 0.557003915309906, "learning_rate": 7.667008685830055e-06, "loss": 0.2593, "num_input_tokens_seen": 90058000, "step": 94330 }, { "epoch": 7.695162737580553, "grad_norm": 3.471653938293457, "learning_rate": 7.664443935865168e-06, "loss": 0.4931, "num_input_tokens_seen": 90063280, "step": 94335 }, { "epoch": 7.695570601190962, "grad_norm": 3.53078031539917, "learning_rate": 7.661879537284633e-06, "loss": 0.2156, "num_input_tokens_seen": 90067296, "step": 94340 }, { "epoch": 7.69597846480137, "grad_norm": 6.371942520141602, "learning_rate": 7.659315490140434e-06, "loss": 0.4395, "num_input_tokens_seen": 90072304, "step": 94345 }, { "epoch": 7.696386328411779, "grad_norm": 40.25695037841797, "learning_rate": 7.65675179448454e-06, "loss": 0.3103, "num_input_tokens_seen": 90077072, "step": 94350 }, { "epoch": 7.696794192022188, "grad_norm": 0.4345790147781372, "learning_rate": 7.654188450368913e-06, "loss": 0.296, "num_input_tokens_seen": 90082368, "step": 94355 }, { "epoch": 7.697202055632596, "grad_norm": 28.15086555480957, "learning_rate": 7.651625457845521e-06, "loss": 0.5312, "num_input_tokens_seen": 90086512, "step": 94360 }, { "epoch": 7.697609919243005, "grad_norm": 35.142616271972656, "learning_rate": 7.64906281696631e-06, "loss": 0.3108, "num_input_tokens_seen": 90090944, "step": 94365 }, { "epoch": 7.698017782853414, "grad_norm": 32.136634826660156, "learning_rate": 7.646500527783221e-06, "loss": 0.3314, "num_input_tokens_seen": 90096560, "step": 94370 }, { "epoch": 7.698425646463822, "grad_norm": 1.0190269947052002, "learning_rate": 7.643938590348185e-06, "loss": 0.4252, "num_input_tokens_seen": 90100496, "step": 94375 }, { "epoch": 7.698833510074231, "grad_norm": 0.8849431872367859, "learning_rate": 7.641377004713149e-06, "loss": 0.3467, "num_input_tokens_seen": 90104896, "step": 94380 }, { "epoch": 7.6992413736846395, "grad_norm": 21.010543823242188, "learning_rate": 7.638815770930022e-06, "loss": 0.4544, "num_input_tokens_seen": 90109968, "step": 94385 }, { "epoch": 7.6996492372950485, "grad_norm": 7.900833606719971, "learning_rate": 7.636254889050722e-06, "loss": 0.3254, "num_input_tokens_seen": 90114752, "step": 94390 }, { "epoch": 7.7000571009054575, "grad_norm": 1.4616330862045288, "learning_rate": 7.633694359127161e-06, "loss": 0.3879, "num_input_tokens_seen": 90119408, "step": 94395 }, { "epoch": 7.700464964515866, "grad_norm": 1.6668449640274048, "learning_rate": 7.631134181211225e-06, "loss": 0.4043, "num_input_tokens_seen": 90123552, "step": 94400 }, { "epoch": 7.700872828126275, "grad_norm": 5.279109954833984, "learning_rate": 7.628574355354828e-06, "loss": 0.3574, "num_input_tokens_seen": 90127664, "step": 94405 }, { "epoch": 7.701280691736684, "grad_norm": 43.01335906982422, "learning_rate": 7.626014881609847e-06, "loss": 0.6129, "num_input_tokens_seen": 90131792, "step": 94410 }, { "epoch": 7.701688555347092, "grad_norm": 36.100406646728516, "learning_rate": 7.623455760028162e-06, "loss": 0.2806, "num_input_tokens_seen": 90136480, "step": 94415 }, { "epoch": 7.702096418957501, "grad_norm": 4.604873180389404, "learning_rate": 7.620896990661636e-06, "loss": 0.2653, "num_input_tokens_seen": 90140976, "step": 94420 }, { "epoch": 7.70250428256791, "grad_norm": 2.404855728149414, "learning_rate": 7.618338573562151e-06, "loss": 0.2648, "num_input_tokens_seen": 90145392, "step": 94425 }, { "epoch": 7.702912146178318, "grad_norm": 1.0561338663101196, "learning_rate": 7.615780508781556e-06, "loss": 0.4008, "num_input_tokens_seen": 90149680, "step": 94430 }, { "epoch": 7.703320009788727, "grad_norm": 0.43749791383743286, "learning_rate": 7.6132227963717055e-06, "loss": 0.1923, "num_input_tokens_seen": 90154560, "step": 94435 }, { "epoch": 7.703727873399135, "grad_norm": 56.740291595458984, "learning_rate": 7.61066543638444e-06, "loss": 0.4008, "num_input_tokens_seen": 90159376, "step": 94440 }, { "epoch": 7.704135737009544, "grad_norm": 0.9949781894683838, "learning_rate": 7.608108428871597e-06, "loss": 0.3538, "num_input_tokens_seen": 90164288, "step": 94445 }, { "epoch": 7.704543600619953, "grad_norm": 0.6001942157745361, "learning_rate": 7.605551773885005e-06, "loss": 0.4577, "num_input_tokens_seen": 90169888, "step": 94450 }, { "epoch": 7.704951464230361, "grad_norm": 5.2175421714782715, "learning_rate": 7.602995471476479e-06, "loss": 0.3407, "num_input_tokens_seen": 90174768, "step": 94455 }, { "epoch": 7.70535932784077, "grad_norm": 4.4691033363342285, "learning_rate": 7.6004395216978525e-06, "loss": 0.3116, "num_input_tokens_seen": 90178864, "step": 94460 }, { "epoch": 7.705767191451178, "grad_norm": 2.5328822135925293, "learning_rate": 7.597883924600921e-06, "loss": 0.3042, "num_input_tokens_seen": 90184048, "step": 94465 }, { "epoch": 7.706175055061587, "grad_norm": 16.105213165283203, "learning_rate": 7.59532868023749e-06, "loss": 0.5411, "num_input_tokens_seen": 90189120, "step": 94470 }, { "epoch": 7.706582918671996, "grad_norm": 4.119406700134277, "learning_rate": 7.592773788659349e-06, "loss": 0.2106, "num_input_tokens_seen": 90193968, "step": 94475 }, { "epoch": 7.706990782282404, "grad_norm": 1.2295751571655273, "learning_rate": 7.5902192499182875e-06, "loss": 0.483, "num_input_tokens_seen": 90198144, "step": 94480 }, { "epoch": 7.707398645892813, "grad_norm": 0.7935289144515991, "learning_rate": 7.587665064066085e-06, "loss": 0.2989, "num_input_tokens_seen": 90202832, "step": 94485 }, { "epoch": 7.707806509503222, "grad_norm": 4.798008441925049, "learning_rate": 7.585111231154504e-06, "loss": 0.4245, "num_input_tokens_seen": 90207232, "step": 94490 }, { "epoch": 7.708214373113631, "grad_norm": 1.7472076416015625, "learning_rate": 7.582557751235325e-06, "loss": 0.2112, "num_input_tokens_seen": 90211824, "step": 94495 }, { "epoch": 7.70862223672404, "grad_norm": 1.5768029689788818, "learning_rate": 7.580004624360301e-06, "loss": 0.2148, "num_input_tokens_seen": 90217696, "step": 94500 }, { "epoch": 7.709030100334449, "grad_norm": 0.29481086134910583, "learning_rate": 7.57745185058118e-06, "loss": 0.3263, "num_input_tokens_seen": 90222736, "step": 94505 }, { "epoch": 7.709437963944857, "grad_norm": 37.21021270751953, "learning_rate": 7.574899429949697e-06, "loss": 0.4215, "num_input_tokens_seen": 90227456, "step": 94510 }, { "epoch": 7.709845827555266, "grad_norm": 0.30504870414733887, "learning_rate": 7.572347362517609e-06, "loss": 0.276, "num_input_tokens_seen": 90232384, "step": 94515 }, { "epoch": 7.710253691165674, "grad_norm": 6.065404415130615, "learning_rate": 7.569795648336633e-06, "loss": 0.6443, "num_input_tokens_seen": 90236864, "step": 94520 }, { "epoch": 7.710661554776083, "grad_norm": 11.40975570678711, "learning_rate": 7.567244287458494e-06, "loss": 0.3665, "num_input_tokens_seen": 90241472, "step": 94525 }, { "epoch": 7.711069418386492, "grad_norm": 1.0916715860366821, "learning_rate": 7.564693279934898e-06, "loss": 0.6143, "num_input_tokens_seen": 90246768, "step": 94530 }, { "epoch": 7.7114772819969, "grad_norm": 8.323223114013672, "learning_rate": 7.562142625817567e-06, "loss": 0.4778, "num_input_tokens_seen": 90251584, "step": 94535 }, { "epoch": 7.711885145607309, "grad_norm": 0.5121048092842102, "learning_rate": 7.5595923251581965e-06, "loss": 0.2635, "num_input_tokens_seen": 90256656, "step": 94540 }, { "epoch": 7.712293009217717, "grad_norm": 1.8129390478134155, "learning_rate": 7.557042378008478e-06, "loss": 0.3133, "num_input_tokens_seen": 90261024, "step": 94545 }, { "epoch": 7.712700872828126, "grad_norm": 1.5151340961456299, "learning_rate": 7.554492784420097e-06, "loss": 0.3266, "num_input_tokens_seen": 90265120, "step": 94550 }, { "epoch": 7.713108736438535, "grad_norm": 11.087873458862305, "learning_rate": 7.551943544444729e-06, "loss": 0.3608, "num_input_tokens_seen": 90270320, "step": 94555 }, { "epoch": 7.713516600048944, "grad_norm": 2.5764033794403076, "learning_rate": 7.5493946581340565e-06, "loss": 0.2709, "num_input_tokens_seen": 90275072, "step": 94560 }, { "epoch": 7.713924463659352, "grad_norm": 1.6395471096038818, "learning_rate": 7.5468461255397385e-06, "loss": 0.3081, "num_input_tokens_seen": 90279424, "step": 94565 }, { "epoch": 7.714332327269761, "grad_norm": 29.98788070678711, "learning_rate": 7.544297946713435e-06, "loss": 0.2609, "num_input_tokens_seen": 90284576, "step": 94570 }, { "epoch": 7.714740190880169, "grad_norm": 2.704915761947632, "learning_rate": 7.541750121706789e-06, "loss": 0.2209, "num_input_tokens_seen": 90289744, "step": 94575 }, { "epoch": 7.715148054490578, "grad_norm": 0.3576776087284088, "learning_rate": 7.539202650571456e-06, "loss": 0.4082, "num_input_tokens_seen": 90294496, "step": 94580 }, { "epoch": 7.715555918100987, "grad_norm": 0.8415789008140564, "learning_rate": 7.536655533359063e-06, "loss": 0.4261, "num_input_tokens_seen": 90299520, "step": 94585 }, { "epoch": 7.7159637817113955, "grad_norm": 0.6334450840950012, "learning_rate": 7.534108770121245e-06, "loss": 0.3952, "num_input_tokens_seen": 90304288, "step": 94590 }, { "epoch": 7.7163716453218045, "grad_norm": 3.942610740661621, "learning_rate": 7.531562360909619e-06, "loss": 0.2883, "num_input_tokens_seen": 90309200, "step": 94595 }, { "epoch": 7.716779508932213, "grad_norm": 0.5022586584091187, "learning_rate": 7.529016305775802e-06, "loss": 0.2111, "num_input_tokens_seen": 90313424, "step": 94600 }, { "epoch": 7.717187372542622, "grad_norm": 1.6411142349243164, "learning_rate": 7.526470604771399e-06, "loss": 0.2691, "num_input_tokens_seen": 90319152, "step": 94605 }, { "epoch": 7.717595236153031, "grad_norm": 25.51888084411621, "learning_rate": 7.523925257948014e-06, "loss": 0.5329, "num_input_tokens_seen": 90324320, "step": 94610 }, { "epoch": 7.718003099763439, "grad_norm": 4.496219158172607, "learning_rate": 7.5213802653572366e-06, "loss": 0.4349, "num_input_tokens_seen": 90329056, "step": 94615 }, { "epoch": 7.718410963373848, "grad_norm": 16.036914825439453, "learning_rate": 7.518835627050646e-06, "loss": 0.4629, "num_input_tokens_seen": 90333712, "step": 94620 }, { "epoch": 7.718818826984257, "grad_norm": 19.61969566345215, "learning_rate": 7.516291343079837e-06, "loss": 0.4027, "num_input_tokens_seen": 90338272, "step": 94625 }, { "epoch": 7.719226690594665, "grad_norm": 3.255032777786255, "learning_rate": 7.5137474134963745e-06, "loss": 0.5646, "num_input_tokens_seen": 90343712, "step": 94630 }, { "epoch": 7.719634554205074, "grad_norm": 47.16712188720703, "learning_rate": 7.511203838351821e-06, "loss": 0.5566, "num_input_tokens_seen": 90348368, "step": 94635 }, { "epoch": 7.720042417815483, "grad_norm": 0.8496927618980408, "learning_rate": 7.508660617697735e-06, "loss": 0.2008, "num_input_tokens_seen": 90353264, "step": 94640 }, { "epoch": 7.720450281425891, "grad_norm": 2.0346927642822266, "learning_rate": 7.506117751585656e-06, "loss": 0.2968, "num_input_tokens_seen": 90358176, "step": 94645 }, { "epoch": 7.7208581450363, "grad_norm": 0.996294379234314, "learning_rate": 7.503575240067146e-06, "loss": 0.3622, "num_input_tokens_seen": 90362336, "step": 94650 }, { "epoch": 7.721266008646708, "grad_norm": 1.4970993995666504, "learning_rate": 7.501033083193731e-06, "loss": 0.3797, "num_input_tokens_seen": 90367312, "step": 94655 }, { "epoch": 7.721673872257117, "grad_norm": 1.3787919282913208, "learning_rate": 7.498491281016939e-06, "loss": 0.6752, "num_input_tokens_seen": 90373280, "step": 94660 }, { "epoch": 7.722081735867526, "grad_norm": 0.7455982565879822, "learning_rate": 7.495949833588284e-06, "loss": 0.4191, "num_input_tokens_seen": 90377856, "step": 94665 }, { "epoch": 7.722489599477934, "grad_norm": 2.4773831367492676, "learning_rate": 7.493408740959295e-06, "loss": 0.5038, "num_input_tokens_seen": 90382432, "step": 94670 }, { "epoch": 7.722897463088343, "grad_norm": 0.5873743891716003, "learning_rate": 7.490868003181472e-06, "loss": 0.356, "num_input_tokens_seen": 90387760, "step": 94675 }, { "epoch": 7.7233053266987515, "grad_norm": 22.3420467376709, "learning_rate": 7.488327620306318e-06, "loss": 0.4845, "num_input_tokens_seen": 90392816, "step": 94680 }, { "epoch": 7.7237131903091605, "grad_norm": 0.5815807580947876, "learning_rate": 7.485787592385318e-06, "loss": 0.4703, "num_input_tokens_seen": 90398560, "step": 94685 }, { "epoch": 7.7241210539195695, "grad_norm": 10.034482955932617, "learning_rate": 7.483247919469955e-06, "loss": 0.3187, "num_input_tokens_seen": 90403280, "step": 94690 }, { "epoch": 7.724528917529978, "grad_norm": 17.43991470336914, "learning_rate": 7.480708601611722e-06, "loss": 0.3311, "num_input_tokens_seen": 90408320, "step": 94695 }, { "epoch": 7.724936781140387, "grad_norm": 0.5446916818618774, "learning_rate": 7.478169638862079e-06, "loss": 0.2477, "num_input_tokens_seen": 90412832, "step": 94700 }, { "epoch": 7.725344644750796, "grad_norm": 0.5321701169013977, "learning_rate": 7.475631031272495e-06, "loss": 0.6475, "num_input_tokens_seen": 90417424, "step": 94705 }, { "epoch": 7.725752508361204, "grad_norm": 6.693048000335693, "learning_rate": 7.473092778894414e-06, "loss": 0.2539, "num_input_tokens_seen": 90422224, "step": 94710 }, { "epoch": 7.726160371971613, "grad_norm": 0.568878173828125, "learning_rate": 7.470554881779301e-06, "loss": 0.4938, "num_input_tokens_seen": 90426576, "step": 94715 }, { "epoch": 7.726568235582022, "grad_norm": 6.635030746459961, "learning_rate": 7.468017339978592e-06, "loss": 0.5318, "num_input_tokens_seen": 90431232, "step": 94720 }, { "epoch": 7.72697609919243, "grad_norm": 3.6118602752685547, "learning_rate": 7.465480153543724e-06, "loss": 0.2358, "num_input_tokens_seen": 90435936, "step": 94725 }, { "epoch": 7.727383962802839, "grad_norm": 0.6649695038795471, "learning_rate": 7.462943322526122e-06, "loss": 0.3054, "num_input_tokens_seen": 90441360, "step": 94730 }, { "epoch": 7.727791826413247, "grad_norm": 7.675782680511475, "learning_rate": 7.460406846977205e-06, "loss": 0.3933, "num_input_tokens_seen": 90446592, "step": 94735 }, { "epoch": 7.728199690023656, "grad_norm": 0.4936997890472412, "learning_rate": 7.457870726948391e-06, "loss": 0.3643, "num_input_tokens_seen": 90451024, "step": 94740 }, { "epoch": 7.728607553634065, "grad_norm": 0.7245617508888245, "learning_rate": 7.455334962491081e-06, "loss": 0.3493, "num_input_tokens_seen": 90456240, "step": 94745 }, { "epoch": 7.729015417244473, "grad_norm": 11.845913887023926, "learning_rate": 7.452799553656678e-06, "loss": 0.4094, "num_input_tokens_seen": 90461584, "step": 94750 }, { "epoch": 7.729423280854882, "grad_norm": 46.73728561401367, "learning_rate": 7.450264500496562e-06, "loss": 0.3545, "num_input_tokens_seen": 90467360, "step": 94755 }, { "epoch": 7.729831144465291, "grad_norm": 1.1143193244934082, "learning_rate": 7.447729803062137e-06, "loss": 0.4847, "num_input_tokens_seen": 90471632, "step": 94760 }, { "epoch": 7.730239008075699, "grad_norm": 0.4103878140449524, "learning_rate": 7.445195461404766e-06, "loss": 0.2043, "num_input_tokens_seen": 90476192, "step": 94765 }, { "epoch": 7.730646871686108, "grad_norm": 0.329055517911911, "learning_rate": 7.442661475575829e-06, "loss": 0.4094, "num_input_tokens_seen": 90481040, "step": 94770 }, { "epoch": 7.731054735296517, "grad_norm": 7.089996337890625, "learning_rate": 7.440127845626673e-06, "loss": 0.3716, "num_input_tokens_seen": 90485712, "step": 94775 }, { "epoch": 7.731462598906925, "grad_norm": 0.4896107316017151, "learning_rate": 7.4375945716086724e-06, "loss": 0.3393, "num_input_tokens_seen": 90490288, "step": 94780 }, { "epoch": 7.7318704625173345, "grad_norm": 43.72872543334961, "learning_rate": 7.435061653573167e-06, "loss": 0.2582, "num_input_tokens_seen": 90495488, "step": 94785 }, { "epoch": 7.732278326127743, "grad_norm": 0.32710009813308716, "learning_rate": 7.432529091571497e-06, "loss": 0.4638, "num_input_tokens_seen": 90499744, "step": 94790 }, { "epoch": 7.732686189738152, "grad_norm": 14.317888259887695, "learning_rate": 7.429996885654999e-06, "loss": 0.3989, "num_input_tokens_seen": 90504752, "step": 94795 }, { "epoch": 7.733094053348561, "grad_norm": 0.7113361954689026, "learning_rate": 7.427465035874989e-06, "loss": 0.2978, "num_input_tokens_seen": 90510080, "step": 94800 }, { "epoch": 7.733501916958969, "grad_norm": 32.15462875366211, "learning_rate": 7.424933542282805e-06, "loss": 0.3975, "num_input_tokens_seen": 90515280, "step": 94805 }, { "epoch": 7.733909780569378, "grad_norm": 8.59798526763916, "learning_rate": 7.422402404929748e-06, "loss": 0.3677, "num_input_tokens_seen": 90520624, "step": 94810 }, { "epoch": 7.734317644179786, "grad_norm": 3.89827299118042, "learning_rate": 7.419871623867128e-06, "loss": 0.4508, "num_input_tokens_seen": 90526160, "step": 94815 }, { "epoch": 7.734725507790195, "grad_norm": 4.4380269050598145, "learning_rate": 7.41734119914623e-06, "loss": 0.4183, "num_input_tokens_seen": 90529952, "step": 94820 }, { "epoch": 7.735133371400604, "grad_norm": 0.47601065039634705, "learning_rate": 7.414811130818361e-06, "loss": 0.3573, "num_input_tokens_seen": 90534832, "step": 94825 }, { "epoch": 7.735541235011012, "grad_norm": 0.501257061958313, "learning_rate": 7.412281418934802e-06, "loss": 0.3042, "num_input_tokens_seen": 90539136, "step": 94830 }, { "epoch": 7.735949098621421, "grad_norm": 4.731654644012451, "learning_rate": 7.409752063546821e-06, "loss": 0.278, "num_input_tokens_seen": 90543728, "step": 94835 }, { "epoch": 7.73635696223183, "grad_norm": 3.4979989528656006, "learning_rate": 7.407223064705693e-06, "loss": 0.3679, "num_input_tokens_seen": 90547952, "step": 94840 }, { "epoch": 7.736764825842238, "grad_norm": 2.80432391166687, "learning_rate": 7.404694422462671e-06, "loss": 0.2442, "num_input_tokens_seen": 90552032, "step": 94845 }, { "epoch": 7.737172689452647, "grad_norm": 49.563377380371094, "learning_rate": 7.402166136869024e-06, "loss": 0.5242, "num_input_tokens_seen": 90557328, "step": 94850 }, { "epoch": 7.737580553063056, "grad_norm": 0.7458672523498535, "learning_rate": 7.39963820797599e-06, "loss": 0.264, "num_input_tokens_seen": 90561968, "step": 94855 }, { "epoch": 7.737988416673464, "grad_norm": 0.8772397041320801, "learning_rate": 7.3971106358348115e-06, "loss": 0.3384, "num_input_tokens_seen": 90566800, "step": 94860 }, { "epoch": 7.738396280283873, "grad_norm": 14.949437141418457, "learning_rate": 7.39458342049672e-06, "loss": 0.3804, "num_input_tokens_seen": 90571584, "step": 94865 }, { "epoch": 7.738804143894281, "grad_norm": 0.7832328081130981, "learning_rate": 7.392056562012945e-06, "loss": 0.2662, "num_input_tokens_seen": 90576416, "step": 94870 }, { "epoch": 7.73921200750469, "grad_norm": 1.3721216917037964, "learning_rate": 7.389530060434696e-06, "loss": 0.4937, "num_input_tokens_seen": 90580848, "step": 94875 }, { "epoch": 7.739619871115099, "grad_norm": 1.093112826347351, "learning_rate": 7.387003915813193e-06, "loss": 0.3758, "num_input_tokens_seen": 90585456, "step": 94880 }, { "epoch": 7.7400277347255075, "grad_norm": 1.9198963642120361, "learning_rate": 7.384478128199634e-06, "loss": 0.2573, "num_input_tokens_seen": 90589664, "step": 94885 }, { "epoch": 7.7404355983359165, "grad_norm": 0.3458685576915741, "learning_rate": 7.381952697645214e-06, "loss": 0.2889, "num_input_tokens_seen": 90594848, "step": 94890 }, { "epoch": 7.740843461946325, "grad_norm": 1.4684118032455444, "learning_rate": 7.3794276242011325e-06, "loss": 0.3164, "num_input_tokens_seen": 90599360, "step": 94895 }, { "epoch": 7.741251325556734, "grad_norm": 1.630272626876831, "learning_rate": 7.376902907918565e-06, "loss": 0.3704, "num_input_tokens_seen": 90604800, "step": 94900 }, { "epoch": 7.741659189167143, "grad_norm": 27.705921173095703, "learning_rate": 7.374378548848687e-06, "loss": 0.3728, "num_input_tokens_seen": 90609136, "step": 94905 }, { "epoch": 7.742067052777551, "grad_norm": 1.4495166540145874, "learning_rate": 7.371854547042658e-06, "loss": 0.2809, "num_input_tokens_seen": 90614624, "step": 94910 }, { "epoch": 7.74247491638796, "grad_norm": 17.853628158569336, "learning_rate": 7.369330902551655e-06, "loss": 0.3656, "num_input_tokens_seen": 90619152, "step": 94915 }, { "epoch": 7.742882779998369, "grad_norm": 3.0571014881134033, "learning_rate": 7.366807615426824e-06, "loss": 0.2892, "num_input_tokens_seen": 90623776, "step": 94920 }, { "epoch": 7.743290643608777, "grad_norm": 7.56815767288208, "learning_rate": 7.364284685719308e-06, "loss": 0.3366, "num_input_tokens_seen": 90628512, "step": 94925 }, { "epoch": 7.743698507219186, "grad_norm": 28.278732299804688, "learning_rate": 7.3617621134802385e-06, "loss": 0.3116, "num_input_tokens_seen": 90632848, "step": 94930 }, { "epoch": 7.744106370829595, "grad_norm": 35.55810546875, "learning_rate": 7.359239898760767e-06, "loss": 0.3401, "num_input_tokens_seen": 90637440, "step": 94935 }, { "epoch": 7.744514234440003, "grad_norm": 0.7490891218185425, "learning_rate": 7.356718041612007e-06, "loss": 0.3727, "num_input_tokens_seen": 90642640, "step": 94940 }, { "epoch": 7.744922098050412, "grad_norm": 3.763592481613159, "learning_rate": 7.354196542085074e-06, "loss": 0.3658, "num_input_tokens_seen": 90647120, "step": 94945 }, { "epoch": 7.74532996166082, "grad_norm": 23.607013702392578, "learning_rate": 7.351675400231078e-06, "loss": 0.3152, "num_input_tokens_seen": 90651040, "step": 94950 }, { "epoch": 7.745737825271229, "grad_norm": 1.8077492713928223, "learning_rate": 7.349154616101114e-06, "loss": 0.3357, "num_input_tokens_seen": 90655520, "step": 94955 }, { "epoch": 7.746145688881638, "grad_norm": 2.8225619792938232, "learning_rate": 7.346634189746296e-06, "loss": 0.2154, "num_input_tokens_seen": 90659664, "step": 94960 }, { "epoch": 7.746553552492046, "grad_norm": 39.12510299682617, "learning_rate": 7.3441141212177014e-06, "loss": 0.2826, "num_input_tokens_seen": 90665072, "step": 94965 }, { "epoch": 7.746961416102455, "grad_norm": 2.4278032779693604, "learning_rate": 7.34159441056641e-06, "loss": 0.4283, "num_input_tokens_seen": 90669424, "step": 94970 }, { "epoch": 7.747369279712864, "grad_norm": 1.0074505805969238, "learning_rate": 7.339075057843489e-06, "loss": 0.2885, "num_input_tokens_seen": 90673728, "step": 94975 }, { "epoch": 7.7477771433232725, "grad_norm": 42.30397415161133, "learning_rate": 7.33655606310002e-06, "loss": 0.3948, "num_input_tokens_seen": 90678160, "step": 94980 }, { "epoch": 7.7481850069336815, "grad_norm": 1.0064133405685425, "learning_rate": 7.334037426387053e-06, "loss": 0.389, "num_input_tokens_seen": 90683488, "step": 94985 }, { "epoch": 7.7485928705440905, "grad_norm": 3.2836673259735107, "learning_rate": 7.331519147755642e-06, "loss": 0.3173, "num_input_tokens_seen": 90687856, "step": 94990 }, { "epoch": 7.749000734154499, "grad_norm": 3.481450080871582, "learning_rate": 7.329001227256826e-06, "loss": 0.191, "num_input_tokens_seen": 90693072, "step": 94995 }, { "epoch": 7.749408597764908, "grad_norm": 0.6759541630744934, "learning_rate": 7.32648366494165e-06, "loss": 0.3228, "num_input_tokens_seen": 90697424, "step": 95000 }, { "epoch": 7.749816461375316, "grad_norm": 0.5421538949012756, "learning_rate": 7.32396646086114e-06, "loss": 0.3029, "num_input_tokens_seen": 90702960, "step": 95005 }, { "epoch": 7.750224324985725, "grad_norm": 0.47912701964378357, "learning_rate": 7.32144961506632e-06, "loss": 0.4574, "num_input_tokens_seen": 90708336, "step": 95010 }, { "epoch": 7.750632188596134, "grad_norm": 0.3281872272491455, "learning_rate": 7.318933127608202e-06, "loss": 0.2909, "num_input_tokens_seen": 90713616, "step": 95015 }, { "epoch": 7.751040052206542, "grad_norm": 2.134378671646118, "learning_rate": 7.31641699853779e-06, "loss": 0.4847, "num_input_tokens_seen": 90717024, "step": 95020 }, { "epoch": 7.751447915816951, "grad_norm": 0.5141361355781555, "learning_rate": 7.313901227906097e-06, "loss": 0.2057, "num_input_tokens_seen": 90721424, "step": 95025 }, { "epoch": 7.751855779427359, "grad_norm": 0.49273020029067993, "learning_rate": 7.3113858157641125e-06, "loss": 0.2058, "num_input_tokens_seen": 90726384, "step": 95030 }, { "epoch": 7.752263643037768, "grad_norm": 2.1836729049682617, "learning_rate": 7.30887076216282e-06, "loss": 0.3066, "num_input_tokens_seen": 90731328, "step": 95035 }, { "epoch": 7.752671506648177, "grad_norm": 28.626644134521484, "learning_rate": 7.306356067153203e-06, "loss": 0.4632, "num_input_tokens_seen": 90736192, "step": 95040 }, { "epoch": 7.753079370258585, "grad_norm": 1.4052534103393555, "learning_rate": 7.3038417307862175e-06, "loss": 0.3131, "num_input_tokens_seen": 90741120, "step": 95045 }, { "epoch": 7.753487233868994, "grad_norm": 0.2841688096523285, "learning_rate": 7.301327753112852e-06, "loss": 0.2914, "num_input_tokens_seen": 90745872, "step": 95050 }, { "epoch": 7.753895097479403, "grad_norm": 0.4268519878387451, "learning_rate": 7.298814134184051e-06, "loss": 0.352, "num_input_tokens_seen": 90750992, "step": 95055 }, { "epoch": 7.754302961089811, "grad_norm": 16.65772819519043, "learning_rate": 7.2963008740507656e-06, "loss": 0.2682, "num_input_tokens_seen": 90755360, "step": 95060 }, { "epoch": 7.75471082470022, "grad_norm": 0.8343732357025146, "learning_rate": 7.29378797276393e-06, "loss": 0.3042, "num_input_tokens_seen": 90760176, "step": 95065 }, { "epoch": 7.755118688310629, "grad_norm": 0.6026554107666016, "learning_rate": 7.291275430374497e-06, "loss": 0.5398, "num_input_tokens_seen": 90764608, "step": 95070 }, { "epoch": 7.7555265519210375, "grad_norm": 26.9088191986084, "learning_rate": 7.288763246933386e-06, "loss": 0.4643, "num_input_tokens_seen": 90768752, "step": 95075 }, { "epoch": 7.7559344155314465, "grad_norm": 32.8784065246582, "learning_rate": 7.286251422491519e-06, "loss": 0.2831, "num_input_tokens_seen": 90773424, "step": 95080 }, { "epoch": 7.756342279141855, "grad_norm": 0.675993025302887, "learning_rate": 7.2837399570998e-06, "loss": 0.2366, "num_input_tokens_seen": 90778528, "step": 95085 }, { "epoch": 7.756750142752264, "grad_norm": 1.290653109550476, "learning_rate": 7.281228850809152e-06, "loss": 0.3754, "num_input_tokens_seen": 90782560, "step": 95090 }, { "epoch": 7.757158006362673, "grad_norm": 37.349578857421875, "learning_rate": 7.278718103670468e-06, "loss": 0.3849, "num_input_tokens_seen": 90787680, "step": 95095 }, { "epoch": 7.757565869973081, "grad_norm": 5.326014518737793, "learning_rate": 7.276207715734634e-06, "loss": 0.319, "num_input_tokens_seen": 90792720, "step": 95100 }, { "epoch": 7.75797373358349, "grad_norm": 4.353456497192383, "learning_rate": 7.2736976870525416e-06, "loss": 0.1918, "num_input_tokens_seen": 90797360, "step": 95105 }, { "epoch": 7.758381597193899, "grad_norm": 1.4811761379241943, "learning_rate": 7.271188017675054e-06, "loss": 0.3934, "num_input_tokens_seen": 90801056, "step": 95110 }, { "epoch": 7.758789460804307, "grad_norm": 1.9390106201171875, "learning_rate": 7.268678707653062e-06, "loss": 0.2955, "num_input_tokens_seen": 90805408, "step": 95115 }, { "epoch": 7.759197324414716, "grad_norm": 0.8218262195587158, "learning_rate": 7.266169757037416e-06, "loss": 0.5247, "num_input_tokens_seen": 90810480, "step": 95120 }, { "epoch": 7.759605188025125, "grad_norm": 8.334880828857422, "learning_rate": 7.263661165878976e-06, "loss": 0.4684, "num_input_tokens_seen": 90815024, "step": 95125 }, { "epoch": 7.760013051635533, "grad_norm": 1.4155917167663574, "learning_rate": 7.261152934228588e-06, "loss": 0.2975, "num_input_tokens_seen": 90819888, "step": 95130 }, { "epoch": 7.760420915245942, "grad_norm": 1.4541147947311401, "learning_rate": 7.25864506213709e-06, "loss": 0.3079, "num_input_tokens_seen": 90824624, "step": 95135 }, { "epoch": 7.76082877885635, "grad_norm": 1.1455496549606323, "learning_rate": 7.256137549655318e-06, "loss": 0.3678, "num_input_tokens_seen": 90829024, "step": 95140 }, { "epoch": 7.761236642466759, "grad_norm": 1.1518847942352295, "learning_rate": 7.253630396834099e-06, "loss": 0.3735, "num_input_tokens_seen": 90833008, "step": 95145 }, { "epoch": 7.761644506077168, "grad_norm": 21.44944190979004, "learning_rate": 7.251123603724241e-06, "loss": 0.4189, "num_input_tokens_seen": 90838032, "step": 95150 }, { "epoch": 7.762052369687576, "grad_norm": 41.85934066772461, "learning_rate": 7.248617170376576e-06, "loss": 0.3568, "num_input_tokens_seen": 90843568, "step": 95155 }, { "epoch": 7.762460233297985, "grad_norm": 20.050416946411133, "learning_rate": 7.246111096841899e-06, "loss": 0.459, "num_input_tokens_seen": 90848576, "step": 95160 }, { "epoch": 7.762868096908393, "grad_norm": 3.1207993030548096, "learning_rate": 7.243605383171001e-06, "loss": 0.3956, "num_input_tokens_seen": 90852784, "step": 95165 }, { "epoch": 7.763275960518802, "grad_norm": 0.8250707387924194, "learning_rate": 7.241100029414679e-06, "loss": 0.5092, "num_input_tokens_seen": 90857808, "step": 95170 }, { "epoch": 7.763683824129211, "grad_norm": 0.49354836344718933, "learning_rate": 7.238595035623708e-06, "loss": 0.6289, "num_input_tokens_seen": 90863536, "step": 95175 }, { "epoch": 7.7640916877396196, "grad_norm": 1.5741536617279053, "learning_rate": 7.236090401848875e-06, "loss": 0.5733, "num_input_tokens_seen": 90868432, "step": 95180 }, { "epoch": 7.764499551350029, "grad_norm": 0.6835154294967651, "learning_rate": 7.23358612814094e-06, "loss": 0.2534, "num_input_tokens_seen": 90873232, "step": 95185 }, { "epoch": 7.764907414960438, "grad_norm": 2.9391984939575195, "learning_rate": 7.231082214550663e-06, "loss": 0.3502, "num_input_tokens_seen": 90878208, "step": 95190 }, { "epoch": 7.765315278570846, "grad_norm": 0.3355081081390381, "learning_rate": 7.228578661128804e-06, "loss": 0.3018, "num_input_tokens_seen": 90883072, "step": 95195 }, { "epoch": 7.765723142181255, "grad_norm": 0.9414946436882019, "learning_rate": 7.2260754679260935e-06, "loss": 0.3086, "num_input_tokens_seen": 90886832, "step": 95200 }, { "epoch": 7.766131005791664, "grad_norm": 1.901759386062622, "learning_rate": 7.223572634993286e-06, "loss": 0.3122, "num_input_tokens_seen": 90892064, "step": 95205 }, { "epoch": 7.766538869402072, "grad_norm": 0.29787343740463257, "learning_rate": 7.221070162381111e-06, "loss": 0.3631, "num_input_tokens_seen": 90897488, "step": 95210 }, { "epoch": 7.766946733012481, "grad_norm": 22.162616729736328, "learning_rate": 7.2185680501402896e-06, "loss": 0.4499, "num_input_tokens_seen": 90901840, "step": 95215 }, { "epoch": 7.767354596622889, "grad_norm": 0.9118527770042419, "learning_rate": 7.2160662983215275e-06, "loss": 0.3235, "num_input_tokens_seen": 90906192, "step": 95220 }, { "epoch": 7.767762460233298, "grad_norm": 23.290008544921875, "learning_rate": 7.2135649069755535e-06, "loss": 0.436, "num_input_tokens_seen": 90911520, "step": 95225 }, { "epoch": 7.768170323843707, "grad_norm": 1.5513747930526733, "learning_rate": 7.211063876153062e-06, "loss": 0.2672, "num_input_tokens_seen": 90916544, "step": 95230 }, { "epoch": 7.768578187454115, "grad_norm": 0.4454050660133362, "learning_rate": 7.208563205904745e-06, "loss": 0.2524, "num_input_tokens_seen": 90921840, "step": 95235 }, { "epoch": 7.768986051064524, "grad_norm": 10.47128963470459, "learning_rate": 7.206062896281293e-06, "loss": 0.4562, "num_input_tokens_seen": 90927216, "step": 95240 }, { "epoch": 7.769393914674932, "grad_norm": 3.740138530731201, "learning_rate": 7.2035629473333755e-06, "loss": 0.2433, "num_input_tokens_seen": 90931712, "step": 95245 }, { "epoch": 7.769801778285341, "grad_norm": 36.79109191894531, "learning_rate": 7.201063359111684e-06, "loss": 0.4868, "num_input_tokens_seen": 90937088, "step": 95250 }, { "epoch": 7.77020964189575, "grad_norm": 0.743044376373291, "learning_rate": 7.198564131666877e-06, "loss": 0.3018, "num_input_tokens_seen": 90942448, "step": 95255 }, { "epoch": 7.770617505506158, "grad_norm": 57.98003387451172, "learning_rate": 7.196065265049609e-06, "loss": 0.3622, "num_input_tokens_seen": 90946992, "step": 95260 }, { "epoch": 7.771025369116567, "grad_norm": 0.4101150929927826, "learning_rate": 7.193566759310533e-06, "loss": 0.4037, "num_input_tokens_seen": 90952368, "step": 95265 }, { "epoch": 7.771433232726976, "grad_norm": 0.6537954807281494, "learning_rate": 7.191068614500282e-06, "loss": 0.3883, "num_input_tokens_seen": 90957440, "step": 95270 }, { "epoch": 7.7718410963373845, "grad_norm": 0.6093027591705322, "learning_rate": 7.188570830669514e-06, "loss": 0.2513, "num_input_tokens_seen": 90962592, "step": 95275 }, { "epoch": 7.7722489599477935, "grad_norm": 0.47724997997283936, "learning_rate": 7.186073407868846e-06, "loss": 0.3965, "num_input_tokens_seen": 90967520, "step": 95280 }, { "epoch": 7.7726568235582025, "grad_norm": 1.8281770944595337, "learning_rate": 7.183576346148899e-06, "loss": 0.3544, "num_input_tokens_seen": 90973056, "step": 95285 }, { "epoch": 7.773064687168611, "grad_norm": 5.020719528198242, "learning_rate": 7.181079645560287e-06, "loss": 0.1708, "num_input_tokens_seen": 90978288, "step": 95290 }, { "epoch": 7.77347255077902, "grad_norm": 38.044639587402344, "learning_rate": 7.178583306153622e-06, "loss": 0.3654, "num_input_tokens_seen": 90982368, "step": 95295 }, { "epoch": 7.773880414389428, "grad_norm": 1.1629879474639893, "learning_rate": 7.1760873279795e-06, "loss": 0.3246, "num_input_tokens_seen": 90986656, "step": 95300 }, { "epoch": 7.774288277999837, "grad_norm": 1.191997766494751, "learning_rate": 7.1735917110885144e-06, "loss": 0.3421, "num_input_tokens_seen": 90991552, "step": 95305 }, { "epoch": 7.774696141610246, "grad_norm": 50.830650329589844, "learning_rate": 7.171096455531242e-06, "loss": 0.2845, "num_input_tokens_seen": 90996288, "step": 95310 }, { "epoch": 7.775104005220654, "grad_norm": 16.663969039916992, "learning_rate": 7.168601561358277e-06, "loss": 0.3081, "num_input_tokens_seen": 91001808, "step": 95315 }, { "epoch": 7.775511868831063, "grad_norm": 0.7094606757164001, "learning_rate": 7.166107028620181e-06, "loss": 0.386, "num_input_tokens_seen": 91006416, "step": 95320 }, { "epoch": 7.775919732441472, "grad_norm": 2.5327062606811523, "learning_rate": 7.163612857367516e-06, "loss": 0.3138, "num_input_tokens_seen": 91011952, "step": 95325 }, { "epoch": 7.77632759605188, "grad_norm": 0.652935266494751, "learning_rate": 7.161119047650833e-06, "loss": 0.3525, "num_input_tokens_seen": 91016768, "step": 95330 }, { "epoch": 7.776735459662289, "grad_norm": 0.6781337857246399, "learning_rate": 7.158625599520694e-06, "loss": 0.2636, "num_input_tokens_seen": 91021568, "step": 95335 }, { "epoch": 7.777143323272698, "grad_norm": 11.002896308898926, "learning_rate": 7.15613251302763e-06, "loss": 0.3208, "num_input_tokens_seen": 91026112, "step": 95340 }, { "epoch": 7.777551186883106, "grad_norm": 3.173158645629883, "learning_rate": 7.153639788222183e-06, "loss": 0.4218, "num_input_tokens_seen": 91031296, "step": 95345 }, { "epoch": 7.777959050493515, "grad_norm": 2.540571928024292, "learning_rate": 7.151147425154869e-06, "loss": 0.3933, "num_input_tokens_seen": 91036256, "step": 95350 }, { "epoch": 7.778366914103923, "grad_norm": 30.345745086669922, "learning_rate": 7.148655423876208e-06, "loss": 0.3647, "num_input_tokens_seen": 91040400, "step": 95355 }, { "epoch": 7.778774777714332, "grad_norm": 30.14581298828125, "learning_rate": 7.146163784436721e-06, "loss": 0.3598, "num_input_tokens_seen": 91045648, "step": 95360 }, { "epoch": 7.779182641324741, "grad_norm": 5.588501930236816, "learning_rate": 7.143672506886911e-06, "loss": 0.5365, "num_input_tokens_seen": 91050080, "step": 95365 }, { "epoch": 7.7795905049351495, "grad_norm": 18.51710319519043, "learning_rate": 7.141181591277269e-06, "loss": 0.3531, "num_input_tokens_seen": 91055104, "step": 95370 }, { "epoch": 7.7799983685455585, "grad_norm": 0.6749864220619202, "learning_rate": 7.138691037658282e-06, "loss": 0.3835, "num_input_tokens_seen": 91059648, "step": 95375 }, { "epoch": 7.780406232155967, "grad_norm": 1.9737697839736938, "learning_rate": 7.136200846080443e-06, "loss": 0.3956, "num_input_tokens_seen": 91063952, "step": 95380 }, { "epoch": 7.780814095766376, "grad_norm": 8.33678913116455, "learning_rate": 7.133711016594224e-06, "loss": 0.3738, "num_input_tokens_seen": 91068432, "step": 95385 }, { "epoch": 7.781221959376785, "grad_norm": 2.763193130493164, "learning_rate": 7.131221549250089e-06, "loss": 0.3241, "num_input_tokens_seen": 91073568, "step": 95390 }, { "epoch": 7.781629822987193, "grad_norm": 0.4486488401889801, "learning_rate": 7.128732444098502e-06, "loss": 0.249, "num_input_tokens_seen": 91079104, "step": 95395 }, { "epoch": 7.782037686597602, "grad_norm": 40.61046600341797, "learning_rate": 7.126243701189905e-06, "loss": 0.4748, "num_input_tokens_seen": 91083616, "step": 95400 }, { "epoch": 7.782445550208011, "grad_norm": 4.675052642822266, "learning_rate": 7.123755320574763e-06, "loss": 0.4152, "num_input_tokens_seen": 91088096, "step": 95405 }, { "epoch": 7.782853413818419, "grad_norm": 0.7251792550086975, "learning_rate": 7.1212673023035055e-06, "loss": 0.3637, "num_input_tokens_seen": 91092976, "step": 95410 }, { "epoch": 7.783261277428828, "grad_norm": 2.6349399089813232, "learning_rate": 7.11877964642656e-06, "loss": 0.285, "num_input_tokens_seen": 91097344, "step": 95415 }, { "epoch": 7.783669141039237, "grad_norm": 3.343125581741333, "learning_rate": 7.1162923529943525e-06, "loss": 0.3942, "num_input_tokens_seen": 91101600, "step": 95420 }, { "epoch": 7.784077004649645, "grad_norm": 0.45804181694984436, "learning_rate": 7.113805422057302e-06, "loss": 0.3507, "num_input_tokens_seen": 91106592, "step": 95425 }, { "epoch": 7.784484868260054, "grad_norm": 2.485687255859375, "learning_rate": 7.111318853665813e-06, "loss": 0.4428, "num_input_tokens_seen": 91111856, "step": 95430 }, { "epoch": 7.784892731870462, "grad_norm": 8.491116523742676, "learning_rate": 7.1088326478702924e-06, "loss": 0.3341, "num_input_tokens_seen": 91116816, "step": 95435 }, { "epoch": 7.785300595480871, "grad_norm": 21.055723190307617, "learning_rate": 7.106346804721128e-06, "loss": 0.4879, "num_input_tokens_seen": 91121952, "step": 95440 }, { "epoch": 7.78570845909128, "grad_norm": 10.474976539611816, "learning_rate": 7.103861324268704e-06, "loss": 0.3128, "num_input_tokens_seen": 91127024, "step": 95445 }, { "epoch": 7.786116322701688, "grad_norm": 25.355735778808594, "learning_rate": 7.101376206563415e-06, "loss": 0.4409, "num_input_tokens_seen": 91131232, "step": 95450 }, { "epoch": 7.786524186312097, "grad_norm": 0.38934049010276794, "learning_rate": 7.098891451655626e-06, "loss": 0.2698, "num_input_tokens_seen": 91136752, "step": 95455 }, { "epoch": 7.786932049922505, "grad_norm": 4.695300102233887, "learning_rate": 7.0964070595957e-06, "loss": 0.2089, "num_input_tokens_seen": 91140848, "step": 95460 }, { "epoch": 7.787339913532914, "grad_norm": 21.93110466003418, "learning_rate": 7.093923030433986e-06, "loss": 0.1503, "num_input_tokens_seen": 91145824, "step": 95465 }, { "epoch": 7.7877477771433234, "grad_norm": 32.53445053100586, "learning_rate": 7.091439364220851e-06, "loss": 0.3617, "num_input_tokens_seen": 91150320, "step": 95470 }, { "epoch": 7.788155640753732, "grad_norm": 0.4910179078578949, "learning_rate": 7.08895606100663e-06, "loss": 0.2342, "num_input_tokens_seen": 91154816, "step": 95475 }, { "epoch": 7.788563504364141, "grad_norm": 60.314117431640625, "learning_rate": 7.086473120841661e-06, "loss": 0.3805, "num_input_tokens_seen": 91160048, "step": 95480 }, { "epoch": 7.78897136797455, "grad_norm": 2.387510061264038, "learning_rate": 7.083990543776261e-06, "loss": 0.4031, "num_input_tokens_seen": 91164624, "step": 95485 }, { "epoch": 7.789379231584958, "grad_norm": 8.862271308898926, "learning_rate": 7.081508329860767e-06, "loss": 0.3118, "num_input_tokens_seen": 91169664, "step": 95490 }, { "epoch": 7.789787095195367, "grad_norm": 22.57264518737793, "learning_rate": 7.079026479145484e-06, "loss": 0.3534, "num_input_tokens_seen": 91174032, "step": 95495 }, { "epoch": 7.790194958805776, "grad_norm": 14.806560516357422, "learning_rate": 7.0765449916807224e-06, "loss": 0.4593, "num_input_tokens_seen": 91178672, "step": 95500 }, { "epoch": 7.790602822416184, "grad_norm": 39.97243881225586, "learning_rate": 7.074063867516776e-06, "loss": 0.3836, "num_input_tokens_seen": 91184608, "step": 95505 }, { "epoch": 7.791010686026593, "grad_norm": 0.7229748964309692, "learning_rate": 7.07158310670393e-06, "loss": 0.3289, "num_input_tokens_seen": 91189232, "step": 95510 }, { "epoch": 7.791418549637001, "grad_norm": 2.4282727241516113, "learning_rate": 7.069102709292482e-06, "loss": 0.5257, "num_input_tokens_seen": 91193968, "step": 95515 }, { "epoch": 7.79182641324741, "grad_norm": 1.4408754110336304, "learning_rate": 7.066622675332707e-06, "loss": 0.3932, "num_input_tokens_seen": 91199120, "step": 95520 }, { "epoch": 7.792234276857819, "grad_norm": 0.5288490653038025, "learning_rate": 7.064143004874868e-06, "loss": 0.348, "num_input_tokens_seen": 91203488, "step": 95525 }, { "epoch": 7.792642140468227, "grad_norm": 21.336210250854492, "learning_rate": 7.06166369796922e-06, "loss": 0.3478, "num_input_tokens_seen": 91208240, "step": 95530 }, { "epoch": 7.793050004078636, "grad_norm": 0.9014872908592224, "learning_rate": 7.059184754666034e-06, "loss": 0.42, "num_input_tokens_seen": 91212224, "step": 95535 }, { "epoch": 7.793457867689045, "grad_norm": 43.68153762817383, "learning_rate": 7.056706175015551e-06, "loss": 0.454, "num_input_tokens_seen": 91217328, "step": 95540 }, { "epoch": 7.793865731299453, "grad_norm": 3.440420627593994, "learning_rate": 7.054227959068008e-06, "loss": 0.2908, "num_input_tokens_seen": 91222224, "step": 95545 }, { "epoch": 7.794273594909862, "grad_norm": 6.551606178283691, "learning_rate": 7.051750106873639e-06, "loss": 0.502, "num_input_tokens_seen": 91226704, "step": 95550 }, { "epoch": 7.794681458520271, "grad_norm": 0.5867618322372437, "learning_rate": 7.049272618482669e-06, "loss": 0.2961, "num_input_tokens_seen": 91231776, "step": 95555 }, { "epoch": 7.795089322130679, "grad_norm": 0.8201238512992859, "learning_rate": 7.0467954939453155e-06, "loss": 0.2915, "num_input_tokens_seen": 91236608, "step": 95560 }, { "epoch": 7.795497185741088, "grad_norm": 23.78268051147461, "learning_rate": 7.044318733311789e-06, "loss": 0.3233, "num_input_tokens_seen": 91241264, "step": 95565 }, { "epoch": 7.7959050493514965, "grad_norm": 0.3780255615711212, "learning_rate": 7.041842336632293e-06, "loss": 0.2774, "num_input_tokens_seen": 91245520, "step": 95570 }, { "epoch": 7.7963129129619055, "grad_norm": 0.3732721507549286, "learning_rate": 7.039366303957013e-06, "loss": 0.4909, "num_input_tokens_seen": 91250928, "step": 95575 }, { "epoch": 7.7967207765723145, "grad_norm": 7.547421455383301, "learning_rate": 7.036890635336152e-06, "loss": 0.3786, "num_input_tokens_seen": 91254704, "step": 95580 }, { "epoch": 7.797128640182723, "grad_norm": 5.93510103225708, "learning_rate": 7.034415330819885e-06, "loss": 0.4094, "num_input_tokens_seen": 91259888, "step": 95585 }, { "epoch": 7.797536503793132, "grad_norm": 41.75783157348633, "learning_rate": 7.0319403904583885e-06, "loss": 0.2955, "num_input_tokens_seen": 91265104, "step": 95590 }, { "epoch": 7.79794436740354, "grad_norm": 34.41702651977539, "learning_rate": 7.0294658143018225e-06, "loss": 0.3761, "num_input_tokens_seen": 91270160, "step": 95595 }, { "epoch": 7.798352231013949, "grad_norm": 0.5680014491081238, "learning_rate": 7.0269916024003404e-06, "loss": 0.4604, "num_input_tokens_seen": 91275104, "step": 95600 }, { "epoch": 7.798760094624358, "grad_norm": 1.3112761974334717, "learning_rate": 7.0245177548041105e-06, "loss": 0.3558, "num_input_tokens_seen": 91280032, "step": 95605 }, { "epoch": 7.799167958234766, "grad_norm": 3.9712066650390625, "learning_rate": 7.022044271563266e-06, "loss": 0.4165, "num_input_tokens_seen": 91284656, "step": 95610 }, { "epoch": 7.799575821845175, "grad_norm": 19.613082885742188, "learning_rate": 7.019571152727947e-06, "loss": 0.4132, "num_input_tokens_seen": 91288304, "step": 95615 }, { "epoch": 7.799983685455584, "grad_norm": 2.3169000148773193, "learning_rate": 7.017098398348271e-06, "loss": 0.3429, "num_input_tokens_seen": 91292416, "step": 95620 }, { "epoch": 7.800391549065992, "grad_norm": 2.4244611263275146, "learning_rate": 7.014626008474376e-06, "loss": 0.4354, "num_input_tokens_seen": 91297184, "step": 95625 }, { "epoch": 7.800799412676401, "grad_norm": 17.849815368652344, "learning_rate": 7.0121539831563725e-06, "loss": 0.2806, "num_input_tokens_seen": 91301648, "step": 95630 }, { "epoch": 7.80120727628681, "grad_norm": 18.367399215698242, "learning_rate": 7.009682322444361e-06, "loss": 0.436, "num_input_tokens_seen": 91307088, "step": 95635 }, { "epoch": 7.801615139897218, "grad_norm": 0.4724229872226715, "learning_rate": 7.007211026388439e-06, "loss": 0.4392, "num_input_tokens_seen": 91312800, "step": 95640 }, { "epoch": 7.802023003507627, "grad_norm": 0.5759483575820923, "learning_rate": 7.004740095038709e-06, "loss": 0.2343, "num_input_tokens_seen": 91317872, "step": 95645 }, { "epoch": 7.802430867118035, "grad_norm": 3.6302311420440674, "learning_rate": 7.002269528445252e-06, "loss": 0.3761, "num_input_tokens_seen": 91323504, "step": 95650 }, { "epoch": 7.802838730728444, "grad_norm": 1.920666217803955, "learning_rate": 6.9997993266581435e-06, "loss": 0.3055, "num_input_tokens_seen": 91328768, "step": 95655 }, { "epoch": 7.803246594338853, "grad_norm": 0.45929816365242004, "learning_rate": 6.997329489727455e-06, "loss": 0.3278, "num_input_tokens_seen": 91333648, "step": 95660 }, { "epoch": 7.8036544579492615, "grad_norm": 0.7555562257766724, "learning_rate": 6.994860017703239e-06, "loss": 0.2786, "num_input_tokens_seen": 91339344, "step": 95665 }, { "epoch": 7.8040623215596705, "grad_norm": 0.6719849109649658, "learning_rate": 6.992390910635568e-06, "loss": 0.4122, "num_input_tokens_seen": 91343856, "step": 95670 }, { "epoch": 7.8044701851700795, "grad_norm": 2.8984832763671875, "learning_rate": 6.989922168574484e-06, "loss": 0.4277, "num_input_tokens_seen": 91349152, "step": 95675 }, { "epoch": 7.804878048780488, "grad_norm": 1.7862122058868408, "learning_rate": 6.98745379157002e-06, "loss": 0.2278, "num_input_tokens_seen": 91354128, "step": 95680 }, { "epoch": 7.805285912390897, "grad_norm": 9.918614387512207, "learning_rate": 6.984985779672215e-06, "loss": 0.2531, "num_input_tokens_seen": 91359488, "step": 95685 }, { "epoch": 7.805693776001306, "grad_norm": 26.11661148071289, "learning_rate": 6.982518132931096e-06, "loss": 0.4464, "num_input_tokens_seen": 91363760, "step": 95690 }, { "epoch": 7.806101639611714, "grad_norm": 1.7045037746429443, "learning_rate": 6.980050851396675e-06, "loss": 0.4559, "num_input_tokens_seen": 91368304, "step": 95695 }, { "epoch": 7.806509503222123, "grad_norm": 0.8431783318519592, "learning_rate": 6.977583935118967e-06, "loss": 0.2844, "num_input_tokens_seen": 91372976, "step": 95700 }, { "epoch": 7.806917366832531, "grad_norm": 9.811775207519531, "learning_rate": 6.975117384147975e-06, "loss": 0.3312, "num_input_tokens_seen": 91377872, "step": 95705 }, { "epoch": 7.80732523044294, "grad_norm": 1.1513018608093262, "learning_rate": 6.972651198533686e-06, "loss": 0.3541, "num_input_tokens_seen": 91382768, "step": 95710 }, { "epoch": 7.807733094053349, "grad_norm": 0.8217533230781555, "learning_rate": 6.970185378326105e-06, "loss": 0.3216, "num_input_tokens_seen": 91386848, "step": 95715 }, { "epoch": 7.808140957663757, "grad_norm": 0.8613181710243225, "learning_rate": 6.9677199235752054e-06, "loss": 0.2803, "num_input_tokens_seen": 91392144, "step": 95720 }, { "epoch": 7.808548821274166, "grad_norm": 0.3475557863712311, "learning_rate": 6.9652548343309615e-06, "loss": 0.3534, "num_input_tokens_seen": 91396304, "step": 95725 }, { "epoch": 7.808956684884574, "grad_norm": 1.061104655265808, "learning_rate": 6.962790110643327e-06, "loss": 0.2905, "num_input_tokens_seen": 91401824, "step": 95730 }, { "epoch": 7.809364548494983, "grad_norm": 13.030609130859375, "learning_rate": 6.960325752562285e-06, "loss": 0.2975, "num_input_tokens_seen": 91406688, "step": 95735 }, { "epoch": 7.809772412105392, "grad_norm": 0.6259801387786865, "learning_rate": 6.957861760137771e-06, "loss": 0.3045, "num_input_tokens_seen": 91410688, "step": 95740 }, { "epoch": 7.8101802757158, "grad_norm": 19.2085018157959, "learning_rate": 6.9553981334197356e-06, "loss": 0.2902, "num_input_tokens_seen": 91415488, "step": 95745 }, { "epoch": 7.810588139326209, "grad_norm": 0.6545859575271606, "learning_rate": 6.952934872458111e-06, "loss": 0.3044, "num_input_tokens_seen": 91419888, "step": 95750 }, { "epoch": 7.810996002936618, "grad_norm": 44.24134826660156, "learning_rate": 6.9504719773028194e-06, "loss": 0.2847, "num_input_tokens_seen": 91425632, "step": 95755 }, { "epoch": 7.8114038665470265, "grad_norm": 19.93092155456543, "learning_rate": 6.948009448003798e-06, "loss": 0.4272, "num_input_tokens_seen": 91430896, "step": 95760 }, { "epoch": 7.8118117301574355, "grad_norm": 0.5680469274520874, "learning_rate": 6.945547284610956e-06, "loss": 0.2802, "num_input_tokens_seen": 91435968, "step": 95765 }, { "epoch": 7.8122195937678445, "grad_norm": 40.44344711303711, "learning_rate": 6.9430854871742e-06, "loss": 0.3766, "num_input_tokens_seen": 91441744, "step": 95770 }, { "epoch": 7.812627457378253, "grad_norm": 0.604301929473877, "learning_rate": 6.9406240557434194e-06, "loss": 0.221, "num_input_tokens_seen": 91446272, "step": 95775 }, { "epoch": 7.813035320988662, "grad_norm": 40.082481384277344, "learning_rate": 6.938162990368522e-06, "loss": 0.3613, "num_input_tokens_seen": 91451264, "step": 95780 }, { "epoch": 7.81344318459907, "grad_norm": 0.5359853506088257, "learning_rate": 6.935702291099386e-06, "loss": 0.1836, "num_input_tokens_seen": 91456016, "step": 95785 }, { "epoch": 7.813851048209479, "grad_norm": 40.89910888671875, "learning_rate": 6.933241957985889e-06, "loss": 0.3178, "num_input_tokens_seen": 91460576, "step": 95790 }, { "epoch": 7.814258911819888, "grad_norm": 19.561946868896484, "learning_rate": 6.9307819910779e-06, "loss": 0.5202, "num_input_tokens_seen": 91466016, "step": 95795 }, { "epoch": 7.814666775430296, "grad_norm": 85.47425079345703, "learning_rate": 6.928322390425276e-06, "loss": 0.2743, "num_input_tokens_seen": 91470880, "step": 95800 }, { "epoch": 7.815074639040705, "grad_norm": 70.90384674072266, "learning_rate": 6.925863156077883e-06, "loss": 0.5152, "num_input_tokens_seen": 91474992, "step": 95805 }, { "epoch": 7.815482502651113, "grad_norm": 17.145204544067383, "learning_rate": 6.923404288085566e-06, "loss": 0.3415, "num_input_tokens_seen": 91479152, "step": 95810 }, { "epoch": 7.815890366261522, "grad_norm": 11.340926170349121, "learning_rate": 6.920945786498162e-06, "loss": 0.3396, "num_input_tokens_seen": 91484256, "step": 95815 }, { "epoch": 7.816298229871931, "grad_norm": 0.4137629270553589, "learning_rate": 6.918487651365504e-06, "loss": 0.2899, "num_input_tokens_seen": 91488736, "step": 95820 }, { "epoch": 7.816706093482339, "grad_norm": 7.305912494659424, "learning_rate": 6.916029882737421e-06, "loss": 0.2883, "num_input_tokens_seen": 91492784, "step": 95825 }, { "epoch": 7.817113957092748, "grad_norm": 9.116135597229004, "learning_rate": 6.913572480663724e-06, "loss": 0.637, "num_input_tokens_seen": 91497936, "step": 95830 }, { "epoch": 7.817521820703157, "grad_norm": 16.64603042602539, "learning_rate": 6.91111544519423e-06, "loss": 0.4154, "num_input_tokens_seen": 91502880, "step": 95835 }, { "epoch": 7.817929684313565, "grad_norm": 6.4658637046813965, "learning_rate": 6.908658776378729e-06, "loss": 0.473, "num_input_tokens_seen": 91507648, "step": 95840 }, { "epoch": 7.818337547923974, "grad_norm": 52.64369201660156, "learning_rate": 6.906202474267038e-06, "loss": 0.2816, "num_input_tokens_seen": 91512592, "step": 95845 }, { "epoch": 7.818745411534383, "grad_norm": 1.1341006755828857, "learning_rate": 6.903746538908934e-06, "loss": 0.4973, "num_input_tokens_seen": 91516768, "step": 95850 }, { "epoch": 7.819153275144791, "grad_norm": 2.005110502243042, "learning_rate": 6.9012909703541965e-06, "loss": 0.4761, "num_input_tokens_seen": 91521680, "step": 95855 }, { "epoch": 7.8195611387552, "grad_norm": 1.3499754667282104, "learning_rate": 6.898835768652601e-06, "loss": 0.313, "num_input_tokens_seen": 91526240, "step": 95860 }, { "epoch": 7.8199690023656085, "grad_norm": 0.5307111740112305, "learning_rate": 6.896380933853905e-06, "loss": 0.4613, "num_input_tokens_seen": 91531120, "step": 95865 }, { "epoch": 7.8203768659760176, "grad_norm": 1.4226478338241577, "learning_rate": 6.893926466007883e-06, "loss": 0.3768, "num_input_tokens_seen": 91535968, "step": 95870 }, { "epoch": 7.820784729586427, "grad_norm": 45.95333480834961, "learning_rate": 6.891472365164278e-06, "loss": 0.366, "num_input_tokens_seen": 91541328, "step": 95875 }, { "epoch": 7.821192593196835, "grad_norm": 8.848402976989746, "learning_rate": 6.889018631372832e-06, "loss": 0.418, "num_input_tokens_seen": 91546176, "step": 95880 }, { "epoch": 7.821600456807244, "grad_norm": 0.2602604329586029, "learning_rate": 6.886565264683276e-06, "loss": 0.308, "num_input_tokens_seen": 91550336, "step": 95885 }, { "epoch": 7.822008320417653, "grad_norm": 0.8349879384040833, "learning_rate": 6.884112265145351e-06, "loss": 0.3817, "num_input_tokens_seen": 91555968, "step": 95890 }, { "epoch": 7.822416184028061, "grad_norm": 1.003234624862671, "learning_rate": 6.8816596328087765e-06, "loss": 0.4599, "num_input_tokens_seen": 91560480, "step": 95895 }, { "epoch": 7.82282404763847, "grad_norm": 0.7641655206680298, "learning_rate": 6.879207367723259e-06, "loss": 0.3631, "num_input_tokens_seen": 91565472, "step": 95900 }, { "epoch": 7.823231911248879, "grad_norm": 46.27322006225586, "learning_rate": 6.876755469938509e-06, "loss": 0.4008, "num_input_tokens_seen": 91569968, "step": 95905 }, { "epoch": 7.823639774859287, "grad_norm": 0.5379180908203125, "learning_rate": 6.874303939504217e-06, "loss": 0.3544, "num_input_tokens_seen": 91574672, "step": 95910 }, { "epoch": 7.824047638469696, "grad_norm": 5.462811470031738, "learning_rate": 6.871852776470089e-06, "loss": 0.3019, "num_input_tokens_seen": 91578800, "step": 95915 }, { "epoch": 7.824455502080104, "grad_norm": 1.267629623413086, "learning_rate": 6.869401980885803e-06, "loss": 0.2803, "num_input_tokens_seen": 91583184, "step": 95920 }, { "epoch": 7.824863365690513, "grad_norm": 3.4907732009887695, "learning_rate": 6.866951552801035e-06, "loss": 0.2327, "num_input_tokens_seen": 91587392, "step": 95925 }, { "epoch": 7.825271229300922, "grad_norm": 28.824352264404297, "learning_rate": 6.864501492265446e-06, "loss": 0.3879, "num_input_tokens_seen": 91592832, "step": 95930 }, { "epoch": 7.82567909291133, "grad_norm": 0.7706685662269592, "learning_rate": 6.8620517993287115e-06, "loss": 0.3889, "num_input_tokens_seen": 91597536, "step": 95935 }, { "epoch": 7.826086956521739, "grad_norm": 0.9737359285354614, "learning_rate": 6.85960247404048e-06, "loss": 0.3753, "num_input_tokens_seen": 91602240, "step": 95940 }, { "epoch": 7.826494820132147, "grad_norm": 0.6838791370391846, "learning_rate": 6.857153516450401e-06, "loss": 0.2971, "num_input_tokens_seen": 91607824, "step": 95945 }, { "epoch": 7.826902683742556, "grad_norm": 0.6880478858947754, "learning_rate": 6.854704926608108e-06, "loss": 0.4432, "num_input_tokens_seen": 91612176, "step": 95950 }, { "epoch": 7.827310547352965, "grad_norm": 26.34723472595215, "learning_rate": 6.852256704563237e-06, "loss": 0.3104, "num_input_tokens_seen": 91617232, "step": 95955 }, { "epoch": 7.8277184109633735, "grad_norm": 2.280271530151367, "learning_rate": 6.849808850365402e-06, "loss": 0.2454, "num_input_tokens_seen": 91621936, "step": 95960 }, { "epoch": 7.8281262745737825, "grad_norm": 0.9615530967712402, "learning_rate": 6.847361364064239e-06, "loss": 0.3715, "num_input_tokens_seen": 91626928, "step": 95965 }, { "epoch": 7.8285341381841915, "grad_norm": 1.6668280363082886, "learning_rate": 6.844914245709347e-06, "loss": 0.2843, "num_input_tokens_seen": 91632368, "step": 95970 }, { "epoch": 7.8289420017946, "grad_norm": 34.234195709228516, "learning_rate": 6.842467495350327e-06, "loss": 0.3095, "num_input_tokens_seen": 91637568, "step": 95975 }, { "epoch": 7.829349865405009, "grad_norm": 81.72762298583984, "learning_rate": 6.840021113036776e-06, "loss": 0.3975, "num_input_tokens_seen": 91642416, "step": 95980 }, { "epoch": 7.829757729015418, "grad_norm": 0.7203027009963989, "learning_rate": 6.837575098818283e-06, "loss": 0.4268, "num_input_tokens_seen": 91647280, "step": 95985 }, { "epoch": 7.830165592625826, "grad_norm": 0.7219815254211426, "learning_rate": 6.835129452744421e-06, "loss": 0.3759, "num_input_tokens_seen": 91651424, "step": 95990 }, { "epoch": 7.830573456236235, "grad_norm": 37.196937561035156, "learning_rate": 6.83268417486477e-06, "loss": 0.3353, "num_input_tokens_seen": 91657360, "step": 95995 }, { "epoch": 7.830981319846643, "grad_norm": 0.8104854226112366, "learning_rate": 6.83023926522888e-06, "loss": 0.4374, "num_input_tokens_seen": 91662496, "step": 96000 }, { "epoch": 7.831389183457052, "grad_norm": 111.84565734863281, "learning_rate": 6.82779472388633e-06, "loss": 0.5807, "num_input_tokens_seen": 91666752, "step": 96005 }, { "epoch": 7.831797047067461, "grad_norm": 1.3983429670333862, "learning_rate": 6.825350550886658e-06, "loss": 0.2276, "num_input_tokens_seen": 91671120, "step": 96010 }, { "epoch": 7.832204910677869, "grad_norm": 0.6430536508560181, "learning_rate": 6.822906746279406e-06, "loss": 0.3897, "num_input_tokens_seen": 91675616, "step": 96015 }, { "epoch": 7.832612774288278, "grad_norm": 0.6692298650741577, "learning_rate": 6.820463310114103e-06, "loss": 0.2262, "num_input_tokens_seen": 91680768, "step": 96020 }, { "epoch": 7.833020637898686, "grad_norm": 3.9762635231018066, "learning_rate": 6.818020242440293e-06, "loss": 0.4176, "num_input_tokens_seen": 91685936, "step": 96025 }, { "epoch": 7.833428501509095, "grad_norm": 0.3371990919113159, "learning_rate": 6.815577543307486e-06, "loss": 0.4308, "num_input_tokens_seen": 91690128, "step": 96030 }, { "epoch": 7.833836365119504, "grad_norm": 0.519412636756897, "learning_rate": 6.8131352127651955e-06, "loss": 0.3547, "num_input_tokens_seen": 91694368, "step": 96035 }, { "epoch": 7.834244228729912, "grad_norm": 21.13364601135254, "learning_rate": 6.810693250862918e-06, "loss": 0.5178, "num_input_tokens_seen": 91698944, "step": 96040 }, { "epoch": 7.834652092340321, "grad_norm": 0.5988839268684387, "learning_rate": 6.808251657650166e-06, "loss": 0.3422, "num_input_tokens_seen": 91703792, "step": 96045 }, { "epoch": 7.83505995595073, "grad_norm": 0.274156391620636, "learning_rate": 6.805810433176424e-06, "loss": 0.3433, "num_input_tokens_seen": 91708288, "step": 96050 }, { "epoch": 7.8354678195611385, "grad_norm": 54.0175895690918, "learning_rate": 6.803369577491172e-06, "loss": 0.3562, "num_input_tokens_seen": 91713888, "step": 96055 }, { "epoch": 7.8358756831715475, "grad_norm": 0.7794265151023865, "learning_rate": 6.80092909064389e-06, "loss": 0.3473, "num_input_tokens_seen": 91719088, "step": 96060 }, { "epoch": 7.8362835467819565, "grad_norm": 3.5721898078918457, "learning_rate": 6.79848897268403e-06, "loss": 0.3019, "num_input_tokens_seen": 91724048, "step": 96065 }, { "epoch": 7.836691410392365, "grad_norm": 0.4119349718093872, "learning_rate": 6.796049223661075e-06, "loss": 0.3704, "num_input_tokens_seen": 91728960, "step": 96070 }, { "epoch": 7.837099274002774, "grad_norm": 0.5462456345558167, "learning_rate": 6.793609843624466e-06, "loss": 0.3141, "num_input_tokens_seen": 91733440, "step": 96075 }, { "epoch": 7.837507137613182, "grad_norm": 0.573225736618042, "learning_rate": 6.791170832623647e-06, "loss": 0.3449, "num_input_tokens_seen": 91738528, "step": 96080 }, { "epoch": 7.837915001223591, "grad_norm": 0.8393514156341553, "learning_rate": 6.788732190708053e-06, "loss": 0.4129, "num_input_tokens_seen": 91743616, "step": 96085 }, { "epoch": 7.838322864834, "grad_norm": 0.5107699036598206, "learning_rate": 6.786293917927125e-06, "loss": 0.2689, "num_input_tokens_seen": 91747728, "step": 96090 }, { "epoch": 7.838730728444408, "grad_norm": 37.30651092529297, "learning_rate": 6.783856014330281e-06, "loss": 0.3556, "num_input_tokens_seen": 91752048, "step": 96095 }, { "epoch": 7.839138592054817, "grad_norm": 3.3265347480773926, "learning_rate": 6.781418479966936e-06, "loss": 0.4805, "num_input_tokens_seen": 91757056, "step": 96100 }, { "epoch": 7.839546455665226, "grad_norm": 0.5132907629013062, "learning_rate": 6.778981314886496e-06, "loss": 0.3168, "num_input_tokens_seen": 91761664, "step": 96105 }, { "epoch": 7.839954319275634, "grad_norm": 0.4734112322330475, "learning_rate": 6.776544519138362e-06, "loss": 0.4018, "num_input_tokens_seen": 91765936, "step": 96110 }, { "epoch": 7.840362182886043, "grad_norm": 28.96234703063965, "learning_rate": 6.774108092771925e-06, "loss": 0.3375, "num_input_tokens_seen": 91771376, "step": 96115 }, { "epoch": 7.840770046496452, "grad_norm": 0.9970617294311523, "learning_rate": 6.771672035836574e-06, "loss": 0.3216, "num_input_tokens_seen": 91775440, "step": 96120 }, { "epoch": 7.84117791010686, "grad_norm": 0.4892914295196533, "learning_rate": 6.769236348381685e-06, "loss": 0.27, "num_input_tokens_seen": 91780608, "step": 96125 }, { "epoch": 7.841585773717269, "grad_norm": 1.871675968170166, "learning_rate": 6.766801030456621e-06, "loss": 0.2487, "num_input_tokens_seen": 91785536, "step": 96130 }, { "epoch": 7.841993637327677, "grad_norm": 1.0604004859924316, "learning_rate": 6.764366082110759e-06, "loss": 0.4594, "num_input_tokens_seen": 91790208, "step": 96135 }, { "epoch": 7.842401500938086, "grad_norm": 12.884743690490723, "learning_rate": 6.7619315033934485e-06, "loss": 0.1857, "num_input_tokens_seen": 91795248, "step": 96140 }, { "epoch": 7.842809364548495, "grad_norm": 2.4094362258911133, "learning_rate": 6.759497294354036e-06, "loss": 0.2657, "num_input_tokens_seen": 91799840, "step": 96145 }, { "epoch": 7.843217228158903, "grad_norm": 0.5911406874656677, "learning_rate": 6.757063455041865e-06, "loss": 0.3099, "num_input_tokens_seen": 91805104, "step": 96150 }, { "epoch": 7.843625091769312, "grad_norm": 0.9312556982040405, "learning_rate": 6.754629985506256e-06, "loss": 0.2993, "num_input_tokens_seen": 91809584, "step": 96155 }, { "epoch": 7.844032955379721, "grad_norm": 30.71678352355957, "learning_rate": 6.752196885796552e-06, "loss": 0.3565, "num_input_tokens_seen": 91815232, "step": 96160 }, { "epoch": 7.84444081899013, "grad_norm": 1.0516854524612427, "learning_rate": 6.749764155962062e-06, "loss": 0.3146, "num_input_tokens_seen": 91819424, "step": 96165 }, { "epoch": 7.844848682600539, "grad_norm": 1.0116957426071167, "learning_rate": 6.7473317960521005e-06, "loss": 0.38, "num_input_tokens_seen": 91823824, "step": 96170 }, { "epoch": 7.845256546210947, "grad_norm": 7.5723490715026855, "learning_rate": 6.744899806115959e-06, "loss": 0.3607, "num_input_tokens_seen": 91828512, "step": 96175 }, { "epoch": 7.845664409821356, "grad_norm": 1.7266662120819092, "learning_rate": 6.742468186202949e-06, "loss": 0.4632, "num_input_tokens_seen": 91833504, "step": 96180 }, { "epoch": 7.846072273431765, "grad_norm": 7.329978942871094, "learning_rate": 6.740036936362348e-06, "loss": 0.3152, "num_input_tokens_seen": 91838816, "step": 96185 }, { "epoch": 7.846480137042173, "grad_norm": 0.5242847800254822, "learning_rate": 6.737606056643441e-06, "loss": 0.3916, "num_input_tokens_seen": 91843568, "step": 96190 }, { "epoch": 7.846888000652582, "grad_norm": 0.8812377452850342, "learning_rate": 6.735175547095493e-06, "loss": 0.4824, "num_input_tokens_seen": 91848784, "step": 96195 }, { "epoch": 7.847295864262991, "grad_norm": 0.9140903949737549, "learning_rate": 6.732745407767782e-06, "loss": 0.3208, "num_input_tokens_seen": 91853408, "step": 96200 }, { "epoch": 7.847703727873399, "grad_norm": 1.6636439561843872, "learning_rate": 6.730315638709561e-06, "loss": 0.318, "num_input_tokens_seen": 91858848, "step": 96205 }, { "epoch": 7.848111591483808, "grad_norm": 2.1289567947387695, "learning_rate": 6.727886239970077e-06, "loss": 0.3238, "num_input_tokens_seen": 91864240, "step": 96210 }, { "epoch": 7.848519455094216, "grad_norm": 1.275356650352478, "learning_rate": 6.725457211598574e-06, "loss": 0.3125, "num_input_tokens_seen": 91869120, "step": 96215 }, { "epoch": 7.848927318704625, "grad_norm": 3.8898746967315674, "learning_rate": 6.723028553644281e-06, "loss": 0.2243, "num_input_tokens_seen": 91874704, "step": 96220 }, { "epoch": 7.849335182315034, "grad_norm": 1.016095519065857, "learning_rate": 6.720600266156441e-06, "loss": 0.301, "num_input_tokens_seen": 91879840, "step": 96225 }, { "epoch": 7.849743045925442, "grad_norm": 6.607588768005371, "learning_rate": 6.718172349184265e-06, "loss": 0.3224, "num_input_tokens_seen": 91884288, "step": 96230 }, { "epoch": 7.850150909535851, "grad_norm": 1.16496741771698, "learning_rate": 6.7157448027769685e-06, "loss": 0.4366, "num_input_tokens_seen": 91888672, "step": 96235 }, { "epoch": 7.85055877314626, "grad_norm": 3.102003812789917, "learning_rate": 6.713317626983753e-06, "loss": 0.2758, "num_input_tokens_seen": 91892480, "step": 96240 }, { "epoch": 7.850966636756668, "grad_norm": 0.917729914188385, "learning_rate": 6.710890821853821e-06, "loss": 0.337, "num_input_tokens_seen": 91897840, "step": 96245 }, { "epoch": 7.851374500367077, "grad_norm": 16.56142807006836, "learning_rate": 6.708464387436361e-06, "loss": 0.591, "num_input_tokens_seen": 91902624, "step": 96250 }, { "epoch": 7.851782363977486, "grad_norm": 26.691621780395508, "learning_rate": 6.706038323780553e-06, "loss": 0.3694, "num_input_tokens_seen": 91907392, "step": 96255 }, { "epoch": 7.8521902275878945, "grad_norm": 1.338394045829773, "learning_rate": 6.703612630935577e-06, "loss": 0.3274, "num_input_tokens_seen": 91911920, "step": 96260 }, { "epoch": 7.8525980911983035, "grad_norm": 0.691068708896637, "learning_rate": 6.701187308950587e-06, "loss": 0.2474, "num_input_tokens_seen": 91917280, "step": 96265 }, { "epoch": 7.853005954808712, "grad_norm": 0.6350158452987671, "learning_rate": 6.698762357874766e-06, "loss": 0.3217, "num_input_tokens_seen": 91921312, "step": 96270 }, { "epoch": 7.853413818419121, "grad_norm": 16.869659423828125, "learning_rate": 6.696337777757256e-06, "loss": 0.2734, "num_input_tokens_seen": 91925120, "step": 96275 }, { "epoch": 7.85382168202953, "grad_norm": 1.5620718002319336, "learning_rate": 6.693913568647198e-06, "loss": 0.2764, "num_input_tokens_seen": 91930048, "step": 96280 }, { "epoch": 7.854229545639938, "grad_norm": 2.7019107341766357, "learning_rate": 6.691489730593728e-06, "loss": 0.319, "num_input_tokens_seen": 91933904, "step": 96285 }, { "epoch": 7.854637409250347, "grad_norm": 1.5682355165481567, "learning_rate": 6.689066263645988e-06, "loss": 0.2975, "num_input_tokens_seen": 91938656, "step": 96290 }, { "epoch": 7.855045272860755, "grad_norm": 1.2823796272277832, "learning_rate": 6.6866431678530935e-06, "loss": 0.5202, "num_input_tokens_seen": 91943664, "step": 96295 }, { "epoch": 7.855453136471164, "grad_norm": 0.3710100054740906, "learning_rate": 6.684220443264161e-06, "loss": 0.3497, "num_input_tokens_seen": 91949200, "step": 96300 }, { "epoch": 7.855861000081573, "grad_norm": 2.548447608947754, "learning_rate": 6.6817980899282984e-06, "loss": 0.3551, "num_input_tokens_seen": 91953936, "step": 96305 }, { "epoch": 7.856268863691981, "grad_norm": 4.0703582763671875, "learning_rate": 6.679376107894597e-06, "loss": 0.2941, "num_input_tokens_seen": 91958464, "step": 96310 }, { "epoch": 7.85667672730239, "grad_norm": 1.2372812032699585, "learning_rate": 6.6769544972121625e-06, "loss": 0.3976, "num_input_tokens_seen": 91962720, "step": 96315 }, { "epoch": 7.857084590912799, "grad_norm": 1.8241409063339233, "learning_rate": 6.674533257930074e-06, "loss": 0.2605, "num_input_tokens_seen": 91967408, "step": 96320 }, { "epoch": 7.857492454523207, "grad_norm": 2.788151502609253, "learning_rate": 6.6721123900974125e-06, "loss": 0.3692, "num_input_tokens_seen": 91972016, "step": 96325 }, { "epoch": 7.857900318133616, "grad_norm": 1.0509634017944336, "learning_rate": 6.669691893763236e-06, "loss": 0.4549, "num_input_tokens_seen": 91976304, "step": 96330 }, { "epoch": 7.858308181744025, "grad_norm": 0.8341285586357117, "learning_rate": 6.667271768976621e-06, "loss": 0.3449, "num_input_tokens_seen": 91981008, "step": 96335 }, { "epoch": 7.858716045354433, "grad_norm": 1.414876103401184, "learning_rate": 6.664852015786621e-06, "loss": 0.2958, "num_input_tokens_seen": 91985424, "step": 96340 }, { "epoch": 7.859123908964842, "grad_norm": 1.2086924314498901, "learning_rate": 6.662432634242277e-06, "loss": 0.3692, "num_input_tokens_seen": 91990336, "step": 96345 }, { "epoch": 7.8595317725752505, "grad_norm": 2.8983683586120605, "learning_rate": 6.660013624392633e-06, "loss": 0.3942, "num_input_tokens_seen": 91994272, "step": 96350 }, { "epoch": 7.8599396361856595, "grad_norm": 34.316673278808594, "learning_rate": 6.657594986286711e-06, "loss": 0.3375, "num_input_tokens_seen": 91999792, "step": 96355 }, { "epoch": 7.8603474997960685, "grad_norm": 15.911584854125977, "learning_rate": 6.655176719973552e-06, "loss": 0.3194, "num_input_tokens_seen": 92004400, "step": 96360 }, { "epoch": 7.860755363406477, "grad_norm": 10.2064208984375, "learning_rate": 6.652758825502167e-06, "loss": 0.2023, "num_input_tokens_seen": 92008592, "step": 96365 }, { "epoch": 7.861163227016886, "grad_norm": 19.006486892700195, "learning_rate": 6.6503413029215615e-06, "loss": 0.3978, "num_input_tokens_seen": 92013520, "step": 96370 }, { "epoch": 7.861571090627294, "grad_norm": 1.491526484489441, "learning_rate": 6.647924152280743e-06, "loss": 0.3239, "num_input_tokens_seen": 92018000, "step": 96375 }, { "epoch": 7.861978954237703, "grad_norm": 0.5872836112976074, "learning_rate": 6.645507373628704e-06, "loss": 0.5485, "num_input_tokens_seen": 92021744, "step": 96380 }, { "epoch": 7.862386817848112, "grad_norm": 20.19546890258789, "learning_rate": 6.643090967014432e-06, "loss": 0.2702, "num_input_tokens_seen": 92026640, "step": 96385 }, { "epoch": 7.86279468145852, "grad_norm": 1.0258762836456299, "learning_rate": 6.640674932486901e-06, "loss": 0.4999, "num_input_tokens_seen": 92031200, "step": 96390 }, { "epoch": 7.863202545068929, "grad_norm": 20.177438735961914, "learning_rate": 6.638259270095085e-06, "loss": 0.5126, "num_input_tokens_seen": 92036144, "step": 96395 }, { "epoch": 7.863610408679338, "grad_norm": 1.4265565872192383, "learning_rate": 6.635843979887957e-06, "loss": 0.4445, "num_input_tokens_seen": 92040160, "step": 96400 }, { "epoch": 7.864018272289746, "grad_norm": 0.922624409198761, "learning_rate": 6.633429061914467e-06, "loss": 0.2961, "num_input_tokens_seen": 92044224, "step": 96405 }, { "epoch": 7.864426135900155, "grad_norm": 0.3308897912502289, "learning_rate": 6.631014516223569e-06, "loss": 0.5699, "num_input_tokens_seen": 92048608, "step": 96410 }, { "epoch": 7.864833999510564, "grad_norm": 1.6533764600753784, "learning_rate": 6.6286003428641965e-06, "loss": 0.3648, "num_input_tokens_seen": 92054608, "step": 96415 }, { "epoch": 7.865241863120972, "grad_norm": 6.201361656188965, "learning_rate": 6.6261865418852845e-06, "loss": 0.2101, "num_input_tokens_seen": 92059984, "step": 96420 }, { "epoch": 7.865649726731381, "grad_norm": 1.2047793865203857, "learning_rate": 6.623773113335771e-06, "loss": 0.4331, "num_input_tokens_seen": 92064704, "step": 96425 }, { "epoch": 7.866057590341789, "grad_norm": 14.898078918457031, "learning_rate": 6.621360057264567e-06, "loss": 0.3763, "num_input_tokens_seen": 92069776, "step": 96430 }, { "epoch": 7.866465453952198, "grad_norm": 30.920236587524414, "learning_rate": 6.618947373720585e-06, "loss": 0.345, "num_input_tokens_seen": 92075360, "step": 96435 }, { "epoch": 7.866873317562607, "grad_norm": 0.6750487685203552, "learning_rate": 6.616535062752724e-06, "loss": 0.33, "num_input_tokens_seen": 92080080, "step": 96440 }, { "epoch": 7.8672811811730154, "grad_norm": 1.4738918542861938, "learning_rate": 6.614123124409891e-06, "loss": 0.3864, "num_input_tokens_seen": 92085408, "step": 96445 }, { "epoch": 7.8676890447834245, "grad_norm": 1.0841312408447266, "learning_rate": 6.611711558740969e-06, "loss": 0.2642, "num_input_tokens_seen": 92090928, "step": 96450 }, { "epoch": 7.8680969083938335, "grad_norm": 25.861370086669922, "learning_rate": 6.60930036579484e-06, "loss": 0.3621, "num_input_tokens_seen": 92095248, "step": 96455 }, { "epoch": 7.868504772004242, "grad_norm": 0.8040089011192322, "learning_rate": 6.606889545620379e-06, "loss": 0.3511, "num_input_tokens_seen": 92100784, "step": 96460 }, { "epoch": 7.868912635614651, "grad_norm": 38.231266021728516, "learning_rate": 6.604479098266445e-06, "loss": 0.3324, "num_input_tokens_seen": 92105600, "step": 96465 }, { "epoch": 7.86932049922506, "grad_norm": 0.547275960445404, "learning_rate": 6.6020690237819095e-06, "loss": 0.4224, "num_input_tokens_seen": 92110272, "step": 96470 }, { "epoch": 7.869728362835468, "grad_norm": 6.805372714996338, "learning_rate": 6.599659322215615e-06, "loss": 0.2177, "num_input_tokens_seen": 92114944, "step": 96475 }, { "epoch": 7.870136226445877, "grad_norm": 0.6072078943252563, "learning_rate": 6.5972499936164084e-06, "loss": 0.3514, "num_input_tokens_seen": 92120320, "step": 96480 }, { "epoch": 7.870544090056285, "grad_norm": 4.634647369384766, "learning_rate": 6.59484103803312e-06, "loss": 0.3348, "num_input_tokens_seen": 92124320, "step": 96485 }, { "epoch": 7.870951953666694, "grad_norm": 0.7848624587059021, "learning_rate": 6.5924324555145855e-06, "loss": 0.3091, "num_input_tokens_seen": 92129088, "step": 96490 }, { "epoch": 7.871359817277103, "grad_norm": 1.1669580936431885, "learning_rate": 6.590024246109627e-06, "loss": 0.2871, "num_input_tokens_seen": 92133280, "step": 96495 }, { "epoch": 7.871767680887511, "grad_norm": 0.504045844078064, "learning_rate": 6.587616409867053e-06, "loss": 0.3267, "num_input_tokens_seen": 92138128, "step": 96500 }, { "epoch": 7.87217554449792, "grad_norm": 18.50261116027832, "learning_rate": 6.585208946835669e-06, "loss": 0.2231, "num_input_tokens_seen": 92143664, "step": 96505 }, { "epoch": 7.872583408108328, "grad_norm": 2.23535418510437, "learning_rate": 6.582801857064277e-06, "loss": 0.4304, "num_input_tokens_seen": 92148432, "step": 96510 }, { "epoch": 7.872991271718737, "grad_norm": 0.4421333372592926, "learning_rate": 6.580395140601664e-06, "loss": 0.4543, "num_input_tokens_seen": 92152800, "step": 96515 }, { "epoch": 7.873399135329146, "grad_norm": 0.4474929869174957, "learning_rate": 6.577988797496617e-06, "loss": 0.3723, "num_input_tokens_seen": 92158048, "step": 96520 }, { "epoch": 7.873806998939554, "grad_norm": 4.549267768859863, "learning_rate": 6.575582827797907e-06, "loss": 0.2803, "num_input_tokens_seen": 92162640, "step": 96525 }, { "epoch": 7.874214862549963, "grad_norm": 0.7897415161132812, "learning_rate": 6.573177231554298e-06, "loss": 0.3649, "num_input_tokens_seen": 92168016, "step": 96530 }, { "epoch": 7.874622726160372, "grad_norm": 0.8099258542060852, "learning_rate": 6.57077200881456e-06, "loss": 0.287, "num_input_tokens_seen": 92172848, "step": 96535 }, { "epoch": 7.87503058977078, "grad_norm": 114.30354309082031, "learning_rate": 6.568367159627448e-06, "loss": 0.3661, "num_input_tokens_seen": 92176624, "step": 96540 }, { "epoch": 7.875438453381189, "grad_norm": 20.270572662353516, "learning_rate": 6.565962684041696e-06, "loss": 0.419, "num_input_tokens_seen": 92181232, "step": 96545 }, { "epoch": 7.875846316991598, "grad_norm": 0.378522127866745, "learning_rate": 6.563558582106053e-06, "loss": 0.2187, "num_input_tokens_seen": 92186688, "step": 96550 }, { "epoch": 7.8762541806020065, "grad_norm": 0.9579943418502808, "learning_rate": 6.561154853869231e-06, "loss": 0.3332, "num_input_tokens_seen": 92191536, "step": 96555 }, { "epoch": 7.876662044212416, "grad_norm": 14.842976570129395, "learning_rate": 6.558751499379976e-06, "loss": 0.3633, "num_input_tokens_seen": 92196752, "step": 96560 }, { "epoch": 7.877069907822824, "grad_norm": 37.63529586791992, "learning_rate": 6.556348518686989e-06, "loss": 0.4167, "num_input_tokens_seen": 92201344, "step": 96565 }, { "epoch": 7.877477771433233, "grad_norm": 9.223776817321777, "learning_rate": 6.553945911838982e-06, "loss": 0.5134, "num_input_tokens_seen": 92206016, "step": 96570 }, { "epoch": 7.877885635043642, "grad_norm": 2.5894172191619873, "learning_rate": 6.551543678884645e-06, "loss": 0.348, "num_input_tokens_seen": 92210752, "step": 96575 }, { "epoch": 7.87829349865405, "grad_norm": 30.9095458984375, "learning_rate": 6.549141819872687e-06, "loss": 0.4034, "num_input_tokens_seen": 92215728, "step": 96580 }, { "epoch": 7.878701362264459, "grad_norm": 11.36620044708252, "learning_rate": 6.5467403348517835e-06, "loss": 0.4376, "num_input_tokens_seen": 92220240, "step": 96585 }, { "epoch": 7.879109225874867, "grad_norm": 1.844889521598816, "learning_rate": 6.54433922387061e-06, "loss": 0.4082, "num_input_tokens_seen": 92225520, "step": 96590 }, { "epoch": 7.879517089485276, "grad_norm": 1.1744743585586548, "learning_rate": 6.5419384869778335e-06, "loss": 0.3694, "num_input_tokens_seen": 92230368, "step": 96595 }, { "epoch": 7.879924953095685, "grad_norm": 0.5772964358329773, "learning_rate": 6.539538124222127e-06, "loss": 0.4588, "num_input_tokens_seen": 92235888, "step": 96600 }, { "epoch": 7.880332816706094, "grad_norm": 30.98187828063965, "learning_rate": 6.53713813565214e-06, "loss": 0.5321, "num_input_tokens_seen": 92241280, "step": 96605 }, { "epoch": 7.880740680316502, "grad_norm": 3.5366806983947754, "learning_rate": 6.5347385213165165e-06, "loss": 0.2618, "num_input_tokens_seen": 92246496, "step": 96610 }, { "epoch": 7.881148543926911, "grad_norm": 12.143420219421387, "learning_rate": 6.532339281263897e-06, "loss": 0.255, "num_input_tokens_seen": 92250912, "step": 96615 }, { "epoch": 7.881556407537319, "grad_norm": 0.9368565082550049, "learning_rate": 6.529940415542904e-06, "loss": 0.2843, "num_input_tokens_seen": 92255680, "step": 96620 }, { "epoch": 7.881964271147728, "grad_norm": 104.36203002929688, "learning_rate": 6.52754192420218e-06, "loss": 0.3003, "num_input_tokens_seen": 92260864, "step": 96625 }, { "epoch": 7.882372134758137, "grad_norm": 1.0706608295440674, "learning_rate": 6.52514380729033e-06, "loss": 0.3685, "num_input_tokens_seen": 92265488, "step": 96630 }, { "epoch": 7.882779998368545, "grad_norm": 1.332854986190796, "learning_rate": 6.522746064855964e-06, "loss": 0.2712, "num_input_tokens_seen": 92270976, "step": 96635 }, { "epoch": 7.883187861978954, "grad_norm": 0.5925281643867493, "learning_rate": 6.520348696947684e-06, "loss": 0.2414, "num_input_tokens_seen": 92274336, "step": 96640 }, { "epoch": 7.8835957255893625, "grad_norm": 2.464616537094116, "learning_rate": 6.517951703614086e-06, "loss": 0.5472, "num_input_tokens_seen": 92280288, "step": 96645 }, { "epoch": 7.8840035891997715, "grad_norm": 53.7281608581543, "learning_rate": 6.515555084903752e-06, "loss": 0.4302, "num_input_tokens_seen": 92285392, "step": 96650 }, { "epoch": 7.8844114528101805, "grad_norm": 3.8618946075439453, "learning_rate": 6.513158840865255e-06, "loss": 0.3227, "num_input_tokens_seen": 92290368, "step": 96655 }, { "epoch": 7.884819316420589, "grad_norm": 1.1275503635406494, "learning_rate": 6.5107629715471765e-06, "loss": 0.215, "num_input_tokens_seen": 92295392, "step": 96660 }, { "epoch": 7.885227180030998, "grad_norm": 117.34098052978516, "learning_rate": 6.508367476998081e-06, "loss": 0.4008, "num_input_tokens_seen": 92299712, "step": 96665 }, { "epoch": 7.885635043641407, "grad_norm": 2.1930737495422363, "learning_rate": 6.5059723572665185e-06, "loss": 0.3988, "num_input_tokens_seen": 92304720, "step": 96670 }, { "epoch": 7.886042907251815, "grad_norm": 1.8521289825439453, "learning_rate": 6.503577612401035e-06, "loss": 0.2997, "num_input_tokens_seen": 92309472, "step": 96675 }, { "epoch": 7.886450770862224, "grad_norm": 0.9499881863594055, "learning_rate": 6.5011832424501765e-06, "loss": 0.3666, "num_input_tokens_seen": 92313792, "step": 96680 }, { "epoch": 7.886858634472633, "grad_norm": 3.5286693572998047, "learning_rate": 6.498789247462464e-06, "loss": 0.448, "num_input_tokens_seen": 92318640, "step": 96685 }, { "epoch": 7.887266498083041, "grad_norm": 6.693701267242432, "learning_rate": 6.496395627486437e-06, "loss": 0.3446, "num_input_tokens_seen": 92323200, "step": 96690 }, { "epoch": 7.88767436169345, "grad_norm": 2.0051910877227783, "learning_rate": 6.4940023825706134e-06, "loss": 0.2895, "num_input_tokens_seen": 92328224, "step": 96695 }, { "epoch": 7.888082225303858, "grad_norm": 1.5701332092285156, "learning_rate": 6.491609512763494e-06, "loss": 0.3505, "num_input_tokens_seen": 92332272, "step": 96700 }, { "epoch": 7.888490088914267, "grad_norm": 3.688568115234375, "learning_rate": 6.489217018113586e-06, "loss": 0.367, "num_input_tokens_seen": 92336592, "step": 96705 }, { "epoch": 7.888897952524676, "grad_norm": 1.2267851829528809, "learning_rate": 6.486824898669374e-06, "loss": 0.3079, "num_input_tokens_seen": 92341360, "step": 96710 }, { "epoch": 7.889305816135084, "grad_norm": 1.4789124727249146, "learning_rate": 6.484433154479366e-06, "loss": 0.3825, "num_input_tokens_seen": 92345440, "step": 96715 }, { "epoch": 7.889713679745493, "grad_norm": 3.524592161178589, "learning_rate": 6.482041785592027e-06, "loss": 0.2411, "num_input_tokens_seen": 92350736, "step": 96720 }, { "epoch": 7.890121543355901, "grad_norm": 1.6609381437301636, "learning_rate": 6.479650792055833e-06, "loss": 0.3216, "num_input_tokens_seen": 92355184, "step": 96725 }, { "epoch": 7.89052940696631, "grad_norm": 1.6031290292739868, "learning_rate": 6.477260173919242e-06, "loss": 0.3753, "num_input_tokens_seen": 92360112, "step": 96730 }, { "epoch": 7.890937270576719, "grad_norm": 1.1686924695968628, "learning_rate": 6.474869931230723e-06, "loss": 0.4599, "num_input_tokens_seen": 92364368, "step": 96735 }, { "epoch": 7.8913451341871275, "grad_norm": 9.8691987991333, "learning_rate": 6.472480064038716e-06, "loss": 0.3685, "num_input_tokens_seen": 92369200, "step": 96740 }, { "epoch": 7.8917529977975365, "grad_norm": 35.24549865722656, "learning_rate": 6.470090572391668e-06, "loss": 0.3767, "num_input_tokens_seen": 92374384, "step": 96745 }, { "epoch": 7.8921608614079455, "grad_norm": 23.710390090942383, "learning_rate": 6.46770145633801e-06, "loss": 0.2092, "num_input_tokens_seen": 92378608, "step": 96750 }, { "epoch": 7.892568725018354, "grad_norm": 25.640098571777344, "learning_rate": 6.465312715926161e-06, "loss": 0.2312, "num_input_tokens_seen": 92383120, "step": 96755 }, { "epoch": 7.892976588628763, "grad_norm": 13.144672393798828, "learning_rate": 6.462924351204555e-06, "loss": 0.2715, "num_input_tokens_seen": 92387456, "step": 96760 }, { "epoch": 7.893384452239172, "grad_norm": 4.3130693435668945, "learning_rate": 6.460536362221595e-06, "loss": 0.4167, "num_input_tokens_seen": 92391664, "step": 96765 }, { "epoch": 7.89379231584958, "grad_norm": 1.3513306379318237, "learning_rate": 6.458148749025686e-06, "loss": 0.4172, "num_input_tokens_seen": 92396496, "step": 96770 }, { "epoch": 7.894200179459989, "grad_norm": 2.0403499603271484, "learning_rate": 6.455761511665215e-06, "loss": 0.3108, "num_input_tokens_seen": 92400976, "step": 96775 }, { "epoch": 7.894608043070397, "grad_norm": 0.6274423599243164, "learning_rate": 6.453374650188584e-06, "loss": 0.3071, "num_input_tokens_seen": 92405920, "step": 96780 }, { "epoch": 7.895015906680806, "grad_norm": 20.987346649169922, "learning_rate": 6.45098816464417e-06, "loss": 0.4174, "num_input_tokens_seen": 92410432, "step": 96785 }, { "epoch": 7.895423770291215, "grad_norm": 3.5205066204071045, "learning_rate": 6.4486020550803435e-06, "loss": 0.2599, "num_input_tokens_seen": 92414704, "step": 96790 }, { "epoch": 7.895831633901623, "grad_norm": 2.1807830333709717, "learning_rate": 6.44621632154547e-06, "loss": 0.3163, "num_input_tokens_seen": 92419424, "step": 96795 }, { "epoch": 7.896239497512032, "grad_norm": 0.4580422341823578, "learning_rate": 6.443830964087908e-06, "loss": 0.4528, "num_input_tokens_seen": 92424208, "step": 96800 }, { "epoch": 7.896647361122441, "grad_norm": 3.211540937423706, "learning_rate": 6.441445982756008e-06, "loss": 0.3454, "num_input_tokens_seen": 92427760, "step": 96805 }, { "epoch": 7.897055224732849, "grad_norm": 41.22861099243164, "learning_rate": 6.43906137759811e-06, "loss": 0.2923, "num_input_tokens_seen": 92433184, "step": 96810 }, { "epoch": 7.897463088343258, "grad_norm": 0.9215708374977112, "learning_rate": 6.436677148662554e-06, "loss": 0.2741, "num_input_tokens_seen": 92437376, "step": 96815 }, { "epoch": 7.897870951953667, "grad_norm": 13.187625885009766, "learning_rate": 6.4342932959976535e-06, "loss": 0.2315, "num_input_tokens_seen": 92441984, "step": 96820 }, { "epoch": 7.898278815564075, "grad_norm": 55.394710540771484, "learning_rate": 6.431909819651749e-06, "loss": 0.3752, "num_input_tokens_seen": 92446624, "step": 96825 }, { "epoch": 7.898686679174484, "grad_norm": 28.0909423828125, "learning_rate": 6.4295267196731405e-06, "loss": 0.3793, "num_input_tokens_seen": 92451760, "step": 96830 }, { "epoch": 7.899094542784892, "grad_norm": 11.805865287780762, "learning_rate": 6.427143996110135e-06, "loss": 0.3439, "num_input_tokens_seen": 92457488, "step": 96835 }, { "epoch": 7.899502406395301, "grad_norm": 1.4182578325271606, "learning_rate": 6.424761649011022e-06, "loss": 0.3788, "num_input_tokens_seen": 92461232, "step": 96840 }, { "epoch": 7.8999102700057104, "grad_norm": 0.583338737487793, "learning_rate": 6.422379678424103e-06, "loss": 0.4818, "num_input_tokens_seen": 92466256, "step": 96845 }, { "epoch": 7.900318133616119, "grad_norm": 46.82817077636719, "learning_rate": 6.419998084397655e-06, "loss": 0.3773, "num_input_tokens_seen": 92471168, "step": 96850 }, { "epoch": 7.900725997226528, "grad_norm": 21.731037139892578, "learning_rate": 6.41761686697995e-06, "loss": 0.3213, "num_input_tokens_seen": 92475184, "step": 96855 }, { "epoch": 7.901133860836936, "grad_norm": 0.9842221736907959, "learning_rate": 6.415236026219254e-06, "loss": 0.4563, "num_input_tokens_seen": 92480640, "step": 96860 }, { "epoch": 7.901541724447345, "grad_norm": 17.462533950805664, "learning_rate": 6.412855562163822e-06, "loss": 0.3728, "num_input_tokens_seen": 92484688, "step": 96865 }, { "epoch": 7.901949588057754, "grad_norm": 2.5592093467712402, "learning_rate": 6.410475474861912e-06, "loss": 0.4094, "num_input_tokens_seen": 92489984, "step": 96870 }, { "epoch": 7.902357451668162, "grad_norm": 36.35065841674805, "learning_rate": 6.408095764361771e-06, "loss": 0.3704, "num_input_tokens_seen": 92495296, "step": 96875 }, { "epoch": 7.902765315278571, "grad_norm": 0.521295428276062, "learning_rate": 6.4057164307116254e-06, "loss": 0.3867, "num_input_tokens_seen": 92499888, "step": 96880 }, { "epoch": 7.90317317888898, "grad_norm": 4.861600399017334, "learning_rate": 6.4033374739597e-06, "loss": 0.3067, "num_input_tokens_seen": 92504336, "step": 96885 }, { "epoch": 7.903581042499388, "grad_norm": 0.3953690528869629, "learning_rate": 6.40095889415423e-06, "loss": 0.4248, "num_input_tokens_seen": 92508640, "step": 96890 }, { "epoch": 7.903988906109797, "grad_norm": 11.884883880615234, "learning_rate": 6.398580691343417e-06, "loss": 0.2713, "num_input_tokens_seen": 92513856, "step": 96895 }, { "epoch": 7.904396769720206, "grad_norm": 0.5391229391098022, "learning_rate": 6.396202865575471e-06, "loss": 0.2268, "num_input_tokens_seen": 92519488, "step": 96900 }, { "epoch": 7.904804633330614, "grad_norm": 6.4093499183654785, "learning_rate": 6.393825416898588e-06, "loss": 0.41, "num_input_tokens_seen": 92523968, "step": 96905 }, { "epoch": 7.905212496941023, "grad_norm": 3.206432819366455, "learning_rate": 6.391448345360951e-06, "loss": 0.3228, "num_input_tokens_seen": 92529152, "step": 96910 }, { "epoch": 7.905620360551431, "grad_norm": 0.6181529760360718, "learning_rate": 6.389071651010756e-06, "loss": 0.359, "num_input_tokens_seen": 92534704, "step": 96915 }, { "epoch": 7.90602822416184, "grad_norm": 0.7835957407951355, "learning_rate": 6.38669533389617e-06, "loss": 0.3136, "num_input_tokens_seen": 92539376, "step": 96920 }, { "epoch": 7.906436087772249, "grad_norm": 55.42733383178711, "learning_rate": 6.384319394065358e-06, "loss": 0.6193, "num_input_tokens_seen": 92544304, "step": 96925 }, { "epoch": 7.906843951382657, "grad_norm": 39.22624969482422, "learning_rate": 6.381943831566484e-06, "loss": 0.4971, "num_input_tokens_seen": 92549568, "step": 96930 }, { "epoch": 7.907251814993066, "grad_norm": 13.749727249145508, "learning_rate": 6.379568646447697e-06, "loss": 0.3722, "num_input_tokens_seen": 92553776, "step": 96935 }, { "epoch": 7.9076596786034745, "grad_norm": 0.5515837669372559, "learning_rate": 6.3771938387571395e-06, "loss": 0.2441, "num_input_tokens_seen": 92558448, "step": 96940 }, { "epoch": 7.9080675422138835, "grad_norm": 0.5556431412696838, "learning_rate": 6.374819408542951e-06, "loss": 0.3486, "num_input_tokens_seen": 92563040, "step": 96945 }, { "epoch": 7.9084754058242925, "grad_norm": 83.06230926513672, "learning_rate": 6.3724453558532515e-06, "loss": 0.3818, "num_input_tokens_seen": 92567552, "step": 96950 }, { "epoch": 7.908883269434701, "grad_norm": 22.25527572631836, "learning_rate": 6.370071680736178e-06, "loss": 0.4205, "num_input_tokens_seen": 92572368, "step": 96955 }, { "epoch": 7.90929113304511, "grad_norm": 4.106470584869385, "learning_rate": 6.367698383239834e-06, "loss": 0.3247, "num_input_tokens_seen": 92577264, "step": 96960 }, { "epoch": 7.909698996655519, "grad_norm": 36.32085418701172, "learning_rate": 6.365325463412325e-06, "loss": 0.3936, "num_input_tokens_seen": 92581792, "step": 96965 }, { "epoch": 7.910106860265927, "grad_norm": 1.1297725439071655, "learning_rate": 6.362952921301751e-06, "loss": 0.3376, "num_input_tokens_seen": 92586784, "step": 96970 }, { "epoch": 7.910514723876336, "grad_norm": 8.99837589263916, "learning_rate": 6.360580756956197e-06, "loss": 0.2661, "num_input_tokens_seen": 92590976, "step": 96975 }, { "epoch": 7.910922587486745, "grad_norm": 11.566054344177246, "learning_rate": 6.358208970423757e-06, "loss": 0.3099, "num_input_tokens_seen": 92595264, "step": 96980 }, { "epoch": 7.911330451097153, "grad_norm": 0.6387268900871277, "learning_rate": 6.355837561752498e-06, "loss": 0.2526, "num_input_tokens_seen": 92599600, "step": 96985 }, { "epoch": 7.911738314707562, "grad_norm": 60.436275482177734, "learning_rate": 6.3534665309904904e-06, "loss": 0.4684, "num_input_tokens_seen": 92604432, "step": 96990 }, { "epoch": 7.91214617831797, "grad_norm": 0.6708790063858032, "learning_rate": 6.351095878185786e-06, "loss": 0.2791, "num_input_tokens_seen": 92609920, "step": 96995 }, { "epoch": 7.912554041928379, "grad_norm": 40.58078384399414, "learning_rate": 6.348725603386449e-06, "loss": 0.5109, "num_input_tokens_seen": 92614368, "step": 97000 }, { "epoch": 7.912961905538788, "grad_norm": 2.508570909500122, "learning_rate": 6.346355706640522e-06, "loss": 0.187, "num_input_tokens_seen": 92619552, "step": 97005 }, { "epoch": 7.913369769149196, "grad_norm": 24.455015182495117, "learning_rate": 6.343986187996034e-06, "loss": 0.3118, "num_input_tokens_seen": 92623872, "step": 97010 }, { "epoch": 7.913777632759605, "grad_norm": 53.415225982666016, "learning_rate": 6.341617047501023e-06, "loss": 0.3581, "num_input_tokens_seen": 92628240, "step": 97015 }, { "epoch": 7.914185496370014, "grad_norm": 33.14815902709961, "learning_rate": 6.339248285203497e-06, "loss": 0.329, "num_input_tokens_seen": 92633024, "step": 97020 }, { "epoch": 7.914593359980422, "grad_norm": 41.315223693847656, "learning_rate": 6.336879901151485e-06, "loss": 0.4017, "num_input_tokens_seen": 92638176, "step": 97025 }, { "epoch": 7.915001223590831, "grad_norm": 0.5317657589912415, "learning_rate": 6.334511895392989e-06, "loss": 0.2153, "num_input_tokens_seen": 92644144, "step": 97030 }, { "epoch": 7.91540908720124, "grad_norm": 0.6353974342346191, "learning_rate": 6.3321442679760035e-06, "loss": 0.528, "num_input_tokens_seen": 92648768, "step": 97035 }, { "epoch": 7.9158169508116485, "grad_norm": 0.3798716962337494, "learning_rate": 6.329777018948515e-06, "loss": 0.2328, "num_input_tokens_seen": 92653808, "step": 97040 }, { "epoch": 7.9162248144220575, "grad_norm": 2.2741947174072266, "learning_rate": 6.32741014835852e-06, "loss": 0.422, "num_input_tokens_seen": 92657232, "step": 97045 }, { "epoch": 7.916632678032466, "grad_norm": 42.40135192871094, "learning_rate": 6.3250436562539875e-06, "loss": 0.6394, "num_input_tokens_seen": 92661568, "step": 97050 }, { "epoch": 7.917040541642875, "grad_norm": 0.9763385057449341, "learning_rate": 6.3226775426828836e-06, "loss": 0.5807, "num_input_tokens_seen": 92666608, "step": 97055 }, { "epoch": 7.917448405253284, "grad_norm": 0.4903947114944458, "learning_rate": 6.320311807693169e-06, "loss": 0.3652, "num_input_tokens_seen": 92671360, "step": 97060 }, { "epoch": 7.917856268863692, "grad_norm": 16.387489318847656, "learning_rate": 6.3179464513327986e-06, "loss": 0.3176, "num_input_tokens_seen": 92677024, "step": 97065 }, { "epoch": 7.918264132474101, "grad_norm": 6.825987815856934, "learning_rate": 6.315581473649712e-06, "loss": 0.3933, "num_input_tokens_seen": 92681408, "step": 97070 }, { "epoch": 7.918671996084509, "grad_norm": 5.729376792907715, "learning_rate": 6.313216874691852e-06, "loss": 0.219, "num_input_tokens_seen": 92685888, "step": 97075 }, { "epoch": 7.919079859694918, "grad_norm": 1.2520534992218018, "learning_rate": 6.310852654507146e-06, "loss": 0.4857, "num_input_tokens_seen": 92691088, "step": 97080 }, { "epoch": 7.919487723305327, "grad_norm": 4.7328386306762695, "learning_rate": 6.308488813143507e-06, "loss": 0.4289, "num_input_tokens_seen": 92694864, "step": 97085 }, { "epoch": 7.919895586915735, "grad_norm": 1.8594751358032227, "learning_rate": 6.3061253506488615e-06, "loss": 0.3477, "num_input_tokens_seen": 92700368, "step": 97090 }, { "epoch": 7.920303450526144, "grad_norm": 1.0883039236068726, "learning_rate": 6.303762267071117e-06, "loss": 0.3941, "num_input_tokens_seen": 92705472, "step": 97095 }, { "epoch": 7.920711314136553, "grad_norm": 9.86762523651123, "learning_rate": 6.301399562458163e-06, "loss": 0.293, "num_input_tokens_seen": 92709568, "step": 97100 }, { "epoch": 7.921119177746961, "grad_norm": 1.7604645490646362, "learning_rate": 6.299037236857896e-06, "loss": 0.2575, "num_input_tokens_seen": 92714176, "step": 97105 }, { "epoch": 7.92152704135737, "grad_norm": 2.0369176864624023, "learning_rate": 6.296675290318188e-06, "loss": 0.4794, "num_input_tokens_seen": 92718576, "step": 97110 }, { "epoch": 7.921934904967779, "grad_norm": 0.5312582850456238, "learning_rate": 6.2943137228869354e-06, "loss": 0.2362, "num_input_tokens_seen": 92723312, "step": 97115 }, { "epoch": 7.922342768578187, "grad_norm": 13.176048278808594, "learning_rate": 6.2919525346119915e-06, "loss": 0.4527, "num_input_tokens_seen": 92728000, "step": 97120 }, { "epoch": 7.922750632188596, "grad_norm": 0.526982843875885, "learning_rate": 6.289591725541225e-06, "loss": 0.3944, "num_input_tokens_seen": 92733104, "step": 97125 }, { "epoch": 7.923158495799004, "grad_norm": 1.1442698240280151, "learning_rate": 6.28723129572247e-06, "loss": 0.226, "num_input_tokens_seen": 92737584, "step": 97130 }, { "epoch": 7.9235663594094135, "grad_norm": 24.44256019592285, "learning_rate": 6.284871245203597e-06, "loss": 0.2669, "num_input_tokens_seen": 92742256, "step": 97135 }, { "epoch": 7.9239742230198225, "grad_norm": 5.789156913757324, "learning_rate": 6.282511574032429e-06, "loss": 0.2331, "num_input_tokens_seen": 92747104, "step": 97140 }, { "epoch": 7.924382086630231, "grad_norm": 39.552608489990234, "learning_rate": 6.280152282256802e-06, "loss": 0.4141, "num_input_tokens_seen": 92751952, "step": 97145 }, { "epoch": 7.92478995024064, "grad_norm": 2.079533100128174, "learning_rate": 6.277793369924523e-06, "loss": 0.4176, "num_input_tokens_seen": 92756208, "step": 97150 }, { "epoch": 7.925197813851048, "grad_norm": 1.7249778509140015, "learning_rate": 6.275434837083424e-06, "loss": 0.3624, "num_input_tokens_seen": 92761184, "step": 97155 }, { "epoch": 7.925605677461457, "grad_norm": 0.447782427072525, "learning_rate": 6.2730766837813035e-06, "loss": 0.287, "num_input_tokens_seen": 92766288, "step": 97160 }, { "epoch": 7.926013541071866, "grad_norm": 1.2133327722549438, "learning_rate": 6.270718910065962e-06, "loss": 0.2782, "num_input_tokens_seen": 92771120, "step": 97165 }, { "epoch": 7.926421404682275, "grad_norm": 0.8649516701698303, "learning_rate": 6.268361515985191e-06, "loss": 0.4054, "num_input_tokens_seen": 92774608, "step": 97170 }, { "epoch": 7.926829268292683, "grad_norm": 0.44923797249794006, "learning_rate": 6.266004501586764e-06, "loss": 0.3671, "num_input_tokens_seen": 92779520, "step": 97175 }, { "epoch": 7.927237131903092, "grad_norm": 8.82214641571045, "learning_rate": 6.263647866918474e-06, "loss": 0.3169, "num_input_tokens_seen": 92784528, "step": 97180 }, { "epoch": 7.9276449955135, "grad_norm": 8.116847038269043, "learning_rate": 6.261291612028078e-06, "loss": 0.3092, "num_input_tokens_seen": 92788864, "step": 97185 }, { "epoch": 7.928052859123909, "grad_norm": 0.2565903067588806, "learning_rate": 6.258935736963339e-06, "loss": 0.2333, "num_input_tokens_seen": 92793664, "step": 97190 }, { "epoch": 7.928460722734318, "grad_norm": 56.06647872924805, "learning_rate": 6.256580241772009e-06, "loss": 0.267, "num_input_tokens_seen": 92798512, "step": 97195 }, { "epoch": 7.928868586344726, "grad_norm": 0.5636565685272217, "learning_rate": 6.254225126501834e-06, "loss": 0.3445, "num_input_tokens_seen": 92803744, "step": 97200 }, { "epoch": 7.929276449955135, "grad_norm": 30.931692123413086, "learning_rate": 6.251870391200549e-06, "loss": 0.3044, "num_input_tokens_seen": 92808432, "step": 97205 }, { "epoch": 7.929684313565543, "grad_norm": 0.5594926476478577, "learning_rate": 6.249516035915884e-06, "loss": 0.2552, "num_input_tokens_seen": 92813440, "step": 97210 }, { "epoch": 7.930092177175952, "grad_norm": 66.29459381103516, "learning_rate": 6.247162060695563e-06, "loss": 0.5776, "num_input_tokens_seen": 92817312, "step": 97215 }, { "epoch": 7.930500040786361, "grad_norm": 24.690940856933594, "learning_rate": 6.24480846558729e-06, "loss": 0.308, "num_input_tokens_seen": 92822480, "step": 97220 }, { "epoch": 7.930907904396769, "grad_norm": 15.751351356506348, "learning_rate": 6.242455250638787e-06, "loss": 0.1776, "num_input_tokens_seen": 92827168, "step": 97225 }, { "epoch": 7.931315768007178, "grad_norm": 11.63039779663086, "learning_rate": 6.240102415897747e-06, "loss": 0.4225, "num_input_tokens_seen": 92832112, "step": 97230 }, { "epoch": 7.931723631617587, "grad_norm": 28.785236358642578, "learning_rate": 6.2377499614118595e-06, "loss": 0.3487, "num_input_tokens_seen": 92836896, "step": 97235 }, { "epoch": 7.9321314952279955, "grad_norm": 0.507554292678833, "learning_rate": 6.2353978872288e-06, "loss": 0.4206, "num_input_tokens_seen": 92841280, "step": 97240 }, { "epoch": 7.9325393588384046, "grad_norm": 1.3960098028182983, "learning_rate": 6.233046193396258e-06, "loss": 0.2427, "num_input_tokens_seen": 92846608, "step": 97245 }, { "epoch": 7.932947222448814, "grad_norm": 2.6965866088867188, "learning_rate": 6.230694879961896e-06, "loss": 0.4133, "num_input_tokens_seen": 92850960, "step": 97250 }, { "epoch": 7.933355086059222, "grad_norm": 0.35074204206466675, "learning_rate": 6.228343946973372e-06, "loss": 0.3356, "num_input_tokens_seen": 92856256, "step": 97255 }, { "epoch": 7.933762949669631, "grad_norm": 29.905996322631836, "learning_rate": 6.225993394478341e-06, "loss": 0.2152, "num_input_tokens_seen": 92861328, "step": 97260 }, { "epoch": 7.934170813280039, "grad_norm": 4.247211456298828, "learning_rate": 6.223643222524439e-06, "loss": 0.4156, "num_input_tokens_seen": 92866368, "step": 97265 }, { "epoch": 7.934578676890448, "grad_norm": 0.5173700451850891, "learning_rate": 6.221293431159317e-06, "loss": 0.3128, "num_input_tokens_seen": 92871232, "step": 97270 }, { "epoch": 7.934986540500857, "grad_norm": 34.7541389465332, "learning_rate": 6.2189440204305976e-06, "loss": 0.1796, "num_input_tokens_seen": 92876336, "step": 97275 }, { "epoch": 7.935394404111265, "grad_norm": 0.46282729506492615, "learning_rate": 6.2165949903859046e-06, "loss": 0.2383, "num_input_tokens_seen": 92881392, "step": 97280 }, { "epoch": 7.935802267721674, "grad_norm": 47.347198486328125, "learning_rate": 6.214246341072841e-06, "loss": 0.4049, "num_input_tokens_seen": 92885152, "step": 97285 }, { "epoch": 7.936210131332082, "grad_norm": 1.184336543083191, "learning_rate": 6.211898072539027e-06, "loss": 0.4311, "num_input_tokens_seen": 92889632, "step": 97290 }, { "epoch": 7.936617994942491, "grad_norm": 7.134445667266846, "learning_rate": 6.209550184832055e-06, "loss": 0.3438, "num_input_tokens_seen": 92894816, "step": 97295 }, { "epoch": 7.9370258585529, "grad_norm": 22.341323852539062, "learning_rate": 6.207202677999518e-06, "loss": 0.3059, "num_input_tokens_seen": 92899712, "step": 97300 }, { "epoch": 7.937433722163308, "grad_norm": 36.022178649902344, "learning_rate": 6.204855552088998e-06, "loss": 0.3628, "num_input_tokens_seen": 92904688, "step": 97305 }, { "epoch": 7.937841585773717, "grad_norm": 0.6857573390007019, "learning_rate": 6.202508807148061e-06, "loss": 0.4198, "num_input_tokens_seen": 92909664, "step": 97310 }, { "epoch": 7.938249449384126, "grad_norm": 19.430559158325195, "learning_rate": 6.2001624432242915e-06, "loss": 0.4243, "num_input_tokens_seen": 92914160, "step": 97315 }, { "epoch": 7.938657312994534, "grad_norm": 41.76509475708008, "learning_rate": 6.197816460365238e-06, "loss": 0.6148, "num_input_tokens_seen": 92917856, "step": 97320 }, { "epoch": 7.939065176604943, "grad_norm": 34.70654296875, "learning_rate": 6.195470858618457e-06, "loss": 0.3265, "num_input_tokens_seen": 92922000, "step": 97325 }, { "epoch": 7.939473040215352, "grad_norm": 9.062228202819824, "learning_rate": 6.193125638031491e-06, "loss": 0.2412, "num_input_tokens_seen": 92926656, "step": 97330 }, { "epoch": 7.9398809038257605, "grad_norm": 4.981637954711914, "learning_rate": 6.190780798651877e-06, "loss": 0.4718, "num_input_tokens_seen": 92931248, "step": 97335 }, { "epoch": 7.9402887674361695, "grad_norm": 35.2362174987793, "learning_rate": 6.188436340527146e-06, "loss": 0.4555, "num_input_tokens_seen": 92936592, "step": 97340 }, { "epoch": 7.940696631046578, "grad_norm": 0.41280412673950195, "learning_rate": 6.186092263704812e-06, "loss": 0.4903, "num_input_tokens_seen": 92941344, "step": 97345 }, { "epoch": 7.941104494656987, "grad_norm": 0.47305238246917725, "learning_rate": 6.183748568232389e-06, "loss": 0.5179, "num_input_tokens_seen": 92945984, "step": 97350 }, { "epoch": 7.941512358267396, "grad_norm": 3.7160511016845703, "learning_rate": 6.1814052541573936e-06, "loss": 0.3587, "num_input_tokens_seen": 92950944, "step": 97355 }, { "epoch": 7.941920221877804, "grad_norm": 28.85357666015625, "learning_rate": 6.179062321527318e-06, "loss": 0.4195, "num_input_tokens_seen": 92956640, "step": 97360 }, { "epoch": 7.942328085488213, "grad_norm": 0.43206700682640076, "learning_rate": 6.176719770389652e-06, "loss": 0.263, "num_input_tokens_seen": 92962112, "step": 97365 }, { "epoch": 7.942735949098622, "grad_norm": 0.4707315266132355, "learning_rate": 6.174377600791876e-06, "loss": 0.4027, "num_input_tokens_seen": 92967168, "step": 97370 }, { "epoch": 7.94314381270903, "grad_norm": 0.4823857247829437, "learning_rate": 6.17203581278146e-06, "loss": 0.3391, "num_input_tokens_seen": 92971232, "step": 97375 }, { "epoch": 7.943551676319439, "grad_norm": 1.1332571506500244, "learning_rate": 6.1696944064058844e-06, "loss": 0.4156, "num_input_tokens_seen": 92975776, "step": 97380 }, { "epoch": 7.943959539929848, "grad_norm": 0.580204963684082, "learning_rate": 6.1673533817126035e-06, "loss": 0.3702, "num_input_tokens_seen": 92980224, "step": 97385 }, { "epoch": 7.944367403540256, "grad_norm": 1.7216832637786865, "learning_rate": 6.165012738749066e-06, "loss": 0.4742, "num_input_tokens_seen": 92984544, "step": 97390 }, { "epoch": 7.944775267150665, "grad_norm": 0.45605170726776123, "learning_rate": 6.162672477562709e-06, "loss": 0.2789, "num_input_tokens_seen": 92988736, "step": 97395 }, { "epoch": 7.945183130761073, "grad_norm": 27.42194366455078, "learning_rate": 6.160332598200982e-06, "loss": 0.3002, "num_input_tokens_seen": 92993056, "step": 97400 }, { "epoch": 7.945590994371482, "grad_norm": 14.905659675598145, "learning_rate": 6.157993100711312e-06, "loss": 0.4424, "num_input_tokens_seen": 92997760, "step": 97405 }, { "epoch": 7.945998857981891, "grad_norm": 1.3000468015670776, "learning_rate": 6.155653985141114e-06, "loss": 0.3771, "num_input_tokens_seen": 93003024, "step": 97410 }, { "epoch": 7.946406721592299, "grad_norm": 21.961627960205078, "learning_rate": 6.1533152515378e-06, "loss": 0.3339, "num_input_tokens_seen": 93008304, "step": 97415 }, { "epoch": 7.946814585202708, "grad_norm": 12.862072944641113, "learning_rate": 6.150976899948771e-06, "loss": 0.4715, "num_input_tokens_seen": 93012656, "step": 97420 }, { "epoch": 7.9472224488131165, "grad_norm": 16.864063262939453, "learning_rate": 6.148638930421438e-06, "loss": 0.3248, "num_input_tokens_seen": 93016384, "step": 97425 }, { "epoch": 7.9476303124235255, "grad_norm": 59.58588409423828, "learning_rate": 6.146301343003186e-06, "loss": 0.3082, "num_input_tokens_seen": 93021328, "step": 97430 }, { "epoch": 7.9480381760339345, "grad_norm": 25.771827697753906, "learning_rate": 6.1439641377413925e-06, "loss": 0.3531, "num_input_tokens_seen": 93026016, "step": 97435 }, { "epoch": 7.948446039644343, "grad_norm": 4.033287525177002, "learning_rate": 6.141627314683426e-06, "loss": 0.5042, "num_input_tokens_seen": 93031136, "step": 97440 }, { "epoch": 7.948853903254752, "grad_norm": 29.44585609436035, "learning_rate": 6.139290873876668e-06, "loss": 0.4279, "num_input_tokens_seen": 93036592, "step": 97445 }, { "epoch": 7.949261766865161, "grad_norm": 7.844661235809326, "learning_rate": 6.13695481536847e-06, "loss": 0.366, "num_input_tokens_seen": 93041232, "step": 97450 }, { "epoch": 7.949669630475569, "grad_norm": 0.7978794574737549, "learning_rate": 6.1346191392061794e-06, "loss": 0.2855, "num_input_tokens_seen": 93046128, "step": 97455 }, { "epoch": 7.950077494085978, "grad_norm": 0.5461978316307068, "learning_rate": 6.132283845437148e-06, "loss": 0.2584, "num_input_tokens_seen": 93050992, "step": 97460 }, { "epoch": 7.950485357696387, "grad_norm": 0.7292492389678955, "learning_rate": 6.129948934108701e-06, "loss": 0.4495, "num_input_tokens_seen": 93055424, "step": 97465 }, { "epoch": 7.950893221306795, "grad_norm": 2.4671051502227783, "learning_rate": 6.127614405268165e-06, "loss": 0.3552, "num_input_tokens_seen": 93059984, "step": 97470 }, { "epoch": 7.951301084917204, "grad_norm": 0.5503339767456055, "learning_rate": 6.125280258962873e-06, "loss": 0.4592, "num_input_tokens_seen": 93064752, "step": 97475 }, { "epoch": 7.951708948527612, "grad_norm": 6.125795364379883, "learning_rate": 6.122946495240131e-06, "loss": 0.482, "num_input_tokens_seen": 93069712, "step": 97480 }, { "epoch": 7.952116812138021, "grad_norm": 0.8222345113754272, "learning_rate": 6.120613114147242e-06, "loss": 0.2806, "num_input_tokens_seen": 93073744, "step": 97485 }, { "epoch": 7.95252467574843, "grad_norm": 0.35317090153694153, "learning_rate": 6.1182801157315005e-06, "loss": 0.3049, "num_input_tokens_seen": 93078736, "step": 97490 }, { "epoch": 7.952932539358838, "grad_norm": 15.107060432434082, "learning_rate": 6.1159475000402e-06, "loss": 0.3264, "num_input_tokens_seen": 93083520, "step": 97495 }, { "epoch": 7.953340402969247, "grad_norm": 9.04468822479248, "learning_rate": 6.113615267120617e-06, "loss": 0.3024, "num_input_tokens_seen": 93088480, "step": 97500 }, { "epoch": 7.953748266579655, "grad_norm": 16.599088668823242, "learning_rate": 6.111283417020022e-06, "loss": 0.2677, "num_input_tokens_seen": 93092768, "step": 97505 }, { "epoch": 7.954156130190064, "grad_norm": 1.8146781921386719, "learning_rate": 6.108951949785694e-06, "loss": 0.264, "num_input_tokens_seen": 93097792, "step": 97510 }, { "epoch": 7.954563993800473, "grad_norm": 0.47639575600624084, "learning_rate": 6.1066208654648795e-06, "loss": 0.2878, "num_input_tokens_seen": 93102656, "step": 97515 }, { "epoch": 7.954971857410881, "grad_norm": 22.15570640563965, "learning_rate": 6.1042901641048335e-06, "loss": 0.2842, "num_input_tokens_seen": 93108176, "step": 97520 }, { "epoch": 7.95537972102129, "grad_norm": 26.68231773376465, "learning_rate": 6.101959845752797e-06, "loss": 0.4195, "num_input_tokens_seen": 93113456, "step": 97525 }, { "epoch": 7.955787584631699, "grad_norm": 0.6854815483093262, "learning_rate": 6.099629910455998e-06, "loss": 0.2241, "num_input_tokens_seen": 93117952, "step": 97530 }, { "epoch": 7.956195448242108, "grad_norm": 31.78957176208496, "learning_rate": 6.097300358261674e-06, "loss": 0.3622, "num_input_tokens_seen": 93123408, "step": 97535 }, { "epoch": 7.956603311852517, "grad_norm": 13.82067584991455, "learning_rate": 6.094971189217042e-06, "loss": 0.3492, "num_input_tokens_seen": 93127728, "step": 97540 }, { "epoch": 7.957011175462926, "grad_norm": 3.236649990081787, "learning_rate": 6.092642403369309e-06, "loss": 0.225, "num_input_tokens_seen": 93132384, "step": 97545 }, { "epoch": 7.957419039073334, "grad_norm": 16.423503875732422, "learning_rate": 6.090314000765674e-06, "loss": 0.4811, "num_input_tokens_seen": 93138592, "step": 97550 }, { "epoch": 7.957826902683743, "grad_norm": 0.8486534357070923, "learning_rate": 6.0879859814533454e-06, "loss": 0.5015, "num_input_tokens_seen": 93143920, "step": 97555 }, { "epoch": 7.958234766294151, "grad_norm": 1.5453728437423706, "learning_rate": 6.085658345479505e-06, "loss": 0.3473, "num_input_tokens_seen": 93147744, "step": 97560 }, { "epoch": 7.95864262990456, "grad_norm": 0.7498776316642761, "learning_rate": 6.083331092891334e-06, "loss": 0.1936, "num_input_tokens_seen": 93152752, "step": 97565 }, { "epoch": 7.959050493514969, "grad_norm": 4.805014610290527, "learning_rate": 6.081004223736e-06, "loss": 0.2737, "num_input_tokens_seen": 93157808, "step": 97570 }, { "epoch": 7.959458357125377, "grad_norm": 11.404853820800781, "learning_rate": 6.078677738060667e-06, "loss": 0.4444, "num_input_tokens_seen": 93162800, "step": 97575 }, { "epoch": 7.959866220735786, "grad_norm": 0.41871145367622375, "learning_rate": 6.076351635912503e-06, "loss": 0.3252, "num_input_tokens_seen": 93167824, "step": 97580 }, { "epoch": 7.960274084346195, "grad_norm": 13.641716957092285, "learning_rate": 6.074025917338649e-06, "loss": 0.359, "num_input_tokens_seen": 93171952, "step": 97585 }, { "epoch": 7.960681947956603, "grad_norm": 0.3251833915710449, "learning_rate": 6.07170058238625e-06, "loss": 0.2814, "num_input_tokens_seen": 93176784, "step": 97590 }, { "epoch": 7.961089811567012, "grad_norm": 0.48653337359428406, "learning_rate": 6.069375631102428e-06, "loss": 0.2888, "num_input_tokens_seen": 93182000, "step": 97595 }, { "epoch": 7.961497675177421, "grad_norm": 8.725449562072754, "learning_rate": 6.067051063534326e-06, "loss": 0.3457, "num_input_tokens_seen": 93187968, "step": 97600 }, { "epoch": 7.961905538787829, "grad_norm": 18.26580810546875, "learning_rate": 6.064726879729052e-06, "loss": 0.4353, "num_input_tokens_seen": 93193360, "step": 97605 }, { "epoch": 7.962313402398238, "grad_norm": 14.389034271240234, "learning_rate": 6.06240307973372e-06, "loss": 0.413, "num_input_tokens_seen": 93198032, "step": 97610 }, { "epoch": 7.962721266008646, "grad_norm": 26.292221069335938, "learning_rate": 6.0600796635954305e-06, "loss": 0.3606, "num_input_tokens_seen": 93202976, "step": 97615 }, { "epoch": 7.963129129619055, "grad_norm": 0.3864724040031433, "learning_rate": 6.057756631361278e-06, "loss": 0.3537, "num_input_tokens_seen": 93207696, "step": 97620 }, { "epoch": 7.963536993229464, "grad_norm": 28.28447151184082, "learning_rate": 6.055433983078349e-06, "loss": 0.4076, "num_input_tokens_seen": 93212192, "step": 97625 }, { "epoch": 7.9639448568398725, "grad_norm": 1.3011270761489868, "learning_rate": 6.053111718793722e-06, "loss": 0.3261, "num_input_tokens_seen": 93216784, "step": 97630 }, { "epoch": 7.9643527204502815, "grad_norm": 35.29359817504883, "learning_rate": 6.0507898385544715e-06, "loss": 0.4244, "num_input_tokens_seen": 93220896, "step": 97635 }, { "epoch": 7.96476058406069, "grad_norm": 5.804825305938721, "learning_rate": 6.048468342407651e-06, "loss": 0.3205, "num_input_tokens_seen": 93226464, "step": 97640 }, { "epoch": 7.965168447671099, "grad_norm": 0.7212347388267517, "learning_rate": 6.0461472304003315e-06, "loss": 0.3162, "num_input_tokens_seen": 93231520, "step": 97645 }, { "epoch": 7.965576311281508, "grad_norm": 11.976125717163086, "learning_rate": 6.043826502579553e-06, "loss": 0.4309, "num_input_tokens_seen": 93236752, "step": 97650 }, { "epoch": 7.965984174891916, "grad_norm": 0.8862615823745728, "learning_rate": 6.041506158992355e-06, "loss": 0.2255, "num_input_tokens_seen": 93241856, "step": 97655 }, { "epoch": 7.966392038502325, "grad_norm": 14.06313419342041, "learning_rate": 6.039186199685773e-06, "loss": 0.3731, "num_input_tokens_seen": 93246784, "step": 97660 }, { "epoch": 7.966799902112734, "grad_norm": 20.43889617919922, "learning_rate": 6.036866624706824e-06, "loss": 0.4048, "num_input_tokens_seen": 93251616, "step": 97665 }, { "epoch": 7.967207765723142, "grad_norm": 0.46856385469436646, "learning_rate": 6.034547434102536e-06, "loss": 0.2692, "num_input_tokens_seen": 93256400, "step": 97670 }, { "epoch": 7.967615629333551, "grad_norm": 0.5284930467605591, "learning_rate": 6.032228627919911e-06, "loss": 0.4019, "num_input_tokens_seen": 93261872, "step": 97675 }, { "epoch": 7.96802349294396, "grad_norm": 31.341257095336914, "learning_rate": 6.029910206205955e-06, "loss": 0.3882, "num_input_tokens_seen": 93266784, "step": 97680 }, { "epoch": 7.968431356554368, "grad_norm": 0.5449298620223999, "learning_rate": 6.027592169007651e-06, "loss": 0.2617, "num_input_tokens_seen": 93271296, "step": 97685 }, { "epoch": 7.968839220164777, "grad_norm": 0.4966725707054138, "learning_rate": 6.025274516371998e-06, "loss": 0.2596, "num_input_tokens_seen": 93275648, "step": 97690 }, { "epoch": 7.969247083775185, "grad_norm": 28.70612907409668, "learning_rate": 6.022957248345967e-06, "loss": 0.3757, "num_input_tokens_seen": 93280272, "step": 97695 }, { "epoch": 7.969654947385594, "grad_norm": 0.5354653596878052, "learning_rate": 6.020640364976529e-06, "loss": 0.3557, "num_input_tokens_seen": 93285552, "step": 97700 }, { "epoch": 7.970062810996003, "grad_norm": 3.6573996543884277, "learning_rate": 6.018323866310638e-06, "loss": 0.3899, "num_input_tokens_seen": 93289808, "step": 97705 }, { "epoch": 7.970470674606411, "grad_norm": 2.918633222579956, "learning_rate": 6.016007752395267e-06, "loss": 0.3509, "num_input_tokens_seen": 93294624, "step": 97710 }, { "epoch": 7.97087853821682, "grad_norm": 34.233009338378906, "learning_rate": 6.013692023277348e-06, "loss": 0.386, "num_input_tokens_seen": 93299488, "step": 97715 }, { "epoch": 7.971286401827229, "grad_norm": 19.761091232299805, "learning_rate": 6.011376679003827e-06, "loss": 0.3758, "num_input_tokens_seen": 93304544, "step": 97720 }, { "epoch": 7.9716942654376375, "grad_norm": 0.4006724953651428, "learning_rate": 6.009061719621631e-06, "loss": 0.3466, "num_input_tokens_seen": 93308224, "step": 97725 }, { "epoch": 7.9721021290480465, "grad_norm": 31.395008087158203, "learning_rate": 6.006747145177674e-06, "loss": 0.2921, "num_input_tokens_seen": 93312400, "step": 97730 }, { "epoch": 7.9725099926584555, "grad_norm": 3.454604387283325, "learning_rate": 6.004432955718892e-06, "loss": 0.1994, "num_input_tokens_seen": 93317328, "step": 97735 }, { "epoch": 7.972917856268864, "grad_norm": 3.1141014099121094, "learning_rate": 6.002119151292182e-06, "loss": 0.4056, "num_input_tokens_seen": 93321152, "step": 97740 }, { "epoch": 7.973325719879273, "grad_norm": 1.3880670070648193, "learning_rate": 5.999805731944444e-06, "loss": 0.3599, "num_input_tokens_seen": 93326736, "step": 97745 }, { "epoch": 7.973733583489681, "grad_norm": 17.447465896606445, "learning_rate": 5.997492697722571e-06, "loss": 0.23, "num_input_tokens_seen": 93331920, "step": 97750 }, { "epoch": 7.97414144710009, "grad_norm": 3.263005256652832, "learning_rate": 5.995180048673446e-06, "loss": 0.3238, "num_input_tokens_seen": 93336208, "step": 97755 }, { "epoch": 7.974549310710499, "grad_norm": 1.2585461139678955, "learning_rate": 5.9928677848439445e-06, "loss": 0.2809, "num_input_tokens_seen": 93341136, "step": 97760 }, { "epoch": 7.974957174320907, "grad_norm": 0.38424795866012573, "learning_rate": 5.990555906280937e-06, "loss": 0.3, "num_input_tokens_seen": 93345344, "step": 97765 }, { "epoch": 7.975365037931316, "grad_norm": 0.42898786067962646, "learning_rate": 5.988244413031282e-06, "loss": 0.3597, "num_input_tokens_seen": 93349824, "step": 97770 }, { "epoch": 7.975772901541724, "grad_norm": 0.4480639696121216, "learning_rate": 5.98593330514183e-06, "loss": 0.2674, "num_input_tokens_seen": 93354384, "step": 97775 }, { "epoch": 7.976180765152133, "grad_norm": 1.5003373622894287, "learning_rate": 5.983622582659437e-06, "loss": 0.2829, "num_input_tokens_seen": 93359280, "step": 97780 }, { "epoch": 7.976588628762542, "grad_norm": 1.9070674180984497, "learning_rate": 5.981312245630932e-06, "loss": 0.3585, "num_input_tokens_seen": 93364192, "step": 97785 }, { "epoch": 7.97699649237295, "grad_norm": 0.5262851119041443, "learning_rate": 5.979002294103147e-06, "loss": 0.3923, "num_input_tokens_seen": 93368176, "step": 97790 }, { "epoch": 7.977404355983359, "grad_norm": 2.3776721954345703, "learning_rate": 5.976692728122896e-06, "loss": 0.3003, "num_input_tokens_seen": 93372608, "step": 97795 }, { "epoch": 7.977812219593768, "grad_norm": 11.498403549194336, "learning_rate": 5.974383547737009e-06, "loss": 0.3328, "num_input_tokens_seen": 93376688, "step": 97800 }, { "epoch": 7.978220083204176, "grad_norm": 21.71247100830078, "learning_rate": 5.9720747529922795e-06, "loss": 0.3866, "num_input_tokens_seen": 93381024, "step": 97805 }, { "epoch": 7.978627946814585, "grad_norm": 1.2119940519332886, "learning_rate": 5.969766343935512e-06, "loss": 0.4245, "num_input_tokens_seen": 93385792, "step": 97810 }, { "epoch": 7.979035810424994, "grad_norm": 0.34230029582977295, "learning_rate": 5.967458320613492e-06, "loss": 0.4453, "num_input_tokens_seen": 93390176, "step": 97815 }, { "epoch": 7.9794436740354024, "grad_norm": 0.41586461663246155, "learning_rate": 5.965150683072998e-06, "loss": 0.261, "num_input_tokens_seen": 93394480, "step": 97820 }, { "epoch": 7.9798515376458115, "grad_norm": 1.7206511497497559, "learning_rate": 5.96284343136082e-06, "loss": 0.3744, "num_input_tokens_seen": 93399440, "step": 97825 }, { "epoch": 7.98025940125622, "grad_norm": 0.5239515900611877, "learning_rate": 5.960536565523714e-06, "loss": 0.2255, "num_input_tokens_seen": 93404880, "step": 97830 }, { "epoch": 7.980667264866629, "grad_norm": 42.624385833740234, "learning_rate": 5.958230085608443e-06, "loss": 0.4244, "num_input_tokens_seen": 93409328, "step": 97835 }, { "epoch": 7.981075128477038, "grad_norm": 1.181130290031433, "learning_rate": 5.955923991661749e-06, "loss": 0.5127, "num_input_tokens_seen": 93414544, "step": 97840 }, { "epoch": 7.981482992087446, "grad_norm": 0.3169002830982208, "learning_rate": 5.953618283730392e-06, "loss": 0.2787, "num_input_tokens_seen": 93418736, "step": 97845 }, { "epoch": 7.981890855697855, "grad_norm": 1.582207202911377, "learning_rate": 5.951312961861097e-06, "loss": 0.3663, "num_input_tokens_seen": 93424272, "step": 97850 }, { "epoch": 7.982298719308263, "grad_norm": 0.45879894495010376, "learning_rate": 5.949008026100594e-06, "loss": 0.2323, "num_input_tokens_seen": 93429312, "step": 97855 }, { "epoch": 7.982706582918672, "grad_norm": 0.3461037278175354, "learning_rate": 5.9467034764956046e-06, "loss": 0.2818, "num_input_tokens_seen": 93434512, "step": 97860 }, { "epoch": 7.983114446529081, "grad_norm": 2.0231738090515137, "learning_rate": 5.944399313092833e-06, "loss": 0.2689, "num_input_tokens_seen": 93439376, "step": 97865 }, { "epoch": 7.983522310139489, "grad_norm": 0.35963431000709534, "learning_rate": 5.942095535938996e-06, "loss": 0.2721, "num_input_tokens_seen": 93443808, "step": 97870 }, { "epoch": 7.983930173749898, "grad_norm": 2.199106216430664, "learning_rate": 5.939792145080783e-06, "loss": 0.2754, "num_input_tokens_seen": 93448800, "step": 97875 }, { "epoch": 7.984338037360307, "grad_norm": 0.40210574865341187, "learning_rate": 5.937489140564887e-06, "loss": 0.262, "num_input_tokens_seen": 93454304, "step": 97880 }, { "epoch": 7.984745900970715, "grad_norm": 1.9834388494491577, "learning_rate": 5.9351865224379825e-06, "loss": 0.2597, "num_input_tokens_seen": 93459840, "step": 97885 }, { "epoch": 7.985153764581124, "grad_norm": 28.14080047607422, "learning_rate": 5.9328842907467484e-06, "loss": 0.3662, "num_input_tokens_seen": 93464304, "step": 97890 }, { "epoch": 7.985561628191533, "grad_norm": 0.5274257659912109, "learning_rate": 5.930582445537847e-06, "loss": 0.2917, "num_input_tokens_seen": 93469200, "step": 97895 }, { "epoch": 7.985969491801941, "grad_norm": 1.546265959739685, "learning_rate": 5.928280986857937e-06, "loss": 0.4911, "num_input_tokens_seen": 93474848, "step": 97900 }, { "epoch": 7.98637735541235, "grad_norm": 0.4597804844379425, "learning_rate": 5.92597991475366e-06, "loss": 0.3452, "num_input_tokens_seen": 93479792, "step": 97905 }, { "epoch": 7.986785219022758, "grad_norm": 1.1556164026260376, "learning_rate": 5.923679229271669e-06, "loss": 0.4287, "num_input_tokens_seen": 93484720, "step": 97910 }, { "epoch": 7.987193082633167, "grad_norm": 13.619821548461914, "learning_rate": 5.921378930458599e-06, "loss": 0.4181, "num_input_tokens_seen": 93489008, "step": 97915 }, { "epoch": 7.987600946243576, "grad_norm": 10.902901649475098, "learning_rate": 5.919079018361068e-06, "loss": 0.5028, "num_input_tokens_seen": 93493904, "step": 97920 }, { "epoch": 7.9880088098539845, "grad_norm": 3.721452474594116, "learning_rate": 5.916779493025698e-06, "loss": 0.4, "num_input_tokens_seen": 93498816, "step": 97925 }, { "epoch": 7.9884166734643935, "grad_norm": 19.337766647338867, "learning_rate": 5.914480354499091e-06, "loss": 0.1804, "num_input_tokens_seen": 93504032, "step": 97930 }, { "epoch": 7.988824537074803, "grad_norm": 28.9725341796875, "learning_rate": 5.912181602827865e-06, "loss": 0.378, "num_input_tokens_seen": 93509264, "step": 97935 }, { "epoch": 7.989232400685211, "grad_norm": 7.455938816070557, "learning_rate": 5.909883238058608e-06, "loss": 0.3829, "num_input_tokens_seen": 93513584, "step": 97940 }, { "epoch": 7.98964026429562, "grad_norm": 5.361911296844482, "learning_rate": 5.907585260237905e-06, "loss": 0.3767, "num_input_tokens_seen": 93519152, "step": 97945 }, { "epoch": 7.990048127906029, "grad_norm": 1.9378187656402588, "learning_rate": 5.905287669412329e-06, "loss": 0.2991, "num_input_tokens_seen": 93523488, "step": 97950 }, { "epoch": 7.990455991516437, "grad_norm": 0.4508562386035919, "learning_rate": 5.902990465628466e-06, "loss": 0.3429, "num_input_tokens_seen": 93528448, "step": 97955 }, { "epoch": 7.990863855126846, "grad_norm": 0.4522305130958557, "learning_rate": 5.90069364893287e-06, "loss": 0.2135, "num_input_tokens_seen": 93533408, "step": 97960 }, { "epoch": 7.991271718737254, "grad_norm": 11.936993598937988, "learning_rate": 5.8983972193720985e-06, "loss": 0.3461, "num_input_tokens_seen": 93538048, "step": 97965 }, { "epoch": 7.991679582347663, "grad_norm": 0.44825440645217896, "learning_rate": 5.896101176992699e-06, "loss": 0.28, "num_input_tokens_seen": 93542848, "step": 97970 }, { "epoch": 7.992087445958072, "grad_norm": 40.387386322021484, "learning_rate": 5.893805521841203e-06, "loss": 0.3119, "num_input_tokens_seen": 93547520, "step": 97975 }, { "epoch": 7.99249530956848, "grad_norm": 1.6379235982894897, "learning_rate": 5.891510253964158e-06, "loss": 0.278, "num_input_tokens_seen": 93552432, "step": 97980 }, { "epoch": 7.992903173178889, "grad_norm": 23.238744735717773, "learning_rate": 5.88921537340808e-06, "loss": 0.4315, "num_input_tokens_seen": 93557408, "step": 97985 }, { "epoch": 7.993311036789297, "grad_norm": 59.60924530029297, "learning_rate": 5.886920880219485e-06, "loss": 0.5686, "num_input_tokens_seen": 93562304, "step": 97990 }, { "epoch": 7.993718900399706, "grad_norm": 21.291404724121094, "learning_rate": 5.884626774444876e-06, "loss": 0.4018, "num_input_tokens_seen": 93567344, "step": 97995 }, { "epoch": 7.994126764010115, "grad_norm": 17.000638961791992, "learning_rate": 5.8823330561307675e-06, "loss": 0.2825, "num_input_tokens_seen": 93572544, "step": 98000 }, { "epoch": 7.994534627620523, "grad_norm": 0.5932804942131042, "learning_rate": 5.880039725323644e-06, "loss": 0.443, "num_input_tokens_seen": 93577408, "step": 98005 }, { "epoch": 7.994942491230932, "grad_norm": 0.5102822184562683, "learning_rate": 5.87774678206999e-06, "loss": 0.3363, "num_input_tokens_seen": 93582608, "step": 98010 }, { "epoch": 7.995350354841341, "grad_norm": 4.201125621795654, "learning_rate": 5.8754542264162825e-06, "loss": 0.4972, "num_input_tokens_seen": 93586976, "step": 98015 }, { "epoch": 7.9957582184517495, "grad_norm": 3.9831109046936035, "learning_rate": 5.873162058408993e-06, "loss": 0.4636, "num_input_tokens_seen": 93592176, "step": 98020 }, { "epoch": 7.9961660820621585, "grad_norm": 2.125410318374634, "learning_rate": 5.870870278094581e-06, "loss": 0.4035, "num_input_tokens_seen": 93597328, "step": 98025 }, { "epoch": 7.9965739456725675, "grad_norm": 1.5831753015518188, "learning_rate": 5.868578885519499e-06, "loss": 0.4209, "num_input_tokens_seen": 93602512, "step": 98030 }, { "epoch": 7.996981809282976, "grad_norm": 2.819143056869507, "learning_rate": 5.866287880730195e-06, "loss": 0.3861, "num_input_tokens_seen": 93607584, "step": 98035 }, { "epoch": 7.997389672893385, "grad_norm": 11.716912269592285, "learning_rate": 5.863997263773096e-06, "loss": 0.3202, "num_input_tokens_seen": 93613168, "step": 98040 }, { "epoch": 7.997797536503793, "grad_norm": 0.5504457950592041, "learning_rate": 5.861707034694649e-06, "loss": 0.4119, "num_input_tokens_seen": 93618576, "step": 98045 }, { "epoch": 7.998205400114202, "grad_norm": 28.816492080688477, "learning_rate": 5.8594171935412695e-06, "loss": 0.3072, "num_input_tokens_seen": 93622464, "step": 98050 }, { "epoch": 7.998613263724611, "grad_norm": 0.5568426847457886, "learning_rate": 5.8571277403593694e-06, "loss": 0.3782, "num_input_tokens_seen": 93627472, "step": 98055 }, { "epoch": 7.999021127335019, "grad_norm": 20.524887084960938, "learning_rate": 5.854838675195348e-06, "loss": 0.2826, "num_input_tokens_seen": 93633024, "step": 98060 }, { "epoch": 7.999428990945428, "grad_norm": 8.405793190002441, "learning_rate": 5.852549998095619e-06, "loss": 0.2593, "num_input_tokens_seen": 93637808, "step": 98065 }, { "epoch": 7.999836854555836, "grad_norm": 0.3735765516757965, "learning_rate": 5.8502617091065665e-06, "loss": 0.3487, "num_input_tokens_seen": 93642208, "step": 98070 }, { "epoch": 8.000244718166245, "grad_norm": 0.5333326458930969, "learning_rate": 5.84797380827457e-06, "loss": 0.3596, "num_input_tokens_seen": 93646656, "step": 98075 }, { "epoch": 8.000652581776654, "grad_norm": 22.73033905029297, "learning_rate": 5.8456862956460055e-06, "loss": 0.2519, "num_input_tokens_seen": 93652576, "step": 98080 }, { "epoch": 8.000652581776654, "eval_loss": 0.3265881836414337, "eval_runtime": 570.8851, "eval_samples_per_second": 4.773, "eval_steps_per_second": 2.388, "num_input_tokens_seen": 93652576, "step": 98080 }, { "epoch": 8.001060445387063, "grad_norm": 23.86806297302246, "learning_rate": 5.843399171267236e-06, "loss": 0.2536, "num_input_tokens_seen": 93656656, "step": 98085 }, { "epoch": 8.001468308997472, "grad_norm": 2.786170482635498, "learning_rate": 5.841112435184631e-06, "loss": 0.2748, "num_input_tokens_seen": 93660368, "step": 98090 }, { "epoch": 8.00187617260788, "grad_norm": 39.75179672241211, "learning_rate": 5.838826087444535e-06, "loss": 0.3966, "num_input_tokens_seen": 93664656, "step": 98095 }, { "epoch": 8.002284036218288, "grad_norm": 24.45681381225586, "learning_rate": 5.836540128093293e-06, "loss": 0.2716, "num_input_tokens_seen": 93669056, "step": 98100 }, { "epoch": 8.002691899828697, "grad_norm": 24.65431785583496, "learning_rate": 5.834254557177232e-06, "loss": 0.3691, "num_input_tokens_seen": 93674352, "step": 98105 }, { "epoch": 8.003099763439106, "grad_norm": 0.5251656174659729, "learning_rate": 5.831969374742697e-06, "loss": 0.3738, "num_input_tokens_seen": 93678928, "step": 98110 }, { "epoch": 8.003507627049515, "grad_norm": 2.691098690032959, "learning_rate": 5.8296845808359925e-06, "loss": 0.3925, "num_input_tokens_seen": 93683552, "step": 98115 }, { "epoch": 8.003915490659923, "grad_norm": 8.380885124206543, "learning_rate": 5.827400175503439e-06, "loss": 0.3084, "num_input_tokens_seen": 93688560, "step": 98120 }, { "epoch": 8.004323354270332, "grad_norm": 14.973474502563477, "learning_rate": 5.825116158791339e-06, "loss": 0.4511, "num_input_tokens_seen": 93693456, "step": 98125 }, { "epoch": 8.00473121788074, "grad_norm": 5.495951175689697, "learning_rate": 5.822832530745976e-06, "loss": 0.2195, "num_input_tokens_seen": 93698624, "step": 98130 }, { "epoch": 8.00513908149115, "grad_norm": 1.8018385171890259, "learning_rate": 5.820549291413658e-06, "loss": 0.4423, "num_input_tokens_seen": 93703184, "step": 98135 }, { "epoch": 8.005546945101559, "grad_norm": 6.652774333953857, "learning_rate": 5.818266440840653e-06, "loss": 0.2323, "num_input_tokens_seen": 93707376, "step": 98140 }, { "epoch": 8.005954808711966, "grad_norm": 19.20405387878418, "learning_rate": 5.815983979073239e-06, "loss": 0.3585, "num_input_tokens_seen": 93712464, "step": 98145 }, { "epoch": 8.006362672322375, "grad_norm": 0.85944664478302, "learning_rate": 5.813701906157676e-06, "loss": 0.3225, "num_input_tokens_seen": 93718032, "step": 98150 }, { "epoch": 8.006770535932784, "grad_norm": 0.4824252426624298, "learning_rate": 5.811420222140224e-06, "loss": 0.2541, "num_input_tokens_seen": 93723424, "step": 98155 }, { "epoch": 8.007178399543193, "grad_norm": 21.628389358520508, "learning_rate": 5.809138927067123e-06, "loss": 0.4881, "num_input_tokens_seen": 93728640, "step": 98160 }, { "epoch": 8.007586263153602, "grad_norm": 21.091781616210938, "learning_rate": 5.806858020984629e-06, "loss": 0.3047, "num_input_tokens_seen": 93733408, "step": 98165 }, { "epoch": 8.007994126764011, "grad_norm": 1.1935291290283203, "learning_rate": 5.804577503938965e-06, "loss": 0.4455, "num_input_tokens_seen": 93737376, "step": 98170 }, { "epoch": 8.008401990374418, "grad_norm": 22.80481719970703, "learning_rate": 5.802297375976362e-06, "loss": 0.2951, "num_input_tokens_seen": 93741888, "step": 98175 }, { "epoch": 8.008809853984827, "grad_norm": 33.75436019897461, "learning_rate": 5.800017637143032e-06, "loss": 0.3584, "num_input_tokens_seen": 93746640, "step": 98180 }, { "epoch": 8.009217717595236, "grad_norm": 0.3673854470252991, "learning_rate": 5.797738287485186e-06, "loss": 0.2966, "num_input_tokens_seen": 93750896, "step": 98185 }, { "epoch": 8.009625581205645, "grad_norm": 3.1189048290252686, "learning_rate": 5.795459327049024e-06, "loss": 0.3308, "num_input_tokens_seen": 93756192, "step": 98190 }, { "epoch": 8.010033444816054, "grad_norm": 18.967618942260742, "learning_rate": 5.7931807558807335e-06, "loss": 0.3249, "num_input_tokens_seen": 93760080, "step": 98195 }, { "epoch": 8.010441308426461, "grad_norm": 35.470333099365234, "learning_rate": 5.790902574026519e-06, "loss": 0.3672, "num_input_tokens_seen": 93764496, "step": 98200 }, { "epoch": 8.01084917203687, "grad_norm": 13.854839324951172, "learning_rate": 5.788624781532542e-06, "loss": 0.2511, "num_input_tokens_seen": 93768576, "step": 98205 }, { "epoch": 8.01125703564728, "grad_norm": 2.1539013385772705, "learning_rate": 5.786347378444978e-06, "loss": 0.4527, "num_input_tokens_seen": 93772464, "step": 98210 }, { "epoch": 8.011664899257688, "grad_norm": 55.50807189941406, "learning_rate": 5.78407036480999e-06, "loss": 0.196, "num_input_tokens_seen": 93777408, "step": 98215 }, { "epoch": 8.012072762868097, "grad_norm": 1.7748113870620728, "learning_rate": 5.781793740673724e-06, "loss": 0.3372, "num_input_tokens_seen": 93782384, "step": 98220 }, { "epoch": 8.012480626478506, "grad_norm": 10.488556861877441, "learning_rate": 5.7795175060823385e-06, "loss": 0.1876, "num_input_tokens_seen": 93786944, "step": 98225 }, { "epoch": 8.012888490088914, "grad_norm": 62.38413619995117, "learning_rate": 5.777241661081964e-06, "loss": 0.2699, "num_input_tokens_seen": 93791536, "step": 98230 }, { "epoch": 8.013296353699323, "grad_norm": 5.56161642074585, "learning_rate": 5.7749662057187344e-06, "loss": 0.3399, "num_input_tokens_seen": 93796976, "step": 98235 }, { "epoch": 8.013704217309732, "grad_norm": 1.8847028017044067, "learning_rate": 5.7726911400387664e-06, "loss": 0.3893, "num_input_tokens_seen": 93801648, "step": 98240 }, { "epoch": 8.01411208092014, "grad_norm": 2.9320485591888428, "learning_rate": 5.770416464088182e-06, "loss": 0.2239, "num_input_tokens_seen": 93806224, "step": 98245 }, { "epoch": 8.01451994453055, "grad_norm": 20.691986083984375, "learning_rate": 5.768142177913085e-06, "loss": 0.5282, "num_input_tokens_seen": 93810192, "step": 98250 }, { "epoch": 8.014927808140957, "grad_norm": 2.7036356925964355, "learning_rate": 5.765868281559575e-06, "loss": 0.5892, "num_input_tokens_seen": 93814592, "step": 98255 }, { "epoch": 8.015335671751366, "grad_norm": 3.690934419631958, "learning_rate": 5.7635947750737355e-06, "loss": 0.3876, "num_input_tokens_seen": 93819760, "step": 98260 }, { "epoch": 8.015743535361775, "grad_norm": 24.007341384887695, "learning_rate": 5.761321658501662e-06, "loss": 0.2095, "num_input_tokens_seen": 93825056, "step": 98265 }, { "epoch": 8.016151398972184, "grad_norm": 5.812723636627197, "learning_rate": 5.759048931889424e-06, "loss": 0.2646, "num_input_tokens_seen": 93829200, "step": 98270 }, { "epoch": 8.016559262582593, "grad_norm": 6.98845100402832, "learning_rate": 5.7567765952830865e-06, "loss": 0.3817, "num_input_tokens_seen": 93833472, "step": 98275 }, { "epoch": 8.016967126193, "grad_norm": 16.253768920898438, "learning_rate": 5.754504648728712e-06, "loss": 0.4023, "num_input_tokens_seen": 93838544, "step": 98280 }, { "epoch": 8.01737498980341, "grad_norm": 50.27107238769531, "learning_rate": 5.752233092272344e-06, "loss": 0.4267, "num_input_tokens_seen": 93841856, "step": 98285 }, { "epoch": 8.017782853413818, "grad_norm": 11.010072708129883, "learning_rate": 5.749961925960037e-06, "loss": 0.2644, "num_input_tokens_seen": 93847152, "step": 98290 }, { "epoch": 8.018190717024227, "grad_norm": 46.01755142211914, "learning_rate": 5.747691149837825e-06, "loss": 0.359, "num_input_tokens_seen": 93851936, "step": 98295 }, { "epoch": 8.018598580634636, "grad_norm": 32.58598327636719, "learning_rate": 5.745420763951731e-06, "loss": 0.2755, "num_input_tokens_seen": 93856624, "step": 98300 }, { "epoch": 8.019006444245045, "grad_norm": 0.4100016951560974, "learning_rate": 5.743150768347774e-06, "loss": 0.3087, "num_input_tokens_seen": 93861408, "step": 98305 }, { "epoch": 8.019414307855453, "grad_norm": 62.81800079345703, "learning_rate": 5.740881163071971e-06, "loss": 0.4445, "num_input_tokens_seen": 93865648, "step": 98310 }, { "epoch": 8.019822171465862, "grad_norm": 35.03501510620117, "learning_rate": 5.738611948170322e-06, "loss": 0.3569, "num_input_tokens_seen": 93871632, "step": 98315 }, { "epoch": 8.02023003507627, "grad_norm": 36.65781784057617, "learning_rate": 5.736343123688825e-06, "loss": 0.2711, "num_input_tokens_seen": 93876656, "step": 98320 }, { "epoch": 8.02063789868668, "grad_norm": 1.999098777770996, "learning_rate": 5.734074689673469e-06, "loss": 0.2675, "num_input_tokens_seen": 93881408, "step": 98325 }, { "epoch": 8.021045762297089, "grad_norm": 38.90167236328125, "learning_rate": 5.731806646170224e-06, "loss": 0.2856, "num_input_tokens_seen": 93885664, "step": 98330 }, { "epoch": 8.021453625907496, "grad_norm": 0.9973719716072083, "learning_rate": 5.7295389932250755e-06, "loss": 0.3733, "num_input_tokens_seen": 93890752, "step": 98335 }, { "epoch": 8.021861489517905, "grad_norm": 0.46732866764068604, "learning_rate": 5.727271730883985e-06, "loss": 0.3544, "num_input_tokens_seen": 93895024, "step": 98340 }, { "epoch": 8.022269353128314, "grad_norm": 0.4209289252758026, "learning_rate": 5.725004859192906e-06, "loss": 0.2956, "num_input_tokens_seen": 93900400, "step": 98345 }, { "epoch": 8.022677216738723, "grad_norm": 1.7928205728530884, "learning_rate": 5.722738378197781e-06, "loss": 0.3767, "num_input_tokens_seen": 93903744, "step": 98350 }, { "epoch": 8.023085080349132, "grad_norm": 5.499029636383057, "learning_rate": 5.720472287944567e-06, "loss": 0.4422, "num_input_tokens_seen": 93908704, "step": 98355 }, { "epoch": 8.023492943959539, "grad_norm": 2.7690651416778564, "learning_rate": 5.7182065884791855e-06, "loss": 0.3103, "num_input_tokens_seen": 93913024, "step": 98360 }, { "epoch": 8.023900807569948, "grad_norm": 1.3574026823043823, "learning_rate": 5.715941279847564e-06, "loss": 0.4672, "num_input_tokens_seen": 93918096, "step": 98365 }, { "epoch": 8.024308671180357, "grad_norm": 30.775293350219727, "learning_rate": 5.713676362095619e-06, "loss": 0.2883, "num_input_tokens_seen": 93923264, "step": 98370 }, { "epoch": 8.024716534790766, "grad_norm": 0.5885896682739258, "learning_rate": 5.711411835269248e-06, "loss": 0.3262, "num_input_tokens_seen": 93928464, "step": 98375 }, { "epoch": 8.025124398401175, "grad_norm": 18.219621658325195, "learning_rate": 5.709147699414374e-06, "loss": 0.3111, "num_input_tokens_seen": 93932256, "step": 98380 }, { "epoch": 8.025532262011584, "grad_norm": 49.75662612915039, "learning_rate": 5.706883954576877e-06, "loss": 0.4624, "num_input_tokens_seen": 93937552, "step": 98385 }, { "epoch": 8.025940125621991, "grad_norm": 4.937489986419678, "learning_rate": 5.704620600802646e-06, "loss": 0.3258, "num_input_tokens_seen": 93942640, "step": 98390 }, { "epoch": 8.0263479892324, "grad_norm": 35.2008171081543, "learning_rate": 5.7023576381375485e-06, "loss": 0.2995, "num_input_tokens_seen": 93947456, "step": 98395 }, { "epoch": 8.02675585284281, "grad_norm": 0.44346627593040466, "learning_rate": 5.700095066627467e-06, "loss": 0.2175, "num_input_tokens_seen": 93952272, "step": 98400 }, { "epoch": 8.027163716453218, "grad_norm": 1.9718858003616333, "learning_rate": 5.697832886318258e-06, "loss": 0.3589, "num_input_tokens_seen": 93956304, "step": 98405 }, { "epoch": 8.027571580063627, "grad_norm": 0.6419834494590759, "learning_rate": 5.695571097255775e-06, "loss": 0.4288, "num_input_tokens_seen": 93961232, "step": 98410 }, { "epoch": 8.027979443674035, "grad_norm": 25.317819595336914, "learning_rate": 5.693309699485863e-06, "loss": 0.3186, "num_input_tokens_seen": 93965328, "step": 98415 }, { "epoch": 8.028387307284444, "grad_norm": 4.724203109741211, "learning_rate": 5.691048693054355e-06, "loss": 0.3295, "num_input_tokens_seen": 93970048, "step": 98420 }, { "epoch": 8.028795170894853, "grad_norm": 1.57478928565979, "learning_rate": 5.688788078007087e-06, "loss": 0.3622, "num_input_tokens_seen": 93974608, "step": 98425 }, { "epoch": 8.029203034505262, "grad_norm": 0.40781262516975403, "learning_rate": 5.6865278543898835e-06, "loss": 0.2636, "num_input_tokens_seen": 93980576, "step": 98430 }, { "epoch": 8.02961089811567, "grad_norm": 0.5328963994979858, "learning_rate": 5.684268022248551e-06, "loss": 0.3012, "num_input_tokens_seen": 93984496, "step": 98435 }, { "epoch": 8.03001876172608, "grad_norm": 1.6609915494918823, "learning_rate": 5.682008581628898e-06, "loss": 0.3955, "num_input_tokens_seen": 93989360, "step": 98440 }, { "epoch": 8.030426625336487, "grad_norm": 0.4843182861804962, "learning_rate": 5.679749532576722e-06, "loss": 0.2375, "num_input_tokens_seen": 93994448, "step": 98445 }, { "epoch": 8.030834488946896, "grad_norm": 38.66808319091797, "learning_rate": 5.677490875137814e-06, "loss": 0.2799, "num_input_tokens_seen": 93999280, "step": 98450 }, { "epoch": 8.031242352557305, "grad_norm": 19.23367691040039, "learning_rate": 5.675232609357956e-06, "loss": 0.2981, "num_input_tokens_seen": 94003872, "step": 98455 }, { "epoch": 8.031650216167714, "grad_norm": 6.172013282775879, "learning_rate": 5.672974735282916e-06, "loss": 0.3671, "num_input_tokens_seen": 94008752, "step": 98460 }, { "epoch": 8.032058079778123, "grad_norm": 1.66310453414917, "learning_rate": 5.670717252958469e-06, "loss": 0.4621, "num_input_tokens_seen": 94013936, "step": 98465 }, { "epoch": 8.03246594338853, "grad_norm": 2.3381404876708984, "learning_rate": 5.66846016243037e-06, "loss": 0.3753, "num_input_tokens_seen": 94017984, "step": 98470 }, { "epoch": 8.03287380699894, "grad_norm": 0.43338581919670105, "learning_rate": 5.666203463744371e-06, "loss": 0.2708, "num_input_tokens_seen": 94022624, "step": 98475 }, { "epoch": 8.033281670609348, "grad_norm": 0.9519089460372925, "learning_rate": 5.663947156946212e-06, "loss": 0.3933, "num_input_tokens_seen": 94027872, "step": 98480 }, { "epoch": 8.033689534219757, "grad_norm": 18.05579376220703, "learning_rate": 5.661691242081621e-06, "loss": 0.2964, "num_input_tokens_seen": 94032928, "step": 98485 }, { "epoch": 8.034097397830166, "grad_norm": 4.518935203552246, "learning_rate": 5.659435719196338e-06, "loss": 0.5806, "num_input_tokens_seen": 94037952, "step": 98490 }, { "epoch": 8.034505261440573, "grad_norm": 0.34502872824668884, "learning_rate": 5.657180588336072e-06, "loss": 0.263, "num_input_tokens_seen": 94042896, "step": 98495 }, { "epoch": 8.034913125050982, "grad_norm": 1.5879396200180054, "learning_rate": 5.654925849546539e-06, "loss": 0.2291, "num_input_tokens_seen": 94047888, "step": 98500 }, { "epoch": 8.035320988661391, "grad_norm": 0.8827229738235474, "learning_rate": 5.65267150287343e-06, "loss": 0.3027, "num_input_tokens_seen": 94052608, "step": 98505 }, { "epoch": 8.0357288522718, "grad_norm": 2.677570104598999, "learning_rate": 5.650417548362458e-06, "loss": 0.4519, "num_input_tokens_seen": 94056176, "step": 98510 }, { "epoch": 8.03613671588221, "grad_norm": 0.3622083365917206, "learning_rate": 5.648163986059299e-06, "loss": 0.2882, "num_input_tokens_seen": 94060784, "step": 98515 }, { "epoch": 8.036544579492618, "grad_norm": 2.805521011352539, "learning_rate": 5.645910816009633e-06, "loss": 0.2923, "num_input_tokens_seen": 94066736, "step": 98520 }, { "epoch": 8.036952443103026, "grad_norm": 1.087110161781311, "learning_rate": 5.6436580382591306e-06, "loss": 0.3051, "num_input_tokens_seen": 94071760, "step": 98525 }, { "epoch": 8.037360306713435, "grad_norm": 0.43318232893943787, "learning_rate": 5.641405652853446e-06, "loss": 0.2288, "num_input_tokens_seen": 94076400, "step": 98530 }, { "epoch": 8.037768170323844, "grad_norm": 0.4961603283882141, "learning_rate": 5.639153659838253e-06, "loss": 0.451, "num_input_tokens_seen": 94081168, "step": 98535 }, { "epoch": 8.038176033934253, "grad_norm": 0.4825020134449005, "learning_rate": 5.636902059259186e-06, "loss": 0.2464, "num_input_tokens_seen": 94084592, "step": 98540 }, { "epoch": 8.038583897544662, "grad_norm": 20.40407943725586, "learning_rate": 5.634650851161888e-06, "loss": 0.4643, "num_input_tokens_seen": 94089920, "step": 98545 }, { "epoch": 8.038991761155069, "grad_norm": 14.337997436523438, "learning_rate": 5.632400035591981e-06, "loss": 0.3937, "num_input_tokens_seen": 94095088, "step": 98550 }, { "epoch": 8.039399624765478, "grad_norm": 2.2155632972717285, "learning_rate": 5.630149612595101e-06, "loss": 0.3328, "num_input_tokens_seen": 94100832, "step": 98555 }, { "epoch": 8.039807488375887, "grad_norm": 6.439341068267822, "learning_rate": 5.6278995822168585e-06, "loss": 0.3151, "num_input_tokens_seen": 94105728, "step": 98560 }, { "epoch": 8.040215351986296, "grad_norm": 65.39741516113281, "learning_rate": 5.625649944502861e-06, "loss": 0.3003, "num_input_tokens_seen": 94110448, "step": 98565 }, { "epoch": 8.040623215596705, "grad_norm": 6.012545585632324, "learning_rate": 5.6234006994987045e-06, "loss": 0.4478, "num_input_tokens_seen": 94115552, "step": 98570 }, { "epoch": 8.041031079207114, "grad_norm": 3.7590460777282715, "learning_rate": 5.621151847249984e-06, "loss": 0.4498, "num_input_tokens_seen": 94119760, "step": 98575 }, { "epoch": 8.041438942817521, "grad_norm": 46.89328384399414, "learning_rate": 5.618903387802279e-06, "loss": 0.3617, "num_input_tokens_seen": 94124128, "step": 98580 }, { "epoch": 8.04184680642793, "grad_norm": 0.5314189195632935, "learning_rate": 5.6166553212011694e-06, "loss": 0.251, "num_input_tokens_seen": 94129216, "step": 98585 }, { "epoch": 8.04225467003834, "grad_norm": 1.600885272026062, "learning_rate": 5.614407647492217e-06, "loss": 0.3616, "num_input_tokens_seen": 94134016, "step": 98590 }, { "epoch": 8.042662533648748, "grad_norm": 0.5059406161308289, "learning_rate": 5.612160366720978e-06, "loss": 0.3366, "num_input_tokens_seen": 94137920, "step": 98595 }, { "epoch": 8.043070397259157, "grad_norm": 0.37760117650032043, "learning_rate": 5.609913478933018e-06, "loss": 0.3635, "num_input_tokens_seen": 94143184, "step": 98600 }, { "epoch": 8.043478260869565, "grad_norm": 0.7102500796318054, "learning_rate": 5.607666984173873e-06, "loss": 0.2855, "num_input_tokens_seen": 94148640, "step": 98605 }, { "epoch": 8.043886124479974, "grad_norm": 5.4191575050354, "learning_rate": 5.605420882489077e-06, "loss": 0.4733, "num_input_tokens_seen": 94153088, "step": 98610 }, { "epoch": 8.044293988090383, "grad_norm": 1.3044531345367432, "learning_rate": 5.603175173924152e-06, "loss": 0.3102, "num_input_tokens_seen": 94158784, "step": 98615 }, { "epoch": 8.044701851700792, "grad_norm": 1.736021876335144, "learning_rate": 5.600929858524631e-06, "loss": 0.2809, "num_input_tokens_seen": 94163472, "step": 98620 }, { "epoch": 8.0451097153112, "grad_norm": 10.072609901428223, "learning_rate": 5.598684936336019e-06, "loss": 0.2571, "num_input_tokens_seen": 94168000, "step": 98625 }, { "epoch": 8.045517578921608, "grad_norm": 9.789483070373535, "learning_rate": 5.596440407403816e-06, "loss": 0.3112, "num_input_tokens_seen": 94173024, "step": 98630 }, { "epoch": 8.045925442532017, "grad_norm": 5.317586898803711, "learning_rate": 5.594196271773524e-06, "loss": 0.2546, "num_input_tokens_seen": 94177744, "step": 98635 }, { "epoch": 8.046333306142426, "grad_norm": 0.8937179446220398, "learning_rate": 5.5919525294906185e-06, "loss": 0.3997, "num_input_tokens_seen": 94182640, "step": 98640 }, { "epoch": 8.046741169752835, "grad_norm": 13.124220848083496, "learning_rate": 5.589709180600597e-06, "loss": 0.3499, "num_input_tokens_seen": 94187824, "step": 98645 }, { "epoch": 8.047149033363244, "grad_norm": 0.564998984336853, "learning_rate": 5.587466225148921e-06, "loss": 0.4052, "num_input_tokens_seen": 94193296, "step": 98650 }, { "epoch": 8.047556896973653, "grad_norm": 30.635343551635742, "learning_rate": 5.585223663181058e-06, "loss": 0.3267, "num_input_tokens_seen": 94198112, "step": 98655 }, { "epoch": 8.04796476058406, "grad_norm": 6.731871128082275, "learning_rate": 5.582981494742454e-06, "loss": 0.2456, "num_input_tokens_seen": 94202096, "step": 98660 }, { "epoch": 8.048372624194469, "grad_norm": 46.95587921142578, "learning_rate": 5.580739719878569e-06, "loss": 0.3879, "num_input_tokens_seen": 94206400, "step": 98665 }, { "epoch": 8.048780487804878, "grad_norm": 14.291888236999512, "learning_rate": 5.578498338634841e-06, "loss": 0.2947, "num_input_tokens_seen": 94210800, "step": 98670 }, { "epoch": 8.049188351415287, "grad_norm": 17.376317977905273, "learning_rate": 5.5762573510566964e-06, "loss": 0.3803, "num_input_tokens_seen": 94215072, "step": 98675 }, { "epoch": 8.049596215025696, "grad_norm": 7.877410411834717, "learning_rate": 5.574016757189563e-06, "loss": 0.364, "num_input_tokens_seen": 94219232, "step": 98680 }, { "epoch": 8.050004078636103, "grad_norm": 3.013561487197876, "learning_rate": 5.571776557078848e-06, "loss": 0.3053, "num_input_tokens_seen": 94223376, "step": 98685 }, { "epoch": 8.050411942246512, "grad_norm": 1.145721435546875, "learning_rate": 5.5695367507699744e-06, "loss": 0.3335, "num_input_tokens_seen": 94228544, "step": 98690 }, { "epoch": 8.050819805856921, "grad_norm": 0.6560250520706177, "learning_rate": 5.567297338308333e-06, "loss": 0.3881, "num_input_tokens_seen": 94233712, "step": 98695 }, { "epoch": 8.05122766946733, "grad_norm": 0.4513601064682007, "learning_rate": 5.565058319739319e-06, "loss": 0.2613, "num_input_tokens_seen": 94238816, "step": 98700 }, { "epoch": 8.05163553307774, "grad_norm": 1.8022668361663818, "learning_rate": 5.562819695108312e-06, "loss": 0.3134, "num_input_tokens_seen": 94242784, "step": 98705 }, { "epoch": 8.052043396688147, "grad_norm": 4.348175048828125, "learning_rate": 5.560581464460693e-06, "loss": 0.3654, "num_input_tokens_seen": 94247056, "step": 98710 }, { "epoch": 8.052451260298556, "grad_norm": 7.921708583831787, "learning_rate": 5.558343627841825e-06, "loss": 0.3019, "num_input_tokens_seen": 94252304, "step": 98715 }, { "epoch": 8.052859123908965, "grad_norm": 2.0645461082458496, "learning_rate": 5.556106185297072e-06, "loss": 0.5075, "num_input_tokens_seen": 94258304, "step": 98720 }, { "epoch": 8.053266987519374, "grad_norm": 0.4696991741657257, "learning_rate": 5.553869136871783e-06, "loss": 0.2774, "num_input_tokens_seen": 94263632, "step": 98725 }, { "epoch": 8.053674851129783, "grad_norm": 2.644296646118164, "learning_rate": 5.551632482611299e-06, "loss": 0.2367, "num_input_tokens_seen": 94268720, "step": 98730 }, { "epoch": 8.054082714740192, "grad_norm": 11.899980545043945, "learning_rate": 5.549396222560965e-06, "loss": 0.3813, "num_input_tokens_seen": 94273200, "step": 98735 }, { "epoch": 8.054490578350599, "grad_norm": 6.843111038208008, "learning_rate": 5.547160356766107e-06, "loss": 0.3678, "num_input_tokens_seen": 94278368, "step": 98740 }, { "epoch": 8.054898441961008, "grad_norm": 1.526007056236267, "learning_rate": 5.54492488527204e-06, "loss": 0.3873, "num_input_tokens_seen": 94283024, "step": 98745 }, { "epoch": 8.055306305571417, "grad_norm": 15.371628761291504, "learning_rate": 5.5426898081240715e-06, "loss": 0.1513, "num_input_tokens_seen": 94288208, "step": 98750 }, { "epoch": 8.055714169181826, "grad_norm": 5.460968494415283, "learning_rate": 5.540455125367522e-06, "loss": 0.3926, "num_input_tokens_seen": 94291856, "step": 98755 }, { "epoch": 8.056122032792235, "grad_norm": 0.3099380433559418, "learning_rate": 5.538220837047675e-06, "loss": 0.175, "num_input_tokens_seen": 94296064, "step": 98760 }, { "epoch": 8.056529896402642, "grad_norm": 0.36643099784851074, "learning_rate": 5.535986943209825e-06, "loss": 0.2737, "num_input_tokens_seen": 94300512, "step": 98765 }, { "epoch": 8.056937760013051, "grad_norm": 20.24506378173828, "learning_rate": 5.5337534438992475e-06, "loss": 0.4466, "num_input_tokens_seen": 94304960, "step": 98770 }, { "epoch": 8.05734562362346, "grad_norm": 1.9209784269332886, "learning_rate": 5.531520339161209e-06, "loss": 0.4665, "num_input_tokens_seen": 94309280, "step": 98775 }, { "epoch": 8.05775348723387, "grad_norm": 8.43838882446289, "learning_rate": 5.529287629040988e-06, "loss": 0.3904, "num_input_tokens_seen": 94313120, "step": 98780 }, { "epoch": 8.058161350844278, "grad_norm": 2.025282621383667, "learning_rate": 5.527055313583834e-06, "loss": 0.3903, "num_input_tokens_seen": 94317696, "step": 98785 }, { "epoch": 8.058569214454687, "grad_norm": 6.1768903732299805, "learning_rate": 5.524823392834993e-06, "loss": 0.2177, "num_input_tokens_seen": 94322112, "step": 98790 }, { "epoch": 8.058977078065094, "grad_norm": 73.72755432128906, "learning_rate": 5.5225918668396995e-06, "loss": 0.3034, "num_input_tokens_seen": 94327136, "step": 98795 }, { "epoch": 8.059384941675503, "grad_norm": 8.16812801361084, "learning_rate": 5.520360735643199e-06, "loss": 0.554, "num_input_tokens_seen": 94331408, "step": 98800 }, { "epoch": 8.059792805285912, "grad_norm": 17.596738815307617, "learning_rate": 5.518129999290708e-06, "loss": 0.4266, "num_input_tokens_seen": 94336560, "step": 98805 }, { "epoch": 8.060200668896321, "grad_norm": 56.5561637878418, "learning_rate": 5.515899657827444e-06, "loss": 0.5813, "num_input_tokens_seen": 94341584, "step": 98810 }, { "epoch": 8.06060853250673, "grad_norm": 45.03914260864258, "learning_rate": 5.513669711298611e-06, "loss": 0.3877, "num_input_tokens_seen": 94346464, "step": 98815 }, { "epoch": 8.061016396117138, "grad_norm": 23.220090866088867, "learning_rate": 5.5114401597494174e-06, "loss": 0.1412, "num_input_tokens_seen": 94351840, "step": 98820 }, { "epoch": 8.061424259727547, "grad_norm": 0.8662272095680237, "learning_rate": 5.509211003225051e-06, "loss": 0.4847, "num_input_tokens_seen": 94357072, "step": 98825 }, { "epoch": 8.061832123337956, "grad_norm": 18.783275604248047, "learning_rate": 5.506982241770694e-06, "loss": 0.3827, "num_input_tokens_seen": 94362224, "step": 98830 }, { "epoch": 8.062239986948365, "grad_norm": 0.4245700538158417, "learning_rate": 5.504753875431523e-06, "loss": 0.3209, "num_input_tokens_seen": 94367776, "step": 98835 }, { "epoch": 8.062647850558774, "grad_norm": 2.5008673667907715, "learning_rate": 5.502525904252709e-06, "loss": 0.1511, "num_input_tokens_seen": 94372656, "step": 98840 }, { "epoch": 8.063055714169181, "grad_norm": 1.805451512336731, "learning_rate": 5.50029832827941e-06, "loss": 0.4173, "num_input_tokens_seen": 94377552, "step": 98845 }, { "epoch": 8.06346357777959, "grad_norm": 4.130075454711914, "learning_rate": 5.498071147556777e-06, "loss": 0.3134, "num_input_tokens_seen": 94382208, "step": 98850 }, { "epoch": 8.063871441389999, "grad_norm": 21.40778160095215, "learning_rate": 5.495844362129951e-06, "loss": 0.3621, "num_input_tokens_seen": 94387744, "step": 98855 }, { "epoch": 8.064279305000408, "grad_norm": 15.539812088012695, "learning_rate": 5.493617972044074e-06, "loss": 0.3104, "num_input_tokens_seen": 94392464, "step": 98860 }, { "epoch": 8.064687168610817, "grad_norm": 0.7437007427215576, "learning_rate": 5.491391977344276e-06, "loss": 0.356, "num_input_tokens_seen": 94397488, "step": 98865 }, { "epoch": 8.065095032221226, "grad_norm": 24.72296142578125, "learning_rate": 5.489166378075672e-06, "loss": 0.3847, "num_input_tokens_seen": 94402688, "step": 98870 }, { "epoch": 8.065502895831633, "grad_norm": 11.180670738220215, "learning_rate": 5.486941174283372e-06, "loss": 0.5845, "num_input_tokens_seen": 94407536, "step": 98875 }, { "epoch": 8.065910759442042, "grad_norm": 3.3437891006469727, "learning_rate": 5.484716366012485e-06, "loss": 0.2698, "num_input_tokens_seen": 94412064, "step": 98880 }, { "epoch": 8.066318623052451, "grad_norm": 24.278125762939453, "learning_rate": 5.4824919533081e-06, "loss": 0.4101, "num_input_tokens_seen": 94416560, "step": 98885 }, { "epoch": 8.06672648666286, "grad_norm": 0.38990676403045654, "learning_rate": 5.480267936215314e-06, "loss": 0.2489, "num_input_tokens_seen": 94420928, "step": 98890 }, { "epoch": 8.06713435027327, "grad_norm": 27.98659896850586, "learning_rate": 5.478044314779201e-06, "loss": 0.3043, "num_input_tokens_seen": 94425680, "step": 98895 }, { "epoch": 8.067542213883677, "grad_norm": 4.0474934577941895, "learning_rate": 5.475821089044836e-06, "loss": 0.1548, "num_input_tokens_seen": 94430752, "step": 98900 }, { "epoch": 8.067950077494086, "grad_norm": 9.326081275939941, "learning_rate": 5.473598259057275e-06, "loss": 0.3791, "num_input_tokens_seen": 94434944, "step": 98905 }, { "epoch": 8.068357941104495, "grad_norm": 60.52264404296875, "learning_rate": 5.471375824861589e-06, "loss": 0.516, "num_input_tokens_seen": 94439744, "step": 98910 }, { "epoch": 8.068765804714904, "grad_norm": 39.39717483520508, "learning_rate": 5.469153786502812e-06, "loss": 0.2645, "num_input_tokens_seen": 94444592, "step": 98915 }, { "epoch": 8.069173668325313, "grad_norm": 19.097156524658203, "learning_rate": 5.466932144025991e-06, "loss": 0.3466, "num_input_tokens_seen": 94448976, "step": 98920 }, { "epoch": 8.069581531935722, "grad_norm": 20.75044822692871, "learning_rate": 5.464710897476155e-06, "loss": 0.3121, "num_input_tokens_seen": 94453312, "step": 98925 }, { "epoch": 8.069989395546129, "grad_norm": 83.27706146240234, "learning_rate": 5.462490046898322e-06, "loss": 0.5582, "num_input_tokens_seen": 94457648, "step": 98930 }, { "epoch": 8.070397259156538, "grad_norm": 2.209648609161377, "learning_rate": 5.460269592337519e-06, "loss": 0.1441, "num_input_tokens_seen": 94461744, "step": 98935 }, { "epoch": 8.070805122766947, "grad_norm": 3.944462299346924, "learning_rate": 5.458049533838749e-06, "loss": 0.7508, "num_input_tokens_seen": 94466576, "step": 98940 }, { "epoch": 8.071212986377356, "grad_norm": 21.905473709106445, "learning_rate": 5.455829871447013e-06, "loss": 0.37, "num_input_tokens_seen": 94471072, "step": 98945 }, { "epoch": 8.071620849987765, "grad_norm": 0.7369412779808044, "learning_rate": 5.45361060520729e-06, "loss": 0.4271, "num_input_tokens_seen": 94476160, "step": 98950 }, { "epoch": 8.072028713598172, "grad_norm": 10.184049606323242, "learning_rate": 5.451391735164585e-06, "loss": 0.6784, "num_input_tokens_seen": 94480656, "step": 98955 }, { "epoch": 8.072436577208581, "grad_norm": 0.5888832211494446, "learning_rate": 5.4491732613638605e-06, "loss": 0.4274, "num_input_tokens_seen": 94484960, "step": 98960 }, { "epoch": 8.07284444081899, "grad_norm": 14.62613296508789, "learning_rate": 5.4469551838500885e-06, "loss": 0.5331, "num_input_tokens_seen": 94490368, "step": 98965 }, { "epoch": 8.0732523044294, "grad_norm": 2.058382749557495, "learning_rate": 5.444737502668223e-06, "loss": 0.3654, "num_input_tokens_seen": 94495904, "step": 98970 }, { "epoch": 8.073660168039808, "grad_norm": 5.688492298126221, "learning_rate": 5.442520217863215e-06, "loss": 0.2678, "num_input_tokens_seen": 94500880, "step": 98975 }, { "epoch": 8.074068031650215, "grad_norm": 8.885550498962402, "learning_rate": 5.440303329480018e-06, "loss": 0.2757, "num_input_tokens_seen": 94505312, "step": 98980 }, { "epoch": 8.074475895260624, "grad_norm": 3.594510555267334, "learning_rate": 5.438086837563561e-06, "loss": 0.366, "num_input_tokens_seen": 94509472, "step": 98985 }, { "epoch": 8.074883758871033, "grad_norm": 16.07642364501953, "learning_rate": 5.435870742158769e-06, "loss": 0.4787, "num_input_tokens_seen": 94514480, "step": 98990 }, { "epoch": 8.075291622481442, "grad_norm": 7.845526218414307, "learning_rate": 5.4336550433105645e-06, "loss": 0.562, "num_input_tokens_seen": 94518992, "step": 98995 }, { "epoch": 8.075699486091851, "grad_norm": 2.9476423263549805, "learning_rate": 5.43143974106386e-06, "loss": 0.4562, "num_input_tokens_seen": 94524320, "step": 99000 }, { "epoch": 8.07610734970226, "grad_norm": 2.765122175216675, "learning_rate": 5.429224835463556e-06, "loss": 0.2902, "num_input_tokens_seen": 94528528, "step": 99005 }, { "epoch": 8.076515213312668, "grad_norm": 18.555252075195312, "learning_rate": 5.427010326554549e-06, "loss": 0.3029, "num_input_tokens_seen": 94533920, "step": 99010 }, { "epoch": 8.076923076923077, "grad_norm": 0.40510857105255127, "learning_rate": 5.424796214381719e-06, "loss": 0.3411, "num_input_tokens_seen": 94538704, "step": 99015 }, { "epoch": 8.077330940533486, "grad_norm": 9.383584022521973, "learning_rate": 5.422582498989956e-06, "loss": 0.3742, "num_input_tokens_seen": 94543952, "step": 99020 }, { "epoch": 8.077738804143895, "grad_norm": 4.2245259284973145, "learning_rate": 5.420369180424128e-06, "loss": 0.2742, "num_input_tokens_seen": 94549728, "step": 99025 }, { "epoch": 8.078146667754304, "grad_norm": 13.58127498626709, "learning_rate": 5.4181562587290984e-06, "loss": 0.2206, "num_input_tokens_seen": 94554144, "step": 99030 }, { "epoch": 8.078554531364711, "grad_norm": 2.6468067169189453, "learning_rate": 5.4159437339497215e-06, "loss": 0.3295, "num_input_tokens_seen": 94558384, "step": 99035 }, { "epoch": 8.07896239497512, "grad_norm": 0.8514538407325745, "learning_rate": 5.4137316061308344e-06, "loss": 0.2595, "num_input_tokens_seen": 94563472, "step": 99040 }, { "epoch": 8.079370258585529, "grad_norm": 1.6222532987594604, "learning_rate": 5.411519875317292e-06, "loss": 0.3784, "num_input_tokens_seen": 94568096, "step": 99045 }, { "epoch": 8.079778122195938, "grad_norm": 28.381017684936523, "learning_rate": 5.409308541553918e-06, "loss": 0.4028, "num_input_tokens_seen": 94573760, "step": 99050 }, { "epoch": 8.080185985806347, "grad_norm": 1.4283571243286133, "learning_rate": 5.407097604885533e-06, "loss": 0.206, "num_input_tokens_seen": 94579184, "step": 99055 }, { "epoch": 8.080593849416754, "grad_norm": 49.3770866394043, "learning_rate": 5.40488706535695e-06, "loss": 0.4687, "num_input_tokens_seen": 94582928, "step": 99060 }, { "epoch": 8.081001713027163, "grad_norm": 10.829076766967773, "learning_rate": 5.402676923012986e-06, "loss": 0.2705, "num_input_tokens_seen": 94587184, "step": 99065 }, { "epoch": 8.081409576637572, "grad_norm": 4.202199459075928, "learning_rate": 5.400467177898433e-06, "loss": 0.4066, "num_input_tokens_seen": 94591664, "step": 99070 }, { "epoch": 8.081817440247981, "grad_norm": 4.754466533660889, "learning_rate": 5.398257830058085e-06, "loss": 0.4046, "num_input_tokens_seen": 94596096, "step": 99075 }, { "epoch": 8.08222530385839, "grad_norm": 2.633226156234741, "learning_rate": 5.396048879536716e-06, "loss": 0.2406, "num_input_tokens_seen": 94601280, "step": 99080 }, { "epoch": 8.0826331674688, "grad_norm": 6.579080581665039, "learning_rate": 5.393840326379105e-06, "loss": 0.411, "num_input_tokens_seen": 94606016, "step": 99085 }, { "epoch": 8.083041031079206, "grad_norm": 2.2622158527374268, "learning_rate": 5.39163217063002e-06, "loss": 0.4362, "num_input_tokens_seen": 94611312, "step": 99090 }, { "epoch": 8.083448894689615, "grad_norm": 40.468441009521484, "learning_rate": 5.389424412334224e-06, "loss": 0.3411, "num_input_tokens_seen": 94615216, "step": 99095 }, { "epoch": 8.083856758300024, "grad_norm": 4.1941962242126465, "learning_rate": 5.3872170515364615e-06, "loss": 0.4167, "num_input_tokens_seen": 94619712, "step": 99100 }, { "epoch": 8.084264621910433, "grad_norm": 7.461925983428955, "learning_rate": 5.385010088281467e-06, "loss": 0.3517, "num_input_tokens_seen": 94625024, "step": 99105 }, { "epoch": 8.084672485520842, "grad_norm": 4.4564971923828125, "learning_rate": 5.382803522613988e-06, "loss": 0.418, "num_input_tokens_seen": 94630176, "step": 99110 }, { "epoch": 8.08508034913125, "grad_norm": 1.8816035985946655, "learning_rate": 5.380597354578748e-06, "loss": 0.1887, "num_input_tokens_seen": 94635168, "step": 99115 }, { "epoch": 8.085488212741659, "grad_norm": 8.589498519897461, "learning_rate": 5.378391584220463e-06, "loss": 0.4269, "num_input_tokens_seen": 94640688, "step": 99120 }, { "epoch": 8.085896076352068, "grad_norm": 3.2699012756347656, "learning_rate": 5.3761862115838415e-06, "loss": 0.2233, "num_input_tokens_seen": 94645520, "step": 99125 }, { "epoch": 8.086303939962477, "grad_norm": 2.782006025314331, "learning_rate": 5.37398123671359e-06, "loss": 0.4031, "num_input_tokens_seen": 94650368, "step": 99130 }, { "epoch": 8.086711803572886, "grad_norm": 10.039647102355957, "learning_rate": 5.371776659654396e-06, "loss": 0.4376, "num_input_tokens_seen": 94656272, "step": 99135 }, { "epoch": 8.087119667183295, "grad_norm": 1.4831832647323608, "learning_rate": 5.369572480450951e-06, "loss": 0.2021, "num_input_tokens_seen": 94662368, "step": 99140 }, { "epoch": 8.087527530793702, "grad_norm": 45.1295051574707, "learning_rate": 5.367368699147929e-06, "loss": 0.4557, "num_input_tokens_seen": 94667424, "step": 99145 }, { "epoch": 8.087935394404111, "grad_norm": 3.1343770027160645, "learning_rate": 5.365165315789994e-06, "loss": 0.2723, "num_input_tokens_seen": 94671616, "step": 99150 }, { "epoch": 8.08834325801452, "grad_norm": 8.104714393615723, "learning_rate": 5.362962330421822e-06, "loss": 0.3979, "num_input_tokens_seen": 94677776, "step": 99155 }, { "epoch": 8.088751121624929, "grad_norm": 13.416653633117676, "learning_rate": 5.360759743088059e-06, "loss": 0.4503, "num_input_tokens_seen": 94682992, "step": 99160 }, { "epoch": 8.089158985235338, "grad_norm": 37.374813079833984, "learning_rate": 5.358557553833351e-06, "loss": 0.3069, "num_input_tokens_seen": 94688016, "step": 99165 }, { "epoch": 8.089566848845745, "grad_norm": 7.160991191864014, "learning_rate": 5.356355762702336e-06, "loss": 0.2384, "num_input_tokens_seen": 94692960, "step": 99170 }, { "epoch": 8.089974712456154, "grad_norm": 12.931781768798828, "learning_rate": 5.354154369739634e-06, "loss": 0.5027, "num_input_tokens_seen": 94698048, "step": 99175 }, { "epoch": 8.090382576066563, "grad_norm": 9.395130157470703, "learning_rate": 5.351953374989882e-06, "loss": 0.3547, "num_input_tokens_seen": 94701888, "step": 99180 }, { "epoch": 8.090790439676972, "grad_norm": 12.82552433013916, "learning_rate": 5.349752778497688e-06, "loss": 0.4357, "num_input_tokens_seen": 94706240, "step": 99185 }, { "epoch": 8.091198303287381, "grad_norm": 3.5293772220611572, "learning_rate": 5.347552580307655e-06, "loss": 0.4039, "num_input_tokens_seen": 94711280, "step": 99190 }, { "epoch": 8.091606166897789, "grad_norm": 0.8455134630203247, "learning_rate": 5.345352780464371e-06, "loss": 0.2896, "num_input_tokens_seen": 94716768, "step": 99195 }, { "epoch": 8.092014030508198, "grad_norm": 18.107255935668945, "learning_rate": 5.343153379012444e-06, "loss": 0.4087, "num_input_tokens_seen": 94721984, "step": 99200 }, { "epoch": 8.092421894118607, "grad_norm": 1.3514763116836548, "learning_rate": 5.340954375996443e-06, "loss": 0.2725, "num_input_tokens_seen": 94726256, "step": 99205 }, { "epoch": 8.092829757729016, "grad_norm": 4.284788131713867, "learning_rate": 5.338755771460943e-06, "loss": 0.3262, "num_input_tokens_seen": 94731744, "step": 99210 }, { "epoch": 8.093237621339425, "grad_norm": 35.82036209106445, "learning_rate": 5.336557565450506e-06, "loss": 0.439, "num_input_tokens_seen": 94736720, "step": 99215 }, { "epoch": 8.093645484949834, "grad_norm": 2.2428507804870605, "learning_rate": 5.3343597580096935e-06, "loss": 0.3311, "num_input_tokens_seen": 94741920, "step": 99220 }, { "epoch": 8.09405334856024, "grad_norm": 2.0567498207092285, "learning_rate": 5.332162349183057e-06, "loss": 0.3579, "num_input_tokens_seen": 94747056, "step": 99225 }, { "epoch": 8.09446121217065, "grad_norm": 39.9378662109375, "learning_rate": 5.3299653390151275e-06, "loss": 0.34, "num_input_tokens_seen": 94751072, "step": 99230 }, { "epoch": 8.094869075781059, "grad_norm": 8.021913528442383, "learning_rate": 5.3277687275504475e-06, "loss": 0.4977, "num_input_tokens_seen": 94756032, "step": 99235 }, { "epoch": 8.095276939391468, "grad_norm": 40.61300277709961, "learning_rate": 5.325572514833524e-06, "loss": 0.375, "num_input_tokens_seen": 94760768, "step": 99240 }, { "epoch": 8.095684803001877, "grad_norm": 8.129118919372559, "learning_rate": 5.323376700908897e-06, "loss": 0.436, "num_input_tokens_seen": 94764976, "step": 99245 }, { "epoch": 8.096092666612284, "grad_norm": 42.56023025512695, "learning_rate": 5.321181285821061e-06, "loss": 0.3539, "num_input_tokens_seen": 94769456, "step": 99250 }, { "epoch": 8.096500530222693, "grad_norm": 7.598016262054443, "learning_rate": 5.318986269614518e-06, "loss": 0.3528, "num_input_tokens_seen": 94774480, "step": 99255 }, { "epoch": 8.096908393833102, "grad_norm": 15.51065444946289, "learning_rate": 5.316791652333761e-06, "loss": 0.3551, "num_input_tokens_seen": 94779376, "step": 99260 }, { "epoch": 8.097316257443511, "grad_norm": 10.528168678283691, "learning_rate": 5.314597434023272e-06, "loss": 0.1937, "num_input_tokens_seen": 94783792, "step": 99265 }, { "epoch": 8.09772412105392, "grad_norm": 4.148336410522461, "learning_rate": 5.312403614727529e-06, "loss": 0.3985, "num_input_tokens_seen": 94788528, "step": 99270 }, { "epoch": 8.098131984664327, "grad_norm": 8.779085159301758, "learning_rate": 5.3102101944909995e-06, "loss": 0.3373, "num_input_tokens_seen": 94793472, "step": 99275 }, { "epoch": 8.098539848274736, "grad_norm": 8.829812049865723, "learning_rate": 5.308017173358143e-06, "loss": 0.2648, "num_input_tokens_seen": 94798656, "step": 99280 }, { "epoch": 8.098947711885145, "grad_norm": 86.14661407470703, "learning_rate": 5.305824551373403e-06, "loss": 0.4072, "num_input_tokens_seen": 94803392, "step": 99285 }, { "epoch": 8.099355575495554, "grad_norm": 0.517820417881012, "learning_rate": 5.303632328581237e-06, "loss": 0.3567, "num_input_tokens_seen": 94808128, "step": 99290 }, { "epoch": 8.099763439105963, "grad_norm": 2.4860894680023193, "learning_rate": 5.3014405050260755e-06, "loss": 0.2304, "num_input_tokens_seen": 94813056, "step": 99295 }, { "epoch": 8.100171302716372, "grad_norm": 3.7532036304473877, "learning_rate": 5.299249080752344e-06, "loss": 0.2983, "num_input_tokens_seen": 94817888, "step": 99300 }, { "epoch": 8.10057916632678, "grad_norm": 1.990485429763794, "learning_rate": 5.297058055804455e-06, "loss": 0.2815, "num_input_tokens_seen": 94822832, "step": 99305 }, { "epoch": 8.100987029937189, "grad_norm": 3.1089258193969727, "learning_rate": 5.294867430226835e-06, "loss": 0.3426, "num_input_tokens_seen": 94827456, "step": 99310 }, { "epoch": 8.101394893547598, "grad_norm": 4.406589984893799, "learning_rate": 5.292677204063878e-06, "loss": 0.2502, "num_input_tokens_seen": 94832080, "step": 99315 }, { "epoch": 8.101802757158007, "grad_norm": 45.29235076904297, "learning_rate": 5.290487377359979e-06, "loss": 0.36, "num_input_tokens_seen": 94837248, "step": 99320 }, { "epoch": 8.102210620768416, "grad_norm": 3.36026930809021, "learning_rate": 5.288297950159526e-06, "loss": 0.3874, "num_input_tokens_seen": 94841728, "step": 99325 }, { "epoch": 8.102618484378823, "grad_norm": 3.962428092956543, "learning_rate": 5.286108922506894e-06, "loss": 0.3885, "num_input_tokens_seen": 94847376, "step": 99330 }, { "epoch": 8.103026347989232, "grad_norm": 9.720333099365234, "learning_rate": 5.283920294446462e-06, "loss": 0.2295, "num_input_tokens_seen": 94852240, "step": 99335 }, { "epoch": 8.103434211599641, "grad_norm": 46.6651496887207, "learning_rate": 5.281732066022588e-06, "loss": 0.4275, "num_input_tokens_seen": 94857088, "step": 99340 }, { "epoch": 8.10384207521005, "grad_norm": 27.37666130065918, "learning_rate": 5.279544237279624e-06, "loss": 0.2016, "num_input_tokens_seen": 94862656, "step": 99345 }, { "epoch": 8.104249938820459, "grad_norm": 1.7409051656723022, "learning_rate": 5.277356808261913e-06, "loss": 0.3699, "num_input_tokens_seen": 94868112, "step": 99350 }, { "epoch": 8.104657802430868, "grad_norm": 25.687578201293945, "learning_rate": 5.2751697790138054e-06, "loss": 0.3433, "num_input_tokens_seen": 94874032, "step": 99355 }, { "epoch": 8.105065666041275, "grad_norm": 34.463375091552734, "learning_rate": 5.272983149579627e-06, "loss": 0.5586, "num_input_tokens_seen": 94878608, "step": 99360 }, { "epoch": 8.105473529651684, "grad_norm": 8.664554595947266, "learning_rate": 5.270796920003696e-06, "loss": 0.5048, "num_input_tokens_seen": 94883296, "step": 99365 }, { "epoch": 8.105881393262093, "grad_norm": 28.586101531982422, "learning_rate": 5.268611090330322e-06, "loss": 0.4502, "num_input_tokens_seen": 94887808, "step": 99370 }, { "epoch": 8.106289256872502, "grad_norm": 2.6701457500457764, "learning_rate": 5.266425660603821e-06, "loss": 0.3297, "num_input_tokens_seen": 94892720, "step": 99375 }, { "epoch": 8.106697120482911, "grad_norm": 26.856590270996094, "learning_rate": 5.264240630868489e-06, "loss": 0.3399, "num_input_tokens_seen": 94898512, "step": 99380 }, { "epoch": 8.107104984093318, "grad_norm": 12.350048065185547, "learning_rate": 5.262056001168614e-06, "loss": 0.3361, "num_input_tokens_seen": 94903248, "step": 99385 }, { "epoch": 8.107512847703727, "grad_norm": 3.32112979888916, "learning_rate": 5.259871771548475e-06, "loss": 0.4604, "num_input_tokens_seen": 94907744, "step": 99390 }, { "epoch": 8.107920711314136, "grad_norm": 1.3127504587173462, "learning_rate": 5.257687942052348e-06, "loss": 0.3235, "num_input_tokens_seen": 94912464, "step": 99395 }, { "epoch": 8.108328574924546, "grad_norm": 3.9792022705078125, "learning_rate": 5.255504512724496e-06, "loss": 0.3276, "num_input_tokens_seen": 94916688, "step": 99400 }, { "epoch": 8.108736438534955, "grad_norm": 6.7596435546875, "learning_rate": 5.253321483609178e-06, "loss": 0.3843, "num_input_tokens_seen": 94921152, "step": 99405 }, { "epoch": 8.109144302145362, "grad_norm": 6.520021438598633, "learning_rate": 5.251138854750642e-06, "loss": 0.517, "num_input_tokens_seen": 94925968, "step": 99410 }, { "epoch": 8.10955216575577, "grad_norm": 17.26279640197754, "learning_rate": 5.248956626193124e-06, "loss": 0.5116, "num_input_tokens_seen": 94930400, "step": 99415 }, { "epoch": 8.10996002936618, "grad_norm": 2.778486490249634, "learning_rate": 5.246774797980869e-06, "loss": 0.2668, "num_input_tokens_seen": 94935264, "step": 99420 }, { "epoch": 8.110367892976589, "grad_norm": 16.17957305908203, "learning_rate": 5.244593370158096e-06, "loss": 0.34, "num_input_tokens_seen": 94940096, "step": 99425 }, { "epoch": 8.110775756586998, "grad_norm": 8.4788236618042, "learning_rate": 5.2424123427690204e-06, "loss": 0.4489, "num_input_tokens_seen": 94944704, "step": 99430 }, { "epoch": 8.111183620197407, "grad_norm": 12.2962007522583, "learning_rate": 5.2402317158578515e-06, "loss": 0.3339, "num_input_tokens_seen": 94949616, "step": 99435 }, { "epoch": 8.111591483807814, "grad_norm": 18.00261878967285, "learning_rate": 5.2380514894687815e-06, "loss": 0.2893, "num_input_tokens_seen": 94953792, "step": 99440 }, { "epoch": 8.111999347418223, "grad_norm": 24.756404876708984, "learning_rate": 5.235871663646016e-06, "loss": 0.3731, "num_input_tokens_seen": 94957984, "step": 99445 }, { "epoch": 8.112407211028632, "grad_norm": 9.492887496948242, "learning_rate": 5.233692238433738e-06, "loss": 0.2922, "num_input_tokens_seen": 94962928, "step": 99450 }, { "epoch": 8.112815074639041, "grad_norm": 2.049790382385254, "learning_rate": 5.231513213876116e-06, "loss": 0.4861, "num_input_tokens_seen": 94967936, "step": 99455 }, { "epoch": 8.11322293824945, "grad_norm": 7.386457443237305, "learning_rate": 5.229334590017312e-06, "loss": 0.2336, "num_input_tokens_seen": 94972336, "step": 99460 }, { "epoch": 8.113630801859857, "grad_norm": 3.9832921028137207, "learning_rate": 5.2271563669015065e-06, "loss": 0.4451, "num_input_tokens_seen": 94977376, "step": 99465 }, { "epoch": 8.114038665470266, "grad_norm": 33.780609130859375, "learning_rate": 5.2249785445728366e-06, "loss": 0.4645, "num_input_tokens_seen": 94982720, "step": 99470 }, { "epoch": 8.114446529080675, "grad_norm": 1.7314397096633911, "learning_rate": 5.22280112307545e-06, "loss": 0.3902, "num_input_tokens_seen": 94987360, "step": 99475 }, { "epoch": 8.114854392691084, "grad_norm": 8.364965438842773, "learning_rate": 5.220624102453478e-06, "loss": 0.374, "num_input_tokens_seen": 94991856, "step": 99480 }, { "epoch": 8.115262256301493, "grad_norm": 10.043447494506836, "learning_rate": 5.218447482751046e-06, "loss": 0.3484, "num_input_tokens_seen": 94996304, "step": 99485 }, { "epoch": 8.1156701199119, "grad_norm": 10.237276077270508, "learning_rate": 5.216271264012285e-06, "loss": 0.3793, "num_input_tokens_seen": 95000672, "step": 99490 }, { "epoch": 8.11607798352231, "grad_norm": 3.520388126373291, "learning_rate": 5.2140954462812975e-06, "loss": 0.2233, "num_input_tokens_seen": 95004640, "step": 99495 }, { "epoch": 8.116485847132719, "grad_norm": 17.142959594726562, "learning_rate": 5.211920029602191e-06, "loss": 0.4209, "num_input_tokens_seen": 95009104, "step": 99500 }, { "epoch": 8.116893710743128, "grad_norm": 4.369042873382568, "learning_rate": 5.209745014019046e-06, "loss": 0.2709, "num_input_tokens_seen": 95012992, "step": 99505 }, { "epoch": 8.117301574353537, "grad_norm": 13.149910926818848, "learning_rate": 5.20757039957597e-06, "loss": 0.3757, "num_input_tokens_seen": 95018912, "step": 99510 }, { "epoch": 8.117709437963946, "grad_norm": 6.157339572906494, "learning_rate": 5.20539618631703e-06, "loss": 0.3566, "num_input_tokens_seen": 95023840, "step": 99515 }, { "epoch": 8.118117301574353, "grad_norm": 12.607356071472168, "learning_rate": 5.2032223742862985e-06, "loss": 0.3626, "num_input_tokens_seen": 95027888, "step": 99520 }, { "epoch": 8.118525165184762, "grad_norm": 4.165176868438721, "learning_rate": 5.2010489635278384e-06, "loss": 0.3976, "num_input_tokens_seen": 95033152, "step": 99525 }, { "epoch": 8.11893302879517, "grad_norm": 7.867062568664551, "learning_rate": 5.1988759540856995e-06, "loss": 0.4155, "num_input_tokens_seen": 95036960, "step": 99530 }, { "epoch": 8.11934089240558, "grad_norm": 29.382383346557617, "learning_rate": 5.196703346003933e-06, "loss": 0.3328, "num_input_tokens_seen": 95041712, "step": 99535 }, { "epoch": 8.119748756015989, "grad_norm": 37.16123580932617, "learning_rate": 5.194531139326575e-06, "loss": 0.3047, "num_input_tokens_seen": 95046208, "step": 99540 }, { "epoch": 8.120156619626396, "grad_norm": 55.09003448486328, "learning_rate": 5.1923593340976575e-06, "loss": 0.391, "num_input_tokens_seen": 95050704, "step": 99545 }, { "epoch": 8.120564483236805, "grad_norm": 45.78192901611328, "learning_rate": 5.1901879303611875e-06, "loss": 0.3018, "num_input_tokens_seen": 95055648, "step": 99550 }, { "epoch": 8.120972346847214, "grad_norm": 29.09649658203125, "learning_rate": 5.188016928161202e-06, "loss": 0.2961, "num_input_tokens_seen": 95060944, "step": 99555 }, { "epoch": 8.121380210457623, "grad_norm": 4.837088584899902, "learning_rate": 5.185846327541691e-06, "loss": 0.4184, "num_input_tokens_seen": 95066544, "step": 99560 }, { "epoch": 8.121788074068032, "grad_norm": 2.8098292350769043, "learning_rate": 5.183676128546658e-06, "loss": 0.2665, "num_input_tokens_seen": 95071248, "step": 99565 }, { "epoch": 8.122195937678441, "grad_norm": 12.943857192993164, "learning_rate": 5.181506331220082e-06, "loss": 0.2808, "num_input_tokens_seen": 95075920, "step": 99570 }, { "epoch": 8.122603801288848, "grad_norm": 26.545724868774414, "learning_rate": 5.179336935605958e-06, "loss": 0.3243, "num_input_tokens_seen": 95080624, "step": 99575 }, { "epoch": 8.123011664899257, "grad_norm": 7.054232120513916, "learning_rate": 5.177167941748251e-06, "loss": 0.3072, "num_input_tokens_seen": 95085232, "step": 99580 }, { "epoch": 8.123419528509666, "grad_norm": 2.7696549892425537, "learning_rate": 5.174999349690926e-06, "loss": 0.2991, "num_input_tokens_seen": 95090048, "step": 99585 }, { "epoch": 8.123827392120075, "grad_norm": 9.46561336517334, "learning_rate": 5.1728311594779425e-06, "loss": 0.283, "num_input_tokens_seen": 95095696, "step": 99590 }, { "epoch": 8.124235255730484, "grad_norm": 27.394699096679688, "learning_rate": 5.170663371153239e-06, "loss": 0.4265, "num_input_tokens_seen": 95100816, "step": 99595 }, { "epoch": 8.124643119340892, "grad_norm": 12.302408218383789, "learning_rate": 5.168495984760768e-06, "loss": 0.337, "num_input_tokens_seen": 95105552, "step": 99600 }, { "epoch": 8.1250509829513, "grad_norm": 1.1198358535766602, "learning_rate": 5.166329000344458e-06, "loss": 0.3633, "num_input_tokens_seen": 95109392, "step": 99605 }, { "epoch": 8.12545884656171, "grad_norm": 29.982637405395508, "learning_rate": 5.164162417948231e-06, "loss": 0.3231, "num_input_tokens_seen": 95113840, "step": 99610 }, { "epoch": 8.125866710172119, "grad_norm": 30.262893676757812, "learning_rate": 5.161996237615996e-06, "loss": 0.3325, "num_input_tokens_seen": 95117920, "step": 99615 }, { "epoch": 8.126274573782528, "grad_norm": 26.496936798095703, "learning_rate": 5.159830459391674e-06, "loss": 0.3619, "num_input_tokens_seen": 95122576, "step": 99620 }, { "epoch": 8.126682437392935, "grad_norm": 13.334745407104492, "learning_rate": 5.157665083319157e-06, "loss": 0.4508, "num_input_tokens_seen": 95127648, "step": 99625 }, { "epoch": 8.127090301003344, "grad_norm": 2.8731818199157715, "learning_rate": 5.15550010944234e-06, "loss": 0.3657, "num_input_tokens_seen": 95132400, "step": 99630 }, { "epoch": 8.127498164613753, "grad_norm": 1.1041196584701538, "learning_rate": 5.153335537805104e-06, "loss": 0.3772, "num_input_tokens_seen": 95137440, "step": 99635 }, { "epoch": 8.127906028224162, "grad_norm": 22.93071746826172, "learning_rate": 5.151171368451313e-06, "loss": 0.4846, "num_input_tokens_seen": 95141888, "step": 99640 }, { "epoch": 8.128313891834571, "grad_norm": 2.4613332748413086, "learning_rate": 5.1490076014248515e-06, "loss": 0.28, "num_input_tokens_seen": 95145952, "step": 99645 }, { "epoch": 8.12872175544498, "grad_norm": 2.1730830669403076, "learning_rate": 5.146844236769574e-06, "loss": 0.3728, "num_input_tokens_seen": 95151664, "step": 99650 }, { "epoch": 8.129129619055387, "grad_norm": 8.555667877197266, "learning_rate": 5.144681274529326e-06, "loss": 0.3707, "num_input_tokens_seen": 95156976, "step": 99655 }, { "epoch": 8.129537482665796, "grad_norm": 23.86290168762207, "learning_rate": 5.142518714747949e-06, "loss": 0.2701, "num_input_tokens_seen": 95160912, "step": 99660 }, { "epoch": 8.129945346276205, "grad_norm": 6.361480236053467, "learning_rate": 5.140356557469281e-06, "loss": 0.283, "num_input_tokens_seen": 95164944, "step": 99665 }, { "epoch": 8.130353209886614, "grad_norm": 26.877941131591797, "learning_rate": 5.138194802737142e-06, "loss": 0.4009, "num_input_tokens_seen": 95169552, "step": 99670 }, { "epoch": 8.130761073497023, "grad_norm": 0.9384047985076904, "learning_rate": 5.13603345059536e-06, "loss": 0.3207, "num_input_tokens_seen": 95174160, "step": 99675 }, { "epoch": 8.13116893710743, "grad_norm": 18.315013885498047, "learning_rate": 5.133872501087738e-06, "loss": 0.3309, "num_input_tokens_seen": 95178448, "step": 99680 }, { "epoch": 8.13157680071784, "grad_norm": 11.517472267150879, "learning_rate": 5.131711954258081e-06, "loss": 0.3759, "num_input_tokens_seen": 95183248, "step": 99685 }, { "epoch": 8.131984664328249, "grad_norm": 9.874896049499512, "learning_rate": 5.129551810150179e-06, "loss": 0.378, "num_input_tokens_seen": 95187904, "step": 99690 }, { "epoch": 8.132392527938658, "grad_norm": 7.866204261779785, "learning_rate": 5.127392068807818e-06, "loss": 0.4932, "num_input_tokens_seen": 95192464, "step": 99695 }, { "epoch": 8.132800391549067, "grad_norm": 10.62039566040039, "learning_rate": 5.125232730274776e-06, "loss": 0.287, "num_input_tokens_seen": 95197520, "step": 99700 }, { "epoch": 8.133208255159476, "grad_norm": 20.076263427734375, "learning_rate": 5.1230737945948125e-06, "loss": 0.2901, "num_input_tokens_seen": 95201824, "step": 99705 }, { "epoch": 8.133616118769883, "grad_norm": 58.565826416015625, "learning_rate": 5.120915261811707e-06, "loss": 0.3885, "num_input_tokens_seen": 95206064, "step": 99710 }, { "epoch": 8.134023982380292, "grad_norm": 2.4944849014282227, "learning_rate": 5.118757131969201e-06, "loss": 0.4141, "num_input_tokens_seen": 95211424, "step": 99715 }, { "epoch": 8.1344318459907, "grad_norm": 33.817962646484375, "learning_rate": 5.116599405111039e-06, "loss": 0.4785, "num_input_tokens_seen": 95216832, "step": 99720 }, { "epoch": 8.13483970960111, "grad_norm": 33.996681213378906, "learning_rate": 5.114442081280959e-06, "loss": 0.4276, "num_input_tokens_seen": 95221904, "step": 99725 }, { "epoch": 8.135247573211519, "grad_norm": 1.9522188901901245, "learning_rate": 5.112285160522681e-06, "loss": 0.5911, "num_input_tokens_seen": 95226912, "step": 99730 }, { "epoch": 8.135655436821926, "grad_norm": 13.3911771774292, "learning_rate": 5.110128642879938e-06, "loss": 0.4554, "num_input_tokens_seen": 95231600, "step": 99735 }, { "epoch": 8.136063300432335, "grad_norm": 7.201084613800049, "learning_rate": 5.107972528396435e-06, "loss": 0.3105, "num_input_tokens_seen": 95236304, "step": 99740 }, { "epoch": 8.136471164042744, "grad_norm": 1.9394431114196777, "learning_rate": 5.10581681711588e-06, "loss": 0.4142, "num_input_tokens_seen": 95240880, "step": 99745 }, { "epoch": 8.136879027653153, "grad_norm": 9.408235549926758, "learning_rate": 5.103661509081953e-06, "loss": 0.5073, "num_input_tokens_seen": 95245552, "step": 99750 }, { "epoch": 8.137286891263562, "grad_norm": 4.350893497467041, "learning_rate": 5.101506604338363e-06, "loss": 0.5157, "num_input_tokens_seen": 95249968, "step": 99755 }, { "epoch": 8.13769475487397, "grad_norm": 1.1161211729049683, "learning_rate": 5.099352102928778e-06, "loss": 0.2841, "num_input_tokens_seen": 95253824, "step": 99760 }, { "epoch": 8.138102618484378, "grad_norm": 10.032445907592773, "learning_rate": 5.097198004896869e-06, "loss": 0.4663, "num_input_tokens_seen": 95258256, "step": 99765 }, { "epoch": 8.138510482094787, "grad_norm": 21.291296005249023, "learning_rate": 5.095044310286292e-06, "loss": 0.3709, "num_input_tokens_seen": 95262848, "step": 99770 }, { "epoch": 8.138918345705196, "grad_norm": 3.118600845336914, "learning_rate": 5.0928910191407164e-06, "loss": 0.3081, "num_input_tokens_seen": 95267088, "step": 99775 }, { "epoch": 8.139326209315605, "grad_norm": 11.002099990844727, "learning_rate": 5.09073813150378e-06, "loss": 0.5397, "num_input_tokens_seen": 95271344, "step": 99780 }, { "epoch": 8.139734072926014, "grad_norm": 11.211435317993164, "learning_rate": 5.088585647419122e-06, "loss": 0.5198, "num_input_tokens_seen": 95276176, "step": 99785 }, { "epoch": 8.140141936536422, "grad_norm": 33.140933990478516, "learning_rate": 5.0864335669303735e-06, "loss": 0.2801, "num_input_tokens_seen": 95280592, "step": 99790 }, { "epoch": 8.14054980014683, "grad_norm": 50.85379409790039, "learning_rate": 5.0842818900811465e-06, "loss": 0.4763, "num_input_tokens_seen": 95285040, "step": 99795 }, { "epoch": 8.14095766375724, "grad_norm": 37.146785736083984, "learning_rate": 5.082130616915071e-06, "loss": 0.346, "num_input_tokens_seen": 95290160, "step": 99800 }, { "epoch": 8.141365527367649, "grad_norm": 6.1976447105407715, "learning_rate": 5.079979747475741e-06, "loss": 0.4009, "num_input_tokens_seen": 95295104, "step": 99805 }, { "epoch": 8.141773390978058, "grad_norm": 30.61247444152832, "learning_rate": 5.077829281806759e-06, "loss": 0.4075, "num_input_tokens_seen": 95299872, "step": 99810 }, { "epoch": 8.142181254588465, "grad_norm": 0.8979613184928894, "learning_rate": 5.075679219951712e-06, "loss": 0.3796, "num_input_tokens_seen": 95304448, "step": 99815 }, { "epoch": 8.142589118198874, "grad_norm": 1.4147623777389526, "learning_rate": 5.073529561954182e-06, "loss": 0.3863, "num_input_tokens_seen": 95309456, "step": 99820 }, { "epoch": 8.142996981809283, "grad_norm": 25.193721771240234, "learning_rate": 5.0713803078577395e-06, "loss": 0.356, "num_input_tokens_seen": 95314928, "step": 99825 }, { "epoch": 8.143404845419692, "grad_norm": 19.9426212310791, "learning_rate": 5.06923145770595e-06, "loss": 0.3279, "num_input_tokens_seen": 95319328, "step": 99830 }, { "epoch": 8.143812709030101, "grad_norm": 2.1003053188323975, "learning_rate": 5.06708301154237e-06, "loss": 0.3231, "num_input_tokens_seen": 95324432, "step": 99835 }, { "epoch": 8.14422057264051, "grad_norm": 1.559498906135559, "learning_rate": 5.06493496941054e-06, "loss": 0.3138, "num_input_tokens_seen": 95329200, "step": 99840 }, { "epoch": 8.144628436250917, "grad_norm": 4.976739883422852, "learning_rate": 5.062787331354016e-06, "loss": 0.2945, "num_input_tokens_seen": 95334864, "step": 99845 }, { "epoch": 8.145036299861326, "grad_norm": 16.978670120239258, "learning_rate": 5.06064009741632e-06, "loss": 0.4132, "num_input_tokens_seen": 95339792, "step": 99850 }, { "epoch": 8.145444163471735, "grad_norm": 76.65135192871094, "learning_rate": 5.058493267640974e-06, "loss": 0.3985, "num_input_tokens_seen": 95345440, "step": 99855 }, { "epoch": 8.145852027082144, "grad_norm": 14.355690956115723, "learning_rate": 5.056346842071491e-06, "loss": 0.2962, "num_input_tokens_seen": 95350224, "step": 99860 }, { "epoch": 8.146259890692553, "grad_norm": 4.849240303039551, "learning_rate": 5.05420082075139e-06, "loss": 0.293, "num_input_tokens_seen": 95355328, "step": 99865 }, { "epoch": 8.14666775430296, "grad_norm": 7.541445255279541, "learning_rate": 5.052055203724163e-06, "loss": 0.335, "num_input_tokens_seen": 95359984, "step": 99870 }, { "epoch": 8.14707561791337, "grad_norm": 10.765009880065918, "learning_rate": 5.0499099910332995e-06, "loss": 0.1942, "num_input_tokens_seen": 95364784, "step": 99875 }, { "epoch": 8.147483481523778, "grad_norm": 13.503479957580566, "learning_rate": 5.047765182722283e-06, "loss": 0.433, "num_input_tokens_seen": 95369808, "step": 99880 }, { "epoch": 8.147891345134187, "grad_norm": 2.4631097316741943, "learning_rate": 5.0456207788345835e-06, "loss": 0.3424, "num_input_tokens_seen": 95374496, "step": 99885 }, { "epoch": 8.148299208744596, "grad_norm": 9.340547561645508, "learning_rate": 5.043476779413675e-06, "loss": 0.2535, "num_input_tokens_seen": 95379168, "step": 99890 }, { "epoch": 8.148707072355004, "grad_norm": 3.077887773513794, "learning_rate": 5.0413331845030135e-06, "loss": 0.4867, "num_input_tokens_seen": 95384112, "step": 99895 }, { "epoch": 8.149114935965413, "grad_norm": 12.770820617675781, "learning_rate": 5.039189994146049e-06, "loss": 0.4066, "num_input_tokens_seen": 95388880, "step": 99900 }, { "epoch": 8.149522799575822, "grad_norm": 4.8378801345825195, "learning_rate": 5.037047208386214e-06, "loss": 0.3575, "num_input_tokens_seen": 95393744, "step": 99905 }, { "epoch": 8.14993066318623, "grad_norm": 10.432463645935059, "learning_rate": 5.0349048272669565e-06, "loss": 0.3562, "num_input_tokens_seen": 95398256, "step": 99910 }, { "epoch": 8.15033852679664, "grad_norm": 66.06790924072266, "learning_rate": 5.032762850831693e-06, "loss": 0.4575, "num_input_tokens_seen": 95403824, "step": 99915 }, { "epoch": 8.150746390407049, "grad_norm": 11.698305130004883, "learning_rate": 5.0306212791238425e-06, "loss": 0.233, "num_input_tokens_seen": 95408784, "step": 99920 }, { "epoch": 8.151154254017456, "grad_norm": 8.052840232849121, "learning_rate": 5.028480112186806e-06, "loss": 0.4599, "num_input_tokens_seen": 95413200, "step": 99925 }, { "epoch": 8.151562117627865, "grad_norm": 1.2813138961791992, "learning_rate": 5.0263393500639996e-06, "loss": 0.2572, "num_input_tokens_seen": 95417920, "step": 99930 }, { "epoch": 8.151969981238274, "grad_norm": 9.052362442016602, "learning_rate": 5.024198992798806e-06, "loss": 0.4277, "num_input_tokens_seen": 95423696, "step": 99935 }, { "epoch": 8.152377844848683, "grad_norm": 5.488761901855469, "learning_rate": 5.022059040434612e-06, "loss": 0.5856, "num_input_tokens_seen": 95428912, "step": 99940 }, { "epoch": 8.152785708459092, "grad_norm": 5.041763782501221, "learning_rate": 5.019919493014791e-06, "loss": 0.2793, "num_input_tokens_seen": 95434416, "step": 99945 }, { "epoch": 8.1531935720695, "grad_norm": 7.5405659675598145, "learning_rate": 5.01778035058271e-06, "loss": 0.1942, "num_input_tokens_seen": 95438896, "step": 99950 }, { "epoch": 8.153601435679908, "grad_norm": 3.4646317958831787, "learning_rate": 5.015641613181732e-06, "loss": 0.3738, "num_input_tokens_seen": 95443696, "step": 99955 }, { "epoch": 8.154009299290317, "grad_norm": 36.00621795654297, "learning_rate": 5.013503280855208e-06, "loss": 0.3633, "num_input_tokens_seen": 95447328, "step": 99960 }, { "epoch": 8.154417162900726, "grad_norm": 2.4928245544433594, "learning_rate": 5.0113653536464776e-06, "loss": 0.3098, "num_input_tokens_seen": 95452416, "step": 99965 }, { "epoch": 8.154825026511135, "grad_norm": 9.883405685424805, "learning_rate": 5.009227831598873e-06, "loss": 0.4632, "num_input_tokens_seen": 95456304, "step": 99970 }, { "epoch": 8.155232890121543, "grad_norm": 14.333229064941406, "learning_rate": 5.00709071475573e-06, "loss": 0.3133, "num_input_tokens_seen": 95461424, "step": 99975 }, { "epoch": 8.155640753731952, "grad_norm": 24.267202377319336, "learning_rate": 5.004954003160364e-06, "loss": 0.2045, "num_input_tokens_seen": 95466064, "step": 99980 }, { "epoch": 8.15604861734236, "grad_norm": 2.409151554107666, "learning_rate": 5.002817696856086e-06, "loss": 0.28, "num_input_tokens_seen": 95470496, "step": 99985 }, { "epoch": 8.15645648095277, "grad_norm": 16.27009391784668, "learning_rate": 5.000681795886192e-06, "loss": 0.3326, "num_input_tokens_seen": 95474896, "step": 99990 }, { "epoch": 8.156864344563179, "grad_norm": 7.416680812835693, "learning_rate": 4.998546300293974e-06, "loss": 0.462, "num_input_tokens_seen": 95479712, "step": 99995 }, { "epoch": 8.157272208173588, "grad_norm": 4.639903545379639, "learning_rate": 4.996411210122731e-06, "loss": 0.2316, "num_input_tokens_seen": 95484368, "step": 100000 }, { "epoch": 8.157680071783995, "grad_norm": 13.636744499206543, "learning_rate": 4.994276525415734e-06, "loss": 0.3552, "num_input_tokens_seen": 95489568, "step": 100005 }, { "epoch": 8.158087935394404, "grad_norm": 2.282832384109497, "learning_rate": 4.99214224621625e-06, "loss": 0.2722, "num_input_tokens_seen": 95493744, "step": 100010 }, { "epoch": 8.158495799004813, "grad_norm": 2.2026467323303223, "learning_rate": 4.990008372567534e-06, "loss": 0.3486, "num_input_tokens_seen": 95499488, "step": 100015 }, { "epoch": 8.158903662615222, "grad_norm": 16.739376068115234, "learning_rate": 4.987874904512851e-06, "loss": 0.4081, "num_input_tokens_seen": 95504640, "step": 100020 }, { "epoch": 8.15931152622563, "grad_norm": 6.791686058044434, "learning_rate": 4.985741842095442e-06, "loss": 0.3674, "num_input_tokens_seen": 95509520, "step": 100025 }, { "epoch": 8.159719389836038, "grad_norm": 0.887308657169342, "learning_rate": 4.9836091853585395e-06, "loss": 0.3183, "num_input_tokens_seen": 95515104, "step": 100030 }, { "epoch": 8.160127253446447, "grad_norm": 8.229482650756836, "learning_rate": 4.981476934345375e-06, "loss": 0.3506, "num_input_tokens_seen": 95519584, "step": 100035 }, { "epoch": 8.160535117056856, "grad_norm": 14.087122917175293, "learning_rate": 4.979345089099157e-06, "loss": 0.5684, "num_input_tokens_seen": 95524688, "step": 100040 }, { "epoch": 8.160942980667265, "grad_norm": 11.522516250610352, "learning_rate": 4.977213649663118e-06, "loss": 0.2974, "num_input_tokens_seen": 95530400, "step": 100045 }, { "epoch": 8.161350844277674, "grad_norm": 3.5217509269714355, "learning_rate": 4.975082616080448e-06, "loss": 0.4619, "num_input_tokens_seen": 95534784, "step": 100050 }, { "epoch": 8.161758707888083, "grad_norm": 4.442265510559082, "learning_rate": 4.972951988394342e-06, "loss": 0.3856, "num_input_tokens_seen": 95539648, "step": 100055 }, { "epoch": 8.16216657149849, "grad_norm": 26.242403030395508, "learning_rate": 4.970821766647985e-06, "loss": 0.4341, "num_input_tokens_seen": 95543824, "step": 100060 }, { "epoch": 8.1625744351089, "grad_norm": 29.95796775817871, "learning_rate": 4.968691950884566e-06, "loss": 0.3284, "num_input_tokens_seen": 95548272, "step": 100065 }, { "epoch": 8.162982298719308, "grad_norm": 21.146278381347656, "learning_rate": 4.966562541147249e-06, "loss": 0.2262, "num_input_tokens_seen": 95553312, "step": 100070 }, { "epoch": 8.163390162329717, "grad_norm": 29.104408264160156, "learning_rate": 4.9644335374791965e-06, "loss": 0.2841, "num_input_tokens_seen": 95557088, "step": 100075 }, { "epoch": 8.163798025940126, "grad_norm": 2.855870246887207, "learning_rate": 4.962304939923565e-06, "loss": 0.4518, "num_input_tokens_seen": 95562304, "step": 100080 }, { "epoch": 8.164205889550534, "grad_norm": 0.7719515562057495, "learning_rate": 4.9601767485234955e-06, "loss": 0.2062, "num_input_tokens_seen": 95566688, "step": 100085 }, { "epoch": 8.164613753160943, "grad_norm": 11.37720012664795, "learning_rate": 4.958048963322126e-06, "loss": 0.3619, "num_input_tokens_seen": 95571312, "step": 100090 }, { "epoch": 8.165021616771352, "grad_norm": 2.022993564605713, "learning_rate": 4.955921584362591e-06, "loss": 0.3803, "num_input_tokens_seen": 95576400, "step": 100095 }, { "epoch": 8.16542948038176, "grad_norm": 22.72535514831543, "learning_rate": 4.953794611688007e-06, "loss": 0.3594, "num_input_tokens_seen": 95581280, "step": 100100 }, { "epoch": 8.16583734399217, "grad_norm": 2.3992958068847656, "learning_rate": 4.951668045341482e-06, "loss": 0.2985, "num_input_tokens_seen": 95585232, "step": 100105 }, { "epoch": 8.166245207602577, "grad_norm": 23.713167190551758, "learning_rate": 4.9495418853661325e-06, "loss": 0.4788, "num_input_tokens_seen": 95590560, "step": 100110 }, { "epoch": 8.166653071212986, "grad_norm": 2.4468307495117188, "learning_rate": 4.947416131805049e-06, "loss": 0.2917, "num_input_tokens_seen": 95595472, "step": 100115 }, { "epoch": 8.167060934823395, "grad_norm": 2.1137452125549316, "learning_rate": 4.94529078470132e-06, "loss": 0.3646, "num_input_tokens_seen": 95599840, "step": 100120 }, { "epoch": 8.167468798433804, "grad_norm": 12.32359504699707, "learning_rate": 4.943165844098019e-06, "loss": 0.5073, "num_input_tokens_seen": 95604448, "step": 100125 }, { "epoch": 8.167876662044213, "grad_norm": 18.548961639404297, "learning_rate": 4.94104131003823e-06, "loss": 0.3291, "num_input_tokens_seen": 95609536, "step": 100130 }, { "epoch": 8.168284525654622, "grad_norm": 11.610872268676758, "learning_rate": 4.938917182565009e-06, "loss": 0.3161, "num_input_tokens_seen": 95614848, "step": 100135 }, { "epoch": 8.16869238926503, "grad_norm": 3.4200351238250732, "learning_rate": 4.936793461721414e-06, "loss": 0.3596, "num_input_tokens_seen": 95619216, "step": 100140 }, { "epoch": 8.169100252875438, "grad_norm": 14.836660385131836, "learning_rate": 4.934670147550488e-06, "loss": 0.2698, "num_input_tokens_seen": 95623504, "step": 100145 }, { "epoch": 8.169508116485847, "grad_norm": 6.9069437980651855, "learning_rate": 4.9325472400952675e-06, "loss": 0.3353, "num_input_tokens_seen": 95628032, "step": 100150 }, { "epoch": 8.169915980096256, "grad_norm": 13.774322509765625, "learning_rate": 4.930424739398793e-06, "loss": 0.2739, "num_input_tokens_seen": 95632944, "step": 100155 }, { "epoch": 8.170323843706665, "grad_norm": 2.853886604309082, "learning_rate": 4.9283026455040795e-06, "loss": 0.2452, "num_input_tokens_seen": 95637984, "step": 100160 }, { "epoch": 8.170731707317072, "grad_norm": 58.26839828491211, "learning_rate": 4.926180958454144e-06, "loss": 0.4642, "num_input_tokens_seen": 95643232, "step": 100165 }, { "epoch": 8.171139570927481, "grad_norm": 0.7066859602928162, "learning_rate": 4.924059678291984e-06, "loss": 0.3276, "num_input_tokens_seen": 95648432, "step": 100170 }, { "epoch": 8.17154743453789, "grad_norm": 17.68460464477539, "learning_rate": 4.921938805060608e-06, "loss": 0.3209, "num_input_tokens_seen": 95654160, "step": 100175 }, { "epoch": 8.1719552981483, "grad_norm": 19.921358108520508, "learning_rate": 4.9198183388030035e-06, "loss": 0.4054, "num_input_tokens_seen": 95658512, "step": 100180 }, { "epoch": 8.172363161758708, "grad_norm": 1.5836663246154785, "learning_rate": 4.917698279562147e-06, "loss": 0.3154, "num_input_tokens_seen": 95663424, "step": 100185 }, { "epoch": 8.172771025369116, "grad_norm": 14.812543869018555, "learning_rate": 4.915578627381015e-06, "loss": 0.3663, "num_input_tokens_seen": 95667344, "step": 100190 }, { "epoch": 8.173178888979525, "grad_norm": 8.660862922668457, "learning_rate": 4.913459382302563e-06, "loss": 0.3003, "num_input_tokens_seen": 95672144, "step": 100195 }, { "epoch": 8.173586752589934, "grad_norm": 11.800005912780762, "learning_rate": 4.911340544369758e-06, "loss": 0.3074, "num_input_tokens_seen": 95676992, "step": 100200 }, { "epoch": 8.173994616200343, "grad_norm": 1.5873891115188599, "learning_rate": 4.9092221136255444e-06, "loss": 0.1593, "num_input_tokens_seen": 95682336, "step": 100205 }, { "epoch": 8.174402479810752, "grad_norm": 0.5760295987129211, "learning_rate": 4.907104090112863e-06, "loss": 0.2655, "num_input_tokens_seen": 95686768, "step": 100210 }, { "epoch": 8.17481034342116, "grad_norm": 10.398754119873047, "learning_rate": 4.904986473874643e-06, "loss": 0.4449, "num_input_tokens_seen": 95691744, "step": 100215 }, { "epoch": 8.175218207031568, "grad_norm": 10.312092781066895, "learning_rate": 4.902869264953808e-06, "loss": 0.4468, "num_input_tokens_seen": 95696096, "step": 100220 }, { "epoch": 8.175626070641977, "grad_norm": 28.43292236328125, "learning_rate": 4.900752463393274e-06, "loss": 0.4334, "num_input_tokens_seen": 95699952, "step": 100225 }, { "epoch": 8.176033934252386, "grad_norm": 30.575057983398438, "learning_rate": 4.898636069235948e-06, "loss": 0.5304, "num_input_tokens_seen": 95705008, "step": 100230 }, { "epoch": 8.176441797862795, "grad_norm": 5.1850175857543945, "learning_rate": 4.8965200825247245e-06, "loss": 0.3001, "num_input_tokens_seen": 95710064, "step": 100235 }, { "epoch": 8.176849661473204, "grad_norm": 20.927810668945312, "learning_rate": 4.894404503302491e-06, "loss": 0.2663, "num_input_tokens_seen": 95715056, "step": 100240 }, { "epoch": 8.177257525083611, "grad_norm": 17.840553283691406, "learning_rate": 4.892289331612143e-06, "loss": 0.2675, "num_input_tokens_seen": 95720432, "step": 100245 }, { "epoch": 8.17766538869402, "grad_norm": 8.066201210021973, "learning_rate": 4.890174567496544e-06, "loss": 0.3665, "num_input_tokens_seen": 95725040, "step": 100250 }, { "epoch": 8.17807325230443, "grad_norm": 10.37077522277832, "learning_rate": 4.888060210998563e-06, "loss": 0.3611, "num_input_tokens_seen": 95729456, "step": 100255 }, { "epoch": 8.178481115914838, "grad_norm": 4.88014554977417, "learning_rate": 4.885946262161045e-06, "loss": 0.2441, "num_input_tokens_seen": 95734000, "step": 100260 }, { "epoch": 8.178888979525247, "grad_norm": 3.7506871223449707, "learning_rate": 4.8838327210268596e-06, "loss": 0.451, "num_input_tokens_seen": 95738816, "step": 100265 }, { "epoch": 8.179296843135656, "grad_norm": 14.124041557312012, "learning_rate": 4.881719587638836e-06, "loss": 0.2376, "num_input_tokens_seen": 95744784, "step": 100270 }, { "epoch": 8.179704706746064, "grad_norm": 4.001627445220947, "learning_rate": 4.879606862039809e-06, "loss": 0.3074, "num_input_tokens_seen": 95749952, "step": 100275 }, { "epoch": 8.180112570356473, "grad_norm": 19.332225799560547, "learning_rate": 4.877494544272598e-06, "loss": 0.3297, "num_input_tokens_seen": 95754768, "step": 100280 }, { "epoch": 8.180520433966882, "grad_norm": 0.9327279925346375, "learning_rate": 4.875382634380018e-06, "loss": 0.4559, "num_input_tokens_seen": 95759104, "step": 100285 }, { "epoch": 8.18092829757729, "grad_norm": 2.793543577194214, "learning_rate": 4.8732711324048854e-06, "loss": 0.254, "num_input_tokens_seen": 95763808, "step": 100290 }, { "epoch": 8.1813361611877, "grad_norm": 2.003899574279785, "learning_rate": 4.871160038389996e-06, "loss": 0.3024, "num_input_tokens_seen": 95767216, "step": 100295 }, { "epoch": 8.181744024798107, "grad_norm": 47.76886749267578, "learning_rate": 4.869049352378141e-06, "loss": 0.3047, "num_input_tokens_seen": 95772672, "step": 100300 }, { "epoch": 8.182151888408516, "grad_norm": 1.0730986595153809, "learning_rate": 4.866939074412092e-06, "loss": 0.2784, "num_input_tokens_seen": 95777280, "step": 100305 }, { "epoch": 8.182559752018925, "grad_norm": 4.136938095092773, "learning_rate": 4.8648292045346405e-06, "loss": 0.5147, "num_input_tokens_seen": 95782224, "step": 100310 }, { "epoch": 8.182967615629334, "grad_norm": 12.408591270446777, "learning_rate": 4.862719742788546e-06, "loss": 0.2536, "num_input_tokens_seen": 95787072, "step": 100315 }, { "epoch": 8.183375479239743, "grad_norm": 10.533042907714844, "learning_rate": 4.860610689216563e-06, "loss": 0.3822, "num_input_tokens_seen": 95791472, "step": 100320 }, { "epoch": 8.18378334285015, "grad_norm": 2.1372954845428467, "learning_rate": 4.85850204386144e-06, "loss": 0.265, "num_input_tokens_seen": 95796528, "step": 100325 }, { "epoch": 8.184191206460559, "grad_norm": 48.86426544189453, "learning_rate": 4.856393806765927e-06, "loss": 0.2666, "num_input_tokens_seen": 95801376, "step": 100330 }, { "epoch": 8.184599070070968, "grad_norm": 34.92398452758789, "learning_rate": 4.85428597797275e-06, "loss": 0.392, "num_input_tokens_seen": 95806752, "step": 100335 }, { "epoch": 8.185006933681377, "grad_norm": 22.039134979248047, "learning_rate": 4.8521785575246394e-06, "loss": 0.3371, "num_input_tokens_seen": 95811520, "step": 100340 }, { "epoch": 8.185414797291786, "grad_norm": 32.89873123168945, "learning_rate": 4.8500715454643045e-06, "loss": 0.3882, "num_input_tokens_seen": 95816432, "step": 100345 }, { "epoch": 8.185822660902195, "grad_norm": 1.8852319717407227, "learning_rate": 4.847964941834459e-06, "loss": 0.4908, "num_input_tokens_seen": 95822208, "step": 100350 }, { "epoch": 8.186230524512602, "grad_norm": 0.6268742680549622, "learning_rate": 4.845858746677803e-06, "loss": 0.3345, "num_input_tokens_seen": 95827616, "step": 100355 }, { "epoch": 8.186638388123011, "grad_norm": 7.088033199310303, "learning_rate": 4.843752960037018e-06, "loss": 0.463, "num_input_tokens_seen": 95832624, "step": 100360 }, { "epoch": 8.18704625173342, "grad_norm": 45.67543411254883, "learning_rate": 4.8416475819548e-06, "loss": 0.2703, "num_input_tokens_seen": 95837680, "step": 100365 }, { "epoch": 8.18745411534383, "grad_norm": 5.514235019683838, "learning_rate": 4.839542612473822e-06, "loss": 0.4391, "num_input_tokens_seen": 95842240, "step": 100370 }, { "epoch": 8.187861978954238, "grad_norm": 3.468895196914673, "learning_rate": 4.83743805163675e-06, "loss": 0.4644, "num_input_tokens_seen": 95847024, "step": 100375 }, { "epoch": 8.188269842564646, "grad_norm": 58.81505584716797, "learning_rate": 4.835333899486239e-06, "loss": 0.4064, "num_input_tokens_seen": 95851776, "step": 100380 }, { "epoch": 8.188677706175055, "grad_norm": 32.96146011352539, "learning_rate": 4.8332301560649415e-06, "loss": 0.4786, "num_input_tokens_seen": 95856560, "step": 100385 }, { "epoch": 8.189085569785464, "grad_norm": 1.288061499595642, "learning_rate": 4.831126821415499e-06, "loss": 0.2141, "num_input_tokens_seen": 95861360, "step": 100390 }, { "epoch": 8.189493433395873, "grad_norm": 30.08710289001465, "learning_rate": 4.829023895580539e-06, "loss": 0.3659, "num_input_tokens_seen": 95866240, "step": 100395 }, { "epoch": 8.189901297006282, "grad_norm": 1.1103742122650146, "learning_rate": 4.8269213786027e-06, "loss": 0.2675, "num_input_tokens_seen": 95870848, "step": 100400 }, { "epoch": 8.190309160616689, "grad_norm": 18.87725830078125, "learning_rate": 4.824819270524594e-06, "loss": 0.3286, "num_input_tokens_seen": 95875872, "step": 100405 }, { "epoch": 8.190717024227098, "grad_norm": 0.9203776717185974, "learning_rate": 4.822717571388827e-06, "loss": 0.2907, "num_input_tokens_seen": 95880752, "step": 100410 }, { "epoch": 8.191124887837507, "grad_norm": 12.93315601348877, "learning_rate": 4.820616281237994e-06, "loss": 0.2475, "num_input_tokens_seen": 95885776, "step": 100415 }, { "epoch": 8.191532751447916, "grad_norm": 1.0536497831344604, "learning_rate": 4.818515400114704e-06, "loss": 0.4091, "num_input_tokens_seen": 95891024, "step": 100420 }, { "epoch": 8.191940615058325, "grad_norm": 1.0622942447662354, "learning_rate": 4.816414928061527e-06, "loss": 0.4249, "num_input_tokens_seen": 95895904, "step": 100425 }, { "epoch": 8.192348478668734, "grad_norm": 4.561130523681641, "learning_rate": 4.814314865121044e-06, "loss": 0.46, "num_input_tokens_seen": 95900576, "step": 100430 }, { "epoch": 8.192756342279141, "grad_norm": 52.241127014160156, "learning_rate": 4.8122152113358245e-06, "loss": 0.4213, "num_input_tokens_seen": 95904832, "step": 100435 }, { "epoch": 8.19316420588955, "grad_norm": 6.798852443695068, "learning_rate": 4.810115966748416e-06, "loss": 0.3337, "num_input_tokens_seen": 95909920, "step": 100440 }, { "epoch": 8.19357206949996, "grad_norm": 0.9935775399208069, "learning_rate": 4.808017131401383e-06, "loss": 0.3011, "num_input_tokens_seen": 95914080, "step": 100445 }, { "epoch": 8.193979933110368, "grad_norm": 16.89394187927246, "learning_rate": 4.805918705337262e-06, "loss": 0.2454, "num_input_tokens_seen": 95918576, "step": 100450 }, { "epoch": 8.194387796720777, "grad_norm": 9.221953392028809, "learning_rate": 4.80382068859859e-06, "loss": 0.3499, "num_input_tokens_seen": 95923296, "step": 100455 }, { "epoch": 8.194795660331184, "grad_norm": 34.558319091796875, "learning_rate": 4.801723081227883e-06, "loss": 0.3222, "num_input_tokens_seen": 95927920, "step": 100460 }, { "epoch": 8.195203523941593, "grad_norm": 17.38916778564453, "learning_rate": 4.799625883267672e-06, "loss": 0.4641, "num_input_tokens_seen": 95933312, "step": 100465 }, { "epoch": 8.195611387552002, "grad_norm": 47.6989860534668, "learning_rate": 4.797529094760464e-06, "loss": 0.2821, "num_input_tokens_seen": 95937584, "step": 100470 }, { "epoch": 8.196019251162411, "grad_norm": 17.39530372619629, "learning_rate": 4.795432715748754e-06, "loss": 0.2982, "num_input_tokens_seen": 95942464, "step": 100475 }, { "epoch": 8.19642711477282, "grad_norm": 33.884010314941406, "learning_rate": 4.793336746275037e-06, "loss": 0.3177, "num_input_tokens_seen": 95947600, "step": 100480 }, { "epoch": 8.19683497838323, "grad_norm": 31.321989059448242, "learning_rate": 4.7912411863817926e-06, "loss": 0.3099, "num_input_tokens_seen": 95952016, "step": 100485 }, { "epoch": 8.197242841993637, "grad_norm": 22.562524795532227, "learning_rate": 4.789146036111508e-06, "loss": 0.4572, "num_input_tokens_seen": 95956992, "step": 100490 }, { "epoch": 8.197650705604046, "grad_norm": 1.9387356042861938, "learning_rate": 4.787051295506648e-06, "loss": 0.3967, "num_input_tokens_seen": 95962128, "step": 100495 }, { "epoch": 8.198058569214455, "grad_norm": 1.498769998550415, "learning_rate": 4.784956964609666e-06, "loss": 0.2188, "num_input_tokens_seen": 95967248, "step": 100500 }, { "epoch": 8.198466432824864, "grad_norm": 12.406081199645996, "learning_rate": 4.782863043463018e-06, "loss": 0.3795, "num_input_tokens_seen": 95971520, "step": 100505 }, { "epoch": 8.198874296435273, "grad_norm": 0.45121344923973083, "learning_rate": 4.7807695321091445e-06, "loss": 0.4537, "num_input_tokens_seen": 95976704, "step": 100510 }, { "epoch": 8.19928216004568, "grad_norm": 1.9270291328430176, "learning_rate": 4.7786764305904785e-06, "loss": 0.3639, "num_input_tokens_seen": 95981632, "step": 100515 }, { "epoch": 8.199690023656089, "grad_norm": 9.952432632446289, "learning_rate": 4.776583738949453e-06, "loss": 0.3178, "num_input_tokens_seen": 95986512, "step": 100520 }, { "epoch": 8.200097887266498, "grad_norm": 0.7071719169616699, "learning_rate": 4.774491457228472e-06, "loss": 0.3194, "num_input_tokens_seen": 95992160, "step": 100525 }, { "epoch": 8.200505750876907, "grad_norm": 17.17484474182129, "learning_rate": 4.7723995854699634e-06, "loss": 0.3019, "num_input_tokens_seen": 95996400, "step": 100530 }, { "epoch": 8.200913614487316, "grad_norm": 14.240211486816406, "learning_rate": 4.770308123716319e-06, "loss": 0.2427, "num_input_tokens_seen": 96001232, "step": 100535 }, { "epoch": 8.201321478097723, "grad_norm": 2.776779890060425, "learning_rate": 4.768217072009934e-06, "loss": 0.4966, "num_input_tokens_seen": 96005760, "step": 100540 }, { "epoch": 8.201729341708132, "grad_norm": 1.1418274641036987, "learning_rate": 4.766126430393189e-06, "loss": 0.362, "num_input_tokens_seen": 96010608, "step": 100545 }, { "epoch": 8.202137205318541, "grad_norm": 23.438678741455078, "learning_rate": 4.764036198908461e-06, "loss": 0.2999, "num_input_tokens_seen": 96015616, "step": 100550 }, { "epoch": 8.20254506892895, "grad_norm": 11.2959566116333, "learning_rate": 4.761946377598123e-06, "loss": 0.2442, "num_input_tokens_seen": 96019856, "step": 100555 }, { "epoch": 8.20295293253936, "grad_norm": 2.0392160415649414, "learning_rate": 4.759856966504536e-06, "loss": 0.3035, "num_input_tokens_seen": 96024288, "step": 100560 }, { "epoch": 8.203360796149768, "grad_norm": 7.02086067199707, "learning_rate": 4.7577679656700456e-06, "loss": 0.3611, "num_input_tokens_seen": 96029280, "step": 100565 }, { "epoch": 8.203768659760176, "grad_norm": 8.92320728302002, "learning_rate": 4.75567937513699e-06, "loss": 0.3918, "num_input_tokens_seen": 96033856, "step": 100570 }, { "epoch": 8.204176523370585, "grad_norm": 43.81953430175781, "learning_rate": 4.753591194947718e-06, "loss": 0.4387, "num_input_tokens_seen": 96038304, "step": 100575 }, { "epoch": 8.204584386980994, "grad_norm": 32.160804748535156, "learning_rate": 4.751503425144549e-06, "loss": 0.3708, "num_input_tokens_seen": 96042864, "step": 100580 }, { "epoch": 8.204992250591403, "grad_norm": 36.749942779541016, "learning_rate": 4.749416065769805e-06, "loss": 0.3085, "num_input_tokens_seen": 96046048, "step": 100585 }, { "epoch": 8.205400114201812, "grad_norm": 11.094740867614746, "learning_rate": 4.747329116865787e-06, "loss": 0.3247, "num_input_tokens_seen": 96050496, "step": 100590 }, { "epoch": 8.205807977812219, "grad_norm": 0.3445422053337097, "learning_rate": 4.745242578474798e-06, "loss": 0.3577, "num_input_tokens_seen": 96054944, "step": 100595 }, { "epoch": 8.206215841422628, "grad_norm": 14.41032600402832, "learning_rate": 4.743156450639141e-06, "loss": 0.3969, "num_input_tokens_seen": 96059552, "step": 100600 }, { "epoch": 8.206623705033037, "grad_norm": 7.5498433113098145, "learning_rate": 4.741070733401095e-06, "loss": 0.3491, "num_input_tokens_seen": 96064496, "step": 100605 }, { "epoch": 8.207031568643446, "grad_norm": 1.6643027067184448, "learning_rate": 4.738985426802939e-06, "loss": 0.3868, "num_input_tokens_seen": 96068816, "step": 100610 }, { "epoch": 8.207439432253855, "grad_norm": 6.497992992401123, "learning_rate": 4.736900530886931e-06, "loss": 0.2812, "num_input_tokens_seen": 96073696, "step": 100615 }, { "epoch": 8.207847295864262, "grad_norm": 0.551162600517273, "learning_rate": 4.734816045695345e-06, "loss": 0.2154, "num_input_tokens_seen": 96078592, "step": 100620 }, { "epoch": 8.208255159474671, "grad_norm": 1.5832101106643677, "learning_rate": 4.732731971270427e-06, "loss": 0.3559, "num_input_tokens_seen": 96083824, "step": 100625 }, { "epoch": 8.20866302308508, "grad_norm": 0.7142858505249023, "learning_rate": 4.73064830765442e-06, "loss": 0.3645, "num_input_tokens_seen": 96088368, "step": 100630 }, { "epoch": 8.20907088669549, "grad_norm": 1.9471386671066284, "learning_rate": 4.728565054889559e-06, "loss": 0.3085, "num_input_tokens_seen": 96093840, "step": 100635 }, { "epoch": 8.209478750305898, "grad_norm": 20.98107147216797, "learning_rate": 4.726482213018068e-06, "loss": 0.4174, "num_input_tokens_seen": 96098384, "step": 100640 }, { "epoch": 8.209886613916307, "grad_norm": 19.16914939880371, "learning_rate": 4.72439978208217e-06, "loss": 0.3282, "num_input_tokens_seen": 96103552, "step": 100645 }, { "epoch": 8.210294477526714, "grad_norm": 23.115537643432617, "learning_rate": 4.722317762124073e-06, "loss": 0.4044, "num_input_tokens_seen": 96108384, "step": 100650 }, { "epoch": 8.210702341137123, "grad_norm": 46.23080062866211, "learning_rate": 4.720236153185978e-06, "loss": 0.2806, "num_input_tokens_seen": 96113264, "step": 100655 }, { "epoch": 8.211110204747532, "grad_norm": 58.144805908203125, "learning_rate": 4.718154955310073e-06, "loss": 0.4826, "num_input_tokens_seen": 96117376, "step": 100660 }, { "epoch": 8.211518068357941, "grad_norm": 5.859470367431641, "learning_rate": 4.716074168538554e-06, "loss": 0.3078, "num_input_tokens_seen": 96122464, "step": 100665 }, { "epoch": 8.21192593196835, "grad_norm": 3.3023767471313477, "learning_rate": 4.713993792913593e-06, "loss": 0.3886, "num_input_tokens_seen": 96128368, "step": 100670 }, { "epoch": 8.212333795578758, "grad_norm": 2.1826870441436768, "learning_rate": 4.711913828477358e-06, "loss": 0.3028, "num_input_tokens_seen": 96133296, "step": 100675 }, { "epoch": 8.212741659189167, "grad_norm": 24.47062873840332, "learning_rate": 4.709834275272002e-06, "loss": 0.3941, "num_input_tokens_seen": 96137712, "step": 100680 }, { "epoch": 8.213149522799576, "grad_norm": 50.83053970336914, "learning_rate": 4.707755133339692e-06, "loss": 0.4729, "num_input_tokens_seen": 96141952, "step": 100685 }, { "epoch": 8.213557386409985, "grad_norm": 24.215036392211914, "learning_rate": 4.705676402722558e-06, "loss": 0.2786, "num_input_tokens_seen": 96146896, "step": 100690 }, { "epoch": 8.213965250020394, "grad_norm": 1.4927958250045776, "learning_rate": 4.703598083462743e-06, "loss": 0.3965, "num_input_tokens_seen": 96151744, "step": 100695 }, { "epoch": 8.214373113630803, "grad_norm": 1.2489880323410034, "learning_rate": 4.70152017560237e-06, "loss": 0.2812, "num_input_tokens_seen": 96157360, "step": 100700 }, { "epoch": 8.21478097724121, "grad_norm": 3.8000240325927734, "learning_rate": 4.699442679183552e-06, "loss": 0.4442, "num_input_tokens_seen": 96162080, "step": 100705 }, { "epoch": 8.215188840851619, "grad_norm": 1.6071341037750244, "learning_rate": 4.69736559424841e-06, "loss": 0.2544, "num_input_tokens_seen": 96166928, "step": 100710 }, { "epoch": 8.215596704462028, "grad_norm": 50.233848571777344, "learning_rate": 4.695288920839041e-06, "loss": 0.4464, "num_input_tokens_seen": 96172272, "step": 100715 }, { "epoch": 8.216004568072437, "grad_norm": 30.47551727294922, "learning_rate": 4.6932126589975365e-06, "loss": 0.3297, "num_input_tokens_seen": 96176304, "step": 100720 }, { "epoch": 8.216412431682846, "grad_norm": 3.9973015785217285, "learning_rate": 4.691136808765978e-06, "loss": 0.3917, "num_input_tokens_seen": 96181648, "step": 100725 }, { "epoch": 8.216820295293253, "grad_norm": 4.531330108642578, "learning_rate": 4.689061370186451e-06, "loss": 0.3465, "num_input_tokens_seen": 96186784, "step": 100730 }, { "epoch": 8.217228158903662, "grad_norm": 4.033801078796387, "learning_rate": 4.68698634330102e-06, "loss": 0.2813, "num_input_tokens_seen": 96190992, "step": 100735 }, { "epoch": 8.217636022514071, "grad_norm": 3.6564948558807373, "learning_rate": 4.684911728151744e-06, "loss": 0.2861, "num_input_tokens_seen": 96195504, "step": 100740 }, { "epoch": 8.21804388612448, "grad_norm": 6.656063556671143, "learning_rate": 4.682837524780675e-06, "loss": 0.298, "num_input_tokens_seen": 96200384, "step": 100745 }, { "epoch": 8.21845174973489, "grad_norm": 16.85687828063965, "learning_rate": 4.680763733229851e-06, "loss": 0.3786, "num_input_tokens_seen": 96205296, "step": 100750 }, { "epoch": 8.218859613345296, "grad_norm": 10.499091148376465, "learning_rate": 4.678690353541318e-06, "loss": 0.293, "num_input_tokens_seen": 96210592, "step": 100755 }, { "epoch": 8.219267476955705, "grad_norm": 0.947932243347168, "learning_rate": 4.676617385757095e-06, "loss": 0.3985, "num_input_tokens_seen": 96215248, "step": 100760 }, { "epoch": 8.219675340566114, "grad_norm": 0.8985366821289062, "learning_rate": 4.674544829919203e-06, "loss": 0.292, "num_input_tokens_seen": 96220480, "step": 100765 }, { "epoch": 8.220083204176523, "grad_norm": 3.1116063594818115, "learning_rate": 4.672472686069648e-06, "loss": 0.2374, "num_input_tokens_seen": 96225296, "step": 100770 }, { "epoch": 8.220491067786933, "grad_norm": 1.5187219381332397, "learning_rate": 4.670400954250436e-06, "loss": 0.2908, "num_input_tokens_seen": 96229056, "step": 100775 }, { "epoch": 8.220898931397342, "grad_norm": 0.33149096369743347, "learning_rate": 4.66832963450356e-06, "loss": 0.3542, "num_input_tokens_seen": 96234608, "step": 100780 }, { "epoch": 8.221306795007749, "grad_norm": 0.4761655628681183, "learning_rate": 4.666258726871001e-06, "loss": 0.3491, "num_input_tokens_seen": 96239152, "step": 100785 }, { "epoch": 8.221714658618158, "grad_norm": 2.4298646450042725, "learning_rate": 4.664188231394736e-06, "loss": 0.4214, "num_input_tokens_seen": 96242752, "step": 100790 }, { "epoch": 8.222122522228567, "grad_norm": 30.824020385742188, "learning_rate": 4.662118148116729e-06, "loss": 0.5311, "num_input_tokens_seen": 96247904, "step": 100795 }, { "epoch": 8.222530385838976, "grad_norm": 1.2330234050750732, "learning_rate": 4.660048477078952e-06, "loss": 0.3356, "num_input_tokens_seen": 96252688, "step": 100800 }, { "epoch": 8.222938249449385, "grad_norm": 5.581427097320557, "learning_rate": 4.65797921832335e-06, "loss": 0.3968, "num_input_tokens_seen": 96258208, "step": 100805 }, { "epoch": 8.223346113059792, "grad_norm": 2.1435744762420654, "learning_rate": 4.655910371891864e-06, "loss": 0.4078, "num_input_tokens_seen": 96262496, "step": 100810 }, { "epoch": 8.223753976670201, "grad_norm": 8.398019790649414, "learning_rate": 4.653841937826422e-06, "loss": 0.2954, "num_input_tokens_seen": 96268048, "step": 100815 }, { "epoch": 8.22416184028061, "grad_norm": 36.84529113769531, "learning_rate": 4.651773916168967e-06, "loss": 0.2998, "num_input_tokens_seen": 96272976, "step": 100820 }, { "epoch": 8.224569703891019, "grad_norm": 2.937814950942993, "learning_rate": 4.64970630696141e-06, "loss": 0.3325, "num_input_tokens_seen": 96277632, "step": 100825 }, { "epoch": 8.224977567501428, "grad_norm": 11.512587547302246, "learning_rate": 4.647639110245655e-06, "loss": 0.2554, "num_input_tokens_seen": 96283136, "step": 100830 }, { "epoch": 8.225385431111837, "grad_norm": 12.923799514770508, "learning_rate": 4.64557232606361e-06, "loss": 0.3862, "num_input_tokens_seen": 96287776, "step": 100835 }, { "epoch": 8.225793294722244, "grad_norm": 35.0845832824707, "learning_rate": 4.643505954457155e-06, "loss": 0.3481, "num_input_tokens_seen": 96293328, "step": 100840 }, { "epoch": 8.226201158332653, "grad_norm": 11.873587608337402, "learning_rate": 4.6414399954681946e-06, "loss": 0.3383, "num_input_tokens_seen": 96298560, "step": 100845 }, { "epoch": 8.226609021943062, "grad_norm": 35.832855224609375, "learning_rate": 4.639374449138592e-06, "loss": 0.3365, "num_input_tokens_seen": 96303616, "step": 100850 }, { "epoch": 8.227016885553471, "grad_norm": 4.490286827087402, "learning_rate": 4.637309315510218e-06, "loss": 0.3389, "num_input_tokens_seen": 96308608, "step": 100855 }, { "epoch": 8.22742474916388, "grad_norm": 1.886103868484497, "learning_rate": 4.635244594624926e-06, "loss": 0.3145, "num_input_tokens_seen": 96313104, "step": 100860 }, { "epoch": 8.227832612774288, "grad_norm": 13.737648010253906, "learning_rate": 4.63318028652458e-06, "loss": 0.2736, "num_input_tokens_seen": 96318048, "step": 100865 }, { "epoch": 8.228240476384697, "grad_norm": 1.1030704975128174, "learning_rate": 4.631116391251014e-06, "loss": 0.3019, "num_input_tokens_seen": 96323376, "step": 100870 }, { "epoch": 8.228648339995106, "grad_norm": 9.422518730163574, "learning_rate": 4.629052908846062e-06, "loss": 0.358, "num_input_tokens_seen": 96327744, "step": 100875 }, { "epoch": 8.229056203605515, "grad_norm": 21.371416091918945, "learning_rate": 4.6269898393515475e-06, "loss": 0.3368, "num_input_tokens_seen": 96332656, "step": 100880 }, { "epoch": 8.229464067215924, "grad_norm": 43.81484603881836, "learning_rate": 4.624927182809297e-06, "loss": 0.4665, "num_input_tokens_seen": 96336960, "step": 100885 }, { "epoch": 8.22987193082633, "grad_norm": 3.212282180786133, "learning_rate": 4.622864939261115e-06, "loss": 0.3336, "num_input_tokens_seen": 96341568, "step": 100890 }, { "epoch": 8.23027979443674, "grad_norm": 2.236640214920044, "learning_rate": 4.6208031087488006e-06, "loss": 0.3805, "num_input_tokens_seen": 96346512, "step": 100895 }, { "epoch": 8.230687658047149, "grad_norm": 3.464733600616455, "learning_rate": 4.618741691314149e-06, "loss": 0.5816, "num_input_tokens_seen": 96351616, "step": 100900 }, { "epoch": 8.231095521657558, "grad_norm": 28.684816360473633, "learning_rate": 4.616680686998942e-06, "loss": 0.2525, "num_input_tokens_seen": 96356496, "step": 100905 }, { "epoch": 8.231503385267967, "grad_norm": 19.030038833618164, "learning_rate": 4.614620095844957e-06, "loss": 0.2312, "num_input_tokens_seen": 96360992, "step": 100910 }, { "epoch": 8.231911248878376, "grad_norm": 0.8515158295631409, "learning_rate": 4.612559917893958e-06, "loss": 0.3167, "num_input_tokens_seen": 96365584, "step": 100915 }, { "epoch": 8.232319112488783, "grad_norm": 3.9932918548583984, "learning_rate": 4.610500153187708e-06, "loss": 0.3083, "num_input_tokens_seen": 96370944, "step": 100920 }, { "epoch": 8.232726976099192, "grad_norm": 36.00257110595703, "learning_rate": 4.60844080176795e-06, "loss": 0.2967, "num_input_tokens_seen": 96375920, "step": 100925 }, { "epoch": 8.233134839709601, "grad_norm": 4.317503929138184, "learning_rate": 4.6063818636764376e-06, "loss": 0.4074, "num_input_tokens_seen": 96379344, "step": 100930 }, { "epoch": 8.23354270332001, "grad_norm": 1.7475314140319824, "learning_rate": 4.6043233389549e-06, "loss": 0.411, "num_input_tokens_seen": 96384720, "step": 100935 }, { "epoch": 8.23395056693042, "grad_norm": 3.430450677871704, "learning_rate": 4.6022652276450615e-06, "loss": 0.3675, "num_input_tokens_seen": 96389056, "step": 100940 }, { "epoch": 8.234358430540826, "grad_norm": 6.477347373962402, "learning_rate": 4.600207529788639e-06, "loss": 0.4528, "num_input_tokens_seen": 96394272, "step": 100945 }, { "epoch": 8.234766294151235, "grad_norm": 3.7366855144500732, "learning_rate": 4.598150245427335e-06, "loss": 0.2744, "num_input_tokens_seen": 96399584, "step": 100950 }, { "epoch": 8.235174157761644, "grad_norm": 1.2906774282455444, "learning_rate": 4.596093374602864e-06, "loss": 0.3822, "num_input_tokens_seen": 96404672, "step": 100955 }, { "epoch": 8.235582021372053, "grad_norm": 10.232562065124512, "learning_rate": 4.594036917356909e-06, "loss": 0.34, "num_input_tokens_seen": 96410336, "step": 100960 }, { "epoch": 8.235989884982462, "grad_norm": 2.228975772857666, "learning_rate": 4.591980873731153e-06, "loss": 0.3259, "num_input_tokens_seen": 96415584, "step": 100965 }, { "epoch": 8.236397748592871, "grad_norm": 28.44925308227539, "learning_rate": 4.589925243767268e-06, "loss": 0.3087, "num_input_tokens_seen": 96420384, "step": 100970 }, { "epoch": 8.236805612203279, "grad_norm": 14.112955093383789, "learning_rate": 4.587870027506933e-06, "loss": 0.3656, "num_input_tokens_seen": 96424912, "step": 100975 }, { "epoch": 8.237213475813688, "grad_norm": 9.912013053894043, "learning_rate": 4.585815224991796e-06, "loss": 0.4155, "num_input_tokens_seen": 96430096, "step": 100980 }, { "epoch": 8.237621339424097, "grad_norm": 39.205726623535156, "learning_rate": 4.583760836263512e-06, "loss": 0.3611, "num_input_tokens_seen": 96434240, "step": 100985 }, { "epoch": 8.238029203034506, "grad_norm": 26.877578735351562, "learning_rate": 4.581706861363722e-06, "loss": 0.488, "num_input_tokens_seen": 96439920, "step": 100990 }, { "epoch": 8.238437066644915, "grad_norm": 1.859645128250122, "learning_rate": 4.579653300334047e-06, "loss": 0.2775, "num_input_tokens_seen": 96444880, "step": 100995 }, { "epoch": 8.238844930255322, "grad_norm": 20.18914222717285, "learning_rate": 4.577600153216133e-06, "loss": 0.5088, "num_input_tokens_seen": 96450368, "step": 101000 }, { "epoch": 8.239252793865731, "grad_norm": 2.36868953704834, "learning_rate": 4.575547420051582e-06, "loss": 0.2452, "num_input_tokens_seen": 96454352, "step": 101005 }, { "epoch": 8.23966065747614, "grad_norm": 13.828394889831543, "learning_rate": 4.5734951008820086e-06, "loss": 0.3788, "num_input_tokens_seen": 96459392, "step": 101010 }, { "epoch": 8.240068521086549, "grad_norm": 11.410258293151855, "learning_rate": 4.571443195749003e-06, "loss": 0.3006, "num_input_tokens_seen": 96464720, "step": 101015 }, { "epoch": 8.240476384696958, "grad_norm": 30.50951385498047, "learning_rate": 4.56939170469417e-06, "loss": 0.3068, "num_input_tokens_seen": 96468800, "step": 101020 }, { "epoch": 8.240884248307365, "grad_norm": 16.374956130981445, "learning_rate": 4.567340627759084e-06, "loss": 0.2895, "num_input_tokens_seen": 96474320, "step": 101025 }, { "epoch": 8.241292111917774, "grad_norm": 27.95671844482422, "learning_rate": 4.565289964985323e-06, "loss": 0.2669, "num_input_tokens_seen": 96479856, "step": 101030 }, { "epoch": 8.241699975528183, "grad_norm": 38.26142120361328, "learning_rate": 4.56323971641445e-06, "loss": 0.4464, "num_input_tokens_seen": 96484752, "step": 101035 }, { "epoch": 8.242107839138592, "grad_norm": 27.16118049621582, "learning_rate": 4.561189882088024e-06, "loss": 0.3357, "num_input_tokens_seen": 96490080, "step": 101040 }, { "epoch": 8.242515702749001, "grad_norm": 35.772247314453125, "learning_rate": 4.559140462047595e-06, "loss": 0.4455, "num_input_tokens_seen": 96494192, "step": 101045 }, { "epoch": 8.24292356635941, "grad_norm": 20.738035202026367, "learning_rate": 4.557091456334703e-06, "loss": 0.277, "num_input_tokens_seen": 96498432, "step": 101050 }, { "epoch": 8.243331429969817, "grad_norm": 65.24891662597656, "learning_rate": 4.555042864990872e-06, "loss": 0.2436, "num_input_tokens_seen": 96503264, "step": 101055 }, { "epoch": 8.243739293580227, "grad_norm": 19.965665817260742, "learning_rate": 4.552994688057644e-06, "loss": 0.328, "num_input_tokens_seen": 96508096, "step": 101060 }, { "epoch": 8.244147157190636, "grad_norm": 10.917240142822266, "learning_rate": 4.550946925576522e-06, "loss": 0.2879, "num_input_tokens_seen": 96512912, "step": 101065 }, { "epoch": 8.244555020801045, "grad_norm": 48.847496032714844, "learning_rate": 4.548899577589019e-06, "loss": 0.3564, "num_input_tokens_seen": 96517168, "step": 101070 }, { "epoch": 8.244962884411454, "grad_norm": 5.13309383392334, "learning_rate": 4.546852644136629e-06, "loss": 0.2566, "num_input_tokens_seen": 96521536, "step": 101075 }, { "epoch": 8.24537074802186, "grad_norm": 5.419529914855957, "learning_rate": 4.544806125260839e-06, "loss": 0.4436, "num_input_tokens_seen": 96525984, "step": 101080 }, { "epoch": 8.24577861163227, "grad_norm": 28.435903549194336, "learning_rate": 4.5427600210031455e-06, "loss": 0.3741, "num_input_tokens_seen": 96529920, "step": 101085 }, { "epoch": 8.246186475242679, "grad_norm": 2.6580684185028076, "learning_rate": 4.540714331405013e-06, "loss": 0.374, "num_input_tokens_seen": 96534576, "step": 101090 }, { "epoch": 8.246594338853088, "grad_norm": 1.1743769645690918, "learning_rate": 4.5386690565079045e-06, "loss": 0.3051, "num_input_tokens_seen": 96539424, "step": 101095 }, { "epoch": 8.247002202463497, "grad_norm": 43.545589447021484, "learning_rate": 4.536624196353284e-06, "loss": 0.3093, "num_input_tokens_seen": 96544016, "step": 101100 }, { "epoch": 8.247410066073904, "grad_norm": 25.970748901367188, "learning_rate": 4.534579750982587e-06, "loss": 0.2701, "num_input_tokens_seen": 96548688, "step": 101105 }, { "epoch": 8.247817929684313, "grad_norm": 7.992231369018555, "learning_rate": 4.532535720437267e-06, "loss": 0.4816, "num_input_tokens_seen": 96553936, "step": 101110 }, { "epoch": 8.248225793294722, "grad_norm": 1.9986631870269775, "learning_rate": 4.530492104758752e-06, "loss": 0.3303, "num_input_tokens_seen": 96559440, "step": 101115 }, { "epoch": 8.248633656905131, "grad_norm": 25.667587280273438, "learning_rate": 4.5284489039884635e-06, "loss": 0.3576, "num_input_tokens_seen": 96564096, "step": 101120 }, { "epoch": 8.24904152051554, "grad_norm": 2.3468563556671143, "learning_rate": 4.5264061181678116e-06, "loss": 0.1981, "num_input_tokens_seen": 96569136, "step": 101125 }, { "epoch": 8.249449384125949, "grad_norm": 45.51004409790039, "learning_rate": 4.5243637473382145e-06, "loss": 0.3633, "num_input_tokens_seen": 96574080, "step": 101130 }, { "epoch": 8.249857247736356, "grad_norm": 47.874603271484375, "learning_rate": 4.5223217915410625e-06, "loss": 0.1762, "num_input_tokens_seen": 96578960, "step": 101135 }, { "epoch": 8.250265111346765, "grad_norm": 7.742948055267334, "learning_rate": 4.520280250817749e-06, "loss": 0.2602, "num_input_tokens_seen": 96583472, "step": 101140 }, { "epoch": 8.250672974957174, "grad_norm": 21.576797485351562, "learning_rate": 4.51823912520965e-06, "loss": 0.5089, "num_input_tokens_seen": 96589104, "step": 101145 }, { "epoch": 8.251080838567583, "grad_norm": 14.779906272888184, "learning_rate": 4.516198414758133e-06, "loss": 0.3416, "num_input_tokens_seen": 96593696, "step": 101150 }, { "epoch": 8.251488702177992, "grad_norm": 2.815101146697998, "learning_rate": 4.514158119504578e-06, "loss": 0.3059, "num_input_tokens_seen": 96599008, "step": 101155 }, { "epoch": 8.2518965657884, "grad_norm": 57.50611877441406, "learning_rate": 4.512118239490335e-06, "loss": 0.4059, "num_input_tokens_seen": 96602976, "step": 101160 }, { "epoch": 8.252304429398809, "grad_norm": 37.04460906982422, "learning_rate": 4.510078774756746e-06, "loss": 0.326, "num_input_tokens_seen": 96607440, "step": 101165 }, { "epoch": 8.252712293009218, "grad_norm": 2.2874720096588135, "learning_rate": 4.5080397253451565e-06, "loss": 0.1582, "num_input_tokens_seen": 96611840, "step": 101170 }, { "epoch": 8.253120156619627, "grad_norm": 4.136963367462158, "learning_rate": 4.5060010912968854e-06, "loss": 0.3613, "num_input_tokens_seen": 96616640, "step": 101175 }, { "epoch": 8.253528020230036, "grad_norm": 6.231744766235352, "learning_rate": 4.5039628726532695e-06, "loss": 0.3184, "num_input_tokens_seen": 96621024, "step": 101180 }, { "epoch": 8.253935883840445, "grad_norm": 3.0744497776031494, "learning_rate": 4.50192506945562e-06, "loss": 0.5607, "num_input_tokens_seen": 96625648, "step": 101185 }, { "epoch": 8.254343747450852, "grad_norm": 2.9106059074401855, "learning_rate": 4.499887681745238e-06, "loss": 0.3912, "num_input_tokens_seen": 96630464, "step": 101190 }, { "epoch": 8.25475161106126, "grad_norm": 6.9155592918396, "learning_rate": 4.49785070956342e-06, "loss": 0.5572, "num_input_tokens_seen": 96636112, "step": 101195 }, { "epoch": 8.25515947467167, "grad_norm": 26.762317657470703, "learning_rate": 4.4958141529514584e-06, "loss": 0.3311, "num_input_tokens_seen": 96641008, "step": 101200 }, { "epoch": 8.255567338282079, "grad_norm": 2.649545192718506, "learning_rate": 4.493778011950631e-06, "loss": 0.368, "num_input_tokens_seen": 96646560, "step": 101205 }, { "epoch": 8.255975201892488, "grad_norm": 6.044105052947998, "learning_rate": 4.491742286602207e-06, "loss": 0.4582, "num_input_tokens_seen": 96650864, "step": 101210 }, { "epoch": 8.256383065502895, "grad_norm": 22.432558059692383, "learning_rate": 4.48970697694745e-06, "loss": 0.3353, "num_input_tokens_seen": 96655136, "step": 101215 }, { "epoch": 8.256790929113304, "grad_norm": 0.7225881218910217, "learning_rate": 4.487672083027619e-06, "loss": 0.3319, "num_input_tokens_seen": 96659536, "step": 101220 }, { "epoch": 8.257198792723713, "grad_norm": 1.3513708114624023, "learning_rate": 4.485637604883963e-06, "loss": 0.289, "num_input_tokens_seen": 96664336, "step": 101225 }, { "epoch": 8.257606656334122, "grad_norm": 1.349252700805664, "learning_rate": 4.483603542557713e-06, "loss": 0.3531, "num_input_tokens_seen": 96669488, "step": 101230 }, { "epoch": 8.258014519944531, "grad_norm": 16.839130401611328, "learning_rate": 4.481569896090096e-06, "loss": 0.3453, "num_input_tokens_seen": 96675136, "step": 101235 }, { "epoch": 8.258422383554938, "grad_norm": 0.5749370455741882, "learning_rate": 4.479536665522344e-06, "loss": 0.2338, "num_input_tokens_seen": 96680464, "step": 101240 }, { "epoch": 8.258830247165347, "grad_norm": 0.3289729356765747, "learning_rate": 4.477503850895665e-06, "loss": 0.3497, "num_input_tokens_seen": 96684720, "step": 101245 }, { "epoch": 8.259238110775756, "grad_norm": 3.103203058242798, "learning_rate": 4.475471452251262e-06, "loss": 0.226, "num_input_tokens_seen": 96689968, "step": 101250 }, { "epoch": 8.259645974386165, "grad_norm": 8.842883110046387, "learning_rate": 4.473439469630331e-06, "loss": 0.3772, "num_input_tokens_seen": 96695056, "step": 101255 }, { "epoch": 8.260053837996574, "grad_norm": 16.20804786682129, "learning_rate": 4.471407903074054e-06, "loss": 0.2618, "num_input_tokens_seen": 96699056, "step": 101260 }, { "epoch": 8.260461701606983, "grad_norm": 26.87622833251953, "learning_rate": 4.4693767526236215e-06, "loss": 0.3703, "num_input_tokens_seen": 96703888, "step": 101265 }, { "epoch": 8.26086956521739, "grad_norm": 16.11612319946289, "learning_rate": 4.467346018320198e-06, "loss": 0.2851, "num_input_tokens_seen": 96708160, "step": 101270 }, { "epoch": 8.2612774288278, "grad_norm": 34.235103607177734, "learning_rate": 4.465315700204947e-06, "loss": 0.3927, "num_input_tokens_seen": 96712560, "step": 101275 }, { "epoch": 8.261685292438209, "grad_norm": 17.839433670043945, "learning_rate": 4.463285798319014e-06, "loss": 0.3903, "num_input_tokens_seen": 96716928, "step": 101280 }, { "epoch": 8.262093156048618, "grad_norm": 43.44668960571289, "learning_rate": 4.461256312703557e-06, "loss": 0.4178, "num_input_tokens_seen": 96721632, "step": 101285 }, { "epoch": 8.262501019659027, "grad_norm": 0.7339630722999573, "learning_rate": 4.4592272433997075e-06, "loss": 0.3497, "num_input_tokens_seen": 96726560, "step": 101290 }, { "epoch": 8.262908883269434, "grad_norm": 6.553764343261719, "learning_rate": 4.457198590448594e-06, "loss": 0.3277, "num_input_tokens_seen": 96731344, "step": 101295 }, { "epoch": 8.263316746879843, "grad_norm": 0.708906352519989, "learning_rate": 4.455170353891336e-06, "loss": 0.2468, "num_input_tokens_seen": 96735712, "step": 101300 }, { "epoch": 8.263724610490252, "grad_norm": 19.375682830810547, "learning_rate": 4.453142533769036e-06, "loss": 0.3952, "num_input_tokens_seen": 96739728, "step": 101305 }, { "epoch": 8.264132474100661, "grad_norm": 4.329888343811035, "learning_rate": 4.451115130122816e-06, "loss": 0.2858, "num_input_tokens_seen": 96744592, "step": 101310 }, { "epoch": 8.26454033771107, "grad_norm": 10.003399848937988, "learning_rate": 4.44908814299376e-06, "loss": 0.3222, "num_input_tokens_seen": 96748656, "step": 101315 }, { "epoch": 8.264948201321477, "grad_norm": 10.501737594604492, "learning_rate": 4.447061572422953e-06, "loss": 0.325, "num_input_tokens_seen": 96753056, "step": 101320 }, { "epoch": 8.265356064931886, "grad_norm": 33.02754211425781, "learning_rate": 4.445035418451474e-06, "loss": 0.4312, "num_input_tokens_seen": 96758080, "step": 101325 }, { "epoch": 8.265763928542295, "grad_norm": 1.9238533973693848, "learning_rate": 4.443009681120394e-06, "loss": 0.2732, "num_input_tokens_seen": 96763296, "step": 101330 }, { "epoch": 8.266171792152704, "grad_norm": 5.293185234069824, "learning_rate": 4.440984360470774e-06, "loss": 0.3689, "num_input_tokens_seen": 96767776, "step": 101335 }, { "epoch": 8.266579655763113, "grad_norm": 4.836169719696045, "learning_rate": 4.438959456543662e-06, "loss": 0.2611, "num_input_tokens_seen": 96771808, "step": 101340 }, { "epoch": 8.266987519373522, "grad_norm": 25.1151065826416, "learning_rate": 4.436934969380108e-06, "loss": 0.2584, "num_input_tokens_seen": 96776512, "step": 101345 }, { "epoch": 8.26739538298393, "grad_norm": 2.162865161895752, "learning_rate": 4.4349108990211355e-06, "loss": 0.3099, "num_input_tokens_seen": 96780688, "step": 101350 }, { "epoch": 8.267803246594339, "grad_norm": 25.14187240600586, "learning_rate": 4.432887245507786e-06, "loss": 0.3889, "num_input_tokens_seen": 96785680, "step": 101355 }, { "epoch": 8.268211110204748, "grad_norm": 6.211065292358398, "learning_rate": 4.430864008881075e-06, "loss": 0.3277, "num_input_tokens_seen": 96790496, "step": 101360 }, { "epoch": 8.268618973815157, "grad_norm": 46.085575103759766, "learning_rate": 4.428841189182009e-06, "loss": 0.2317, "num_input_tokens_seen": 96795216, "step": 101365 }, { "epoch": 8.269026837425566, "grad_norm": 47.043601989746094, "learning_rate": 4.426818786451586e-06, "loss": 0.4125, "num_input_tokens_seen": 96799024, "step": 101370 }, { "epoch": 8.269434701035973, "grad_norm": 2.9264230728149414, "learning_rate": 4.42479680073081e-06, "loss": 0.3507, "num_input_tokens_seen": 96802480, "step": 101375 }, { "epoch": 8.269842564646382, "grad_norm": 1.1579002141952515, "learning_rate": 4.422775232060658e-06, "loss": 0.2477, "num_input_tokens_seen": 96807456, "step": 101380 }, { "epoch": 8.27025042825679, "grad_norm": 12.941314697265625, "learning_rate": 4.420754080482109e-06, "loss": 0.4194, "num_input_tokens_seen": 96811824, "step": 101385 }, { "epoch": 8.2706582918672, "grad_norm": 6.108404159545898, "learning_rate": 4.41873334603613e-06, "loss": 0.5347, "num_input_tokens_seen": 96816768, "step": 101390 }, { "epoch": 8.271066155477609, "grad_norm": 3.577120780944824, "learning_rate": 4.4167130287636746e-06, "loss": 0.4666, "num_input_tokens_seen": 96822400, "step": 101395 }, { "epoch": 8.271474019088018, "grad_norm": 9.146321296691895, "learning_rate": 4.414693128705707e-06, "loss": 0.2904, "num_input_tokens_seen": 96827600, "step": 101400 }, { "epoch": 8.271881882698425, "grad_norm": 3.1526129245758057, "learning_rate": 4.412673645903162e-06, "loss": 0.1625, "num_input_tokens_seen": 96832800, "step": 101405 }, { "epoch": 8.272289746308834, "grad_norm": 13.637117385864258, "learning_rate": 4.4106545803969755e-06, "loss": 0.4609, "num_input_tokens_seen": 96838192, "step": 101410 }, { "epoch": 8.272697609919243, "grad_norm": 36.833919525146484, "learning_rate": 4.408635932228064e-06, "loss": 0.4905, "num_input_tokens_seen": 96842704, "step": 101415 }, { "epoch": 8.273105473529652, "grad_norm": 8.096632957458496, "learning_rate": 4.406617701437357e-06, "loss": 0.3334, "num_input_tokens_seen": 96847632, "step": 101420 }, { "epoch": 8.273513337140061, "grad_norm": 1.5901857614517212, "learning_rate": 4.404599888065761e-06, "loss": 0.1969, "num_input_tokens_seen": 96851824, "step": 101425 }, { "epoch": 8.273921200750468, "grad_norm": 28.465688705444336, "learning_rate": 4.402582492154172e-06, "loss": 0.4727, "num_input_tokens_seen": 96856608, "step": 101430 }, { "epoch": 8.274329064360877, "grad_norm": 18.633983612060547, "learning_rate": 4.400565513743479e-06, "loss": 0.3464, "num_input_tokens_seen": 96862176, "step": 101435 }, { "epoch": 8.274736927971286, "grad_norm": 1.042917251586914, "learning_rate": 4.398548952874573e-06, "loss": 0.1874, "num_input_tokens_seen": 96865856, "step": 101440 }, { "epoch": 8.275144791581695, "grad_norm": 1.9360811710357666, "learning_rate": 4.3965328095883305e-06, "loss": 0.4313, "num_input_tokens_seen": 96869648, "step": 101445 }, { "epoch": 8.275552655192104, "grad_norm": 42.29878616333008, "learning_rate": 4.394517083925609e-06, "loss": 0.3603, "num_input_tokens_seen": 96874688, "step": 101450 }, { "epoch": 8.275960518802512, "grad_norm": 22.545928955078125, "learning_rate": 4.392501775927272e-06, "loss": 0.4862, "num_input_tokens_seen": 96879872, "step": 101455 }, { "epoch": 8.27636838241292, "grad_norm": 24.32882308959961, "learning_rate": 4.390486885634168e-06, "loss": 0.3974, "num_input_tokens_seen": 96884000, "step": 101460 }, { "epoch": 8.27677624602333, "grad_norm": 26.891809463500977, "learning_rate": 4.388472413087136e-06, "loss": 0.4328, "num_input_tokens_seen": 96888576, "step": 101465 }, { "epoch": 8.277184109633739, "grad_norm": 41.960205078125, "learning_rate": 4.386458358327008e-06, "loss": 0.3753, "num_input_tokens_seen": 96893072, "step": 101470 }, { "epoch": 8.277591973244148, "grad_norm": 2.617544174194336, "learning_rate": 4.384444721394612e-06, "loss": 0.3227, "num_input_tokens_seen": 96898048, "step": 101475 }, { "epoch": 8.277999836854557, "grad_norm": 57.150970458984375, "learning_rate": 4.382431502330755e-06, "loss": 0.3437, "num_input_tokens_seen": 96902656, "step": 101480 }, { "epoch": 8.278407700464964, "grad_norm": 5.434482097625732, "learning_rate": 4.380418701176259e-06, "loss": 0.3035, "num_input_tokens_seen": 96908016, "step": 101485 }, { "epoch": 8.278815564075373, "grad_norm": 38.61659240722656, "learning_rate": 4.378406317971911e-06, "loss": 0.2148, "num_input_tokens_seen": 96912672, "step": 101490 }, { "epoch": 8.279223427685782, "grad_norm": 1.4673447608947754, "learning_rate": 4.376394352758506e-06, "loss": 0.2975, "num_input_tokens_seen": 96917328, "step": 101495 }, { "epoch": 8.279631291296191, "grad_norm": 16.05996322631836, "learning_rate": 4.374382805576824e-06, "loss": 0.2866, "num_input_tokens_seen": 96921824, "step": 101500 }, { "epoch": 8.2800391549066, "grad_norm": 23.234806060791016, "learning_rate": 4.372371676467629e-06, "loss": 0.453, "num_input_tokens_seen": 96926448, "step": 101505 }, { "epoch": 8.280447018517007, "grad_norm": 26.42141342163086, "learning_rate": 4.370360965471704e-06, "loss": 0.3301, "num_input_tokens_seen": 96930896, "step": 101510 }, { "epoch": 8.280854882127416, "grad_norm": 7.823176860809326, "learning_rate": 4.3683506726297954e-06, "loss": 0.3028, "num_input_tokens_seen": 96935952, "step": 101515 }, { "epoch": 8.281262745737825, "grad_norm": 16.227375030517578, "learning_rate": 4.366340797982652e-06, "loss": 0.311, "num_input_tokens_seen": 96940624, "step": 101520 }, { "epoch": 8.281670609348234, "grad_norm": 2.4332380294799805, "learning_rate": 4.364331341571007e-06, "loss": 0.442, "num_input_tokens_seen": 96944720, "step": 101525 }, { "epoch": 8.282078472958643, "grad_norm": 1.2145793437957764, "learning_rate": 4.3623223034356035e-06, "loss": 0.2411, "num_input_tokens_seen": 96949600, "step": 101530 }, { "epoch": 8.28248633656905, "grad_norm": 14.392610549926758, "learning_rate": 4.360313683617157e-06, "loss": 0.46, "num_input_tokens_seen": 96955104, "step": 101535 }, { "epoch": 8.28289420017946, "grad_norm": 0.9892817139625549, "learning_rate": 4.3583054821563805e-06, "loss": 0.2026, "num_input_tokens_seen": 96959824, "step": 101540 }, { "epoch": 8.283302063789868, "grad_norm": 5.2200026512146, "learning_rate": 4.356297699093981e-06, "loss": 0.2768, "num_input_tokens_seen": 96965296, "step": 101545 }, { "epoch": 8.283709927400277, "grad_norm": 5.4414849281311035, "learning_rate": 4.354290334470651e-06, "loss": 0.3335, "num_input_tokens_seen": 96970224, "step": 101550 }, { "epoch": 8.284117791010686, "grad_norm": 23.79343605041504, "learning_rate": 4.352283388327089e-06, "loss": 0.4028, "num_input_tokens_seen": 96973968, "step": 101555 }, { "epoch": 8.284525654621095, "grad_norm": 32.25836944580078, "learning_rate": 4.3502768607039665e-06, "loss": 0.2567, "num_input_tokens_seen": 96978688, "step": 101560 }, { "epoch": 8.284933518231503, "grad_norm": 48.38505935668945, "learning_rate": 4.3482707516419585e-06, "loss": 0.356, "num_input_tokens_seen": 96984640, "step": 101565 }, { "epoch": 8.285341381841912, "grad_norm": 39.49352264404297, "learning_rate": 4.34626506118172e-06, "loss": 0.501, "num_input_tokens_seen": 96989552, "step": 101570 }, { "epoch": 8.28574924545232, "grad_norm": 44.605628967285156, "learning_rate": 4.3442597893639216e-06, "loss": 0.3435, "num_input_tokens_seen": 96994736, "step": 101575 }, { "epoch": 8.28615710906273, "grad_norm": 16.10593605041504, "learning_rate": 4.342254936229198e-06, "loss": 0.2955, "num_input_tokens_seen": 97000240, "step": 101580 }, { "epoch": 8.286564972673139, "grad_norm": 18.05838966369629, "learning_rate": 4.340250501818189e-06, "loss": 0.2322, "num_input_tokens_seen": 97005440, "step": 101585 }, { "epoch": 8.286972836283546, "grad_norm": 4.894779682159424, "learning_rate": 4.338246486171524e-06, "loss": 0.2675, "num_input_tokens_seen": 97009184, "step": 101590 }, { "epoch": 8.287380699893955, "grad_norm": 0.638372004032135, "learning_rate": 4.336242889329822e-06, "loss": 0.2113, "num_input_tokens_seen": 97014720, "step": 101595 }, { "epoch": 8.287788563504364, "grad_norm": 4.5691046714782715, "learning_rate": 4.334239711333696e-06, "loss": 0.4525, "num_input_tokens_seen": 97019424, "step": 101600 }, { "epoch": 8.288196427114773, "grad_norm": 2.5027599334716797, "learning_rate": 4.3322369522237505e-06, "loss": 0.1664, "num_input_tokens_seen": 97023536, "step": 101605 }, { "epoch": 8.288604290725182, "grad_norm": 19.914920806884766, "learning_rate": 4.330234612040579e-06, "loss": 0.531, "num_input_tokens_seen": 97028320, "step": 101610 }, { "epoch": 8.289012154335591, "grad_norm": 8.683724403381348, "learning_rate": 4.328232690824763e-06, "loss": 0.4312, "num_input_tokens_seen": 97033888, "step": 101615 }, { "epoch": 8.289420017945998, "grad_norm": 7.986408233642578, "learning_rate": 4.326231188616892e-06, "loss": 0.2416, "num_input_tokens_seen": 97038848, "step": 101620 }, { "epoch": 8.289827881556407, "grad_norm": 11.016071319580078, "learning_rate": 4.32423010545753e-06, "loss": 0.3791, "num_input_tokens_seen": 97044464, "step": 101625 }, { "epoch": 8.290235745166816, "grad_norm": 57.44044494628906, "learning_rate": 4.322229441387241e-06, "loss": 0.2757, "num_input_tokens_seen": 97049872, "step": 101630 }, { "epoch": 8.290643608777225, "grad_norm": 15.455826759338379, "learning_rate": 4.320229196446565e-06, "loss": 0.3311, "num_input_tokens_seen": 97055136, "step": 101635 }, { "epoch": 8.291051472387634, "grad_norm": 2.6416375637054443, "learning_rate": 4.318229370676064e-06, "loss": 0.2822, "num_input_tokens_seen": 97059472, "step": 101640 }, { "epoch": 8.291459335998042, "grad_norm": 11.194411277770996, "learning_rate": 4.316229964116264e-06, "loss": 0.4807, "num_input_tokens_seen": 97065280, "step": 101645 }, { "epoch": 8.29186719960845, "grad_norm": 30.297428131103516, "learning_rate": 4.314230976807692e-06, "loss": 0.2935, "num_input_tokens_seen": 97069344, "step": 101650 }, { "epoch": 8.29227506321886, "grad_norm": 8.450202941894531, "learning_rate": 4.312232408790873e-06, "loss": 0.4179, "num_input_tokens_seen": 97074608, "step": 101655 }, { "epoch": 8.292682926829269, "grad_norm": 42.731414794921875, "learning_rate": 4.310234260106303e-06, "loss": 0.3716, "num_input_tokens_seen": 97079744, "step": 101660 }, { "epoch": 8.293090790439678, "grad_norm": 1.6917299032211304, "learning_rate": 4.308236530794499e-06, "loss": 0.3403, "num_input_tokens_seen": 97084672, "step": 101665 }, { "epoch": 8.293498654050085, "grad_norm": 13.19846248626709, "learning_rate": 4.30623922089595e-06, "loss": 0.4474, "num_input_tokens_seen": 97089728, "step": 101670 }, { "epoch": 8.293906517660494, "grad_norm": 1.3261795043945312, "learning_rate": 4.304242330451136e-06, "loss": 0.3387, "num_input_tokens_seen": 97094288, "step": 101675 }, { "epoch": 8.294314381270903, "grad_norm": 71.62247467041016, "learning_rate": 4.302245859500531e-06, "loss": 0.392, "num_input_tokens_seen": 97099200, "step": 101680 }, { "epoch": 8.294722244881312, "grad_norm": 10.476455688476562, "learning_rate": 4.300249808084614e-06, "loss": 0.2239, "num_input_tokens_seen": 97103984, "step": 101685 }, { "epoch": 8.29513010849172, "grad_norm": 23.228271484375, "learning_rate": 4.298254176243835e-06, "loss": 0.3353, "num_input_tokens_seen": 97108336, "step": 101690 }, { "epoch": 8.29553797210213, "grad_norm": 3.2776825428009033, "learning_rate": 4.296258964018651e-06, "loss": 0.402, "num_input_tokens_seen": 97113344, "step": 101695 }, { "epoch": 8.295945835712537, "grad_norm": 5.740549087524414, "learning_rate": 4.294264171449497e-06, "loss": 0.2823, "num_input_tokens_seen": 97118160, "step": 101700 }, { "epoch": 8.296353699322946, "grad_norm": 34.27235412597656, "learning_rate": 4.292269798576806e-06, "loss": 0.429, "num_input_tokens_seen": 97122928, "step": 101705 }, { "epoch": 8.296761562933355, "grad_norm": 2.824997663497925, "learning_rate": 4.290275845441013e-06, "loss": 0.3294, "num_input_tokens_seen": 97128112, "step": 101710 }, { "epoch": 8.297169426543764, "grad_norm": 4.599106788635254, "learning_rate": 4.288282312082528e-06, "loss": 0.3049, "num_input_tokens_seen": 97132992, "step": 101715 }, { "epoch": 8.297577290154173, "grad_norm": 27.417322158813477, "learning_rate": 4.28628919854176e-06, "loss": 0.2344, "num_input_tokens_seen": 97138528, "step": 101720 }, { "epoch": 8.29798515376458, "grad_norm": 9.389050483703613, "learning_rate": 4.284296504859106e-06, "loss": 0.5196, "num_input_tokens_seen": 97144320, "step": 101725 }, { "epoch": 8.29839301737499, "grad_norm": 2.633946180343628, "learning_rate": 4.28230423107496e-06, "loss": 0.2024, "num_input_tokens_seen": 97148416, "step": 101730 }, { "epoch": 8.298800880985398, "grad_norm": 15.290743827819824, "learning_rate": 4.280312377229706e-06, "loss": 0.3044, "num_input_tokens_seen": 97152512, "step": 101735 }, { "epoch": 8.299208744595807, "grad_norm": 42.919891357421875, "learning_rate": 4.278320943363715e-06, "loss": 0.4347, "num_input_tokens_seen": 97157824, "step": 101740 }, { "epoch": 8.299616608206216, "grad_norm": 0.9533632397651672, "learning_rate": 4.2763299295173504e-06, "loss": 0.2588, "num_input_tokens_seen": 97162928, "step": 101745 }, { "epoch": 8.300024471816624, "grad_norm": 2.9376845359802246, "learning_rate": 4.274339335730967e-06, "loss": 0.3249, "num_input_tokens_seen": 97166352, "step": 101750 }, { "epoch": 8.300432335427033, "grad_norm": 22.67440414428711, "learning_rate": 4.272349162044928e-06, "loss": 0.3826, "num_input_tokens_seen": 97170768, "step": 101755 }, { "epoch": 8.300840199037442, "grad_norm": 2.5940475463867188, "learning_rate": 4.2703594084995605e-06, "loss": 0.4227, "num_input_tokens_seen": 97175248, "step": 101760 }, { "epoch": 8.30124806264785, "grad_norm": 7.938169002532959, "learning_rate": 4.2683700751352005e-06, "loss": 0.3721, "num_input_tokens_seen": 97180784, "step": 101765 }, { "epoch": 8.30165592625826, "grad_norm": 2.92006778717041, "learning_rate": 4.266381161992164e-06, "loss": 0.2936, "num_input_tokens_seen": 97185968, "step": 101770 }, { "epoch": 8.302063789868669, "grad_norm": 7.144894599914551, "learning_rate": 4.2643926691107756e-06, "loss": 0.275, "num_input_tokens_seen": 97191232, "step": 101775 }, { "epoch": 8.302471653479076, "grad_norm": 31.417715072631836, "learning_rate": 4.262404596531339e-06, "loss": 0.4071, "num_input_tokens_seen": 97195568, "step": 101780 }, { "epoch": 8.302879517089485, "grad_norm": 38.47357940673828, "learning_rate": 4.260416944294146e-06, "loss": 0.1969, "num_input_tokens_seen": 97199872, "step": 101785 }, { "epoch": 8.303287380699894, "grad_norm": 46.54734802246094, "learning_rate": 4.258429712439485e-06, "loss": 0.3546, "num_input_tokens_seen": 97204848, "step": 101790 }, { "epoch": 8.303695244310303, "grad_norm": 3.483191728591919, "learning_rate": 4.2564429010076454e-06, "loss": 0.3258, "num_input_tokens_seen": 97210112, "step": 101795 }, { "epoch": 8.304103107920712, "grad_norm": 0.7990214228630066, "learning_rate": 4.254456510038895e-06, "loss": 0.3313, "num_input_tokens_seen": 97214288, "step": 101800 }, { "epoch": 8.30451097153112, "grad_norm": 21.53676986694336, "learning_rate": 4.252470539573494e-06, "loss": 0.4201, "num_input_tokens_seen": 97218464, "step": 101805 }, { "epoch": 8.304918835141528, "grad_norm": 0.8790543675422668, "learning_rate": 4.2504849896516985e-06, "loss": 0.4582, "num_input_tokens_seen": 97223984, "step": 101810 }, { "epoch": 8.305326698751937, "grad_norm": 43.931182861328125, "learning_rate": 4.248499860313751e-06, "loss": 0.3557, "num_input_tokens_seen": 97229040, "step": 101815 }, { "epoch": 8.305734562362346, "grad_norm": 9.069086074829102, "learning_rate": 4.2465151515999e-06, "loss": 0.545, "num_input_tokens_seen": 97234064, "step": 101820 }, { "epoch": 8.306142425972755, "grad_norm": 49.94241714477539, "learning_rate": 4.244530863550367e-06, "loss": 0.308, "num_input_tokens_seen": 97239360, "step": 101825 }, { "epoch": 8.306550289583164, "grad_norm": 70.99473571777344, "learning_rate": 4.2425469962053736e-06, "loss": 0.5493, "num_input_tokens_seen": 97244832, "step": 101830 }, { "epoch": 8.306958153193571, "grad_norm": 14.549026489257812, "learning_rate": 4.240563549605128e-06, "loss": 0.3161, "num_input_tokens_seen": 97249952, "step": 101835 }, { "epoch": 8.30736601680398, "grad_norm": 5.749377727508545, "learning_rate": 4.238580523789842e-06, "loss": 0.2568, "num_input_tokens_seen": 97254816, "step": 101840 }, { "epoch": 8.30777388041439, "grad_norm": 37.011112213134766, "learning_rate": 4.236597918799709e-06, "loss": 0.1902, "num_input_tokens_seen": 97259200, "step": 101845 }, { "epoch": 8.308181744024798, "grad_norm": 5.273017883300781, "learning_rate": 4.23461573467491e-06, "loss": 0.3895, "num_input_tokens_seen": 97263280, "step": 101850 }, { "epoch": 8.308589607635207, "grad_norm": 6.217341423034668, "learning_rate": 4.23263397145563e-06, "loss": 0.5784, "num_input_tokens_seen": 97267696, "step": 101855 }, { "epoch": 8.308997471245615, "grad_norm": 34.540672302246094, "learning_rate": 4.23065262918203e-06, "loss": 0.3776, "num_input_tokens_seen": 97271504, "step": 101860 }, { "epoch": 8.309405334856024, "grad_norm": 19.647634506225586, "learning_rate": 4.22867170789428e-06, "loss": 0.1489, "num_input_tokens_seen": 97276464, "step": 101865 }, { "epoch": 8.309813198466433, "grad_norm": 5.282935619354248, "learning_rate": 4.2266912076325215e-06, "loss": 0.3947, "num_input_tokens_seen": 97280416, "step": 101870 }, { "epoch": 8.310221062076842, "grad_norm": 12.445945739746094, "learning_rate": 4.224711128436909e-06, "loss": 0.3972, "num_input_tokens_seen": 97285680, "step": 101875 }, { "epoch": 8.31062892568725, "grad_norm": 48.4832878112793, "learning_rate": 4.222731470347577e-06, "loss": 0.398, "num_input_tokens_seen": 97290176, "step": 101880 }, { "epoch": 8.31103678929766, "grad_norm": 39.14179229736328, "learning_rate": 4.220752233404648e-06, "loss": 0.3743, "num_input_tokens_seen": 97295184, "step": 101885 }, { "epoch": 8.311444652908067, "grad_norm": 15.514857292175293, "learning_rate": 4.218773417648242e-06, "loss": 0.3488, "num_input_tokens_seen": 97300080, "step": 101890 }, { "epoch": 8.311852516518476, "grad_norm": 3.034303903579712, "learning_rate": 4.216795023118466e-06, "loss": 0.3311, "num_input_tokens_seen": 97304720, "step": 101895 }, { "epoch": 8.312260380128885, "grad_norm": 0.7086004614830017, "learning_rate": 4.214817049855426e-06, "loss": 0.3267, "num_input_tokens_seen": 97309616, "step": 101900 }, { "epoch": 8.312668243739294, "grad_norm": 38.146358489990234, "learning_rate": 4.212839497899204e-06, "loss": 0.3522, "num_input_tokens_seen": 97313872, "step": 101905 }, { "epoch": 8.313076107349703, "grad_norm": 6.3783278465271, "learning_rate": 4.2108623672899e-06, "loss": 0.4144, "num_input_tokens_seen": 97319248, "step": 101910 }, { "epoch": 8.31348397096011, "grad_norm": 21.987234115600586, "learning_rate": 4.208885658067582e-06, "loss": 0.4206, "num_input_tokens_seen": 97324080, "step": 101915 }, { "epoch": 8.31389183457052, "grad_norm": 31.222867965698242, "learning_rate": 4.206909370272319e-06, "loss": 0.3554, "num_input_tokens_seen": 97328336, "step": 101920 }, { "epoch": 8.314299698180928, "grad_norm": 1.312085747718811, "learning_rate": 4.204933503944158e-06, "loss": 0.2078, "num_input_tokens_seen": 97333472, "step": 101925 }, { "epoch": 8.314707561791337, "grad_norm": 8.896895408630371, "learning_rate": 4.202958059123166e-06, "loss": 0.3963, "num_input_tokens_seen": 97338112, "step": 101930 }, { "epoch": 8.315115425401746, "grad_norm": 4.659789562225342, "learning_rate": 4.200983035849376e-06, "loss": 0.3907, "num_input_tokens_seen": 97342416, "step": 101935 }, { "epoch": 8.315523289012154, "grad_norm": 35.37031173706055, "learning_rate": 4.199008434162824e-06, "loss": 0.1922, "num_input_tokens_seen": 97347040, "step": 101940 }, { "epoch": 8.315931152622563, "grad_norm": 23.838775634765625, "learning_rate": 4.197034254103529e-06, "loss": 0.3321, "num_input_tokens_seen": 97352352, "step": 101945 }, { "epoch": 8.316339016232972, "grad_norm": 19.909130096435547, "learning_rate": 4.195060495711506e-06, "loss": 0.4063, "num_input_tokens_seen": 97357536, "step": 101950 }, { "epoch": 8.31674687984338, "grad_norm": 4.762560844421387, "learning_rate": 4.193087159026773e-06, "loss": 0.5273, "num_input_tokens_seen": 97361728, "step": 101955 }, { "epoch": 8.31715474345379, "grad_norm": 3.1395504474639893, "learning_rate": 4.19111424408932e-06, "loss": 0.3007, "num_input_tokens_seen": 97365392, "step": 101960 }, { "epoch": 8.317562607064199, "grad_norm": 33.91535186767578, "learning_rate": 4.1891417509391405e-06, "loss": 0.4597, "num_input_tokens_seen": 97370432, "step": 101965 }, { "epoch": 8.317970470674606, "grad_norm": 10.678362846374512, "learning_rate": 4.18716967961621e-06, "loss": 0.3515, "num_input_tokens_seen": 97375696, "step": 101970 }, { "epoch": 8.318378334285015, "grad_norm": 16.55506134033203, "learning_rate": 4.1851980301605085e-06, "loss": 0.4696, "num_input_tokens_seen": 97379488, "step": 101975 }, { "epoch": 8.318786197895424, "grad_norm": 14.154608726501465, "learning_rate": 4.183226802612e-06, "loss": 0.2917, "num_input_tokens_seen": 97384320, "step": 101980 }, { "epoch": 8.319194061505833, "grad_norm": 40.05553436279297, "learning_rate": 4.181255997010638e-06, "loss": 0.1717, "num_input_tokens_seen": 97389392, "step": 101985 }, { "epoch": 8.319601925116242, "grad_norm": 5.794755458831787, "learning_rate": 4.179285613396367e-06, "loss": 0.2966, "num_input_tokens_seen": 97394192, "step": 101990 }, { "epoch": 8.320009788726649, "grad_norm": 18.59364128112793, "learning_rate": 4.177315651809132e-06, "loss": 0.411, "num_input_tokens_seen": 97399856, "step": 101995 }, { "epoch": 8.320417652337058, "grad_norm": 2.4565043449401855, "learning_rate": 4.175346112288864e-06, "loss": 0.4427, "num_input_tokens_seen": 97404912, "step": 102000 }, { "epoch": 8.320825515947467, "grad_norm": 9.3003568649292, "learning_rate": 4.173376994875478e-06, "loss": 0.4412, "num_input_tokens_seen": 97409728, "step": 102005 }, { "epoch": 8.321233379557876, "grad_norm": 28.752561569213867, "learning_rate": 4.171408299608892e-06, "loss": 0.367, "num_input_tokens_seen": 97415216, "step": 102010 }, { "epoch": 8.321641243168285, "grad_norm": 5.58803129196167, "learning_rate": 4.169440026529009e-06, "loss": 0.3212, "num_input_tokens_seen": 97420240, "step": 102015 }, { "epoch": 8.322049106778692, "grad_norm": 4.564986705780029, "learning_rate": 4.167472175675727e-06, "loss": 0.4604, "num_input_tokens_seen": 97425360, "step": 102020 }, { "epoch": 8.322456970389101, "grad_norm": 26.431198120117188, "learning_rate": 4.1655047470889306e-06, "loss": 0.4453, "num_input_tokens_seen": 97430144, "step": 102025 }, { "epoch": 8.32286483399951, "grad_norm": 2.374027967453003, "learning_rate": 4.163537740808498e-06, "loss": 0.3276, "num_input_tokens_seen": 97434352, "step": 102030 }, { "epoch": 8.32327269760992, "grad_norm": 0.5884933471679688, "learning_rate": 4.161571156874297e-06, "loss": 0.221, "num_input_tokens_seen": 97438800, "step": 102035 }, { "epoch": 8.323680561220328, "grad_norm": 35.4378662109375, "learning_rate": 4.1596049953261996e-06, "loss": 0.136, "num_input_tokens_seen": 97443728, "step": 102040 }, { "epoch": 8.324088424830737, "grad_norm": 10.477002143859863, "learning_rate": 4.157639256204054e-06, "loss": 0.3532, "num_input_tokens_seen": 97447824, "step": 102045 }, { "epoch": 8.324496288441145, "grad_norm": 2.0852088928222656, "learning_rate": 4.155673939547702e-06, "loss": 0.4321, "num_input_tokens_seen": 97451984, "step": 102050 }, { "epoch": 8.324904152051554, "grad_norm": 32.509483337402344, "learning_rate": 4.153709045396983e-06, "loss": 0.3575, "num_input_tokens_seen": 97455776, "step": 102055 }, { "epoch": 8.325312015661963, "grad_norm": 33.60895919799805, "learning_rate": 4.1517445737917185e-06, "loss": 0.3758, "num_input_tokens_seen": 97460864, "step": 102060 }, { "epoch": 8.325719879272372, "grad_norm": 12.920207977294922, "learning_rate": 4.14978052477174e-06, "loss": 0.4142, "num_input_tokens_seen": 97465632, "step": 102065 }, { "epoch": 8.32612774288278, "grad_norm": 8.742095947265625, "learning_rate": 4.147816898376847e-06, "loss": 0.5303, "num_input_tokens_seen": 97470736, "step": 102070 }, { "epoch": 8.326535606493188, "grad_norm": 16.049821853637695, "learning_rate": 4.145853694646848e-06, "loss": 0.3978, "num_input_tokens_seen": 97475936, "step": 102075 }, { "epoch": 8.326943470103597, "grad_norm": 30.79852294921875, "learning_rate": 4.143890913621526e-06, "loss": 0.2888, "num_input_tokens_seen": 97480656, "step": 102080 }, { "epoch": 8.327351333714006, "grad_norm": 3.3530397415161133, "learning_rate": 4.141928555340677e-06, "loss": 0.341, "num_input_tokens_seen": 97485792, "step": 102085 }, { "epoch": 8.327759197324415, "grad_norm": 4.084222793579102, "learning_rate": 4.139966619844074e-06, "loss": 0.3078, "num_input_tokens_seen": 97490576, "step": 102090 }, { "epoch": 8.328167060934824, "grad_norm": 7.09938383102417, "learning_rate": 4.138005107171486e-06, "loss": 0.2507, "num_input_tokens_seen": 97494528, "step": 102095 }, { "epoch": 8.328574924545233, "grad_norm": 1.507509708404541, "learning_rate": 4.136044017362667e-06, "loss": 0.2996, "num_input_tokens_seen": 97498928, "step": 102100 }, { "epoch": 8.32898278815564, "grad_norm": 45.139102935791016, "learning_rate": 4.1340833504573655e-06, "loss": 0.2703, "num_input_tokens_seen": 97504064, "step": 102105 }, { "epoch": 8.32939065176605, "grad_norm": 3.219998598098755, "learning_rate": 4.132123106495333e-06, "loss": 0.2891, "num_input_tokens_seen": 97509024, "step": 102110 }, { "epoch": 8.329798515376458, "grad_norm": 53.84601974487305, "learning_rate": 4.1301632855162975e-06, "loss": 0.328, "num_input_tokens_seen": 97514096, "step": 102115 }, { "epoch": 8.330206378986867, "grad_norm": 11.655267715454102, "learning_rate": 4.128203887559986e-06, "loss": 0.2039, "num_input_tokens_seen": 97519104, "step": 102120 }, { "epoch": 8.330614242597276, "grad_norm": 7.455440044403076, "learning_rate": 4.126244912666105e-06, "loss": 0.2612, "num_input_tokens_seen": 97523680, "step": 102125 }, { "epoch": 8.331022106207683, "grad_norm": 9.25622844696045, "learning_rate": 4.124286360874377e-06, "loss": 0.3749, "num_input_tokens_seen": 97528896, "step": 102130 }, { "epoch": 8.331429969818092, "grad_norm": 20.52307891845703, "learning_rate": 4.122328232224493e-06, "loss": 0.3582, "num_input_tokens_seen": 97533648, "step": 102135 }, { "epoch": 8.331837833428501, "grad_norm": 0.5985831022262573, "learning_rate": 4.120370526756145e-06, "loss": 0.2875, "num_input_tokens_seen": 97538384, "step": 102140 }, { "epoch": 8.33224569703891, "grad_norm": 7.482212543487549, "learning_rate": 4.118413244509015e-06, "loss": 0.5019, "num_input_tokens_seen": 97543328, "step": 102145 }, { "epoch": 8.33265356064932, "grad_norm": 30.74384117126465, "learning_rate": 4.116456385522774e-06, "loss": 0.4247, "num_input_tokens_seen": 97547472, "step": 102150 }, { "epoch": 8.333061424259727, "grad_norm": 3.4228029251098633, "learning_rate": 4.114499949837086e-06, "loss": 0.3267, "num_input_tokens_seen": 97552880, "step": 102155 }, { "epoch": 8.333469287870136, "grad_norm": 2.0362777709960938, "learning_rate": 4.112543937491614e-06, "loss": 0.4793, "num_input_tokens_seen": 97557776, "step": 102160 }, { "epoch": 8.333877151480545, "grad_norm": 2.4753384590148926, "learning_rate": 4.110588348525998e-06, "loss": 0.3004, "num_input_tokens_seen": 97562576, "step": 102165 }, { "epoch": 8.334285015090954, "grad_norm": 15.474595069885254, "learning_rate": 4.108633182979873e-06, "loss": 0.4857, "num_input_tokens_seen": 97567440, "step": 102170 }, { "epoch": 8.334692878701363, "grad_norm": 5.046517372131348, "learning_rate": 4.106678440892883e-06, "loss": 0.4091, "num_input_tokens_seen": 97572288, "step": 102175 }, { "epoch": 8.335100742311772, "grad_norm": 2.5440220832824707, "learning_rate": 4.104724122304643e-06, "loss": 0.3712, "num_input_tokens_seen": 97577120, "step": 102180 }, { "epoch": 8.335508605922179, "grad_norm": 2.481921672821045, "learning_rate": 4.102770227254765e-06, "loss": 0.2926, "num_input_tokens_seen": 97581712, "step": 102185 }, { "epoch": 8.335916469532588, "grad_norm": 22.74390411376953, "learning_rate": 4.100816755782852e-06, "loss": 0.2485, "num_input_tokens_seen": 97586224, "step": 102190 }, { "epoch": 8.336324333142997, "grad_norm": 8.021341323852539, "learning_rate": 4.098863707928504e-06, "loss": 0.3393, "num_input_tokens_seen": 97590992, "step": 102195 }, { "epoch": 8.336732196753406, "grad_norm": 10.018786430358887, "learning_rate": 4.0969110837313115e-06, "loss": 0.275, "num_input_tokens_seen": 97595856, "step": 102200 }, { "epoch": 8.337140060363815, "grad_norm": 31.22965431213379, "learning_rate": 4.0949588832308485e-06, "loss": 0.2275, "num_input_tokens_seen": 97600752, "step": 102205 }, { "epoch": 8.337547923974222, "grad_norm": 1.8494360446929932, "learning_rate": 4.093007106466684e-06, "loss": 0.3751, "num_input_tokens_seen": 97605280, "step": 102210 }, { "epoch": 8.337955787584631, "grad_norm": 17.42169189453125, "learning_rate": 4.091055753478376e-06, "loss": 0.5025, "num_input_tokens_seen": 97609664, "step": 102215 }, { "epoch": 8.33836365119504, "grad_norm": 10.458596229553223, "learning_rate": 4.089104824305489e-06, "loss": 0.4316, "num_input_tokens_seen": 97614528, "step": 102220 }, { "epoch": 8.33877151480545, "grad_norm": 2.483872413635254, "learning_rate": 4.087154318987563e-06, "loss": 0.4156, "num_input_tokens_seen": 97619616, "step": 102225 }, { "epoch": 8.339179378415858, "grad_norm": 9.471197128295898, "learning_rate": 4.08520423756413e-06, "loss": 0.6048, "num_input_tokens_seen": 97624240, "step": 102230 }, { "epoch": 8.339587242026266, "grad_norm": 1.4392309188842773, "learning_rate": 4.083254580074716e-06, "loss": 0.2339, "num_input_tokens_seen": 97629872, "step": 102235 }, { "epoch": 8.339995105636675, "grad_norm": 6.300327301025391, "learning_rate": 4.08130534655885e-06, "loss": 0.3965, "num_input_tokens_seen": 97634480, "step": 102240 }, { "epoch": 8.340402969247084, "grad_norm": 39.0241813659668, "learning_rate": 4.079356537056033e-06, "loss": 0.3556, "num_input_tokens_seen": 97639328, "step": 102245 }, { "epoch": 8.340810832857493, "grad_norm": 3.854871988296509, "learning_rate": 4.0774081516057714e-06, "loss": 0.3539, "num_input_tokens_seen": 97644480, "step": 102250 }, { "epoch": 8.341218696467902, "grad_norm": 36.589996337890625, "learning_rate": 4.075460190247557e-06, "loss": 0.2777, "num_input_tokens_seen": 97649536, "step": 102255 }, { "epoch": 8.34162656007831, "grad_norm": 30.3453426361084, "learning_rate": 4.073512653020864e-06, "loss": 0.2242, "num_input_tokens_seen": 97654400, "step": 102260 }, { "epoch": 8.342034423688718, "grad_norm": 14.473502159118652, "learning_rate": 4.071565539965186e-06, "loss": 0.4133, "num_input_tokens_seen": 97659056, "step": 102265 }, { "epoch": 8.342442287299127, "grad_norm": 26.5194034576416, "learning_rate": 4.06961885111998e-06, "loss": 0.3832, "num_input_tokens_seen": 97662880, "step": 102270 }, { "epoch": 8.342850150909536, "grad_norm": 21.261505126953125, "learning_rate": 4.067672586524707e-06, "loss": 0.3904, "num_input_tokens_seen": 97668192, "step": 102275 }, { "epoch": 8.343258014519945, "grad_norm": 0.8057704567909241, "learning_rate": 4.0657267462188175e-06, "loss": 0.2505, "num_input_tokens_seen": 97673856, "step": 102280 }, { "epoch": 8.343665878130354, "grad_norm": 6.508625030517578, "learning_rate": 4.063781330241748e-06, "loss": 0.3543, "num_input_tokens_seen": 97678352, "step": 102285 }, { "epoch": 8.344073741740761, "grad_norm": 4.702462673187256, "learning_rate": 4.061836338632935e-06, "loss": 0.41, "num_input_tokens_seen": 97683312, "step": 102290 }, { "epoch": 8.34448160535117, "grad_norm": 8.832000732421875, "learning_rate": 4.059891771431804e-06, "loss": 0.4355, "num_input_tokens_seen": 97688048, "step": 102295 }, { "epoch": 8.34488946896158, "grad_norm": 5.373420715332031, "learning_rate": 4.057947628677769e-06, "loss": 0.2544, "num_input_tokens_seen": 97693136, "step": 102300 }, { "epoch": 8.345297332571988, "grad_norm": 11.857915878295898, "learning_rate": 4.0560039104102305e-06, "loss": 0.359, "num_input_tokens_seen": 97698256, "step": 102305 }, { "epoch": 8.345705196182397, "grad_norm": 4.386357307434082, "learning_rate": 4.054060616668598e-06, "loss": 0.3711, "num_input_tokens_seen": 97703600, "step": 102310 }, { "epoch": 8.346113059792806, "grad_norm": 14.358330726623535, "learning_rate": 4.052117747492256e-06, "loss": 0.2597, "num_input_tokens_seen": 97709184, "step": 102315 }, { "epoch": 8.346520923403213, "grad_norm": 23.09657096862793, "learning_rate": 4.050175302920586e-06, "loss": 0.233, "num_input_tokens_seen": 97713728, "step": 102320 }, { "epoch": 8.346928787013622, "grad_norm": 9.377854347229004, "learning_rate": 4.048233282992955e-06, "loss": 0.3487, "num_input_tokens_seen": 97718336, "step": 102325 }, { "epoch": 8.347336650624031, "grad_norm": 23.910049438476562, "learning_rate": 4.04629168774874e-06, "loss": 0.4656, "num_input_tokens_seen": 97723056, "step": 102330 }, { "epoch": 8.34774451423444, "grad_norm": 22.087108612060547, "learning_rate": 4.044350517227288e-06, "loss": 0.472, "num_input_tokens_seen": 97727952, "step": 102335 }, { "epoch": 8.34815237784485, "grad_norm": 7.8670573234558105, "learning_rate": 4.042409771467945e-06, "loss": 0.3731, "num_input_tokens_seen": 97731152, "step": 102340 }, { "epoch": 8.348560241455257, "grad_norm": 8.485844612121582, "learning_rate": 4.040469450510046e-06, "loss": 0.3395, "num_input_tokens_seen": 97735936, "step": 102345 }, { "epoch": 8.348968105065666, "grad_norm": 32.27139663696289, "learning_rate": 4.038529554392931e-06, "loss": 0.4067, "num_input_tokens_seen": 97741040, "step": 102350 }, { "epoch": 8.349375968676075, "grad_norm": 16.5465030670166, "learning_rate": 4.0365900831559135e-06, "loss": 0.5296, "num_input_tokens_seen": 97746192, "step": 102355 }, { "epoch": 8.349783832286484, "grad_norm": 28.224590301513672, "learning_rate": 4.034651036838308e-06, "loss": 0.3614, "num_input_tokens_seen": 97750672, "step": 102360 }, { "epoch": 8.350191695896893, "grad_norm": 55.434974670410156, "learning_rate": 4.0327124154794175e-06, "loss": 0.5361, "num_input_tokens_seen": 97755584, "step": 102365 }, { "epoch": 8.3505995595073, "grad_norm": 20.697189331054688, "learning_rate": 4.03077421911853e-06, "loss": 0.5823, "num_input_tokens_seen": 97760720, "step": 102370 }, { "epoch": 8.351007423117709, "grad_norm": 1.1646136045455933, "learning_rate": 4.0288364477949455e-06, "loss": 0.4431, "num_input_tokens_seen": 97766256, "step": 102375 }, { "epoch": 8.351415286728118, "grad_norm": 28.209644317626953, "learning_rate": 4.026899101547935e-06, "loss": 0.3338, "num_input_tokens_seen": 97770592, "step": 102380 }, { "epoch": 8.351823150338527, "grad_norm": 11.333151817321777, "learning_rate": 4.0249621804167665e-06, "loss": 0.3304, "num_input_tokens_seen": 97775344, "step": 102385 }, { "epoch": 8.352231013948936, "grad_norm": 16.910511016845703, "learning_rate": 4.023025684440696e-06, "loss": 0.3057, "num_input_tokens_seen": 97780768, "step": 102390 }, { "epoch": 8.352638877559345, "grad_norm": 3.155618667602539, "learning_rate": 4.021089613658987e-06, "loss": 0.1867, "num_input_tokens_seen": 97786064, "step": 102395 }, { "epoch": 8.353046741169752, "grad_norm": 27.255815505981445, "learning_rate": 4.019153968110878e-06, "loss": 0.2895, "num_input_tokens_seen": 97791248, "step": 102400 }, { "epoch": 8.353454604780161, "grad_norm": 33.56089782714844, "learning_rate": 4.017218747835602e-06, "loss": 0.3201, "num_input_tokens_seen": 97795776, "step": 102405 }, { "epoch": 8.35386246839057, "grad_norm": 3.408963680267334, "learning_rate": 4.015283952872387e-06, "loss": 0.3635, "num_input_tokens_seen": 97801728, "step": 102410 }, { "epoch": 8.35427033200098, "grad_norm": 11.631674766540527, "learning_rate": 4.0133495832604475e-06, "loss": 0.336, "num_input_tokens_seen": 97806336, "step": 102415 }, { "epoch": 8.354678195611388, "grad_norm": 61.30131912231445, "learning_rate": 4.011415639038998e-06, "loss": 0.3436, "num_input_tokens_seen": 97811840, "step": 102420 }, { "epoch": 8.355086059221795, "grad_norm": 8.56577205657959, "learning_rate": 4.009482120247232e-06, "loss": 0.3635, "num_input_tokens_seen": 97817088, "step": 102425 }, { "epoch": 8.355493922832204, "grad_norm": 3.1051809787750244, "learning_rate": 4.007549026924342e-06, "loss": 0.3455, "num_input_tokens_seen": 97822016, "step": 102430 }, { "epoch": 8.355901786442613, "grad_norm": 14.220428466796875, "learning_rate": 4.005616359109513e-06, "loss": 0.2658, "num_input_tokens_seen": 97826608, "step": 102435 }, { "epoch": 8.356309650053023, "grad_norm": 17.594547271728516, "learning_rate": 4.0036841168419184e-06, "loss": 0.2888, "num_input_tokens_seen": 97831216, "step": 102440 }, { "epoch": 8.356717513663432, "grad_norm": 7.658364295959473, "learning_rate": 4.00175230016073e-06, "loss": 0.3833, "num_input_tokens_seen": 97835264, "step": 102445 }, { "epoch": 8.357125377273839, "grad_norm": 62.20974349975586, "learning_rate": 3.999820909105098e-06, "loss": 0.5859, "num_input_tokens_seen": 97840016, "step": 102450 }, { "epoch": 8.357533240884248, "grad_norm": 46.75632858276367, "learning_rate": 3.997889943714172e-06, "loss": 0.3395, "num_input_tokens_seen": 97845264, "step": 102455 }, { "epoch": 8.357941104494657, "grad_norm": 2.5008082389831543, "learning_rate": 3.995959404027089e-06, "loss": 0.392, "num_input_tokens_seen": 97849408, "step": 102460 }, { "epoch": 8.358348968105066, "grad_norm": 31.379850387573242, "learning_rate": 3.994029290082988e-06, "loss": 0.4379, "num_input_tokens_seen": 97853200, "step": 102465 }, { "epoch": 8.358756831715475, "grad_norm": 2.4605937004089355, "learning_rate": 3.992099601920984e-06, "loss": 0.2873, "num_input_tokens_seen": 97857808, "step": 102470 }, { "epoch": 8.359164695325884, "grad_norm": 9.347268104553223, "learning_rate": 3.9901703395801985e-06, "loss": 0.4233, "num_input_tokens_seen": 97863616, "step": 102475 }, { "epoch": 8.359572558936291, "grad_norm": 1.53514564037323, "learning_rate": 3.988241503099727e-06, "loss": 0.3066, "num_input_tokens_seen": 97868240, "step": 102480 }, { "epoch": 8.3599804225467, "grad_norm": 0.9186265468597412, "learning_rate": 3.986313092518676e-06, "loss": 0.2665, "num_input_tokens_seen": 97873200, "step": 102485 }, { "epoch": 8.360388286157109, "grad_norm": 37.32079315185547, "learning_rate": 3.984385107876129e-06, "loss": 0.4626, "num_input_tokens_seen": 97878496, "step": 102490 }, { "epoch": 8.360796149767518, "grad_norm": 9.662575721740723, "learning_rate": 3.982457549211166e-06, "loss": 0.5183, "num_input_tokens_seen": 97883920, "step": 102495 }, { "epoch": 8.361204013377927, "grad_norm": 9.634346961975098, "learning_rate": 3.980530416562858e-06, "loss": 0.4646, "num_input_tokens_seen": 97887920, "step": 102500 }, { "epoch": 8.361611876988334, "grad_norm": 8.36585807800293, "learning_rate": 3.97860370997026e-06, "loss": 0.2659, "num_input_tokens_seen": 97892560, "step": 102505 }, { "epoch": 8.362019740598743, "grad_norm": 56.92696762084961, "learning_rate": 3.976677429472442e-06, "loss": 0.6225, "num_input_tokens_seen": 97897280, "step": 102510 }, { "epoch": 8.362427604209152, "grad_norm": 29.647111892700195, "learning_rate": 3.9747515751084344e-06, "loss": 0.5081, "num_input_tokens_seen": 97901568, "step": 102515 }, { "epoch": 8.362835467819561, "grad_norm": 3.3070125579833984, "learning_rate": 3.97282614691728e-06, "loss": 0.3562, "num_input_tokens_seen": 97906832, "step": 102520 }, { "epoch": 8.36324333142997, "grad_norm": 30.78510856628418, "learning_rate": 3.9709011449379995e-06, "loss": 0.2558, "num_input_tokens_seen": 97911776, "step": 102525 }, { "epoch": 8.36365119504038, "grad_norm": 1.9724221229553223, "learning_rate": 3.96897656920962e-06, "loss": 0.349, "num_input_tokens_seen": 97916432, "step": 102530 }, { "epoch": 8.364059058650787, "grad_norm": 11.365760803222656, "learning_rate": 3.967052419771153e-06, "loss": 0.3086, "num_input_tokens_seen": 97920496, "step": 102535 }, { "epoch": 8.364466922261196, "grad_norm": 34.40869140625, "learning_rate": 3.965128696661594e-06, "loss": 0.4701, "num_input_tokens_seen": 97924960, "step": 102540 }, { "epoch": 8.364874785871605, "grad_norm": 3.9954612255096436, "learning_rate": 3.9632053999199365e-06, "loss": 0.3769, "num_input_tokens_seen": 97929856, "step": 102545 }, { "epoch": 8.365282649482014, "grad_norm": 33.305179595947266, "learning_rate": 3.9612825295851695e-06, "loss": 0.537, "num_input_tokens_seen": 97934480, "step": 102550 }, { "epoch": 8.365690513092423, "grad_norm": 7.7356438636779785, "learning_rate": 3.959360085696265e-06, "loss": 0.3832, "num_input_tokens_seen": 97939184, "step": 102555 }, { "epoch": 8.36609837670283, "grad_norm": 17.999914169311523, "learning_rate": 3.957438068292182e-06, "loss": 0.3491, "num_input_tokens_seen": 97943632, "step": 102560 }, { "epoch": 8.366506240313239, "grad_norm": 41.31308364868164, "learning_rate": 3.955516477411897e-06, "loss": 0.3808, "num_input_tokens_seen": 97948304, "step": 102565 }, { "epoch": 8.366914103923648, "grad_norm": 1.985223412513733, "learning_rate": 3.953595313094349e-06, "loss": 0.4466, "num_input_tokens_seen": 97952880, "step": 102570 }, { "epoch": 8.367321967534057, "grad_norm": 5.6004638671875, "learning_rate": 3.951674575378481e-06, "loss": 0.2882, "num_input_tokens_seen": 97956320, "step": 102575 }, { "epoch": 8.367729831144466, "grad_norm": 3.2350993156433105, "learning_rate": 3.949754264303227e-06, "loss": 0.4483, "num_input_tokens_seen": 97961040, "step": 102580 }, { "epoch": 8.368137694754873, "grad_norm": 21.760408401489258, "learning_rate": 3.9478343799075086e-06, "loss": 0.3235, "num_input_tokens_seen": 97966128, "step": 102585 }, { "epoch": 8.368545558365282, "grad_norm": 13.396963119506836, "learning_rate": 3.945914922230234e-06, "loss": 0.2709, "num_input_tokens_seen": 97970848, "step": 102590 }, { "epoch": 8.368953421975691, "grad_norm": 5.1361165046691895, "learning_rate": 3.943995891310326e-06, "loss": 0.2833, "num_input_tokens_seen": 97975344, "step": 102595 }, { "epoch": 8.3693612855861, "grad_norm": 3.649888753890991, "learning_rate": 3.942077287186671e-06, "loss": 0.3013, "num_input_tokens_seen": 97980352, "step": 102600 }, { "epoch": 8.36976914919651, "grad_norm": 26.702787399291992, "learning_rate": 3.940159109898165e-06, "loss": 0.5171, "num_input_tokens_seen": 97986048, "step": 102605 }, { "epoch": 8.370177012806918, "grad_norm": 36.03451919555664, "learning_rate": 3.9382413594836825e-06, "loss": 0.2288, "num_input_tokens_seen": 97991280, "step": 102610 }, { "epoch": 8.370584876417325, "grad_norm": 9.10914134979248, "learning_rate": 3.9363240359820915e-06, "loss": 0.3053, "num_input_tokens_seen": 97995616, "step": 102615 }, { "epoch": 8.370992740027734, "grad_norm": 9.440994262695312, "learning_rate": 3.9344071394322694e-06, "loss": 0.2716, "num_input_tokens_seen": 98001312, "step": 102620 }, { "epoch": 8.371400603638143, "grad_norm": 19.030122756958008, "learning_rate": 3.932490669873062e-06, "loss": 0.2334, "num_input_tokens_seen": 98006368, "step": 102625 }, { "epoch": 8.371808467248552, "grad_norm": 11.984835624694824, "learning_rate": 3.930574627343317e-06, "loss": 0.3827, "num_input_tokens_seen": 98010720, "step": 102630 }, { "epoch": 8.372216330858961, "grad_norm": 30.592758178710938, "learning_rate": 3.928659011881866e-06, "loss": 0.2749, "num_input_tokens_seen": 98015248, "step": 102635 }, { "epoch": 8.372624194469369, "grad_norm": 7.874413013458252, "learning_rate": 3.9267438235275485e-06, "loss": 0.4419, "num_input_tokens_seen": 98020304, "step": 102640 }, { "epoch": 8.373032058079778, "grad_norm": 0.82585608959198, "learning_rate": 3.92482906231918e-06, "loss": 0.3142, "num_input_tokens_seen": 98024640, "step": 102645 }, { "epoch": 8.373439921690187, "grad_norm": 47.93879699707031, "learning_rate": 3.922914728295571e-06, "loss": 0.3426, "num_input_tokens_seen": 98028960, "step": 102650 }, { "epoch": 8.373847785300596, "grad_norm": 20.9448299407959, "learning_rate": 3.921000821495524e-06, "loss": 0.2859, "num_input_tokens_seen": 98033152, "step": 102655 }, { "epoch": 8.374255648911005, "grad_norm": 16.95606803894043, "learning_rate": 3.919087341957825e-06, "loss": 0.2824, "num_input_tokens_seen": 98038016, "step": 102660 }, { "epoch": 8.374663512521412, "grad_norm": 1.4662318229675293, "learning_rate": 3.917174289721276e-06, "loss": 0.2828, "num_input_tokens_seen": 98042752, "step": 102665 }, { "epoch": 8.375071376131821, "grad_norm": 17.67656707763672, "learning_rate": 3.915261664824646e-06, "loss": 0.406, "num_input_tokens_seen": 98047024, "step": 102670 }, { "epoch": 8.37547923974223, "grad_norm": 6.409722328186035, "learning_rate": 3.913349467306701e-06, "loss": 0.378, "num_input_tokens_seen": 98050752, "step": 102675 }, { "epoch": 8.375887103352639, "grad_norm": 3.4167463779449463, "learning_rate": 3.911437697206202e-06, "loss": 0.3462, "num_input_tokens_seen": 98055920, "step": 102680 }, { "epoch": 8.376294966963048, "grad_norm": 8.12582015991211, "learning_rate": 3.909526354561896e-06, "loss": 0.2358, "num_input_tokens_seen": 98060160, "step": 102685 }, { "epoch": 8.376702830573457, "grad_norm": 29.231624603271484, "learning_rate": 3.907615439412535e-06, "loss": 0.435, "num_input_tokens_seen": 98064592, "step": 102690 }, { "epoch": 8.377110694183864, "grad_norm": 31.329984664916992, "learning_rate": 3.905704951796843e-06, "loss": 0.3081, "num_input_tokens_seen": 98069328, "step": 102695 }, { "epoch": 8.377518557794273, "grad_norm": 0.6616762280464172, "learning_rate": 3.903794891753551e-06, "loss": 0.2627, "num_input_tokens_seen": 98074256, "step": 102700 }, { "epoch": 8.377926421404682, "grad_norm": 2.162851572036743, "learning_rate": 3.901885259321369e-06, "loss": 0.2653, "num_input_tokens_seen": 98078656, "step": 102705 }, { "epoch": 8.378334285015091, "grad_norm": 6.6958184242248535, "learning_rate": 3.899976054539012e-06, "loss": 0.3879, "num_input_tokens_seen": 98083600, "step": 102710 }, { "epoch": 8.3787421486255, "grad_norm": 2.7645249366760254, "learning_rate": 3.898067277445172e-06, "loss": 0.275, "num_input_tokens_seen": 98088304, "step": 102715 }, { "epoch": 8.379150012235907, "grad_norm": 10.305205345153809, "learning_rate": 3.896158928078541e-06, "loss": 0.403, "num_input_tokens_seen": 98092576, "step": 102720 }, { "epoch": 8.379557875846317, "grad_norm": 6.341622829437256, "learning_rate": 3.894251006477797e-06, "loss": 0.349, "num_input_tokens_seen": 98097072, "step": 102725 }, { "epoch": 8.379965739456726, "grad_norm": 4.955416679382324, "learning_rate": 3.89234351268162e-06, "loss": 0.2406, "num_input_tokens_seen": 98101184, "step": 102730 }, { "epoch": 8.380373603067135, "grad_norm": 7.047757148742676, "learning_rate": 3.890436446728674e-06, "loss": 0.3077, "num_input_tokens_seen": 98106512, "step": 102735 }, { "epoch": 8.380781466677544, "grad_norm": 2.4422430992126465, "learning_rate": 3.888529808657609e-06, "loss": 0.4425, "num_input_tokens_seen": 98111088, "step": 102740 }, { "epoch": 8.381189330287953, "grad_norm": 7.639534950256348, "learning_rate": 3.886623598507069e-06, "loss": 0.342, "num_input_tokens_seen": 98116576, "step": 102745 }, { "epoch": 8.38159719389836, "grad_norm": 3.8883728981018066, "learning_rate": 3.884717816315702e-06, "loss": 0.2822, "num_input_tokens_seen": 98121408, "step": 102750 }, { "epoch": 8.382005057508769, "grad_norm": 5.958366394042969, "learning_rate": 3.8828124621221345e-06, "loss": 0.4317, "num_input_tokens_seen": 98125616, "step": 102755 }, { "epoch": 8.382412921119178, "grad_norm": 32.74300765991211, "learning_rate": 3.8809075359649834e-06, "loss": 0.3922, "num_input_tokens_seen": 98130528, "step": 102760 }, { "epoch": 8.382820784729587, "grad_norm": 42.700706481933594, "learning_rate": 3.879003037882861e-06, "loss": 0.3096, "num_input_tokens_seen": 98135840, "step": 102765 }, { "epoch": 8.383228648339996, "grad_norm": 6.947768688201904, "learning_rate": 3.877098967914369e-06, "loss": 0.4619, "num_input_tokens_seen": 98140320, "step": 102770 }, { "epoch": 8.383636511950403, "grad_norm": 3.4898693561553955, "learning_rate": 3.875195326098111e-06, "loss": 0.3218, "num_input_tokens_seen": 98145200, "step": 102775 }, { "epoch": 8.384044375560812, "grad_norm": 7.712503433227539, "learning_rate": 3.873292112472665e-06, "loss": 0.2649, "num_input_tokens_seen": 98150800, "step": 102780 }, { "epoch": 8.384452239171221, "grad_norm": 4.101628303527832, "learning_rate": 3.871389327076613e-06, "loss": 0.3274, "num_input_tokens_seen": 98155328, "step": 102785 }, { "epoch": 8.38486010278163, "grad_norm": 39.193931579589844, "learning_rate": 3.869486969948513e-06, "loss": 0.4381, "num_input_tokens_seen": 98160912, "step": 102790 }, { "epoch": 8.385267966392039, "grad_norm": 20.39476203918457, "learning_rate": 3.867585041126942e-06, "loss": 0.2909, "num_input_tokens_seen": 98166080, "step": 102795 }, { "epoch": 8.385675830002448, "grad_norm": 25.237945556640625, "learning_rate": 3.865683540650442e-06, "loss": 0.4902, "num_input_tokens_seen": 98171216, "step": 102800 }, { "epoch": 8.386083693612855, "grad_norm": 72.55821990966797, "learning_rate": 3.863782468557553e-06, "loss": 0.5098, "num_input_tokens_seen": 98175328, "step": 102805 }, { "epoch": 8.386491557223264, "grad_norm": 80.53543090820312, "learning_rate": 3.861881824886815e-06, "loss": 0.5728, "num_input_tokens_seen": 98180800, "step": 102810 }, { "epoch": 8.386899420833673, "grad_norm": 2.562333345413208, "learning_rate": 3.859981609676744e-06, "loss": 0.297, "num_input_tokens_seen": 98185392, "step": 102815 }, { "epoch": 8.387307284444082, "grad_norm": 8.929771423339844, "learning_rate": 3.858081822965867e-06, "loss": 0.2768, "num_input_tokens_seen": 98189920, "step": 102820 }, { "epoch": 8.387715148054491, "grad_norm": 2.947336196899414, "learning_rate": 3.85618246479269e-06, "loss": 0.3968, "num_input_tokens_seen": 98194720, "step": 102825 }, { "epoch": 8.388123011664899, "grad_norm": 6.8303728103637695, "learning_rate": 3.85428353519571e-06, "loss": 0.3445, "num_input_tokens_seen": 98199264, "step": 102830 }, { "epoch": 8.388530875275308, "grad_norm": 5.336745738983154, "learning_rate": 3.852385034213415e-06, "loss": 0.361, "num_input_tokens_seen": 98203776, "step": 102835 }, { "epoch": 8.388938738885717, "grad_norm": 23.938827514648438, "learning_rate": 3.850486961884292e-06, "loss": 0.3707, "num_input_tokens_seen": 98209392, "step": 102840 }, { "epoch": 8.389346602496126, "grad_norm": 18.13888931274414, "learning_rate": 3.848589318246809e-06, "loss": 0.2103, "num_input_tokens_seen": 98214512, "step": 102845 }, { "epoch": 8.389754466106535, "grad_norm": 9.090846061706543, "learning_rate": 3.846692103339433e-06, "loss": 0.3448, "num_input_tokens_seen": 98219696, "step": 102850 }, { "epoch": 8.390162329716942, "grad_norm": 3.0494155883789062, "learning_rate": 3.84479531720062e-06, "loss": 0.324, "num_input_tokens_seen": 98225120, "step": 102855 }, { "epoch": 8.39057019332735, "grad_norm": 29.792030334472656, "learning_rate": 3.842898959868813e-06, "loss": 0.3274, "num_input_tokens_seen": 98230400, "step": 102860 }, { "epoch": 8.39097805693776, "grad_norm": 26.019075393676758, "learning_rate": 3.841003031382456e-06, "loss": 0.2696, "num_input_tokens_seen": 98235104, "step": 102865 }, { "epoch": 8.391385920548169, "grad_norm": 4.859197616577148, "learning_rate": 3.839107531779978e-06, "loss": 0.4675, "num_input_tokens_seen": 98239824, "step": 102870 }, { "epoch": 8.391793784158578, "grad_norm": 15.534768104553223, "learning_rate": 3.837212461099799e-06, "loss": 0.3649, "num_input_tokens_seen": 98244672, "step": 102875 }, { "epoch": 8.392201647768985, "grad_norm": 1.1521291732788086, "learning_rate": 3.8353178193803245e-06, "loss": 0.4824, "num_input_tokens_seen": 98248768, "step": 102880 }, { "epoch": 8.392609511379394, "grad_norm": 29.407451629638672, "learning_rate": 3.833423606659969e-06, "loss": 0.3873, "num_input_tokens_seen": 98253616, "step": 102885 }, { "epoch": 8.393017374989803, "grad_norm": 4.4376749992370605, "learning_rate": 3.831529822977126e-06, "loss": 0.2738, "num_input_tokens_seen": 98257968, "step": 102890 }, { "epoch": 8.393425238600212, "grad_norm": 71.74498748779297, "learning_rate": 3.829636468370173e-06, "loss": 0.5695, "num_input_tokens_seen": 98262832, "step": 102895 }, { "epoch": 8.393833102210621, "grad_norm": 2.990210771560669, "learning_rate": 3.8277435428774935e-06, "loss": 0.4883, "num_input_tokens_seen": 98266928, "step": 102900 }, { "epoch": 8.39424096582103, "grad_norm": 39.44697952270508, "learning_rate": 3.8258510465374565e-06, "loss": 0.2996, "num_input_tokens_seen": 98271808, "step": 102905 }, { "epoch": 8.394648829431437, "grad_norm": 63.20743942260742, "learning_rate": 3.823958979388425e-06, "loss": 0.4787, "num_input_tokens_seen": 98275568, "step": 102910 }, { "epoch": 8.395056693041846, "grad_norm": 73.22157287597656, "learning_rate": 3.822067341468743e-06, "loss": 0.5107, "num_input_tokens_seen": 98280608, "step": 102915 }, { "epoch": 8.395464556652255, "grad_norm": 16.48496437072754, "learning_rate": 3.8201761328167605e-06, "loss": 0.3409, "num_input_tokens_seen": 98285872, "step": 102920 }, { "epoch": 8.395872420262664, "grad_norm": 5.733757972717285, "learning_rate": 3.818285353470802e-06, "loss": 0.3545, "num_input_tokens_seen": 98290560, "step": 102925 }, { "epoch": 8.396280283873073, "grad_norm": 3.5895519256591797, "learning_rate": 3.816395003469203e-06, "loss": 0.2749, "num_input_tokens_seen": 98295984, "step": 102930 }, { "epoch": 8.39668814748348, "grad_norm": 10.839695930480957, "learning_rate": 3.8145050828502755e-06, "loss": 0.3223, "num_input_tokens_seen": 98300064, "step": 102935 }, { "epoch": 8.39709601109389, "grad_norm": 22.859786987304688, "learning_rate": 3.8126155916523272e-06, "loss": 0.3126, "num_input_tokens_seen": 98305504, "step": 102940 }, { "epoch": 8.397503874704299, "grad_norm": 7.263566017150879, "learning_rate": 3.8107265299136553e-06, "loss": 0.2659, "num_input_tokens_seen": 98309856, "step": 102945 }, { "epoch": 8.397911738314708, "grad_norm": 2.691833734512329, "learning_rate": 3.8088378976725543e-06, "loss": 0.3786, "num_input_tokens_seen": 98313808, "step": 102950 }, { "epoch": 8.398319601925117, "grad_norm": 5.324961185455322, "learning_rate": 3.8069496949673074e-06, "loss": 0.275, "num_input_tokens_seen": 98318560, "step": 102955 }, { "epoch": 8.398727465535526, "grad_norm": 4.071514129638672, "learning_rate": 3.8050619218361838e-06, "loss": 0.3506, "num_input_tokens_seen": 98323984, "step": 102960 }, { "epoch": 8.399135329145933, "grad_norm": 20.001800537109375, "learning_rate": 3.8031745783174503e-06, "loss": 0.4566, "num_input_tokens_seen": 98329408, "step": 102965 }, { "epoch": 8.399543192756342, "grad_norm": 13.379361152648926, "learning_rate": 3.8012876644493574e-06, "loss": 0.4025, "num_input_tokens_seen": 98333824, "step": 102970 }, { "epoch": 8.399951056366751, "grad_norm": 76.12957763671875, "learning_rate": 3.7994011802701595e-06, "loss": 0.4404, "num_input_tokens_seen": 98338128, "step": 102975 }, { "epoch": 8.40035891997716, "grad_norm": 12.084670066833496, "learning_rate": 3.797515125818088e-06, "loss": 0.3959, "num_input_tokens_seen": 98342688, "step": 102980 }, { "epoch": 8.400766783587569, "grad_norm": 2.0135021209716797, "learning_rate": 3.7956295011313785e-06, "loss": 0.2462, "num_input_tokens_seen": 98347584, "step": 102985 }, { "epoch": 8.401174647197976, "grad_norm": 3.141814708709717, "learning_rate": 3.7937443062482397e-06, "loss": 0.2901, "num_input_tokens_seen": 98351456, "step": 102990 }, { "epoch": 8.401582510808385, "grad_norm": 1.684092402458191, "learning_rate": 3.791859541206899e-06, "loss": 0.3131, "num_input_tokens_seen": 98356720, "step": 102995 }, { "epoch": 8.401990374418794, "grad_norm": 18.65999984741211, "learning_rate": 3.7899752060455566e-06, "loss": 0.3115, "num_input_tokens_seen": 98361840, "step": 103000 }, { "epoch": 8.402398238029203, "grad_norm": 11.140974998474121, "learning_rate": 3.7880913008024015e-06, "loss": 0.3438, "num_input_tokens_seen": 98367280, "step": 103005 }, { "epoch": 8.402806101639612, "grad_norm": 29.573869705200195, "learning_rate": 3.7862078255156226e-06, "loss": 0.3737, "num_input_tokens_seen": 98371936, "step": 103010 }, { "epoch": 8.403213965250021, "grad_norm": 2.632439374923706, "learning_rate": 3.784324780223392e-06, "loss": 0.273, "num_input_tokens_seen": 98376096, "step": 103015 }, { "epoch": 8.403621828860429, "grad_norm": 2.776437282562256, "learning_rate": 3.782442164963887e-06, "loss": 0.2439, "num_input_tokens_seen": 98381040, "step": 103020 }, { "epoch": 8.404029692470838, "grad_norm": 13.559477806091309, "learning_rate": 3.780559979775264e-06, "loss": 0.4472, "num_input_tokens_seen": 98385616, "step": 103025 }, { "epoch": 8.404437556081247, "grad_norm": 2.5887575149536133, "learning_rate": 3.7786782246956755e-06, "loss": 0.4713, "num_input_tokens_seen": 98389856, "step": 103030 }, { "epoch": 8.404845419691656, "grad_norm": 27.714221954345703, "learning_rate": 3.776796899763252e-06, "loss": 0.3647, "num_input_tokens_seen": 98394816, "step": 103035 }, { "epoch": 8.405253283302065, "grad_norm": 20.28826332092285, "learning_rate": 3.7749160050161466e-06, "loss": 0.3985, "num_input_tokens_seen": 98399808, "step": 103040 }, { "epoch": 8.405661146912472, "grad_norm": 34.96482467651367, "learning_rate": 3.773035540492473e-06, "loss": 0.411, "num_input_tokens_seen": 98404672, "step": 103045 }, { "epoch": 8.40606901052288, "grad_norm": 11.514532089233398, "learning_rate": 3.7711555062303503e-06, "loss": 0.2395, "num_input_tokens_seen": 98409312, "step": 103050 }, { "epoch": 8.40647687413329, "grad_norm": 6.457484722137451, "learning_rate": 3.7692759022678823e-06, "loss": 0.4726, "num_input_tokens_seen": 98414112, "step": 103055 }, { "epoch": 8.406884737743699, "grad_norm": 1.0672942399978638, "learning_rate": 3.7673967286431688e-06, "loss": 0.1627, "num_input_tokens_seen": 98418816, "step": 103060 }, { "epoch": 8.407292601354108, "grad_norm": 25.651588439941406, "learning_rate": 3.765517985394304e-06, "loss": 0.4235, "num_input_tokens_seen": 98423952, "step": 103065 }, { "epoch": 8.407700464964515, "grad_norm": 23.252338409423828, "learning_rate": 3.763639672559366e-06, "loss": 0.4318, "num_input_tokens_seen": 98428816, "step": 103070 }, { "epoch": 8.408108328574924, "grad_norm": 2.6055922508239746, "learning_rate": 3.7617617901764296e-06, "loss": 0.2878, "num_input_tokens_seen": 98433216, "step": 103075 }, { "epoch": 8.408516192185333, "grad_norm": 24.821178436279297, "learning_rate": 3.759884338283551e-06, "loss": 0.4107, "num_input_tokens_seen": 98437904, "step": 103080 }, { "epoch": 8.408924055795742, "grad_norm": 4.9508795738220215, "learning_rate": 3.7580073169187962e-06, "loss": 0.2964, "num_input_tokens_seen": 98443232, "step": 103085 }, { "epoch": 8.409331919406151, "grad_norm": 31.307716369628906, "learning_rate": 3.7561307261202077e-06, "loss": 0.3375, "num_input_tokens_seen": 98447264, "step": 103090 }, { "epoch": 8.40973978301656, "grad_norm": 14.991399765014648, "learning_rate": 3.7542545659258214e-06, "loss": 0.3926, "num_input_tokens_seen": 98452160, "step": 103095 }, { "epoch": 8.410147646626967, "grad_norm": 24.653234481811523, "learning_rate": 3.7523788363736683e-06, "loss": 0.318, "num_input_tokens_seen": 98456672, "step": 103100 }, { "epoch": 8.410555510237376, "grad_norm": 30.613922119140625, "learning_rate": 3.750503537501768e-06, "loss": 0.306, "num_input_tokens_seen": 98460656, "step": 103105 }, { "epoch": 8.410963373847785, "grad_norm": 3.518085241317749, "learning_rate": 3.748628669348131e-06, "loss": 0.243, "num_input_tokens_seen": 98464704, "step": 103110 }, { "epoch": 8.411371237458194, "grad_norm": 2.870725631713867, "learning_rate": 3.746754231950761e-06, "loss": 0.2554, "num_input_tokens_seen": 98469696, "step": 103115 }, { "epoch": 8.411779101068603, "grad_norm": 7.670429229736328, "learning_rate": 3.7448802253476495e-06, "loss": 0.2894, "num_input_tokens_seen": 98475616, "step": 103120 }, { "epoch": 8.41218696467901, "grad_norm": 61.1029052734375, "learning_rate": 3.74300664957678e-06, "loss": 0.4858, "num_input_tokens_seen": 98480256, "step": 103125 }, { "epoch": 8.41259482828942, "grad_norm": 28.897357940673828, "learning_rate": 3.741133504676139e-06, "loss": 0.4356, "num_input_tokens_seen": 98485520, "step": 103130 }, { "epoch": 8.413002691899829, "grad_norm": 8.379617691040039, "learning_rate": 3.739260790683688e-06, "loss": 0.3611, "num_input_tokens_seen": 98490352, "step": 103135 }, { "epoch": 8.413410555510238, "grad_norm": 17.5566463470459, "learning_rate": 3.7373885076373875e-06, "loss": 0.3, "num_input_tokens_seen": 98494480, "step": 103140 }, { "epoch": 8.413818419120647, "grad_norm": 45.23036193847656, "learning_rate": 3.7355166555751803e-06, "loss": 0.3868, "num_input_tokens_seen": 98499216, "step": 103145 }, { "epoch": 8.414226282731054, "grad_norm": 15.014547348022461, "learning_rate": 3.7336452345350187e-06, "loss": 0.294, "num_input_tokens_seen": 98504352, "step": 103150 }, { "epoch": 8.414634146341463, "grad_norm": 0.8678473830223083, "learning_rate": 3.731774244554834e-06, "loss": 0.3018, "num_input_tokens_seen": 98508768, "step": 103155 }, { "epoch": 8.415042009951872, "grad_norm": 35.13991928100586, "learning_rate": 3.7299036856725477e-06, "loss": 0.3114, "num_input_tokens_seen": 98513392, "step": 103160 }, { "epoch": 8.415449873562281, "grad_norm": 9.95548152923584, "learning_rate": 3.728033557926075e-06, "loss": 0.1856, "num_input_tokens_seen": 98518336, "step": 103165 }, { "epoch": 8.41585773717269, "grad_norm": 11.904780387878418, "learning_rate": 3.7261638613533157e-06, "loss": 0.3909, "num_input_tokens_seen": 98522432, "step": 103170 }, { "epoch": 8.416265600783099, "grad_norm": 9.94081974029541, "learning_rate": 3.724294595992184e-06, "loss": 0.3005, "num_input_tokens_seen": 98526928, "step": 103175 }, { "epoch": 8.416673464393506, "grad_norm": 8.697615623474121, "learning_rate": 3.7224257618805575e-06, "loss": 0.3734, "num_input_tokens_seen": 98531536, "step": 103180 }, { "epoch": 8.417081328003915, "grad_norm": 10.9343843460083, "learning_rate": 3.7205573590563205e-06, "loss": 0.3947, "num_input_tokens_seen": 98536032, "step": 103185 }, { "epoch": 8.417489191614324, "grad_norm": 12.419471740722656, "learning_rate": 3.7186893875573365e-06, "loss": 0.3526, "num_input_tokens_seen": 98540672, "step": 103190 }, { "epoch": 8.417897055224733, "grad_norm": 14.128317832946777, "learning_rate": 3.716821847421484e-06, "loss": 0.3841, "num_input_tokens_seen": 98545504, "step": 103195 }, { "epoch": 8.418304918835142, "grad_norm": 21.40711212158203, "learning_rate": 3.7149547386866074e-06, "loss": 0.4173, "num_input_tokens_seen": 98550176, "step": 103200 }, { "epoch": 8.41871278244555, "grad_norm": 24.634719848632812, "learning_rate": 3.7130880613905545e-06, "loss": 0.244, "num_input_tokens_seen": 98555696, "step": 103205 }, { "epoch": 8.419120646055958, "grad_norm": 5.808522701263428, "learning_rate": 3.7112218155711613e-06, "loss": 0.3022, "num_input_tokens_seen": 98560208, "step": 103210 }, { "epoch": 8.419528509666367, "grad_norm": 23.570064544677734, "learning_rate": 3.7093560012662483e-06, "loss": 0.3866, "num_input_tokens_seen": 98565280, "step": 103215 }, { "epoch": 8.419936373276776, "grad_norm": 6.670609951019287, "learning_rate": 3.707490618513648e-06, "loss": 0.395, "num_input_tokens_seen": 98570176, "step": 103220 }, { "epoch": 8.420344236887185, "grad_norm": 25.39655876159668, "learning_rate": 3.7056256673511673e-06, "loss": 0.4601, "num_input_tokens_seen": 98575040, "step": 103225 }, { "epoch": 8.420752100497594, "grad_norm": 20.134077072143555, "learning_rate": 3.703761147816606e-06, "loss": 0.3162, "num_input_tokens_seen": 98579952, "step": 103230 }, { "epoch": 8.421159964108002, "grad_norm": 40.21556091308594, "learning_rate": 3.7018970599477537e-06, "loss": 0.4695, "num_input_tokens_seen": 98585152, "step": 103235 }, { "epoch": 8.42156782771841, "grad_norm": 13.44622802734375, "learning_rate": 3.700033403782402e-06, "loss": 0.2305, "num_input_tokens_seen": 98589776, "step": 103240 }, { "epoch": 8.42197569132882, "grad_norm": 4.167993545532227, "learning_rate": 3.6981701793583177e-06, "loss": 0.2751, "num_input_tokens_seen": 98594864, "step": 103245 }, { "epoch": 8.422383554939229, "grad_norm": 10.82215690612793, "learning_rate": 3.696307386713277e-06, "loss": 0.3818, "num_input_tokens_seen": 98599824, "step": 103250 }, { "epoch": 8.422791418549638, "grad_norm": 31.209720611572266, "learning_rate": 3.694445025885024e-06, "loss": 0.2759, "num_input_tokens_seen": 98604032, "step": 103255 }, { "epoch": 8.423199282160045, "grad_norm": 26.554996490478516, "learning_rate": 3.692583096911323e-06, "loss": 0.2946, "num_input_tokens_seen": 98608144, "step": 103260 }, { "epoch": 8.423607145770454, "grad_norm": 66.01013946533203, "learning_rate": 3.6907215998299104e-06, "loss": 0.5188, "num_input_tokens_seen": 98612448, "step": 103265 }, { "epoch": 8.424015009380863, "grad_norm": 7.283468246459961, "learning_rate": 3.688860534678515e-06, "loss": 0.141, "num_input_tokens_seen": 98616880, "step": 103270 }, { "epoch": 8.424422872991272, "grad_norm": 7.173603057861328, "learning_rate": 3.6869999014948616e-06, "loss": 0.3962, "num_input_tokens_seen": 98621888, "step": 103275 }, { "epoch": 8.424830736601681, "grad_norm": 11.683050155639648, "learning_rate": 3.685139700316656e-06, "loss": 0.4332, "num_input_tokens_seen": 98627440, "step": 103280 }, { "epoch": 8.425238600212088, "grad_norm": 50.177520751953125, "learning_rate": 3.6832799311816184e-06, "loss": 0.2508, "num_input_tokens_seen": 98632752, "step": 103285 }, { "epoch": 8.425646463822497, "grad_norm": 4.452228546142578, "learning_rate": 3.681420594127438e-06, "loss": 0.5683, "num_input_tokens_seen": 98637712, "step": 103290 }, { "epoch": 8.426054327432906, "grad_norm": 8.49515438079834, "learning_rate": 3.6795616891918044e-06, "loss": 0.398, "num_input_tokens_seen": 98643216, "step": 103295 }, { "epoch": 8.426462191043315, "grad_norm": 29.387802124023438, "learning_rate": 3.6777032164123896e-06, "loss": 0.2276, "num_input_tokens_seen": 98647888, "step": 103300 }, { "epoch": 8.426870054653724, "grad_norm": 18.705904006958008, "learning_rate": 3.6758451758268754e-06, "loss": 0.5266, "num_input_tokens_seen": 98652304, "step": 103305 }, { "epoch": 8.427277918264133, "grad_norm": 1.4702261686325073, "learning_rate": 3.6739875674729195e-06, "loss": 0.45, "num_input_tokens_seen": 98657376, "step": 103310 }, { "epoch": 8.42768578187454, "grad_norm": 25.78510093688965, "learning_rate": 3.6721303913881734e-06, "loss": 0.3494, "num_input_tokens_seen": 98663120, "step": 103315 }, { "epoch": 8.42809364548495, "grad_norm": 2.370570659637451, "learning_rate": 3.670273647610281e-06, "loss": 0.2753, "num_input_tokens_seen": 98667872, "step": 103320 }, { "epoch": 8.428501509095359, "grad_norm": 2.071230888366699, "learning_rate": 3.668417336176874e-06, "loss": 0.2573, "num_input_tokens_seen": 98672880, "step": 103325 }, { "epoch": 8.428909372705768, "grad_norm": 0.9447436332702637, "learning_rate": 3.666561457125586e-06, "loss": 0.314, "num_input_tokens_seen": 98678448, "step": 103330 }, { "epoch": 8.429317236316177, "grad_norm": 24.017601013183594, "learning_rate": 3.6647060104940343e-06, "loss": 0.4017, "num_input_tokens_seen": 98683344, "step": 103335 }, { "epoch": 8.429725099926584, "grad_norm": 4.122101306915283, "learning_rate": 3.662850996319825e-06, "loss": 0.3462, "num_input_tokens_seen": 98689280, "step": 103340 }, { "epoch": 8.430132963536993, "grad_norm": 20.972166061401367, "learning_rate": 3.660996414640555e-06, "loss": 0.3554, "num_input_tokens_seen": 98692880, "step": 103345 }, { "epoch": 8.430540827147402, "grad_norm": 19.858739852905273, "learning_rate": 3.659142265493823e-06, "loss": 0.2307, "num_input_tokens_seen": 98697888, "step": 103350 }, { "epoch": 8.43094869075781, "grad_norm": 8.5531644821167, "learning_rate": 3.6572885489172122e-06, "loss": 0.4021, "num_input_tokens_seen": 98702304, "step": 103355 }, { "epoch": 8.43135655436822, "grad_norm": 0.8395180106163025, "learning_rate": 3.6554352649482926e-06, "loss": 0.3882, "num_input_tokens_seen": 98706944, "step": 103360 }, { "epoch": 8.431764417978627, "grad_norm": 39.75721740722656, "learning_rate": 3.653582413624629e-06, "loss": 0.3675, "num_input_tokens_seen": 98711968, "step": 103365 }, { "epoch": 8.432172281589036, "grad_norm": 2.3281548023223877, "learning_rate": 3.6517299949837795e-06, "loss": 0.2536, "num_input_tokens_seen": 98716704, "step": 103370 }, { "epoch": 8.432580145199445, "grad_norm": 5.311755657196045, "learning_rate": 3.649878009063287e-06, "loss": 0.4109, "num_input_tokens_seen": 98720560, "step": 103375 }, { "epoch": 8.432988008809854, "grad_norm": 7.313455581665039, "learning_rate": 3.6480264559006992e-06, "loss": 0.4643, "num_input_tokens_seen": 98724976, "step": 103380 }, { "epoch": 8.433395872420263, "grad_norm": 22.578645706176758, "learning_rate": 3.6461753355335443e-06, "loss": 0.375, "num_input_tokens_seen": 98729872, "step": 103385 }, { "epoch": 8.433803736030672, "grad_norm": 5.980731964111328, "learning_rate": 3.6443246479993394e-06, "loss": 0.3614, "num_input_tokens_seen": 98734576, "step": 103390 }, { "epoch": 8.43421159964108, "grad_norm": 6.524577617645264, "learning_rate": 3.6424743933355987e-06, "loss": 0.2028, "num_input_tokens_seen": 98739280, "step": 103395 }, { "epoch": 8.434619463251488, "grad_norm": 20.48297691345215, "learning_rate": 3.6406245715798286e-06, "loss": 0.4675, "num_input_tokens_seen": 98743408, "step": 103400 }, { "epoch": 8.435027326861897, "grad_norm": 0.8652400374412537, "learning_rate": 3.6387751827695216e-06, "loss": 0.4435, "num_input_tokens_seen": 98747776, "step": 103405 }, { "epoch": 8.435435190472306, "grad_norm": 3.4664559364318848, "learning_rate": 3.6369262269421665e-06, "loss": 0.3267, "num_input_tokens_seen": 98752960, "step": 103410 }, { "epoch": 8.435843054082715, "grad_norm": 14.75041675567627, "learning_rate": 3.6350777041352315e-06, "loss": 0.4153, "num_input_tokens_seen": 98757136, "step": 103415 }, { "epoch": 8.436250917693123, "grad_norm": 13.396295547485352, "learning_rate": 3.6332296143861994e-06, "loss": 0.352, "num_input_tokens_seen": 98762016, "step": 103420 }, { "epoch": 8.436658781303532, "grad_norm": 12.92810344696045, "learning_rate": 3.6313819577325215e-06, "loss": 0.364, "num_input_tokens_seen": 98767024, "step": 103425 }, { "epoch": 8.43706664491394, "grad_norm": 13.58802318572998, "learning_rate": 3.6295347342116538e-06, "loss": 0.4849, "num_input_tokens_seen": 98772000, "step": 103430 }, { "epoch": 8.43747450852435, "grad_norm": 1.9705041646957397, "learning_rate": 3.62768794386103e-06, "loss": 0.3137, "num_input_tokens_seen": 98776704, "step": 103435 }, { "epoch": 8.437882372134759, "grad_norm": 18.59562110900879, "learning_rate": 3.6258415867180954e-06, "loss": 0.4518, "num_input_tokens_seen": 98781808, "step": 103440 }, { "epoch": 8.438290235745168, "grad_norm": 2.498708486557007, "learning_rate": 3.62399566282027e-06, "loss": 0.3043, "num_input_tokens_seen": 98786752, "step": 103445 }, { "epoch": 8.438698099355575, "grad_norm": 7.704165935516357, "learning_rate": 3.6221501722049687e-06, "loss": 0.4414, "num_input_tokens_seen": 98791392, "step": 103450 }, { "epoch": 8.439105962965984, "grad_norm": 1.7030366659164429, "learning_rate": 3.6203051149095973e-06, "loss": 0.3668, "num_input_tokens_seen": 98795056, "step": 103455 }, { "epoch": 8.439513826576393, "grad_norm": 27.591304779052734, "learning_rate": 3.6184604909715538e-06, "loss": 0.2887, "num_input_tokens_seen": 98799872, "step": 103460 }, { "epoch": 8.439921690186802, "grad_norm": 1.601244568824768, "learning_rate": 3.6166163004282354e-06, "loss": 0.3443, "num_input_tokens_seen": 98804384, "step": 103465 }, { "epoch": 8.440329553797211, "grad_norm": 16.524478912353516, "learning_rate": 3.6147725433170187e-06, "loss": 0.3815, "num_input_tokens_seen": 98808656, "step": 103470 }, { "epoch": 8.440737417407618, "grad_norm": 13.162837028503418, "learning_rate": 3.6129292196752733e-06, "loss": 0.486, "num_input_tokens_seen": 98813328, "step": 103475 }, { "epoch": 8.441145281018027, "grad_norm": 33.35479736328125, "learning_rate": 3.611086329540361e-06, "loss": 0.3035, "num_input_tokens_seen": 98817616, "step": 103480 }, { "epoch": 8.441553144628436, "grad_norm": 14.06932258605957, "learning_rate": 3.6092438729496468e-06, "loss": 0.1813, "num_input_tokens_seen": 98822704, "step": 103485 }, { "epoch": 8.441961008238845, "grad_norm": 35.92555236816406, "learning_rate": 3.6074018499404673e-06, "loss": 0.4162, "num_input_tokens_seen": 98828048, "step": 103490 }, { "epoch": 8.442368871849254, "grad_norm": 1.001612901687622, "learning_rate": 3.605560260550164e-06, "loss": 0.3867, "num_input_tokens_seen": 98833120, "step": 103495 }, { "epoch": 8.442776735459661, "grad_norm": 15.719100952148438, "learning_rate": 3.603719104816056e-06, "loss": 0.4236, "num_input_tokens_seen": 98837056, "step": 103500 }, { "epoch": 8.44318459907007, "grad_norm": 8.317801475524902, "learning_rate": 3.6018783827754755e-06, "loss": 0.2183, "num_input_tokens_seen": 98842000, "step": 103505 }, { "epoch": 8.44359246268048, "grad_norm": 57.237510681152344, "learning_rate": 3.60003809446573e-06, "loss": 0.2404, "num_input_tokens_seen": 98846864, "step": 103510 }, { "epoch": 8.444000326290888, "grad_norm": 13.711813926696777, "learning_rate": 3.5981982399241165e-06, "loss": 0.36, "num_input_tokens_seen": 98852832, "step": 103515 }, { "epoch": 8.444408189901297, "grad_norm": 5.953942775726318, "learning_rate": 3.596358819187931e-06, "loss": 0.2523, "num_input_tokens_seen": 98857216, "step": 103520 }, { "epoch": 8.444816053511706, "grad_norm": 3.4278504848480225, "learning_rate": 3.5945198322944575e-06, "loss": 0.1817, "num_input_tokens_seen": 98862464, "step": 103525 }, { "epoch": 8.445223917122114, "grad_norm": 27.510732650756836, "learning_rate": 3.5926812792809713e-06, "loss": 0.3762, "num_input_tokens_seen": 98867296, "step": 103530 }, { "epoch": 8.445631780732523, "grad_norm": 3.3379580974578857, "learning_rate": 3.59084316018474e-06, "loss": 0.3252, "num_input_tokens_seen": 98871856, "step": 103535 }, { "epoch": 8.446039644342932, "grad_norm": 25.02740478515625, "learning_rate": 3.589005475043017e-06, "loss": 0.3252, "num_input_tokens_seen": 98876368, "step": 103540 }, { "epoch": 8.44644750795334, "grad_norm": 2.422370433807373, "learning_rate": 3.5871682238930533e-06, "loss": 0.3299, "num_input_tokens_seen": 98880048, "step": 103545 }, { "epoch": 8.44685537156375, "grad_norm": 7.909479141235352, "learning_rate": 3.585331406772094e-06, "loss": 0.2854, "num_input_tokens_seen": 98885536, "step": 103550 }, { "epoch": 8.447263235174157, "grad_norm": 19.421955108642578, "learning_rate": 3.5834950237173674e-06, "loss": 0.4435, "num_input_tokens_seen": 98890432, "step": 103555 }, { "epoch": 8.447671098784566, "grad_norm": 1.6317191123962402, "learning_rate": 3.581659074766097e-06, "loss": 0.508, "num_input_tokens_seen": 98895264, "step": 103560 }, { "epoch": 8.448078962394975, "grad_norm": 2.3841845989227295, "learning_rate": 3.5798235599554977e-06, "loss": 0.2457, "num_input_tokens_seen": 98898816, "step": 103565 }, { "epoch": 8.448486826005384, "grad_norm": 21.480758666992188, "learning_rate": 3.5779884793227663e-06, "loss": 0.2981, "num_input_tokens_seen": 98902880, "step": 103570 }, { "epoch": 8.448894689615793, "grad_norm": 16.184783935546875, "learning_rate": 3.57615383290511e-06, "loss": 0.3214, "num_input_tokens_seen": 98907056, "step": 103575 }, { "epoch": 8.4493025532262, "grad_norm": 25.691144943237305, "learning_rate": 3.5743196207397133e-06, "loss": 0.3882, "num_input_tokens_seen": 98912464, "step": 103580 }, { "epoch": 8.44971041683661, "grad_norm": 2.0725531578063965, "learning_rate": 3.5724858428637515e-06, "loss": 0.4291, "num_input_tokens_seen": 98917264, "step": 103585 }, { "epoch": 8.450118280447018, "grad_norm": 19.7344913482666, "learning_rate": 3.570652499314392e-06, "loss": 0.253, "num_input_tokens_seen": 98921776, "step": 103590 }, { "epoch": 8.450526144057427, "grad_norm": 33.68132781982422, "learning_rate": 3.568819590128808e-06, "loss": 0.3178, "num_input_tokens_seen": 98926128, "step": 103595 }, { "epoch": 8.450934007667836, "grad_norm": 37.474761962890625, "learning_rate": 3.566987115344142e-06, "loss": 0.5261, "num_input_tokens_seen": 98930624, "step": 103600 }, { "epoch": 8.451341871278245, "grad_norm": 45.017818450927734, "learning_rate": 3.5651550749975423e-06, "loss": 0.259, "num_input_tokens_seen": 98935040, "step": 103605 }, { "epoch": 8.451749734888653, "grad_norm": 21.009965896606445, "learning_rate": 3.563323469126137e-06, "loss": 0.3414, "num_input_tokens_seen": 98940560, "step": 103610 }, { "epoch": 8.452157598499062, "grad_norm": 11.08898639678955, "learning_rate": 3.5614922977670523e-06, "loss": 0.3863, "num_input_tokens_seen": 98945392, "step": 103615 }, { "epoch": 8.45256546210947, "grad_norm": 4.77558708190918, "learning_rate": 3.559661560957417e-06, "loss": 0.4818, "num_input_tokens_seen": 98950000, "step": 103620 }, { "epoch": 8.45297332571988, "grad_norm": 33.59299850463867, "learning_rate": 3.5578312587343286e-06, "loss": 0.4685, "num_input_tokens_seen": 98954672, "step": 103625 }, { "epoch": 8.453381189330289, "grad_norm": 21.854598999023438, "learning_rate": 3.5560013911348887e-06, "loss": 0.354, "num_input_tokens_seen": 98959904, "step": 103630 }, { "epoch": 8.453789052940696, "grad_norm": 17.05173110961914, "learning_rate": 3.554171958196184e-06, "loss": 0.248, "num_input_tokens_seen": 98965088, "step": 103635 }, { "epoch": 8.454196916551105, "grad_norm": 36.85604476928711, "learning_rate": 3.552342959955307e-06, "loss": 0.1997, "num_input_tokens_seen": 98970448, "step": 103640 }, { "epoch": 8.454604780161514, "grad_norm": 8.467355728149414, "learning_rate": 3.5505143964493224e-06, "loss": 0.3328, "num_input_tokens_seen": 98975088, "step": 103645 }, { "epoch": 8.455012643771923, "grad_norm": 10.2793607711792, "learning_rate": 3.5486862677152984e-06, "loss": 0.167, "num_input_tokens_seen": 98979920, "step": 103650 }, { "epoch": 8.455420507382332, "grad_norm": 22.428369522094727, "learning_rate": 3.5468585737902875e-06, "loss": 0.4707, "num_input_tokens_seen": 98984608, "step": 103655 }, { "epoch": 8.45582837099274, "grad_norm": 1.536434292793274, "learning_rate": 3.545031314711339e-06, "loss": 0.3663, "num_input_tokens_seen": 98989328, "step": 103660 }, { "epoch": 8.456236234603148, "grad_norm": 3.0982158184051514, "learning_rate": 3.5432044905154892e-06, "loss": 0.3621, "num_input_tokens_seen": 98994448, "step": 103665 }, { "epoch": 8.456644098213557, "grad_norm": 31.038856506347656, "learning_rate": 3.5413781012397647e-06, "loss": 0.4805, "num_input_tokens_seen": 98998624, "step": 103670 }, { "epoch": 8.457051961823966, "grad_norm": 4.3008713722229, "learning_rate": 3.539552146921188e-06, "loss": 0.3771, "num_input_tokens_seen": 99003488, "step": 103675 }, { "epoch": 8.457459825434375, "grad_norm": 4.538627624511719, "learning_rate": 3.537726627596766e-06, "loss": 0.3013, "num_input_tokens_seen": 99008096, "step": 103680 }, { "epoch": 8.457867689044784, "grad_norm": 1.9280767440795898, "learning_rate": 3.53590154330351e-06, "loss": 0.1982, "num_input_tokens_seen": 99012928, "step": 103685 }, { "epoch": 8.458275552655191, "grad_norm": 20.908329010009766, "learning_rate": 3.5340768940784086e-06, "loss": 0.2936, "num_input_tokens_seen": 99018400, "step": 103690 }, { "epoch": 8.4586834162656, "grad_norm": 1.2427668571472168, "learning_rate": 3.532252679958448e-06, "loss": 0.3728, "num_input_tokens_seen": 99023232, "step": 103695 }, { "epoch": 8.45909127987601, "grad_norm": 3.669867753982544, "learning_rate": 3.530428900980598e-06, "loss": 0.3644, "num_input_tokens_seen": 99026624, "step": 103700 }, { "epoch": 8.459499143486418, "grad_norm": 6.171833515167236, "learning_rate": 3.5286055571818357e-06, "loss": 0.3106, "num_input_tokens_seen": 99031456, "step": 103705 }, { "epoch": 8.459907007096827, "grad_norm": 2.814981460571289, "learning_rate": 3.5267826485991166e-06, "loss": 0.3658, "num_input_tokens_seen": 99036752, "step": 103710 }, { "epoch": 8.460314870707235, "grad_norm": 28.060565948486328, "learning_rate": 3.524960175269387e-06, "loss": 0.3763, "num_input_tokens_seen": 99040992, "step": 103715 }, { "epoch": 8.460722734317644, "grad_norm": 2.4517900943756104, "learning_rate": 3.5231381372295925e-06, "loss": 0.2139, "num_input_tokens_seen": 99045472, "step": 103720 }, { "epoch": 8.461130597928053, "grad_norm": 1.1812983751296997, "learning_rate": 3.5213165345166525e-06, "loss": 0.2904, "num_input_tokens_seen": 99050416, "step": 103725 }, { "epoch": 8.461538461538462, "grad_norm": 1.6826815605163574, "learning_rate": 3.519495367167508e-06, "loss": 0.1843, "num_input_tokens_seen": 99055424, "step": 103730 }, { "epoch": 8.46194632514887, "grad_norm": 19.97765350341797, "learning_rate": 3.517674635219062e-06, "loss": 0.4769, "num_input_tokens_seen": 99060496, "step": 103735 }, { "epoch": 8.46235418875928, "grad_norm": 4.140300273895264, "learning_rate": 3.5158543387082244e-06, "loss": 0.2241, "num_input_tokens_seen": 99064944, "step": 103740 }, { "epoch": 8.462762052369687, "grad_norm": 54.98379898071289, "learning_rate": 3.514034477671885e-06, "loss": 0.3945, "num_input_tokens_seen": 99069216, "step": 103745 }, { "epoch": 8.463169915980096, "grad_norm": 21.08067512512207, "learning_rate": 3.5122150521469416e-06, "loss": 0.3192, "num_input_tokens_seen": 99073664, "step": 103750 }, { "epoch": 8.463577779590505, "grad_norm": 34.93134689331055, "learning_rate": 3.5103960621702682e-06, "loss": 0.3871, "num_input_tokens_seen": 99078528, "step": 103755 }, { "epoch": 8.463985643200914, "grad_norm": 14.686380386352539, "learning_rate": 3.5085775077787346e-06, "loss": 0.4411, "num_input_tokens_seen": 99083456, "step": 103760 }, { "epoch": 8.464393506811323, "grad_norm": 2.97359561920166, "learning_rate": 3.506759389009204e-06, "loss": 0.3645, "num_input_tokens_seen": 99088768, "step": 103765 }, { "epoch": 8.46480137042173, "grad_norm": 3.1694538593292236, "learning_rate": 3.5049417058985208e-06, "loss": 0.4526, "num_input_tokens_seen": 99093248, "step": 103770 }, { "epoch": 8.46520923403214, "grad_norm": 7.0519514083862305, "learning_rate": 3.5031244584835423e-06, "loss": 0.4033, "num_input_tokens_seen": 99097984, "step": 103775 }, { "epoch": 8.465617097642548, "grad_norm": 1.41354501247406, "learning_rate": 3.501307646801097e-06, "loss": 0.2671, "num_input_tokens_seen": 99102896, "step": 103780 }, { "epoch": 8.466024961252957, "grad_norm": 3.1690075397491455, "learning_rate": 3.4994912708880086e-06, "loss": 0.2407, "num_input_tokens_seen": 99107808, "step": 103785 }, { "epoch": 8.466432824863366, "grad_norm": 2.5759637355804443, "learning_rate": 3.497675330781097e-06, "loss": 0.289, "num_input_tokens_seen": 99112432, "step": 103790 }, { "epoch": 8.466840688473773, "grad_norm": 14.714922904968262, "learning_rate": 3.495859826517167e-06, "loss": 0.4277, "num_input_tokens_seen": 99117520, "step": 103795 }, { "epoch": 8.467248552084182, "grad_norm": 1.541292667388916, "learning_rate": 3.4940447581330247e-06, "loss": 0.192, "num_input_tokens_seen": 99122208, "step": 103800 }, { "epoch": 8.467656415694591, "grad_norm": 2.246347427368164, "learning_rate": 3.492230125665455e-06, "loss": 0.448, "num_input_tokens_seen": 99127296, "step": 103805 }, { "epoch": 8.468064279305, "grad_norm": 18.350008010864258, "learning_rate": 3.4904159291512387e-06, "loss": 0.3233, "num_input_tokens_seen": 99131600, "step": 103810 }, { "epoch": 8.46847214291541, "grad_norm": 11.385652542114258, "learning_rate": 3.48860216862715e-06, "loss": 0.3637, "num_input_tokens_seen": 99136864, "step": 103815 }, { "epoch": 8.468880006525819, "grad_norm": 19.907175064086914, "learning_rate": 3.4867888441299594e-06, "loss": 0.3935, "num_input_tokens_seen": 99141760, "step": 103820 }, { "epoch": 8.469287870136226, "grad_norm": 6.571784973144531, "learning_rate": 3.484975955696415e-06, "loss": 0.4315, "num_input_tokens_seen": 99146640, "step": 103825 }, { "epoch": 8.469695733746635, "grad_norm": 3.3768227100372314, "learning_rate": 3.483163503363268e-06, "loss": 0.3819, "num_input_tokens_seen": 99151168, "step": 103830 }, { "epoch": 8.470103597357044, "grad_norm": 4.192748069763184, "learning_rate": 3.4813514871672476e-06, "loss": 0.4275, "num_input_tokens_seen": 99155920, "step": 103835 }, { "epoch": 8.470511460967453, "grad_norm": 1.816293478012085, "learning_rate": 3.4795399071450933e-06, "loss": 0.2098, "num_input_tokens_seen": 99160576, "step": 103840 }, { "epoch": 8.470919324577862, "grad_norm": 2.098013401031494, "learning_rate": 3.477728763333524e-06, "loss": 0.3777, "num_input_tokens_seen": 99165040, "step": 103845 }, { "epoch": 8.471327188188269, "grad_norm": 24.400012969970703, "learning_rate": 3.4759180557692423e-06, "loss": 0.3933, "num_input_tokens_seen": 99169056, "step": 103850 }, { "epoch": 8.471735051798678, "grad_norm": 29.403535842895508, "learning_rate": 3.474107784488953e-06, "loss": 0.3644, "num_input_tokens_seen": 99173824, "step": 103855 }, { "epoch": 8.472142915409087, "grad_norm": 6.779539108276367, "learning_rate": 3.4722979495293573e-06, "loss": 0.4705, "num_input_tokens_seen": 99178768, "step": 103860 }, { "epoch": 8.472550779019496, "grad_norm": 62.64946365356445, "learning_rate": 3.470488550927134e-06, "loss": 0.2661, "num_input_tokens_seen": 99184064, "step": 103865 }, { "epoch": 8.472958642629905, "grad_norm": 2.9104576110839844, "learning_rate": 3.4686795887189594e-06, "loss": 0.3043, "num_input_tokens_seen": 99189104, "step": 103870 }, { "epoch": 8.473366506240314, "grad_norm": 8.018743515014648, "learning_rate": 3.4668710629415013e-06, "loss": 0.2722, "num_input_tokens_seen": 99193920, "step": 103875 }, { "epoch": 8.473774369850721, "grad_norm": 42.29243469238281, "learning_rate": 3.4650629736314083e-06, "loss": 0.4463, "num_input_tokens_seen": 99197856, "step": 103880 }, { "epoch": 8.47418223346113, "grad_norm": 55.568359375, "learning_rate": 3.4632553208253456e-06, "loss": 0.5149, "num_input_tokens_seen": 99202480, "step": 103885 }, { "epoch": 8.47459009707154, "grad_norm": 9.078574180603027, "learning_rate": 3.4614481045599472e-06, "loss": 0.2441, "num_input_tokens_seen": 99207104, "step": 103890 }, { "epoch": 8.474997960681948, "grad_norm": 23.607568740844727, "learning_rate": 3.459641324871843e-06, "loss": 0.2965, "num_input_tokens_seen": 99212480, "step": 103895 }, { "epoch": 8.475405824292357, "grad_norm": 13.207721710205078, "learning_rate": 3.4578349817976474e-06, "loss": 0.3044, "num_input_tokens_seen": 99217424, "step": 103900 }, { "epoch": 8.475813687902765, "grad_norm": 2.8047690391540527, "learning_rate": 3.4560290753739927e-06, "loss": 0.4069, "num_input_tokens_seen": 99222384, "step": 103905 }, { "epoch": 8.476221551513174, "grad_norm": 3.3862264156341553, "learning_rate": 3.4542236056374718e-06, "loss": 0.3417, "num_input_tokens_seen": 99227824, "step": 103910 }, { "epoch": 8.476629415123583, "grad_norm": 10.3881254196167, "learning_rate": 3.4524185726246828e-06, "loss": 0.2292, "num_input_tokens_seen": 99232784, "step": 103915 }, { "epoch": 8.477037278733992, "grad_norm": 5.686732769012451, "learning_rate": 3.4506139763722133e-06, "loss": 0.4252, "num_input_tokens_seen": 99238128, "step": 103920 }, { "epoch": 8.4774451423444, "grad_norm": 23.17073631286621, "learning_rate": 3.448809816916643e-06, "loss": 0.3515, "num_input_tokens_seen": 99243376, "step": 103925 }, { "epoch": 8.47785300595481, "grad_norm": 0.7767173647880554, "learning_rate": 3.447006094294539e-06, "loss": 0.2968, "num_input_tokens_seen": 99248320, "step": 103930 }, { "epoch": 8.478260869565217, "grad_norm": 19.867324829101562, "learning_rate": 3.4452028085424637e-06, "loss": 0.3136, "num_input_tokens_seen": 99252592, "step": 103935 }, { "epoch": 8.478668733175626, "grad_norm": 6.321053504943848, "learning_rate": 3.443399959696969e-06, "loss": 0.2808, "num_input_tokens_seen": 99258112, "step": 103940 }, { "epoch": 8.479076596786035, "grad_norm": 26.386560440063477, "learning_rate": 3.4415975477945892e-06, "loss": 0.359, "num_input_tokens_seen": 99262800, "step": 103945 }, { "epoch": 8.479484460396444, "grad_norm": 20.934566497802734, "learning_rate": 3.4397955728718736e-06, "loss": 0.3951, "num_input_tokens_seen": 99267168, "step": 103950 }, { "epoch": 8.479892324006853, "grad_norm": 3.9971566200256348, "learning_rate": 3.437994034965339e-06, "loss": 0.4485, "num_input_tokens_seen": 99272384, "step": 103955 }, { "epoch": 8.48030018761726, "grad_norm": 2.082491874694824, "learning_rate": 3.436192934111504e-06, "loss": 0.3017, "num_input_tokens_seen": 99276688, "step": 103960 }, { "epoch": 8.48070805122767, "grad_norm": 16.526559829711914, "learning_rate": 3.4343922703468755e-06, "loss": 0.3296, "num_input_tokens_seen": 99282192, "step": 103965 }, { "epoch": 8.481115914838078, "grad_norm": 5.407660961151123, "learning_rate": 3.4325920437079468e-06, "loss": 0.284, "num_input_tokens_seen": 99287056, "step": 103970 }, { "epoch": 8.481523778448487, "grad_norm": 5.35256814956665, "learning_rate": 3.430792254231216e-06, "loss": 0.3828, "num_input_tokens_seen": 99292016, "step": 103975 }, { "epoch": 8.481931642058896, "grad_norm": 3.573084831237793, "learning_rate": 3.4289929019531625e-06, "loss": 0.214, "num_input_tokens_seen": 99297344, "step": 103980 }, { "epoch": 8.482339505669303, "grad_norm": 4.610230922698975, "learning_rate": 3.427193986910257e-06, "loss": 0.4531, "num_input_tokens_seen": 99301984, "step": 103985 }, { "epoch": 8.482747369279712, "grad_norm": 17.987722396850586, "learning_rate": 3.4253955091389566e-06, "loss": 0.2933, "num_input_tokens_seen": 99306544, "step": 103990 }, { "epoch": 8.483155232890121, "grad_norm": 2.303199052810669, "learning_rate": 3.4235974686757293e-06, "loss": 0.2471, "num_input_tokens_seen": 99311184, "step": 103995 }, { "epoch": 8.48356309650053, "grad_norm": 1.9332168102264404, "learning_rate": 3.42179986555701e-06, "loss": 0.4465, "num_input_tokens_seen": 99315136, "step": 104000 }, { "epoch": 8.48397096011094, "grad_norm": 2.1362853050231934, "learning_rate": 3.4200026998192384e-06, "loss": 0.3275, "num_input_tokens_seen": 99319248, "step": 104005 }, { "epoch": 8.484378823721348, "grad_norm": 3.757870674133301, "learning_rate": 3.4182059714988447e-06, "loss": 0.303, "num_input_tokens_seen": 99324704, "step": 104010 }, { "epoch": 8.484786687331756, "grad_norm": 2.7880468368530273, "learning_rate": 3.4164096806322377e-06, "loss": 0.2943, "num_input_tokens_seen": 99328992, "step": 104015 }, { "epoch": 8.485194550942165, "grad_norm": 21.817886352539062, "learning_rate": 3.4146138272558414e-06, "loss": 0.3699, "num_input_tokens_seen": 99334240, "step": 104020 }, { "epoch": 8.485602414552574, "grad_norm": 29.808006286621094, "learning_rate": 3.412818411406049e-06, "loss": 0.4271, "num_input_tokens_seen": 99339168, "step": 104025 }, { "epoch": 8.486010278162983, "grad_norm": 2.136587142944336, "learning_rate": 3.4110234331192565e-06, "loss": 0.4029, "num_input_tokens_seen": 99344880, "step": 104030 }, { "epoch": 8.486418141773392, "grad_norm": 11.74386978149414, "learning_rate": 3.409228892431837e-06, "loss": 0.3791, "num_input_tokens_seen": 99349792, "step": 104035 }, { "epoch": 8.486826005383799, "grad_norm": 3.106976270675659, "learning_rate": 3.4074347893801815e-06, "loss": 0.2523, "num_input_tokens_seen": 99354976, "step": 104040 }, { "epoch": 8.487233868994208, "grad_norm": 4.1327667236328125, "learning_rate": 3.405641124000647e-06, "loss": 0.2152, "num_input_tokens_seen": 99359408, "step": 104045 }, { "epoch": 8.487641732604617, "grad_norm": 17.068201065063477, "learning_rate": 3.40384789632959e-06, "loss": 0.406, "num_input_tokens_seen": 99364848, "step": 104050 }, { "epoch": 8.488049596215026, "grad_norm": 27.75351905822754, "learning_rate": 3.402055106403362e-06, "loss": 0.4307, "num_input_tokens_seen": 99369456, "step": 104055 }, { "epoch": 8.488457459825435, "grad_norm": 3.817547082901001, "learning_rate": 3.400262754258296e-06, "loss": 0.3775, "num_input_tokens_seen": 99374560, "step": 104060 }, { "epoch": 8.488865323435842, "grad_norm": 12.903030395507812, "learning_rate": 3.3984708399307286e-06, "loss": 0.3133, "num_input_tokens_seen": 99379392, "step": 104065 }, { "epoch": 8.489273187046251, "grad_norm": 2.596018075942993, "learning_rate": 3.396679363456973e-06, "loss": 0.2385, "num_input_tokens_seen": 99384080, "step": 104070 }, { "epoch": 8.48968105065666, "grad_norm": 12.505476951599121, "learning_rate": 3.3948883248733493e-06, "loss": 0.2791, "num_input_tokens_seen": 99388512, "step": 104075 }, { "epoch": 8.49008891426707, "grad_norm": 1.5704597234725952, "learning_rate": 3.39309772421616e-06, "loss": 0.1964, "num_input_tokens_seen": 99393600, "step": 104080 }, { "epoch": 8.490496777877478, "grad_norm": 9.253271102905273, "learning_rate": 3.3913075615217006e-06, "loss": 0.3503, "num_input_tokens_seen": 99398832, "step": 104085 }, { "epoch": 8.490904641487887, "grad_norm": 4.878957271575928, "learning_rate": 3.3895178368262504e-06, "loss": 0.182, "num_input_tokens_seen": 99402688, "step": 104090 }, { "epoch": 8.491312505098294, "grad_norm": 2.4285881519317627, "learning_rate": 3.3877285501660945e-06, "loss": 0.4011, "num_input_tokens_seen": 99407808, "step": 104095 }, { "epoch": 8.491720368708704, "grad_norm": 6.106419086456299, "learning_rate": 3.3859397015774897e-06, "loss": 0.2719, "num_input_tokens_seen": 99412576, "step": 104100 }, { "epoch": 8.492128232319113, "grad_norm": 3.0324954986572266, "learning_rate": 3.3841512910967097e-06, "loss": 0.4396, "num_input_tokens_seen": 99417088, "step": 104105 }, { "epoch": 8.492536095929522, "grad_norm": 12.981283187866211, "learning_rate": 3.3823633187599983e-06, "loss": 0.3938, "num_input_tokens_seen": 99421584, "step": 104110 }, { "epoch": 8.49294395953993, "grad_norm": 0.9808966517448425, "learning_rate": 3.380575784603596e-06, "loss": 0.379, "num_input_tokens_seen": 99426480, "step": 104115 }, { "epoch": 8.493351823150338, "grad_norm": 19.04770851135254, "learning_rate": 3.3787886886637365e-06, "loss": 0.3446, "num_input_tokens_seen": 99431712, "step": 104120 }, { "epoch": 8.493759686760747, "grad_norm": 4.003406524658203, "learning_rate": 3.3770020309766366e-06, "loss": 0.1789, "num_input_tokens_seen": 99436048, "step": 104125 }, { "epoch": 8.494167550371156, "grad_norm": 5.854340553283691, "learning_rate": 3.375215811578522e-06, "loss": 0.3618, "num_input_tokens_seen": 99441520, "step": 104130 }, { "epoch": 8.494575413981565, "grad_norm": 1.0907050371170044, "learning_rate": 3.373430030505595e-06, "loss": 0.2978, "num_input_tokens_seen": 99446096, "step": 104135 }, { "epoch": 8.494983277591974, "grad_norm": 6.94901704788208, "learning_rate": 3.371644687794051e-06, "loss": 0.3554, "num_input_tokens_seen": 99450896, "step": 104140 }, { "epoch": 8.495391141202383, "grad_norm": 2.041226863861084, "learning_rate": 3.369859783480073e-06, "loss": 0.3709, "num_input_tokens_seen": 99455600, "step": 104145 }, { "epoch": 8.49579900481279, "grad_norm": 25.21512794494629, "learning_rate": 3.368075317599853e-06, "loss": 0.2904, "num_input_tokens_seen": 99460688, "step": 104150 }, { "epoch": 8.496206868423199, "grad_norm": 16.885238647460938, "learning_rate": 3.3662912901895522e-06, "loss": 0.3229, "num_input_tokens_seen": 99465328, "step": 104155 }, { "epoch": 8.496614732033608, "grad_norm": 8.300928115844727, "learning_rate": 3.3645077012853354e-06, "loss": 0.2363, "num_input_tokens_seen": 99469840, "step": 104160 }, { "epoch": 8.497022595644017, "grad_norm": 9.138628005981445, "learning_rate": 3.3627245509233548e-06, "loss": 0.2229, "num_input_tokens_seen": 99474832, "step": 104165 }, { "epoch": 8.497430459254426, "grad_norm": 16.80124282836914, "learning_rate": 3.3609418391397474e-06, "loss": 0.3211, "num_input_tokens_seen": 99479712, "step": 104170 }, { "epoch": 8.497838322864833, "grad_norm": 32.692073822021484, "learning_rate": 3.359159565970657e-06, "loss": 0.3054, "num_input_tokens_seen": 99484784, "step": 104175 }, { "epoch": 8.498246186475242, "grad_norm": 2.402226209640503, "learning_rate": 3.35737773145221e-06, "loss": 0.4873, "num_input_tokens_seen": 99489840, "step": 104180 }, { "epoch": 8.498654050085651, "grad_norm": 28.514638900756836, "learning_rate": 3.3555963356205173e-06, "loss": 0.2321, "num_input_tokens_seen": 99495264, "step": 104185 }, { "epoch": 8.49906191369606, "grad_norm": 4.312603950500488, "learning_rate": 3.353815378511685e-06, "loss": 0.2966, "num_input_tokens_seen": 99500512, "step": 104190 }, { "epoch": 8.49946977730647, "grad_norm": 2.6164512634277344, "learning_rate": 3.3520348601618236e-06, "loss": 0.4458, "num_input_tokens_seen": 99504768, "step": 104195 }, { "epoch": 8.499877640916877, "grad_norm": 11.636343002319336, "learning_rate": 3.350254780607015e-06, "loss": 0.3549, "num_input_tokens_seen": 99509568, "step": 104200 }, { "epoch": 8.500285504527286, "grad_norm": 36.06620788574219, "learning_rate": 3.3484751398833447e-06, "loss": 0.3738, "num_input_tokens_seen": 99514624, "step": 104205 }, { "epoch": 8.500693368137695, "grad_norm": 1.618943691253662, "learning_rate": 3.346695938026881e-06, "loss": 0.388, "num_input_tokens_seen": 99519344, "step": 104210 }, { "epoch": 8.500693368137695, "eval_loss": 0.33680662512779236, "eval_runtime": 570.9244, "eval_samples_per_second": 4.773, "eval_steps_per_second": 2.387, "num_input_tokens_seen": 99519344, "step": 104210 }, { "epoch": 8.501101231748104, "grad_norm": 1.1047388315200806, "learning_rate": 3.344917175073692e-06, "loss": 0.3033, "num_input_tokens_seen": 99524752, "step": 104215 }, { "epoch": 8.501509095358513, "grad_norm": 17.867050170898438, "learning_rate": 3.3431388510598266e-06, "loss": 0.2836, "num_input_tokens_seen": 99529312, "step": 104220 }, { "epoch": 8.50191695896892, "grad_norm": 10.521263122558594, "learning_rate": 3.3413609660213367e-06, "loss": 0.5055, "num_input_tokens_seen": 99533808, "step": 104225 }, { "epoch": 8.502324822579329, "grad_norm": 17.528846740722656, "learning_rate": 3.3395835199942547e-06, "loss": 0.4119, "num_input_tokens_seen": 99539120, "step": 104230 }, { "epoch": 8.502732686189738, "grad_norm": 32.413780212402344, "learning_rate": 3.3378065130146074e-06, "loss": 0.3199, "num_input_tokens_seen": 99544848, "step": 104235 }, { "epoch": 8.503140549800147, "grad_norm": 3.4067180156707764, "learning_rate": 3.336029945118421e-06, "loss": 0.4, "num_input_tokens_seen": 99548960, "step": 104240 }, { "epoch": 8.503548413410556, "grad_norm": 28.71266746520996, "learning_rate": 3.334253816341704e-06, "loss": 0.3394, "num_input_tokens_seen": 99554912, "step": 104245 }, { "epoch": 8.503956277020965, "grad_norm": 7.639128684997559, "learning_rate": 3.3324781267204535e-06, "loss": 0.4228, "num_input_tokens_seen": 99559104, "step": 104250 }, { "epoch": 8.504364140631372, "grad_norm": 5.5434675216674805, "learning_rate": 3.330702876290659e-06, "loss": 0.342, "num_input_tokens_seen": 99563008, "step": 104255 }, { "epoch": 8.504772004241781, "grad_norm": 7.258564472198486, "learning_rate": 3.328928065088316e-06, "loss": 0.3119, "num_input_tokens_seen": 99567904, "step": 104260 }, { "epoch": 8.50517986785219, "grad_norm": 24.532167434692383, "learning_rate": 3.327153693149393e-06, "loss": 0.3477, "num_input_tokens_seen": 99571632, "step": 104265 }, { "epoch": 8.5055877314626, "grad_norm": 17.41393280029297, "learning_rate": 3.325379760509853e-06, "loss": 0.375, "num_input_tokens_seen": 99576624, "step": 104270 }, { "epoch": 8.505995595073008, "grad_norm": 2.766831636428833, "learning_rate": 3.323606267205659e-06, "loss": 0.2129, "num_input_tokens_seen": 99581312, "step": 104275 }, { "epoch": 8.506403458683415, "grad_norm": 2.30043888092041, "learning_rate": 3.321833213272746e-06, "loss": 0.3314, "num_input_tokens_seen": 99587152, "step": 104280 }, { "epoch": 8.506811322293824, "grad_norm": 6.624787330627441, "learning_rate": 3.3200605987470684e-06, "loss": 0.4986, "num_input_tokens_seen": 99591744, "step": 104285 }, { "epoch": 8.507219185904233, "grad_norm": 13.275956153869629, "learning_rate": 3.318288423664551e-06, "loss": 0.3433, "num_input_tokens_seen": 99596176, "step": 104290 }, { "epoch": 8.507627049514642, "grad_norm": 15.304018020629883, "learning_rate": 3.3165166880611137e-06, "loss": 0.2664, "num_input_tokens_seen": 99601408, "step": 104295 }, { "epoch": 8.508034913125051, "grad_norm": 3.2079551219940186, "learning_rate": 3.314745391972665e-06, "loss": 0.2568, "num_input_tokens_seen": 99605440, "step": 104300 }, { "epoch": 8.50844277673546, "grad_norm": 2.6453826427459717, "learning_rate": 3.312974535435115e-06, "loss": 0.2856, "num_input_tokens_seen": 99610176, "step": 104305 }, { "epoch": 8.508850640345868, "grad_norm": 6.642709732055664, "learning_rate": 3.3112041184843593e-06, "loss": 0.4084, "num_input_tokens_seen": 99614400, "step": 104310 }, { "epoch": 8.509258503956277, "grad_norm": 109.25859069824219, "learning_rate": 3.3094341411562785e-06, "loss": 0.3153, "num_input_tokens_seen": 99619392, "step": 104315 }, { "epoch": 8.509666367566686, "grad_norm": 27.577350616455078, "learning_rate": 3.3076646034867485e-06, "loss": 0.2964, "num_input_tokens_seen": 99624320, "step": 104320 }, { "epoch": 8.510074231177095, "grad_norm": 2.8984923362731934, "learning_rate": 3.3058955055116385e-06, "loss": 0.2409, "num_input_tokens_seen": 99629296, "step": 104325 }, { "epoch": 8.510482094787504, "grad_norm": 5.854537010192871, "learning_rate": 3.3041268472668115e-06, "loss": 0.3467, "num_input_tokens_seen": 99634064, "step": 104330 }, { "epoch": 8.510889958397911, "grad_norm": 35.43388748168945, "learning_rate": 3.302358628788113e-06, "loss": 0.3429, "num_input_tokens_seen": 99638928, "step": 104335 }, { "epoch": 8.51129782200832, "grad_norm": 25.138578414916992, "learning_rate": 3.3005908501113847e-06, "loss": 0.3238, "num_input_tokens_seen": 99644112, "step": 104340 }, { "epoch": 8.511705685618729, "grad_norm": 24.402530670166016, "learning_rate": 3.298823511272461e-06, "loss": 0.3861, "num_input_tokens_seen": 99649120, "step": 104345 }, { "epoch": 8.512113549229138, "grad_norm": 31.16604232788086, "learning_rate": 3.2970566123071645e-06, "loss": 0.2891, "num_input_tokens_seen": 99654256, "step": 104350 }, { "epoch": 8.512521412839547, "grad_norm": 23.08745574951172, "learning_rate": 3.295290153251307e-06, "loss": 0.3791, "num_input_tokens_seen": 99659680, "step": 104355 }, { "epoch": 8.512929276449956, "grad_norm": 13.962481498718262, "learning_rate": 3.2935241341406934e-06, "loss": 0.3396, "num_input_tokens_seen": 99664704, "step": 104360 }, { "epoch": 8.513337140060363, "grad_norm": 15.230257987976074, "learning_rate": 3.2917585550111233e-06, "loss": 0.4676, "num_input_tokens_seen": 99669024, "step": 104365 }, { "epoch": 8.513745003670772, "grad_norm": 37.47101974487305, "learning_rate": 3.289993415898379e-06, "loss": 0.3834, "num_input_tokens_seen": 99673440, "step": 104370 }, { "epoch": 8.514152867281181, "grad_norm": 11.900821685791016, "learning_rate": 3.288228716838246e-06, "loss": 0.271, "num_input_tokens_seen": 99678608, "step": 104375 }, { "epoch": 8.51456073089159, "grad_norm": 18.703903198242188, "learning_rate": 3.286464457866492e-06, "loss": 0.333, "num_input_tokens_seen": 99682384, "step": 104380 }, { "epoch": 8.514968594502, "grad_norm": 11.7896146774292, "learning_rate": 3.284700639018878e-06, "loss": 0.2592, "num_input_tokens_seen": 99687056, "step": 104385 }, { "epoch": 8.515376458112407, "grad_norm": 15.228059768676758, "learning_rate": 3.282937260331148e-06, "loss": 0.3557, "num_input_tokens_seen": 99691776, "step": 104390 }, { "epoch": 8.515784321722816, "grad_norm": 22.431049346923828, "learning_rate": 3.281174321839056e-06, "loss": 0.2534, "num_input_tokens_seen": 99696816, "step": 104395 }, { "epoch": 8.516192185333225, "grad_norm": 3.891540288925171, "learning_rate": 3.279411823578335e-06, "loss": 0.2377, "num_input_tokens_seen": 99702384, "step": 104400 }, { "epoch": 8.516600048943634, "grad_norm": 12.32789134979248, "learning_rate": 3.277649765584703e-06, "loss": 0.4312, "num_input_tokens_seen": 99706832, "step": 104405 }, { "epoch": 8.517007912554043, "grad_norm": 22.943464279174805, "learning_rate": 3.275888147893877e-06, "loss": 0.4565, "num_input_tokens_seen": 99711024, "step": 104410 }, { "epoch": 8.51741577616445, "grad_norm": 14.129075050354004, "learning_rate": 3.2741269705415716e-06, "loss": 0.3807, "num_input_tokens_seen": 99716016, "step": 104415 }, { "epoch": 8.517823639774859, "grad_norm": 2.712148666381836, "learning_rate": 3.2723662335634785e-06, "loss": 0.3418, "num_input_tokens_seen": 99720384, "step": 104420 }, { "epoch": 8.518231503385268, "grad_norm": 20.47609519958496, "learning_rate": 3.2706059369952942e-06, "loss": 0.3733, "num_input_tokens_seen": 99725296, "step": 104425 }, { "epoch": 8.518639366995677, "grad_norm": 4.882729530334473, "learning_rate": 3.2688460808726894e-06, "loss": 0.374, "num_input_tokens_seen": 99730000, "step": 104430 }, { "epoch": 8.519047230606086, "grad_norm": 27.699365615844727, "learning_rate": 3.2670866652313364e-06, "loss": 0.3006, "num_input_tokens_seen": 99735152, "step": 104435 }, { "epoch": 8.519455094216495, "grad_norm": 11.159358978271484, "learning_rate": 3.2653276901069093e-06, "loss": 0.2775, "num_input_tokens_seen": 99739872, "step": 104440 }, { "epoch": 8.519862957826902, "grad_norm": 15.58582878112793, "learning_rate": 3.2635691555350512e-06, "loss": 0.3691, "num_input_tokens_seen": 99744192, "step": 104445 }, { "epoch": 8.520270821437311, "grad_norm": 4.919487476348877, "learning_rate": 3.2618110615514117e-06, "loss": 0.2886, "num_input_tokens_seen": 99748720, "step": 104450 }, { "epoch": 8.52067868504772, "grad_norm": 26.340444564819336, "learning_rate": 3.2600534081916182e-06, "loss": 0.2614, "num_input_tokens_seen": 99753472, "step": 104455 }, { "epoch": 8.521086548658129, "grad_norm": 12.011009216308594, "learning_rate": 3.2582961954913118e-06, "loss": 0.4576, "num_input_tokens_seen": 99758112, "step": 104460 }, { "epoch": 8.521494412268538, "grad_norm": 4.555675983428955, "learning_rate": 3.2565394234861017e-06, "loss": 0.2222, "num_input_tokens_seen": 99761776, "step": 104465 }, { "epoch": 8.521902275878945, "grad_norm": 13.845499992370605, "learning_rate": 3.2547830922115997e-06, "loss": 0.356, "num_input_tokens_seen": 99766816, "step": 104470 }, { "epoch": 8.522310139489354, "grad_norm": 11.642626762390137, "learning_rate": 3.2530272017034015e-06, "loss": 0.4145, "num_input_tokens_seen": 99772016, "step": 104475 }, { "epoch": 8.522718003099763, "grad_norm": 1.2780441045761108, "learning_rate": 3.2512717519971037e-06, "loss": 0.4198, "num_input_tokens_seen": 99776368, "step": 104480 }, { "epoch": 8.523125866710172, "grad_norm": 9.864950180053711, "learning_rate": 3.2495167431282834e-06, "loss": 0.3853, "num_input_tokens_seen": 99780528, "step": 104485 }, { "epoch": 8.523533730320581, "grad_norm": 2.3621346950531006, "learning_rate": 3.247762175132518e-06, "loss": 0.3289, "num_input_tokens_seen": 99785408, "step": 104490 }, { "epoch": 8.523941593930989, "grad_norm": 8.11005973815918, "learning_rate": 3.2460080480453682e-06, "loss": 0.5019, "num_input_tokens_seen": 99789440, "step": 104495 }, { "epoch": 8.524349457541398, "grad_norm": 21.04694175720215, "learning_rate": 3.2442543619023878e-06, "loss": 0.3285, "num_input_tokens_seen": 99794464, "step": 104500 }, { "epoch": 8.524757321151807, "grad_norm": 1.5567584037780762, "learning_rate": 3.242501116739133e-06, "loss": 0.3202, "num_input_tokens_seen": 99799408, "step": 104505 }, { "epoch": 8.525165184762216, "grad_norm": 3.96694016456604, "learning_rate": 3.2407483125911355e-06, "loss": 0.3212, "num_input_tokens_seen": 99803872, "step": 104510 }, { "epoch": 8.525573048372625, "grad_norm": 1.442168116569519, "learning_rate": 3.23899594949392e-06, "loss": 0.2011, "num_input_tokens_seen": 99809328, "step": 104515 }, { "epoch": 8.525980911983034, "grad_norm": 8.54775333404541, "learning_rate": 3.2372440274830136e-06, "loss": 0.3356, "num_input_tokens_seen": 99813552, "step": 104520 }, { "epoch": 8.52638877559344, "grad_norm": 9.540189743041992, "learning_rate": 3.235492546593916e-06, "loss": 0.319, "num_input_tokens_seen": 99817264, "step": 104525 }, { "epoch": 8.52679663920385, "grad_norm": 7.827259063720703, "learning_rate": 3.2337415068621424e-06, "loss": 0.4558, "num_input_tokens_seen": 99821488, "step": 104530 }, { "epoch": 8.527204502814259, "grad_norm": 23.11235237121582, "learning_rate": 3.2319909083231793e-06, "loss": 0.3347, "num_input_tokens_seen": 99826016, "step": 104535 }, { "epoch": 8.527612366424668, "grad_norm": 4.329336643218994, "learning_rate": 3.2302407510125117e-06, "loss": 0.332, "num_input_tokens_seen": 99831072, "step": 104540 }, { "epoch": 8.528020230035077, "grad_norm": 17.809144973754883, "learning_rate": 3.2284910349656057e-06, "loss": 0.2203, "num_input_tokens_seen": 99835888, "step": 104545 }, { "epoch": 8.528428093645484, "grad_norm": 7.919181823730469, "learning_rate": 3.226741760217941e-06, "loss": 0.5608, "num_input_tokens_seen": 99840528, "step": 104550 }, { "epoch": 8.528835957255893, "grad_norm": 53.987178802490234, "learning_rate": 3.2249929268049706e-06, "loss": 0.42, "num_input_tokens_seen": 99844976, "step": 104555 }, { "epoch": 8.529243820866302, "grad_norm": 14.929831504821777, "learning_rate": 3.2232445347621402e-06, "loss": 0.3251, "num_input_tokens_seen": 99849856, "step": 104560 }, { "epoch": 8.529651684476711, "grad_norm": 6.233778476715088, "learning_rate": 3.221496584124889e-06, "loss": 0.4737, "num_input_tokens_seen": 99854944, "step": 104565 }, { "epoch": 8.53005954808712, "grad_norm": 2.5365920066833496, "learning_rate": 3.219749074928641e-06, "loss": 0.3559, "num_input_tokens_seen": 99860576, "step": 104570 }, { "epoch": 8.53046741169753, "grad_norm": 22.551239013671875, "learning_rate": 3.2180020072088314e-06, "loss": 0.3258, "num_input_tokens_seen": 99864992, "step": 104575 }, { "epoch": 8.530875275307936, "grad_norm": 2.9405579566955566, "learning_rate": 3.2162553810008655e-06, "loss": 0.4304, "num_input_tokens_seen": 99869552, "step": 104580 }, { "epoch": 8.531283138918345, "grad_norm": 24.512895584106445, "learning_rate": 3.2145091963401458e-06, "loss": 0.3305, "num_input_tokens_seen": 99874144, "step": 104585 }, { "epoch": 8.531691002528754, "grad_norm": 4.370418071746826, "learning_rate": 3.2127634532620634e-06, "loss": 0.3974, "num_input_tokens_seen": 99878192, "step": 104590 }, { "epoch": 8.532098866139163, "grad_norm": 4.37608003616333, "learning_rate": 3.2110181518020116e-06, "loss": 0.2945, "num_input_tokens_seen": 99883024, "step": 104595 }, { "epoch": 8.532506729749572, "grad_norm": 30.735776901245117, "learning_rate": 3.2092732919953655e-06, "loss": 0.3427, "num_input_tokens_seen": 99887600, "step": 104600 }, { "epoch": 8.53291459335998, "grad_norm": 2.870988368988037, "learning_rate": 3.2075288738774882e-06, "loss": 0.3837, "num_input_tokens_seen": 99891344, "step": 104605 }, { "epoch": 8.533322456970389, "grad_norm": 5.335362434387207, "learning_rate": 3.2057848974837405e-06, "loss": 0.4454, "num_input_tokens_seen": 99895808, "step": 104610 }, { "epoch": 8.533730320580798, "grad_norm": 6.077967643737793, "learning_rate": 3.204041362849472e-06, "loss": 0.3663, "num_input_tokens_seen": 99900656, "step": 104615 }, { "epoch": 8.534138184191207, "grad_norm": 16.59925079345703, "learning_rate": 3.2022982700100235e-06, "loss": 0.3611, "num_input_tokens_seen": 99904720, "step": 104620 }, { "epoch": 8.534546047801616, "grad_norm": 5.501993179321289, "learning_rate": 3.200555619000728e-06, "loss": 0.4012, "num_input_tokens_seen": 99910016, "step": 104625 }, { "epoch": 8.534953911412025, "grad_norm": 15.438125610351562, "learning_rate": 3.1988134098569073e-06, "loss": 0.4406, "num_input_tokens_seen": 99914512, "step": 104630 }, { "epoch": 8.535361775022432, "grad_norm": 2.4112229347229004, "learning_rate": 3.1970716426138665e-06, "loss": 0.2802, "num_input_tokens_seen": 99918720, "step": 104635 }, { "epoch": 8.535769638632841, "grad_norm": 24.685091018676758, "learning_rate": 3.195330317306927e-06, "loss": 0.2968, "num_input_tokens_seen": 99923136, "step": 104640 }, { "epoch": 8.53617750224325, "grad_norm": 10.486681938171387, "learning_rate": 3.193589433971375e-06, "loss": 0.3648, "num_input_tokens_seen": 99928144, "step": 104645 }, { "epoch": 8.536585365853659, "grad_norm": 14.804079055786133, "learning_rate": 3.191848992642499e-06, "loss": 0.3282, "num_input_tokens_seen": 99932928, "step": 104650 }, { "epoch": 8.536993229464068, "grad_norm": 9.4442720413208, "learning_rate": 3.1901089933555726e-06, "loss": 0.2422, "num_input_tokens_seen": 99938592, "step": 104655 }, { "epoch": 8.537401093074475, "grad_norm": 4.581972122192383, "learning_rate": 3.1883694361458742e-06, "loss": 0.4501, "num_input_tokens_seen": 99943008, "step": 104660 }, { "epoch": 8.537808956684884, "grad_norm": 31.103954315185547, "learning_rate": 3.1866303210486615e-06, "loss": 0.3522, "num_input_tokens_seen": 99948192, "step": 104665 }, { "epoch": 8.538216820295293, "grad_norm": 18.080406188964844, "learning_rate": 3.1848916480991808e-06, "loss": 0.2313, "num_input_tokens_seen": 99953136, "step": 104670 }, { "epoch": 8.538624683905702, "grad_norm": 11.462234497070312, "learning_rate": 3.1831534173326766e-06, "loss": 0.1898, "num_input_tokens_seen": 99957456, "step": 104675 }, { "epoch": 8.539032547516111, "grad_norm": 2.260479688644409, "learning_rate": 3.181415628784376e-06, "loss": 0.2867, "num_input_tokens_seen": 99962336, "step": 104680 }, { "epoch": 8.539440411126519, "grad_norm": 5.6422905921936035, "learning_rate": 3.1796782824895148e-06, "loss": 0.3048, "num_input_tokens_seen": 99967248, "step": 104685 }, { "epoch": 8.539848274736928, "grad_norm": 27.54671859741211, "learning_rate": 3.177941378483304e-06, "loss": 0.292, "num_input_tokens_seen": 99971664, "step": 104690 }, { "epoch": 8.540256138347337, "grad_norm": 7.728543281555176, "learning_rate": 3.1762049168009482e-06, "loss": 0.3796, "num_input_tokens_seen": 99976208, "step": 104695 }, { "epoch": 8.540664001957746, "grad_norm": 10.140132904052734, "learning_rate": 3.1744688974776393e-06, "loss": 0.3932, "num_input_tokens_seen": 99981088, "step": 104700 }, { "epoch": 8.541071865568155, "grad_norm": 5.624790668487549, "learning_rate": 3.1727333205485795e-06, "loss": 0.3385, "num_input_tokens_seen": 99986272, "step": 104705 }, { "epoch": 8.541479729178562, "grad_norm": 16.341115951538086, "learning_rate": 3.1709981860489375e-06, "loss": 0.3719, "num_input_tokens_seen": 99991168, "step": 104710 }, { "epoch": 8.54188759278897, "grad_norm": 17.755508422851562, "learning_rate": 3.169263494013888e-06, "loss": 0.4093, "num_input_tokens_seen": 99995728, "step": 104715 }, { "epoch": 8.54229545639938, "grad_norm": 13.74854850769043, "learning_rate": 3.1675292444785897e-06, "loss": 0.4024, "num_input_tokens_seen": 99999312, "step": 104720 }, { "epoch": 8.542703320009789, "grad_norm": 2.482994318008423, "learning_rate": 3.1657954374781917e-06, "loss": 0.1667, "num_input_tokens_seen": 100003584, "step": 104725 }, { "epoch": 8.543111183620198, "grad_norm": 1.2074705362319946, "learning_rate": 3.164062073047849e-06, "loss": 0.3523, "num_input_tokens_seen": 100007824, "step": 104730 }, { "epoch": 8.543519047230607, "grad_norm": 13.130208015441895, "learning_rate": 3.1623291512226873e-06, "loss": 0.3697, "num_input_tokens_seen": 100012912, "step": 104735 }, { "epoch": 8.543926910841014, "grad_norm": 1.0090975761413574, "learning_rate": 3.1605966720378353e-06, "loss": 0.2347, "num_input_tokens_seen": 100017760, "step": 104740 }, { "epoch": 8.544334774451423, "grad_norm": 104.36807250976562, "learning_rate": 3.158864635528408e-06, "loss": 0.4033, "num_input_tokens_seen": 100022976, "step": 104745 }, { "epoch": 8.544742638061832, "grad_norm": 13.649798393249512, "learning_rate": 3.1571330417295153e-06, "loss": 0.4039, "num_input_tokens_seen": 100028064, "step": 104750 }, { "epoch": 8.545150501672241, "grad_norm": 10.927016258239746, "learning_rate": 3.1554018906762543e-06, "loss": 0.3193, "num_input_tokens_seen": 100032928, "step": 104755 }, { "epoch": 8.54555836528265, "grad_norm": 16.29172706604004, "learning_rate": 3.1536711824037114e-06, "loss": 0.4531, "num_input_tokens_seen": 100037312, "step": 104760 }, { "epoch": 8.545966228893057, "grad_norm": 21.482580184936523, "learning_rate": 3.151940916946969e-06, "loss": 0.3986, "num_input_tokens_seen": 100041200, "step": 104765 }, { "epoch": 8.546374092503466, "grad_norm": 14.734209060668945, "learning_rate": 3.1502110943411045e-06, "loss": 0.3526, "num_input_tokens_seen": 100045712, "step": 104770 }, { "epoch": 8.546781956113875, "grad_norm": 25.806133270263672, "learning_rate": 3.1484817146211767e-06, "loss": 0.3968, "num_input_tokens_seen": 100051312, "step": 104775 }, { "epoch": 8.547189819724284, "grad_norm": 2.8354744911193848, "learning_rate": 3.146752777822237e-06, "loss": 0.2853, "num_input_tokens_seen": 100056640, "step": 104780 }, { "epoch": 8.547597683334693, "grad_norm": 2.959486722946167, "learning_rate": 3.1450242839793336e-06, "loss": 0.5353, "num_input_tokens_seen": 100061392, "step": 104785 }, { "epoch": 8.548005546945102, "grad_norm": 15.497318267822266, "learning_rate": 3.143296233127496e-06, "loss": 0.3562, "num_input_tokens_seen": 100066832, "step": 104790 }, { "epoch": 8.54841341055551, "grad_norm": 7.68605375289917, "learning_rate": 3.1415686253017632e-06, "loss": 0.2307, "num_input_tokens_seen": 100071552, "step": 104795 }, { "epoch": 8.548821274165919, "grad_norm": 18.977853775024414, "learning_rate": 3.1398414605371464e-06, "loss": 0.2757, "num_input_tokens_seen": 100076256, "step": 104800 }, { "epoch": 8.549229137776328, "grad_norm": 52.564064025878906, "learning_rate": 3.1381147388686526e-06, "loss": 0.3727, "num_input_tokens_seen": 100080576, "step": 104805 }, { "epoch": 8.549637001386737, "grad_norm": 34.801475524902344, "learning_rate": 3.136388460331277e-06, "loss": 0.4369, "num_input_tokens_seen": 100084432, "step": 104810 }, { "epoch": 8.550044864997146, "grad_norm": 15.478340148925781, "learning_rate": 3.134662624960025e-06, "loss": 0.3462, "num_input_tokens_seen": 100089312, "step": 104815 }, { "epoch": 8.550452728607553, "grad_norm": 16.053504943847656, "learning_rate": 3.1329372327898676e-06, "loss": 0.4188, "num_input_tokens_seen": 100094240, "step": 104820 }, { "epoch": 8.550860592217962, "grad_norm": 2.522437334060669, "learning_rate": 3.131212283855783e-06, "loss": 0.3705, "num_input_tokens_seen": 100099360, "step": 104825 }, { "epoch": 8.551268455828371, "grad_norm": 29.44768714904785, "learning_rate": 3.129487778192733e-06, "loss": 0.418, "num_input_tokens_seen": 100104048, "step": 104830 }, { "epoch": 8.55167631943878, "grad_norm": 17.397705078125, "learning_rate": 3.127763715835666e-06, "loss": 0.4378, "num_input_tokens_seen": 100108592, "step": 104835 }, { "epoch": 8.552084183049189, "grad_norm": 7.23462438583374, "learning_rate": 3.126040096819538e-06, "loss": 0.2743, "num_input_tokens_seen": 100112592, "step": 104840 }, { "epoch": 8.552492046659598, "grad_norm": 6.316290378570557, "learning_rate": 3.124316921179285e-06, "loss": 0.3856, "num_input_tokens_seen": 100117728, "step": 104845 }, { "epoch": 8.552899910270005, "grad_norm": 8.420992851257324, "learning_rate": 3.122594188949832e-06, "loss": 0.3501, "num_input_tokens_seen": 100122320, "step": 104850 }, { "epoch": 8.553307773880414, "grad_norm": 10.903813362121582, "learning_rate": 3.120871900166092e-06, "loss": 0.3864, "num_input_tokens_seen": 100126624, "step": 104855 }, { "epoch": 8.553715637490823, "grad_norm": 32.69192886352539, "learning_rate": 3.1191500548629877e-06, "loss": 0.4095, "num_input_tokens_seen": 100131056, "step": 104860 }, { "epoch": 8.554123501101232, "grad_norm": 19.592811584472656, "learning_rate": 3.117428653075413e-06, "loss": 0.3112, "num_input_tokens_seen": 100136240, "step": 104865 }, { "epoch": 8.554531364711641, "grad_norm": 17.245407104492188, "learning_rate": 3.115707694838263e-06, "loss": 0.3784, "num_input_tokens_seen": 100141696, "step": 104870 }, { "epoch": 8.554939228322048, "grad_norm": 3.5484769344329834, "learning_rate": 3.1139871801864174e-06, "loss": 0.2813, "num_input_tokens_seen": 100145232, "step": 104875 }, { "epoch": 8.555347091932457, "grad_norm": 10.025890350341797, "learning_rate": 3.112267109154754e-06, "loss": 0.1609, "num_input_tokens_seen": 100149344, "step": 104880 }, { "epoch": 8.555754955542866, "grad_norm": 6.293038845062256, "learning_rate": 3.1105474817781284e-06, "loss": 0.3249, "num_input_tokens_seen": 100153408, "step": 104885 }, { "epoch": 8.556162819153275, "grad_norm": 3.8439457416534424, "learning_rate": 3.10882829809141e-06, "loss": 0.526, "num_input_tokens_seen": 100157456, "step": 104890 }, { "epoch": 8.556570682763684, "grad_norm": 9.199447631835938, "learning_rate": 3.1071095581294398e-06, "loss": 0.404, "num_input_tokens_seen": 100162080, "step": 104895 }, { "epoch": 8.556978546374092, "grad_norm": 38.361732482910156, "learning_rate": 3.105391261927054e-06, "loss": 0.3477, "num_input_tokens_seen": 100166384, "step": 104900 }, { "epoch": 8.5573864099845, "grad_norm": 8.43587875366211, "learning_rate": 3.103673409519084e-06, "loss": 0.3583, "num_input_tokens_seen": 100171280, "step": 104905 }, { "epoch": 8.55779427359491, "grad_norm": 9.81403923034668, "learning_rate": 3.1019560009403507e-06, "loss": 0.3017, "num_input_tokens_seen": 100175280, "step": 104910 }, { "epoch": 8.558202137205319, "grad_norm": 3.09733247756958, "learning_rate": 3.1002390362256662e-06, "loss": 0.3236, "num_input_tokens_seen": 100179920, "step": 104915 }, { "epoch": 8.558610000815728, "grad_norm": 4.622043609619141, "learning_rate": 3.09852251540983e-06, "loss": 0.2372, "num_input_tokens_seen": 100184960, "step": 104920 }, { "epoch": 8.559017864426135, "grad_norm": 14.335326194763184, "learning_rate": 3.0968064385276274e-06, "loss": 0.4354, "num_input_tokens_seen": 100189856, "step": 104925 }, { "epoch": 8.559425728036544, "grad_norm": 2.9209511280059814, "learning_rate": 3.0950908056138594e-06, "loss": 0.2918, "num_input_tokens_seen": 100194384, "step": 104930 }, { "epoch": 8.559833591646953, "grad_norm": 6.532103061676025, "learning_rate": 3.0933756167032923e-06, "loss": 0.5601, "num_input_tokens_seen": 100199344, "step": 104935 }, { "epoch": 8.560241455257362, "grad_norm": 15.281208992004395, "learning_rate": 3.091660871830693e-06, "loss": 0.2367, "num_input_tokens_seen": 100204560, "step": 104940 }, { "epoch": 8.560649318867771, "grad_norm": 19.548080444335938, "learning_rate": 3.0899465710308107e-06, "loss": 0.4274, "num_input_tokens_seen": 100209392, "step": 104945 }, { "epoch": 8.56105718247818, "grad_norm": 12.485597610473633, "learning_rate": 3.08823271433841e-06, "loss": 0.4625, "num_input_tokens_seen": 100214656, "step": 104950 }, { "epoch": 8.561465046088587, "grad_norm": 5.819498538970947, "learning_rate": 3.0865193017882184e-06, "loss": 0.3149, "num_input_tokens_seen": 100219616, "step": 104955 }, { "epoch": 8.561872909698996, "grad_norm": 20.060739517211914, "learning_rate": 3.084806333414969e-06, "loss": 0.3586, "num_input_tokens_seen": 100222912, "step": 104960 }, { "epoch": 8.562280773309405, "grad_norm": 12.73177433013916, "learning_rate": 3.083093809253379e-06, "loss": 0.4356, "num_input_tokens_seen": 100228160, "step": 104965 }, { "epoch": 8.562688636919814, "grad_norm": 26.44117546081543, "learning_rate": 3.0813817293381702e-06, "loss": 0.3217, "num_input_tokens_seen": 100233136, "step": 104970 }, { "epoch": 8.563096500530223, "grad_norm": 4.750014305114746, "learning_rate": 3.07967009370404e-06, "loss": 0.3661, "num_input_tokens_seen": 100237456, "step": 104975 }, { "epoch": 8.56350436414063, "grad_norm": 1.1460764408111572, "learning_rate": 3.07795890238568e-06, "loss": 0.4186, "num_input_tokens_seen": 100241456, "step": 104980 }, { "epoch": 8.56391222775104, "grad_norm": 21.038820266723633, "learning_rate": 3.0762481554177797e-06, "loss": 0.3534, "num_input_tokens_seen": 100245712, "step": 104985 }, { "epoch": 8.564320091361449, "grad_norm": 29.744047164916992, "learning_rate": 3.0745378528350077e-06, "loss": 0.2403, "num_input_tokens_seen": 100251104, "step": 104990 }, { "epoch": 8.564727954971858, "grad_norm": 10.683629989624023, "learning_rate": 3.0728279946720424e-06, "loss": 0.3287, "num_input_tokens_seen": 100255856, "step": 104995 }, { "epoch": 8.565135818582267, "grad_norm": 27.90363121032715, "learning_rate": 3.071118580963536e-06, "loss": 0.4819, "num_input_tokens_seen": 100261200, "step": 105000 }, { "epoch": 8.565543682192676, "grad_norm": 13.93986988067627, "learning_rate": 3.0694096117441396e-06, "loss": 0.3266, "num_input_tokens_seen": 100265824, "step": 105005 }, { "epoch": 8.565951545803083, "grad_norm": 29.702735900878906, "learning_rate": 3.067701087048483e-06, "loss": 0.4239, "num_input_tokens_seen": 100271344, "step": 105010 }, { "epoch": 8.566359409413492, "grad_norm": 20.727567672729492, "learning_rate": 3.0659930069112137e-06, "loss": 0.2943, "num_input_tokens_seen": 100275824, "step": 105015 }, { "epoch": 8.5667672730239, "grad_norm": 25.350130081176758, "learning_rate": 3.0642853713669427e-06, "loss": 0.4811, "num_input_tokens_seen": 100280384, "step": 105020 }, { "epoch": 8.56717513663431, "grad_norm": 2.4092791080474854, "learning_rate": 3.06257818045029e-06, "loss": 0.1936, "num_input_tokens_seen": 100285056, "step": 105025 }, { "epoch": 8.567583000244719, "grad_norm": 14.264383316040039, "learning_rate": 3.0608714341958528e-06, "loss": 0.3511, "num_input_tokens_seen": 100289232, "step": 105030 }, { "epoch": 8.567990863855126, "grad_norm": 33.70417785644531, "learning_rate": 3.059165132638231e-06, "loss": 0.179, "num_input_tokens_seen": 100293744, "step": 105035 }, { "epoch": 8.568398727465535, "grad_norm": 15.699570655822754, "learning_rate": 3.057459275812008e-06, "loss": 0.3376, "num_input_tokens_seen": 100298704, "step": 105040 }, { "epoch": 8.568806591075944, "grad_norm": 10.164290428161621, "learning_rate": 3.0557538637517617e-06, "loss": 0.4044, "num_input_tokens_seen": 100302592, "step": 105045 }, { "epoch": 8.569214454686353, "grad_norm": 5.059525966644287, "learning_rate": 3.0540488964920595e-06, "loss": 0.3575, "num_input_tokens_seen": 100306576, "step": 105050 }, { "epoch": 8.569622318296762, "grad_norm": 15.70946216583252, "learning_rate": 3.0523443740674562e-06, "loss": 0.3899, "num_input_tokens_seen": 100311744, "step": 105055 }, { "epoch": 8.570030181907171, "grad_norm": 3.6126532554626465, "learning_rate": 3.050640296512511e-06, "loss": 0.372, "num_input_tokens_seen": 100316352, "step": 105060 }, { "epoch": 8.570438045517578, "grad_norm": 19.757848739624023, "learning_rate": 3.0489366638617593e-06, "loss": 0.3249, "num_input_tokens_seen": 100320624, "step": 105065 }, { "epoch": 8.570845909127987, "grad_norm": 14.733009338378906, "learning_rate": 3.0472334761497356e-06, "loss": 0.4559, "num_input_tokens_seen": 100325072, "step": 105070 }, { "epoch": 8.571253772738396, "grad_norm": 12.392315864562988, "learning_rate": 3.045530733410959e-06, "loss": 0.2806, "num_input_tokens_seen": 100330336, "step": 105075 }, { "epoch": 8.571661636348805, "grad_norm": 14.178609848022461, "learning_rate": 3.0438284356799433e-06, "loss": 0.2767, "num_input_tokens_seen": 100335872, "step": 105080 }, { "epoch": 8.572069499959214, "grad_norm": 16.814510345458984, "learning_rate": 3.0421265829911977e-06, "loss": 0.5472, "num_input_tokens_seen": 100341056, "step": 105085 }, { "epoch": 8.572477363569622, "grad_norm": 4.571710586547852, "learning_rate": 3.0404251753792163e-06, "loss": 0.3196, "num_input_tokens_seen": 100346352, "step": 105090 }, { "epoch": 8.57288522718003, "grad_norm": 6.621116638183594, "learning_rate": 3.0387242128784854e-06, "loss": 0.3864, "num_input_tokens_seen": 100350576, "step": 105095 }, { "epoch": 8.57329309079044, "grad_norm": 39.98945617675781, "learning_rate": 3.0370236955234803e-06, "loss": 0.3958, "num_input_tokens_seen": 100354992, "step": 105100 }, { "epoch": 8.573700954400849, "grad_norm": 21.25461196899414, "learning_rate": 3.0353236233486737e-06, "loss": 0.3409, "num_input_tokens_seen": 100360112, "step": 105105 }, { "epoch": 8.574108818011258, "grad_norm": 2.66855788230896, "learning_rate": 3.0336239963885267e-06, "loss": 0.2007, "num_input_tokens_seen": 100365968, "step": 105110 }, { "epoch": 8.574516681621665, "grad_norm": 34.21811294555664, "learning_rate": 3.0319248146774863e-06, "loss": 0.3818, "num_input_tokens_seen": 100370368, "step": 105115 }, { "epoch": 8.574924545232074, "grad_norm": 3.251718759536743, "learning_rate": 3.030226078249995e-06, "loss": 0.2425, "num_input_tokens_seen": 100375056, "step": 105120 }, { "epoch": 8.575332408842483, "grad_norm": 3.9637227058410645, "learning_rate": 3.0285277871404783e-06, "loss": 0.2629, "num_input_tokens_seen": 100379408, "step": 105125 }, { "epoch": 8.575740272452892, "grad_norm": 27.530229568481445, "learning_rate": 3.0268299413833744e-06, "loss": 0.3801, "num_input_tokens_seen": 100384816, "step": 105130 }, { "epoch": 8.576148136063301, "grad_norm": 15.321866989135742, "learning_rate": 3.0251325410130924e-06, "loss": 0.2662, "num_input_tokens_seen": 100389776, "step": 105135 }, { "epoch": 8.576555999673708, "grad_norm": 7.083590030670166, "learning_rate": 3.023435586064033e-06, "loss": 0.4115, "num_input_tokens_seen": 100395072, "step": 105140 }, { "epoch": 8.576963863284117, "grad_norm": 5.370985507965088, "learning_rate": 3.021739076570593e-06, "loss": 0.3271, "num_input_tokens_seen": 100399744, "step": 105145 }, { "epoch": 8.577371726894526, "grad_norm": 18.79300880432129, "learning_rate": 3.020043012567167e-06, "loss": 0.5941, "num_input_tokens_seen": 100404848, "step": 105150 }, { "epoch": 8.577779590504935, "grad_norm": 5.385833263397217, "learning_rate": 3.018347394088128e-06, "loss": 0.2217, "num_input_tokens_seen": 100409744, "step": 105155 }, { "epoch": 8.578187454115344, "grad_norm": 22.46414566040039, "learning_rate": 3.016652221167848e-06, "loss": 0.353, "num_input_tokens_seen": 100414720, "step": 105160 }, { "epoch": 8.578595317725753, "grad_norm": 43.61202621459961, "learning_rate": 3.0149574938406868e-06, "loss": 0.3726, "num_input_tokens_seen": 100419648, "step": 105165 }, { "epoch": 8.57900318133616, "grad_norm": 2.6168248653411865, "learning_rate": 3.0132632121409933e-06, "loss": 0.4879, "num_input_tokens_seen": 100424240, "step": 105170 }, { "epoch": 8.57941104494657, "grad_norm": 3.9299988746643066, "learning_rate": 3.0115693761031127e-06, "loss": 0.2874, "num_input_tokens_seen": 100429024, "step": 105175 }, { "epoch": 8.579818908556978, "grad_norm": 32.02215576171875, "learning_rate": 3.009875985761379e-06, "loss": 0.4729, "num_input_tokens_seen": 100434192, "step": 105180 }, { "epoch": 8.580226772167387, "grad_norm": 3.734239339828491, "learning_rate": 3.008183041150114e-06, "loss": 0.3966, "num_input_tokens_seen": 100438480, "step": 105185 }, { "epoch": 8.580634635777797, "grad_norm": 2.8557918071746826, "learning_rate": 3.0064905423036276e-06, "loss": 0.2837, "num_input_tokens_seen": 100443344, "step": 105190 }, { "epoch": 8.581042499388204, "grad_norm": 19.232921600341797, "learning_rate": 3.004798489256239e-06, "loss": 0.2691, "num_input_tokens_seen": 100448640, "step": 105195 }, { "epoch": 8.581450362998613, "grad_norm": 13.612062454223633, "learning_rate": 3.0031068820422363e-06, "loss": 0.31, "num_input_tokens_seen": 100453136, "step": 105200 }, { "epoch": 8.581858226609022, "grad_norm": 29.576343536376953, "learning_rate": 3.0014157206959104e-06, "loss": 0.2581, "num_input_tokens_seen": 100457744, "step": 105205 }, { "epoch": 8.58226609021943, "grad_norm": 2.1537740230560303, "learning_rate": 2.9997250052515385e-06, "loss": 0.3257, "num_input_tokens_seen": 100463280, "step": 105210 }, { "epoch": 8.58267395382984, "grad_norm": 0.8035215735435486, "learning_rate": 2.9980347357433936e-06, "loss": 0.3318, "num_input_tokens_seen": 100467712, "step": 105215 }, { "epoch": 8.583081817440249, "grad_norm": 30.855613708496094, "learning_rate": 2.996344912205737e-06, "loss": 0.3951, "num_input_tokens_seen": 100471712, "step": 105220 }, { "epoch": 8.583489681050656, "grad_norm": 7.156824588775635, "learning_rate": 2.9946555346728188e-06, "loss": 0.3065, "num_input_tokens_seen": 100475984, "step": 105225 }, { "epoch": 8.583897544661065, "grad_norm": 27.815780639648438, "learning_rate": 2.9929666031788816e-06, "loss": 0.4213, "num_input_tokens_seen": 100481264, "step": 105230 }, { "epoch": 8.584305408271474, "grad_norm": 2.477774143218994, "learning_rate": 2.991278117758156e-06, "loss": 0.323, "num_input_tokens_seen": 100486000, "step": 105235 }, { "epoch": 8.584713271881883, "grad_norm": 30.318578720092773, "learning_rate": 2.989590078444876e-06, "loss": 0.2715, "num_input_tokens_seen": 100491328, "step": 105240 }, { "epoch": 8.585121135492292, "grad_norm": 7.409231662750244, "learning_rate": 2.987902485273253e-06, "loss": 0.3153, "num_input_tokens_seen": 100495312, "step": 105245 }, { "epoch": 8.5855289991027, "grad_norm": 3.77089524269104, "learning_rate": 2.98621533827749e-06, "loss": 0.3826, "num_input_tokens_seen": 100500080, "step": 105250 }, { "epoch": 8.585936862713108, "grad_norm": 1.6461549997329712, "learning_rate": 2.9845286374917848e-06, "loss": 0.2322, "num_input_tokens_seen": 100505568, "step": 105255 }, { "epoch": 8.586344726323517, "grad_norm": 4.282711982727051, "learning_rate": 2.982842382950335e-06, "loss": 0.3991, "num_input_tokens_seen": 100510720, "step": 105260 }, { "epoch": 8.586752589933926, "grad_norm": 2.372847557067871, "learning_rate": 2.981156574687313e-06, "loss": 0.3317, "num_input_tokens_seen": 100515792, "step": 105265 }, { "epoch": 8.587160453544335, "grad_norm": 2.1918914318084717, "learning_rate": 2.979471212736892e-06, "loss": 0.2691, "num_input_tokens_seen": 100521328, "step": 105270 }, { "epoch": 8.587568317154744, "grad_norm": 18.10023307800293, "learning_rate": 2.977786297133234e-06, "loss": 0.3089, "num_input_tokens_seen": 100525696, "step": 105275 }, { "epoch": 8.587976180765152, "grad_norm": 15.879868507385254, "learning_rate": 2.976101827910482e-06, "loss": 0.3508, "num_input_tokens_seen": 100530336, "step": 105280 }, { "epoch": 8.58838404437556, "grad_norm": 27.117088317871094, "learning_rate": 2.9744178051027916e-06, "loss": 0.2604, "num_input_tokens_seen": 100534576, "step": 105285 }, { "epoch": 8.58879190798597, "grad_norm": 9.017622947692871, "learning_rate": 2.972734228744295e-06, "loss": 0.38, "num_input_tokens_seen": 100538992, "step": 105290 }, { "epoch": 8.589199771596379, "grad_norm": 3.670973539352417, "learning_rate": 2.971051098869115e-06, "loss": 0.313, "num_input_tokens_seen": 100543584, "step": 105295 }, { "epoch": 8.589607635206788, "grad_norm": 10.34970760345459, "learning_rate": 2.96936841551137e-06, "loss": 0.2866, "num_input_tokens_seen": 100547808, "step": 105300 }, { "epoch": 8.590015498817195, "grad_norm": 6.283286094665527, "learning_rate": 2.9676861787051647e-06, "loss": 0.3972, "num_input_tokens_seen": 100552352, "step": 105305 }, { "epoch": 8.590423362427604, "grad_norm": 59.213348388671875, "learning_rate": 2.9660043884846e-06, "loss": 0.3466, "num_input_tokens_seen": 100557120, "step": 105310 }, { "epoch": 8.590831226038013, "grad_norm": 6.765159606933594, "learning_rate": 2.9643230448837627e-06, "loss": 0.2914, "num_input_tokens_seen": 100562384, "step": 105315 }, { "epoch": 8.591239089648422, "grad_norm": 9.163992881774902, "learning_rate": 2.962642147936731e-06, "loss": 0.3058, "num_input_tokens_seen": 100567200, "step": 105320 }, { "epoch": 8.59164695325883, "grad_norm": 8.418837547302246, "learning_rate": 2.9609616976775776e-06, "loss": 0.4287, "num_input_tokens_seen": 100572080, "step": 105325 }, { "epoch": 8.592054816869238, "grad_norm": 28.88774871826172, "learning_rate": 2.9592816941403674e-06, "loss": 0.4365, "num_input_tokens_seen": 100577200, "step": 105330 }, { "epoch": 8.592462680479647, "grad_norm": 25.810733795166016, "learning_rate": 2.9576021373591554e-06, "loss": 0.3126, "num_input_tokens_seen": 100582112, "step": 105335 }, { "epoch": 8.592870544090056, "grad_norm": 35.3116455078125, "learning_rate": 2.9559230273679796e-06, "loss": 0.426, "num_input_tokens_seen": 100586304, "step": 105340 }, { "epoch": 8.593278407700465, "grad_norm": 20.756013870239258, "learning_rate": 2.95424436420087e-06, "loss": 0.2985, "num_input_tokens_seen": 100590960, "step": 105345 }, { "epoch": 8.593686271310874, "grad_norm": 2.4989285469055176, "learning_rate": 2.9525661478918664e-06, "loss": 0.3586, "num_input_tokens_seen": 100595360, "step": 105350 }, { "epoch": 8.594094134921281, "grad_norm": 4.78420352935791, "learning_rate": 2.950888378474978e-06, "loss": 0.2829, "num_input_tokens_seen": 100599760, "step": 105355 }, { "epoch": 8.59450199853169, "grad_norm": 1.1586930751800537, "learning_rate": 2.9492110559842134e-06, "loss": 0.3788, "num_input_tokens_seen": 100604592, "step": 105360 }, { "epoch": 8.5949098621421, "grad_norm": 2.290990114212036, "learning_rate": 2.947534180453565e-06, "loss": 0.2834, "num_input_tokens_seen": 100609008, "step": 105365 }, { "epoch": 8.595317725752508, "grad_norm": 1.5313855409622192, "learning_rate": 2.9458577519170332e-06, "loss": 0.1665, "num_input_tokens_seen": 100614240, "step": 105370 }, { "epoch": 8.595725589362917, "grad_norm": 26.257036209106445, "learning_rate": 2.9441817704085937e-06, "loss": 0.3551, "num_input_tokens_seen": 100618944, "step": 105375 }, { "epoch": 8.596133452973326, "grad_norm": 5.220107078552246, "learning_rate": 2.9425062359622197e-06, "loss": 0.4283, "num_input_tokens_seen": 100624176, "step": 105380 }, { "epoch": 8.596541316583734, "grad_norm": 2.8355793952941895, "learning_rate": 2.9408311486118693e-06, "loss": 0.4437, "num_input_tokens_seen": 100628752, "step": 105385 }, { "epoch": 8.596949180194143, "grad_norm": 5.195011615753174, "learning_rate": 2.9391565083914914e-06, "loss": 0.3525, "num_input_tokens_seen": 100632976, "step": 105390 }, { "epoch": 8.597357043804552, "grad_norm": 2.0769143104553223, "learning_rate": 2.937482315335044e-06, "loss": 0.2005, "num_input_tokens_seen": 100637872, "step": 105395 }, { "epoch": 8.59776490741496, "grad_norm": 13.103598594665527, "learning_rate": 2.935808569476456e-06, "loss": 0.4468, "num_input_tokens_seen": 100642704, "step": 105400 }, { "epoch": 8.59817277102537, "grad_norm": 24.040685653686523, "learning_rate": 2.93413527084965e-06, "loss": 0.3386, "num_input_tokens_seen": 100647536, "step": 105405 }, { "epoch": 8.598580634635777, "grad_norm": 17.72942352294922, "learning_rate": 2.9324624194885436e-06, "loss": 0.386, "num_input_tokens_seen": 100651904, "step": 105410 }, { "epoch": 8.598988498246186, "grad_norm": 2.1524839401245117, "learning_rate": 2.9307900154270484e-06, "loss": 0.3075, "num_input_tokens_seen": 100656352, "step": 105415 }, { "epoch": 8.599396361856595, "grad_norm": 4.036128520965576, "learning_rate": 2.9291180586990653e-06, "loss": 0.1988, "num_input_tokens_seen": 100661728, "step": 105420 }, { "epoch": 8.599804225467004, "grad_norm": 4.746057510375977, "learning_rate": 2.9274465493384806e-06, "loss": 0.2731, "num_input_tokens_seen": 100666080, "step": 105425 }, { "epoch": 8.600212089077413, "grad_norm": 3.3448731899261475, "learning_rate": 2.925775487379173e-06, "loss": 0.3847, "num_input_tokens_seen": 100671120, "step": 105430 }, { "epoch": 8.600619952687822, "grad_norm": 4.145793437957764, "learning_rate": 2.9241048728550158e-06, "loss": 0.3428, "num_input_tokens_seen": 100676192, "step": 105435 }, { "epoch": 8.60102781629823, "grad_norm": 10.630144119262695, "learning_rate": 2.922434705799876e-06, "loss": 0.4427, "num_input_tokens_seen": 100680880, "step": 105440 }, { "epoch": 8.601435679908638, "grad_norm": 37.915748596191406, "learning_rate": 2.920764986247598e-06, "loss": 0.3312, "num_input_tokens_seen": 100685760, "step": 105445 }, { "epoch": 8.601843543519047, "grad_norm": 2.9602925777435303, "learning_rate": 2.9190957142320367e-06, "loss": 0.255, "num_input_tokens_seen": 100690896, "step": 105450 }, { "epoch": 8.602251407129456, "grad_norm": 12.012838363647461, "learning_rate": 2.9174268897870146e-06, "loss": 0.4317, "num_input_tokens_seen": 100695040, "step": 105455 }, { "epoch": 8.602659270739865, "grad_norm": 8.735466957092285, "learning_rate": 2.9157585129463733e-06, "loss": 0.1669, "num_input_tokens_seen": 100700640, "step": 105460 }, { "epoch": 8.603067134350272, "grad_norm": 3.8279871940612793, "learning_rate": 2.91409058374392e-06, "loss": 0.4416, "num_input_tokens_seen": 100705632, "step": 105465 }, { "epoch": 8.603474997960681, "grad_norm": 22.641324996948242, "learning_rate": 2.9124231022134684e-06, "loss": 0.4271, "num_input_tokens_seen": 100710480, "step": 105470 }, { "epoch": 8.60388286157109, "grad_norm": 18.13330841064453, "learning_rate": 2.910756068388812e-06, "loss": 0.3955, "num_input_tokens_seen": 100715408, "step": 105475 }, { "epoch": 8.6042907251815, "grad_norm": 9.085278511047363, "learning_rate": 2.9090894823037396e-06, "loss": 0.2991, "num_input_tokens_seen": 100721056, "step": 105480 }, { "epoch": 8.604698588791909, "grad_norm": 8.549026489257812, "learning_rate": 2.907423343992044e-06, "loss": 0.3721, "num_input_tokens_seen": 100726240, "step": 105485 }, { "epoch": 8.605106452402318, "grad_norm": 57.229522705078125, "learning_rate": 2.905757653487487e-06, "loss": 0.3995, "num_input_tokens_seen": 100731312, "step": 105490 }, { "epoch": 8.605514316012725, "grad_norm": 14.222803115844727, "learning_rate": 2.904092410823833e-06, "loss": 0.2682, "num_input_tokens_seen": 100736160, "step": 105495 }, { "epoch": 8.605922179623134, "grad_norm": 5.13090705871582, "learning_rate": 2.9024276160348336e-06, "loss": 0.2294, "num_input_tokens_seen": 100740528, "step": 105500 }, { "epoch": 8.606330043233543, "grad_norm": 1.8879632949829102, "learning_rate": 2.900763269154241e-06, "loss": 0.2531, "num_input_tokens_seen": 100745200, "step": 105505 }, { "epoch": 8.606737906843952, "grad_norm": 42.97294998168945, "learning_rate": 2.899099370215788e-06, "loss": 0.3309, "num_input_tokens_seen": 100749952, "step": 105510 }, { "epoch": 8.60714577045436, "grad_norm": 6.727669715881348, "learning_rate": 2.8974359192531997e-06, "loss": 0.341, "num_input_tokens_seen": 100754784, "step": 105515 }, { "epoch": 8.607553634064768, "grad_norm": 24.563919067382812, "learning_rate": 2.895772916300185e-06, "loss": 0.3497, "num_input_tokens_seen": 100760064, "step": 105520 }, { "epoch": 8.607961497675177, "grad_norm": 3.8567357063293457, "learning_rate": 2.8941103613904675e-06, "loss": 0.3029, "num_input_tokens_seen": 100764960, "step": 105525 }, { "epoch": 8.608369361285586, "grad_norm": 3.646350860595703, "learning_rate": 2.892448254557739e-06, "loss": 0.349, "num_input_tokens_seen": 100769648, "step": 105530 }, { "epoch": 8.608777224895995, "grad_norm": 9.276671409606934, "learning_rate": 2.890786595835693e-06, "loss": 0.1994, "num_input_tokens_seen": 100773744, "step": 105535 }, { "epoch": 8.609185088506404, "grad_norm": 1.53314208984375, "learning_rate": 2.889125385258004e-06, "loss": 0.3361, "num_input_tokens_seen": 100778768, "step": 105540 }, { "epoch": 8.609592952116813, "grad_norm": 32.4929313659668, "learning_rate": 2.8874646228583464e-06, "loss": 0.3574, "num_input_tokens_seen": 100782928, "step": 105545 }, { "epoch": 8.61000081572722, "grad_norm": 16.26817512512207, "learning_rate": 2.8858043086703867e-06, "loss": 0.3049, "num_input_tokens_seen": 100788464, "step": 105550 }, { "epoch": 8.61040867933763, "grad_norm": 12.907211303710938, "learning_rate": 2.8841444427277793e-06, "loss": 0.3076, "num_input_tokens_seen": 100792816, "step": 105555 }, { "epoch": 8.610816542948038, "grad_norm": 4.171265125274658, "learning_rate": 2.8824850250641663e-06, "loss": 0.298, "num_input_tokens_seen": 100797488, "step": 105560 }, { "epoch": 8.611224406558447, "grad_norm": 7.323258876800537, "learning_rate": 2.8808260557131816e-06, "loss": 0.4093, "num_input_tokens_seen": 100802736, "step": 105565 }, { "epoch": 8.611632270168856, "grad_norm": 18.831642150878906, "learning_rate": 2.8791675347084546e-06, "loss": 0.3531, "num_input_tokens_seen": 100807952, "step": 105570 }, { "epoch": 8.612040133779264, "grad_norm": 12.556021690368652, "learning_rate": 2.877509462083597e-06, "loss": 0.32, "num_input_tokens_seen": 100812880, "step": 105575 }, { "epoch": 8.612447997389673, "grad_norm": 2.9122915267944336, "learning_rate": 2.8758518378722287e-06, "loss": 0.2896, "num_input_tokens_seen": 100818192, "step": 105580 }, { "epoch": 8.612855861000082, "grad_norm": 36.57927322387695, "learning_rate": 2.8741946621079402e-06, "loss": 0.406, "num_input_tokens_seen": 100822368, "step": 105585 }, { "epoch": 8.61326372461049, "grad_norm": 11.2621488571167, "learning_rate": 2.8725379348243263e-06, "loss": 0.3251, "num_input_tokens_seen": 100826576, "step": 105590 }, { "epoch": 8.6136715882209, "grad_norm": 2.100067138671875, "learning_rate": 2.8708816560549634e-06, "loss": 0.2439, "num_input_tokens_seen": 100831920, "step": 105595 }, { "epoch": 8.614079451831307, "grad_norm": 6.513971328735352, "learning_rate": 2.869225825833427e-06, "loss": 0.2802, "num_input_tokens_seen": 100836448, "step": 105600 }, { "epoch": 8.614487315441716, "grad_norm": 5.550389289855957, "learning_rate": 2.867570444193279e-06, "loss": 0.3878, "num_input_tokens_seen": 100840672, "step": 105605 }, { "epoch": 8.614895179052125, "grad_norm": 5.696445941925049, "learning_rate": 2.8659155111680712e-06, "loss": 0.2657, "num_input_tokens_seen": 100846112, "step": 105610 }, { "epoch": 8.615303042662534, "grad_norm": 2.9157874584198, "learning_rate": 2.8642610267913538e-06, "loss": 0.2715, "num_input_tokens_seen": 100850800, "step": 105615 }, { "epoch": 8.615710906272943, "grad_norm": 5.696296215057373, "learning_rate": 2.8626069910966587e-06, "loss": 0.3383, "num_input_tokens_seen": 100854992, "step": 105620 }, { "epoch": 8.61611876988335, "grad_norm": 14.491329193115234, "learning_rate": 2.860953404117514e-06, "loss": 0.1579, "num_input_tokens_seen": 100859760, "step": 105625 }, { "epoch": 8.61652663349376, "grad_norm": 1.6584644317626953, "learning_rate": 2.859300265887435e-06, "loss": 0.3455, "num_input_tokens_seen": 100864752, "step": 105630 }, { "epoch": 8.616934497104168, "grad_norm": 1.1550114154815674, "learning_rate": 2.857647576439926e-06, "loss": 0.2249, "num_input_tokens_seen": 100870160, "step": 105635 }, { "epoch": 8.617342360714577, "grad_norm": 12.028351783752441, "learning_rate": 2.855995335808498e-06, "loss": 0.1736, "num_input_tokens_seen": 100875728, "step": 105640 }, { "epoch": 8.617750224324986, "grad_norm": 25.135456085205078, "learning_rate": 2.8543435440266357e-06, "loss": 0.2171, "num_input_tokens_seen": 100880016, "step": 105645 }, { "epoch": 8.618158087935395, "grad_norm": 2.193660020828247, "learning_rate": 2.8526922011278183e-06, "loss": 0.4039, "num_input_tokens_seen": 100885168, "step": 105650 }, { "epoch": 8.618565951545802, "grad_norm": 4.345657825469971, "learning_rate": 2.851041307145516e-06, "loss": 0.4741, "num_input_tokens_seen": 100889808, "step": 105655 }, { "epoch": 8.618973815156211, "grad_norm": 4.250270843505859, "learning_rate": 2.8493908621131965e-06, "loss": 0.3891, "num_input_tokens_seen": 100894192, "step": 105660 }, { "epoch": 8.61938167876662, "grad_norm": 40.313148498535156, "learning_rate": 2.8477408660643136e-06, "loss": 0.4093, "num_input_tokens_seen": 100899264, "step": 105665 }, { "epoch": 8.61978954237703, "grad_norm": 17.168052673339844, "learning_rate": 2.8460913190323123e-06, "loss": 0.3468, "num_input_tokens_seen": 100903840, "step": 105670 }, { "epoch": 8.620197405987438, "grad_norm": 1.9104493856430054, "learning_rate": 2.844442221050622e-06, "loss": 0.1945, "num_input_tokens_seen": 100909536, "step": 105675 }, { "epoch": 8.620605269597846, "grad_norm": 38.5084228515625, "learning_rate": 2.8427935721526714e-06, "loss": 0.4043, "num_input_tokens_seen": 100914848, "step": 105680 }, { "epoch": 8.621013133208255, "grad_norm": 8.840928077697754, "learning_rate": 2.841145372371884e-06, "loss": 0.1793, "num_input_tokens_seen": 100919312, "step": 105685 }, { "epoch": 8.621420996818664, "grad_norm": 3.178041934967041, "learning_rate": 2.8394976217416657e-06, "loss": 0.332, "num_input_tokens_seen": 100924144, "step": 105690 }, { "epoch": 8.621828860429073, "grad_norm": 34.62815856933594, "learning_rate": 2.8378503202954127e-06, "loss": 0.4272, "num_input_tokens_seen": 100929600, "step": 105695 }, { "epoch": 8.622236724039482, "grad_norm": 1.716785192489624, "learning_rate": 2.836203468066509e-06, "loss": 0.2225, "num_input_tokens_seen": 100933856, "step": 105700 }, { "epoch": 8.62264458764989, "grad_norm": 28.196762084960938, "learning_rate": 2.83455706508835e-06, "loss": 0.3728, "num_input_tokens_seen": 100938768, "step": 105705 }, { "epoch": 8.623052451260298, "grad_norm": 3.5378048419952393, "learning_rate": 2.8329111113943012e-06, "loss": 0.2541, "num_input_tokens_seen": 100943776, "step": 105710 }, { "epoch": 8.623460314870707, "grad_norm": 2.8862736225128174, "learning_rate": 2.8312656070177248e-06, "loss": 0.2015, "num_input_tokens_seen": 100947488, "step": 105715 }, { "epoch": 8.623868178481116, "grad_norm": 4.453917980194092, "learning_rate": 2.829620551991974e-06, "loss": 0.5121, "num_input_tokens_seen": 100951888, "step": 105720 }, { "epoch": 8.624276042091525, "grad_norm": 44.76291275024414, "learning_rate": 2.8279759463503947e-06, "loss": 0.5213, "num_input_tokens_seen": 100956320, "step": 105725 }, { "epoch": 8.624683905701934, "grad_norm": 30.188642501831055, "learning_rate": 2.8263317901263213e-06, "loss": 0.4318, "num_input_tokens_seen": 100959968, "step": 105730 }, { "epoch": 8.625091769312341, "grad_norm": 9.609269142150879, "learning_rate": 2.8246880833530805e-06, "loss": 0.3762, "num_input_tokens_seen": 100965136, "step": 105735 }, { "epoch": 8.62549963292275, "grad_norm": 7.083739280700684, "learning_rate": 2.823044826063989e-06, "loss": 0.4924, "num_input_tokens_seen": 100969248, "step": 105740 }, { "epoch": 8.62590749653316, "grad_norm": 36.21683883666992, "learning_rate": 2.821402018292349e-06, "loss": 0.308, "num_input_tokens_seen": 100973856, "step": 105745 }, { "epoch": 8.626315360143568, "grad_norm": 4.3134846687316895, "learning_rate": 2.819759660071472e-06, "loss": 0.3794, "num_input_tokens_seen": 100977904, "step": 105750 }, { "epoch": 8.626723223753977, "grad_norm": 1.040696144104004, "learning_rate": 2.818117751434643e-06, "loss": 0.241, "num_input_tokens_seen": 100982704, "step": 105755 }, { "epoch": 8.627131087364386, "grad_norm": 6.0997185707092285, "learning_rate": 2.8164762924151377e-06, "loss": 0.4645, "num_input_tokens_seen": 100987008, "step": 105760 }, { "epoch": 8.627538950974794, "grad_norm": 4.352069854736328, "learning_rate": 2.8148352830462297e-06, "loss": 0.2025, "num_input_tokens_seen": 100991344, "step": 105765 }, { "epoch": 8.627946814585203, "grad_norm": 3.330211877822876, "learning_rate": 2.81319472336119e-06, "loss": 0.2636, "num_input_tokens_seen": 100996464, "step": 105770 }, { "epoch": 8.628354678195612, "grad_norm": 9.05460262298584, "learning_rate": 2.8115546133932608e-06, "loss": 0.3839, "num_input_tokens_seen": 101000544, "step": 105775 }, { "epoch": 8.62876254180602, "grad_norm": 3.6563472747802734, "learning_rate": 2.809914953175696e-06, "loss": 0.4659, "num_input_tokens_seen": 101006144, "step": 105780 }, { "epoch": 8.62917040541643, "grad_norm": 3.0955393314361572, "learning_rate": 2.8082757427417228e-06, "loss": 0.4263, "num_input_tokens_seen": 101011072, "step": 105785 }, { "epoch": 8.629578269026837, "grad_norm": 16.866003036499023, "learning_rate": 2.8066369821245664e-06, "loss": 0.31, "num_input_tokens_seen": 101016400, "step": 105790 }, { "epoch": 8.629986132637246, "grad_norm": 31.812763214111328, "learning_rate": 2.804998671357453e-06, "loss": 0.285, "num_input_tokens_seen": 101020688, "step": 105795 }, { "epoch": 8.630393996247655, "grad_norm": 18.26180076599121, "learning_rate": 2.8033608104735846e-06, "loss": 0.1389, "num_input_tokens_seen": 101025504, "step": 105800 }, { "epoch": 8.630801859858064, "grad_norm": 17.23478889465332, "learning_rate": 2.801723399506162e-06, "loss": 0.2892, "num_input_tokens_seen": 101030416, "step": 105805 }, { "epoch": 8.631209723468473, "grad_norm": 25.1433048248291, "learning_rate": 2.800086438488367e-06, "loss": 0.4744, "num_input_tokens_seen": 101035376, "step": 105810 }, { "epoch": 8.63161758707888, "grad_norm": 19.05306625366211, "learning_rate": 2.798449927453392e-06, "loss": 0.2697, "num_input_tokens_seen": 101041456, "step": 105815 }, { "epoch": 8.632025450689289, "grad_norm": 2.6409785747528076, "learning_rate": 2.7968138664344027e-06, "loss": 0.1577, "num_input_tokens_seen": 101045744, "step": 105820 }, { "epoch": 8.632433314299698, "grad_norm": 18.93153953552246, "learning_rate": 2.7951782554645615e-06, "loss": 0.2949, "num_input_tokens_seen": 101050352, "step": 105825 }, { "epoch": 8.632841177910107, "grad_norm": 10.26545238494873, "learning_rate": 2.7935430945770224e-06, "loss": 0.3253, "num_input_tokens_seen": 101055904, "step": 105830 }, { "epoch": 8.633249041520516, "grad_norm": 18.85254669189453, "learning_rate": 2.791908383804925e-06, "loss": 0.3963, "num_input_tokens_seen": 101061248, "step": 105835 }, { "epoch": 8.633656905130923, "grad_norm": 51.324581146240234, "learning_rate": 2.7902741231814108e-06, "loss": 0.2898, "num_input_tokens_seen": 101065920, "step": 105840 }, { "epoch": 8.634064768741332, "grad_norm": 32.12541961669922, "learning_rate": 2.788640312739604e-06, "loss": 0.2782, "num_input_tokens_seen": 101069648, "step": 105845 }, { "epoch": 8.634472632351741, "grad_norm": 1.174517273902893, "learning_rate": 2.7870069525126192e-06, "loss": 0.3374, "num_input_tokens_seen": 101074224, "step": 105850 }, { "epoch": 8.63488049596215, "grad_norm": 3.114851474761963, "learning_rate": 2.7853740425335644e-06, "loss": 0.2991, "num_input_tokens_seen": 101078688, "step": 105855 }, { "epoch": 8.63528835957256, "grad_norm": 3.053295135498047, "learning_rate": 2.783741582835539e-06, "loss": 0.2841, "num_input_tokens_seen": 101082720, "step": 105860 }, { "epoch": 8.635696223182968, "grad_norm": 14.489771842956543, "learning_rate": 2.7821095734516302e-06, "loss": 0.2907, "num_input_tokens_seen": 101088096, "step": 105865 }, { "epoch": 8.636104086793376, "grad_norm": 11.145923614501953, "learning_rate": 2.7804780144149195e-06, "loss": 0.412, "num_input_tokens_seen": 101093488, "step": 105870 }, { "epoch": 8.636511950403785, "grad_norm": 1.828563928604126, "learning_rate": 2.7788469057584784e-06, "loss": 0.4534, "num_input_tokens_seen": 101098080, "step": 105875 }, { "epoch": 8.636919814014194, "grad_norm": 28.39286994934082, "learning_rate": 2.777216247515363e-06, "loss": 0.3077, "num_input_tokens_seen": 101102304, "step": 105880 }, { "epoch": 8.637327677624603, "grad_norm": 5.845297336578369, "learning_rate": 2.7755860397186363e-06, "loss": 0.4089, "num_input_tokens_seen": 101106160, "step": 105885 }, { "epoch": 8.637735541235012, "grad_norm": 14.784444808959961, "learning_rate": 2.7739562824013353e-06, "loss": 0.3344, "num_input_tokens_seen": 101111040, "step": 105890 }, { "epoch": 8.638143404845419, "grad_norm": 9.855878829956055, "learning_rate": 2.7723269755964982e-06, "loss": 0.3441, "num_input_tokens_seen": 101116080, "step": 105895 }, { "epoch": 8.638551268455828, "grad_norm": 16.118894577026367, "learning_rate": 2.7706981193371425e-06, "loss": 0.4666, "num_input_tokens_seen": 101121184, "step": 105900 }, { "epoch": 8.638959132066237, "grad_norm": 8.32442855834961, "learning_rate": 2.7690697136562943e-06, "loss": 0.4082, "num_input_tokens_seen": 101125040, "step": 105905 }, { "epoch": 8.639366995676646, "grad_norm": 19.436744689941406, "learning_rate": 2.7674417585869583e-06, "loss": 0.3952, "num_input_tokens_seen": 101129888, "step": 105910 }, { "epoch": 8.639774859287055, "grad_norm": 7.898425579071045, "learning_rate": 2.7658142541621302e-06, "loss": 0.2641, "num_input_tokens_seen": 101134560, "step": 105915 }, { "epoch": 8.640182722897464, "grad_norm": 12.019031524658203, "learning_rate": 2.7641872004147945e-06, "loss": 0.3417, "num_input_tokens_seen": 101139024, "step": 105920 }, { "epoch": 8.640590586507871, "grad_norm": 7.846596717834473, "learning_rate": 2.762560597377939e-06, "loss": 0.4554, "num_input_tokens_seen": 101143792, "step": 105925 }, { "epoch": 8.64099845011828, "grad_norm": 0.7826513051986694, "learning_rate": 2.7609344450845314e-06, "loss": 0.3938, "num_input_tokens_seen": 101148352, "step": 105930 }, { "epoch": 8.64140631372869, "grad_norm": 23.102218627929688, "learning_rate": 2.7593087435675346e-06, "loss": 0.2834, "num_input_tokens_seen": 101153488, "step": 105935 }, { "epoch": 8.641814177339098, "grad_norm": 3.992441177368164, "learning_rate": 2.757683492859897e-06, "loss": 0.3693, "num_input_tokens_seen": 101158048, "step": 105940 }, { "epoch": 8.642222040949507, "grad_norm": 3.2523038387298584, "learning_rate": 2.7560586929945592e-06, "loss": 0.3683, "num_input_tokens_seen": 101162736, "step": 105945 }, { "epoch": 8.642629904559914, "grad_norm": 71.17810821533203, "learning_rate": 2.754434344004464e-06, "loss": 0.2606, "num_input_tokens_seen": 101167744, "step": 105950 }, { "epoch": 8.643037768170323, "grad_norm": 6.196043014526367, "learning_rate": 2.752810445922532e-06, "loss": 0.3381, "num_input_tokens_seen": 101172352, "step": 105955 }, { "epoch": 8.643445631780732, "grad_norm": 16.20943260192871, "learning_rate": 2.7511869987816798e-06, "loss": 0.3594, "num_input_tokens_seen": 101176624, "step": 105960 }, { "epoch": 8.643853495391141, "grad_norm": 34.916419982910156, "learning_rate": 2.7495640026148074e-06, "loss": 0.6151, "num_input_tokens_seen": 101181520, "step": 105965 }, { "epoch": 8.64426135900155, "grad_norm": 31.670854568481445, "learning_rate": 2.7479414574548253e-06, "loss": 0.4153, "num_input_tokens_seen": 101186240, "step": 105970 }, { "epoch": 8.64466922261196, "grad_norm": 3.48002290725708, "learning_rate": 2.7463193633346122e-06, "loss": 0.2056, "num_input_tokens_seen": 101190672, "step": 105975 }, { "epoch": 8.645077086222367, "grad_norm": 6.4150776863098145, "learning_rate": 2.7446977202870484e-06, "loss": 0.2908, "num_input_tokens_seen": 101195312, "step": 105980 }, { "epoch": 8.645484949832776, "grad_norm": 8.777250289916992, "learning_rate": 2.7430765283450067e-06, "loss": 0.4036, "num_input_tokens_seen": 101200112, "step": 105985 }, { "epoch": 8.645892813443185, "grad_norm": 9.346780776977539, "learning_rate": 2.7414557875413444e-06, "loss": 0.2968, "num_input_tokens_seen": 101204320, "step": 105990 }, { "epoch": 8.646300677053594, "grad_norm": 5.576620101928711, "learning_rate": 2.739835497908916e-06, "loss": 0.241, "num_input_tokens_seen": 101208560, "step": 105995 }, { "epoch": 8.646708540664003, "grad_norm": 5.154089450836182, "learning_rate": 2.738215659480564e-06, "loss": 0.2964, "num_input_tokens_seen": 101213728, "step": 106000 }, { "epoch": 8.64711640427441, "grad_norm": 35.51169204711914, "learning_rate": 2.7365962722891214e-06, "loss": 0.3047, "num_input_tokens_seen": 101217792, "step": 106005 }, { "epoch": 8.647524267884819, "grad_norm": 7.408905029296875, "learning_rate": 2.7349773363674056e-06, "loss": 0.2019, "num_input_tokens_seen": 101222784, "step": 106010 }, { "epoch": 8.647932131495228, "grad_norm": 36.854251861572266, "learning_rate": 2.7333588517482438e-06, "loss": 0.3391, "num_input_tokens_seen": 101228416, "step": 106015 }, { "epoch": 8.648339995105637, "grad_norm": 3.521674394607544, "learning_rate": 2.7317408184644346e-06, "loss": 0.3687, "num_input_tokens_seen": 101232800, "step": 106020 }, { "epoch": 8.648747858716046, "grad_norm": 1.9186991453170776, "learning_rate": 2.730123236548779e-06, "loss": 0.3968, "num_input_tokens_seen": 101237888, "step": 106025 }, { "epoch": 8.649155722326453, "grad_norm": 8.169920921325684, "learning_rate": 2.7285061060340623e-06, "loss": 0.2224, "num_input_tokens_seen": 101242224, "step": 106030 }, { "epoch": 8.649563585936862, "grad_norm": 1.461584448814392, "learning_rate": 2.7268894269530582e-06, "loss": 0.2846, "num_input_tokens_seen": 101246832, "step": 106035 }, { "epoch": 8.649971449547271, "grad_norm": 3.045335054397583, "learning_rate": 2.7252731993385432e-06, "loss": 0.403, "num_input_tokens_seen": 101250576, "step": 106040 }, { "epoch": 8.65037931315768, "grad_norm": 8.974105834960938, "learning_rate": 2.7236574232232776e-06, "loss": 0.4668, "num_input_tokens_seen": 101255952, "step": 106045 }, { "epoch": 8.65078717676809, "grad_norm": 2.826265335083008, "learning_rate": 2.7220420986400096e-06, "loss": 0.5185, "num_input_tokens_seen": 101261584, "step": 106050 }, { "epoch": 8.651195040378497, "grad_norm": 27.333576202392578, "learning_rate": 2.7204272256214776e-06, "loss": 0.5191, "num_input_tokens_seen": 101266128, "step": 106055 }, { "epoch": 8.651602903988906, "grad_norm": 64.11355590820312, "learning_rate": 2.718812804200424e-06, "loss": 0.4004, "num_input_tokens_seen": 101270464, "step": 106060 }, { "epoch": 8.652010767599315, "grad_norm": 7.674990653991699, "learning_rate": 2.7171988344095678e-06, "loss": 0.3499, "num_input_tokens_seen": 101275616, "step": 106065 }, { "epoch": 8.652418631209724, "grad_norm": 82.5406265258789, "learning_rate": 2.7155853162816185e-06, "loss": 0.3922, "num_input_tokens_seen": 101281120, "step": 106070 }, { "epoch": 8.652826494820133, "grad_norm": 13.2673978805542, "learning_rate": 2.7139722498492837e-06, "loss": 0.2615, "num_input_tokens_seen": 101286656, "step": 106075 }, { "epoch": 8.653234358430542, "grad_norm": 12.917817115783691, "learning_rate": 2.7123596351452673e-06, "loss": 0.3982, "num_input_tokens_seen": 101291632, "step": 106080 }, { "epoch": 8.653642222040949, "grad_norm": 3.864781379699707, "learning_rate": 2.710747472202249e-06, "loss": 0.1799, "num_input_tokens_seen": 101296096, "step": 106085 }, { "epoch": 8.654050085651358, "grad_norm": 2.1221351623535156, "learning_rate": 2.7091357610529083e-06, "loss": 0.2927, "num_input_tokens_seen": 101301808, "step": 106090 }, { "epoch": 8.654457949261767, "grad_norm": 2.615551710128784, "learning_rate": 2.7075245017299134e-06, "loss": 0.2557, "num_input_tokens_seen": 101307344, "step": 106095 }, { "epoch": 8.654865812872176, "grad_norm": 3.4646143913269043, "learning_rate": 2.7059136942659185e-06, "loss": 0.1995, "num_input_tokens_seen": 101313072, "step": 106100 }, { "epoch": 8.655273676482585, "grad_norm": 0.7176905870437622, "learning_rate": 2.7043033386935867e-06, "loss": 0.2782, "num_input_tokens_seen": 101317664, "step": 106105 }, { "epoch": 8.655681540092992, "grad_norm": 12.199434280395508, "learning_rate": 2.702693435045553e-06, "loss": 0.3389, "num_input_tokens_seen": 101322864, "step": 106110 }, { "epoch": 8.656089403703401, "grad_norm": 6.440886974334717, "learning_rate": 2.7010839833544467e-06, "loss": 0.5015, "num_input_tokens_seen": 101327472, "step": 106115 }, { "epoch": 8.65649726731381, "grad_norm": 17.47525978088379, "learning_rate": 2.6994749836528914e-06, "loss": 0.4498, "num_input_tokens_seen": 101332608, "step": 106120 }, { "epoch": 8.656905130924219, "grad_norm": 7.390317440032959, "learning_rate": 2.6978664359735034e-06, "loss": 0.28, "num_input_tokens_seen": 101337440, "step": 106125 }, { "epoch": 8.657312994534628, "grad_norm": 4.188887596130371, "learning_rate": 2.6962583403488865e-06, "loss": 0.5447, "num_input_tokens_seen": 101341440, "step": 106130 }, { "epoch": 8.657720858145037, "grad_norm": 18.644380569458008, "learning_rate": 2.6946506968116347e-06, "loss": 0.2878, "num_input_tokens_seen": 101346384, "step": 106135 }, { "epoch": 8.658128721755444, "grad_norm": 13.977474212646484, "learning_rate": 2.6930435053943383e-06, "loss": 0.6086, "num_input_tokens_seen": 101351104, "step": 106140 }, { "epoch": 8.658536585365853, "grad_norm": 11.91537857055664, "learning_rate": 2.6914367661295625e-06, "loss": 0.4498, "num_input_tokens_seen": 101356608, "step": 106145 }, { "epoch": 8.658944448976262, "grad_norm": 94.18411254882812, "learning_rate": 2.68983047904989e-06, "loss": 0.5588, "num_input_tokens_seen": 101361312, "step": 106150 }, { "epoch": 8.659352312586671, "grad_norm": 3.4355294704437256, "learning_rate": 2.6882246441878757e-06, "loss": 0.3909, "num_input_tokens_seen": 101365824, "step": 106155 }, { "epoch": 8.65976017619708, "grad_norm": 3.9306890964508057, "learning_rate": 2.686619261576065e-06, "loss": 0.3672, "num_input_tokens_seen": 101370416, "step": 106160 }, { "epoch": 8.660168039807488, "grad_norm": 13.379647254943848, "learning_rate": 2.6850143312469932e-06, "loss": 0.269, "num_input_tokens_seen": 101375600, "step": 106165 }, { "epoch": 8.660575903417897, "grad_norm": 2.7288758754730225, "learning_rate": 2.683409853233207e-06, "loss": 0.3412, "num_input_tokens_seen": 101380512, "step": 106170 }, { "epoch": 8.660983767028306, "grad_norm": 22.197341918945312, "learning_rate": 2.6818058275672185e-06, "loss": 0.4847, "num_input_tokens_seen": 101385264, "step": 106175 }, { "epoch": 8.661391630638715, "grad_norm": 3.557152271270752, "learning_rate": 2.680202254281544e-06, "loss": 0.3264, "num_input_tokens_seen": 101390336, "step": 106180 }, { "epoch": 8.661799494249124, "grad_norm": 12.339669227600098, "learning_rate": 2.6785991334086826e-06, "loss": 0.3123, "num_input_tokens_seen": 101395440, "step": 106185 }, { "epoch": 8.662207357859533, "grad_norm": 27.768354415893555, "learning_rate": 2.6769964649811295e-06, "loss": 0.2481, "num_input_tokens_seen": 101400016, "step": 106190 }, { "epoch": 8.66261522146994, "grad_norm": 12.358369827270508, "learning_rate": 2.6753942490313767e-06, "loss": 0.2875, "num_input_tokens_seen": 101404304, "step": 106195 }, { "epoch": 8.663023085080349, "grad_norm": 34.33892822265625, "learning_rate": 2.673792485591897e-06, "loss": 0.451, "num_input_tokens_seen": 101408912, "step": 106200 }, { "epoch": 8.663430948690758, "grad_norm": 16.35283851623535, "learning_rate": 2.6721911746951544e-06, "loss": 0.3416, "num_input_tokens_seen": 101413776, "step": 106205 }, { "epoch": 8.663838812301167, "grad_norm": 2.6965692043304443, "learning_rate": 2.670590316373606e-06, "loss": 0.5297, "num_input_tokens_seen": 101419040, "step": 106210 }, { "epoch": 8.664246675911576, "grad_norm": 3.6266613006591797, "learning_rate": 2.6689899106597085e-06, "loss": 0.3596, "num_input_tokens_seen": 101423232, "step": 106215 }, { "epoch": 8.664654539521983, "grad_norm": 17.22270393371582, "learning_rate": 2.6673899575858948e-06, "loss": 0.385, "num_input_tokens_seen": 101427808, "step": 106220 }, { "epoch": 8.665062403132392, "grad_norm": 32.058284759521484, "learning_rate": 2.6657904571845998e-06, "loss": 0.3685, "num_input_tokens_seen": 101432800, "step": 106225 }, { "epoch": 8.665470266742801, "grad_norm": 2.151099920272827, "learning_rate": 2.66419140948824e-06, "loss": 0.4054, "num_input_tokens_seen": 101437472, "step": 106230 }, { "epoch": 8.66587813035321, "grad_norm": 2.789733648300171, "learning_rate": 2.6625928145292246e-06, "loss": 0.3974, "num_input_tokens_seen": 101441472, "step": 106235 }, { "epoch": 8.66628599396362, "grad_norm": 1.454933762550354, "learning_rate": 2.660994672339964e-06, "loss": 0.4083, "num_input_tokens_seen": 101446176, "step": 106240 }, { "epoch": 8.666693857574026, "grad_norm": 15.047487258911133, "learning_rate": 2.6593969829528525e-06, "loss": 0.4442, "num_input_tokens_seen": 101450816, "step": 106245 }, { "epoch": 8.667101721184435, "grad_norm": 8.730843544006348, "learning_rate": 2.657799746400269e-06, "loss": 0.377, "num_input_tokens_seen": 101455696, "step": 106250 }, { "epoch": 8.667509584794844, "grad_norm": 0.4181903302669525, "learning_rate": 2.6562029627145905e-06, "loss": 0.222, "num_input_tokens_seen": 101460832, "step": 106255 }, { "epoch": 8.667917448405253, "grad_norm": 4.31063175201416, "learning_rate": 2.6546066319281833e-06, "loss": 0.3359, "num_input_tokens_seen": 101466016, "step": 106260 }, { "epoch": 8.668325312015662, "grad_norm": 1.7274441719055176, "learning_rate": 2.6530107540734065e-06, "loss": 0.4064, "num_input_tokens_seen": 101470864, "step": 106265 }, { "epoch": 8.66873317562607, "grad_norm": 10.022416114807129, "learning_rate": 2.6514153291825993e-06, "loss": 0.3758, "num_input_tokens_seen": 101476560, "step": 106270 }, { "epoch": 8.669141039236479, "grad_norm": 8.17855167388916, "learning_rate": 2.6498203572881132e-06, "loss": 0.3756, "num_input_tokens_seen": 101481600, "step": 106275 }, { "epoch": 8.669548902846888, "grad_norm": 20.996826171875, "learning_rate": 2.648225838422272e-06, "loss": 0.4385, "num_input_tokens_seen": 101486496, "step": 106280 }, { "epoch": 8.669956766457297, "grad_norm": 2.9042179584503174, "learning_rate": 2.6466317726173916e-06, "loss": 0.2504, "num_input_tokens_seen": 101491424, "step": 106285 }, { "epoch": 8.670364630067706, "grad_norm": 2.8051376342773438, "learning_rate": 2.645038159905791e-06, "loss": 0.2926, "num_input_tokens_seen": 101496816, "step": 106290 }, { "epoch": 8.670772493678115, "grad_norm": 0.7826647162437439, "learning_rate": 2.6434450003197636e-06, "loss": 0.4335, "num_input_tokens_seen": 101501760, "step": 106295 }, { "epoch": 8.671180357288522, "grad_norm": 20.893814086914062, "learning_rate": 2.641852293891603e-06, "loss": 0.3636, "num_input_tokens_seen": 101506992, "step": 106300 }, { "epoch": 8.671588220898931, "grad_norm": 8.180767059326172, "learning_rate": 2.6402600406536026e-06, "loss": 0.2114, "num_input_tokens_seen": 101512080, "step": 106305 }, { "epoch": 8.67199608450934, "grad_norm": 7.042655944824219, "learning_rate": 2.6386682406380287e-06, "loss": 0.2319, "num_input_tokens_seen": 101517056, "step": 106310 }, { "epoch": 8.672403948119749, "grad_norm": 12.199871063232422, "learning_rate": 2.6370768938771467e-06, "loss": 0.1592, "num_input_tokens_seen": 101521584, "step": 106315 }, { "epoch": 8.672811811730158, "grad_norm": 3.4912636280059814, "learning_rate": 2.635486000403209e-06, "loss": 0.382, "num_input_tokens_seen": 101525072, "step": 106320 }, { "epoch": 8.673219675340565, "grad_norm": 4.747876167297363, "learning_rate": 2.6338955602484728e-06, "loss": 0.3076, "num_input_tokens_seen": 101530528, "step": 106325 }, { "epoch": 8.673627538950974, "grad_norm": 3.452732801437378, "learning_rate": 2.632305573445168e-06, "loss": 0.3562, "num_input_tokens_seen": 101534848, "step": 106330 }, { "epoch": 8.674035402561383, "grad_norm": 2.4959230422973633, "learning_rate": 2.6307160400255243e-06, "loss": 0.2667, "num_input_tokens_seen": 101539808, "step": 106335 }, { "epoch": 8.674443266171792, "grad_norm": 31.5606632232666, "learning_rate": 2.629126960021763e-06, "loss": 0.3972, "num_input_tokens_seen": 101545648, "step": 106340 }, { "epoch": 8.674851129782201, "grad_norm": 2.749925374984741, "learning_rate": 2.627538333466087e-06, "loss": 0.3112, "num_input_tokens_seen": 101550656, "step": 106345 }, { "epoch": 8.67525899339261, "grad_norm": 17.798154830932617, "learning_rate": 2.6259501603907054e-06, "loss": 0.5191, "num_input_tokens_seen": 101555952, "step": 106350 }, { "epoch": 8.675666857003018, "grad_norm": 2.4824092388153076, "learning_rate": 2.62436244082781e-06, "loss": 0.2243, "num_input_tokens_seen": 101561296, "step": 106355 }, { "epoch": 8.676074720613427, "grad_norm": 1.673254370689392, "learning_rate": 2.6227751748095776e-06, "loss": 0.2878, "num_input_tokens_seen": 101565616, "step": 106360 }, { "epoch": 8.676482584223836, "grad_norm": 6.644129753112793, "learning_rate": 2.6211883623681793e-06, "loss": 0.3026, "num_input_tokens_seen": 101569984, "step": 106365 }, { "epoch": 8.676890447834245, "grad_norm": 7.183223724365234, "learning_rate": 2.619602003535787e-06, "loss": 0.3114, "num_input_tokens_seen": 101574096, "step": 106370 }, { "epoch": 8.677298311444654, "grad_norm": 36.35786437988281, "learning_rate": 2.6180160983445556e-06, "loss": 0.2089, "num_input_tokens_seen": 101579472, "step": 106375 }, { "epoch": 8.67770617505506, "grad_norm": 3.3062775135040283, "learning_rate": 2.616430646826623e-06, "loss": 0.327, "num_input_tokens_seen": 101584224, "step": 106380 }, { "epoch": 8.67811403866547, "grad_norm": 9.715578079223633, "learning_rate": 2.614845649014133e-06, "loss": 0.371, "num_input_tokens_seen": 101588560, "step": 106385 }, { "epoch": 8.678521902275879, "grad_norm": 33.97730255126953, "learning_rate": 2.613261104939205e-06, "loss": 0.3023, "num_input_tokens_seen": 101593424, "step": 106390 }, { "epoch": 8.678929765886288, "grad_norm": 13.271937370300293, "learning_rate": 2.611677014633965e-06, "loss": 0.2791, "num_input_tokens_seen": 101598736, "step": 106395 }, { "epoch": 8.679337629496697, "grad_norm": 31.551057815551758, "learning_rate": 2.610093378130518e-06, "loss": 0.483, "num_input_tokens_seen": 101604144, "step": 106400 }, { "epoch": 8.679745493107106, "grad_norm": 2.3484668731689453, "learning_rate": 2.608510195460964e-06, "loss": 0.431, "num_input_tokens_seen": 101608784, "step": 106405 }, { "epoch": 8.680153356717513, "grad_norm": 29.671897888183594, "learning_rate": 2.6069274666573965e-06, "loss": 0.2763, "num_input_tokens_seen": 101613840, "step": 106410 }, { "epoch": 8.680561220327922, "grad_norm": 4.809405326843262, "learning_rate": 2.605345191751893e-06, "loss": 0.424, "num_input_tokens_seen": 101619248, "step": 106415 }, { "epoch": 8.680969083938331, "grad_norm": 16.291519165039062, "learning_rate": 2.603763370776524e-06, "loss": 0.3612, "num_input_tokens_seen": 101623360, "step": 106420 }, { "epoch": 8.68137694754874, "grad_norm": 20.083606719970703, "learning_rate": 2.6021820037633566e-06, "loss": 0.3567, "num_input_tokens_seen": 101628496, "step": 106425 }, { "epoch": 8.68178481115915, "grad_norm": 23.07811164855957, "learning_rate": 2.600601090744442e-06, "loss": 0.5123, "num_input_tokens_seen": 101633824, "step": 106430 }, { "epoch": 8.682192674769556, "grad_norm": 2.98006534576416, "learning_rate": 2.5990206317518197e-06, "loss": 0.2893, "num_input_tokens_seen": 101638384, "step": 106435 }, { "epoch": 8.682600538379965, "grad_norm": 3.498063087463379, "learning_rate": 2.5974406268175377e-06, "loss": 0.2826, "num_input_tokens_seen": 101643744, "step": 106440 }, { "epoch": 8.683008401990374, "grad_norm": 7.171572685241699, "learning_rate": 2.595861075973613e-06, "loss": 0.3825, "num_input_tokens_seen": 101647488, "step": 106445 }, { "epoch": 8.683416265600783, "grad_norm": 16.945154190063477, "learning_rate": 2.594281979252064e-06, "loss": 0.3733, "num_input_tokens_seen": 101652080, "step": 106450 }, { "epoch": 8.683824129211192, "grad_norm": 4.7838311195373535, "learning_rate": 2.592703336684893e-06, "loss": 0.2153, "num_input_tokens_seen": 101655776, "step": 106455 }, { "epoch": 8.6842319928216, "grad_norm": 30.648666381835938, "learning_rate": 2.5911251483041105e-06, "loss": 0.4348, "num_input_tokens_seen": 101661136, "step": 106460 }, { "epoch": 8.684639856432009, "grad_norm": 10.259551048278809, "learning_rate": 2.589547414141699e-06, "loss": 0.3222, "num_input_tokens_seen": 101666736, "step": 106465 }, { "epoch": 8.685047720042418, "grad_norm": 3.807878017425537, "learning_rate": 2.587970134229639e-06, "loss": 0.3377, "num_input_tokens_seen": 101672224, "step": 106470 }, { "epoch": 8.685455583652827, "grad_norm": 24.31081199645996, "learning_rate": 2.5863933085998935e-06, "loss": 0.2882, "num_input_tokens_seen": 101677792, "step": 106475 }, { "epoch": 8.685863447263236, "grad_norm": 5.7398576736450195, "learning_rate": 2.58481693728444e-06, "loss": 0.2248, "num_input_tokens_seen": 101683040, "step": 106480 }, { "epoch": 8.686271310873645, "grad_norm": 5.259401321411133, "learning_rate": 2.583241020315219e-06, "loss": 0.4011, "num_input_tokens_seen": 101688128, "step": 106485 }, { "epoch": 8.686679174484052, "grad_norm": 27.809587478637695, "learning_rate": 2.5816655577241777e-06, "loss": 0.4143, "num_input_tokens_seen": 101693840, "step": 106490 }, { "epoch": 8.687087038094461, "grad_norm": 2.6628341674804688, "learning_rate": 2.5800905495432514e-06, "loss": 0.3058, "num_input_tokens_seen": 101699696, "step": 106495 }, { "epoch": 8.68749490170487, "grad_norm": 4.084568977355957, "learning_rate": 2.5785159958043565e-06, "loss": 0.3332, "num_input_tokens_seen": 101704416, "step": 106500 }, { "epoch": 8.687902765315279, "grad_norm": 7.739561557769775, "learning_rate": 2.57694189653942e-06, "loss": 0.2728, "num_input_tokens_seen": 101708704, "step": 106505 }, { "epoch": 8.688310628925688, "grad_norm": 8.079378128051758, "learning_rate": 2.575368251780344e-06, "loss": 0.3977, "num_input_tokens_seen": 101713168, "step": 106510 }, { "epoch": 8.688718492536095, "grad_norm": 22.926895141601562, "learning_rate": 2.5737950615590227e-06, "loss": 0.5185, "num_input_tokens_seen": 101718672, "step": 106515 }, { "epoch": 8.689126356146504, "grad_norm": 17.021270751953125, "learning_rate": 2.5722223259073448e-06, "loss": 0.2877, "num_input_tokens_seen": 101722848, "step": 106520 }, { "epoch": 8.689534219756913, "grad_norm": 48.82431411743164, "learning_rate": 2.5706500448571925e-06, "loss": 0.3896, "num_input_tokens_seen": 101727536, "step": 106525 }, { "epoch": 8.689942083367322, "grad_norm": 31.071186065673828, "learning_rate": 2.5690782184404322e-06, "loss": 0.4115, "num_input_tokens_seen": 101732608, "step": 106530 }, { "epoch": 8.690349946977731, "grad_norm": 15.148077964782715, "learning_rate": 2.5675068466889245e-06, "loss": 0.4882, "num_input_tokens_seen": 101737664, "step": 106535 }, { "epoch": 8.690757810588138, "grad_norm": 34.69830322265625, "learning_rate": 2.5659359296345247e-06, "loss": 0.4206, "num_input_tokens_seen": 101742224, "step": 106540 }, { "epoch": 8.691165674198547, "grad_norm": 4.1169257164001465, "learning_rate": 2.5643654673090685e-06, "loss": 0.2597, "num_input_tokens_seen": 101747232, "step": 106545 }, { "epoch": 8.691573537808956, "grad_norm": 28.38301658630371, "learning_rate": 2.5627954597443908e-06, "loss": 0.5498, "num_input_tokens_seen": 101751088, "step": 106550 }, { "epoch": 8.691981401419365, "grad_norm": 20.861286163330078, "learning_rate": 2.5612259069723145e-06, "loss": 0.5617, "num_input_tokens_seen": 101756480, "step": 106555 }, { "epoch": 8.692389265029774, "grad_norm": 7.146592617034912, "learning_rate": 2.5596568090246548e-06, "loss": 0.2569, "num_input_tokens_seen": 101760672, "step": 106560 }, { "epoch": 8.692797128640184, "grad_norm": 17.382226943969727, "learning_rate": 2.558088165933209e-06, "loss": 0.2894, "num_input_tokens_seen": 101765392, "step": 106565 }, { "epoch": 8.69320499225059, "grad_norm": 20.969799041748047, "learning_rate": 2.556519977729788e-06, "loss": 0.5141, "num_input_tokens_seen": 101769088, "step": 106570 }, { "epoch": 8.693612855861, "grad_norm": 13.896598815917969, "learning_rate": 2.5549522444461683e-06, "loss": 0.303, "num_input_tokens_seen": 101774176, "step": 106575 }, { "epoch": 8.694020719471409, "grad_norm": 11.622611999511719, "learning_rate": 2.5533849661141308e-06, "loss": 0.4492, "num_input_tokens_seen": 101779680, "step": 106580 }, { "epoch": 8.694428583081818, "grad_norm": 26.286558151245117, "learning_rate": 2.5518181427654415e-06, "loss": 0.3536, "num_input_tokens_seen": 101784496, "step": 106585 }, { "epoch": 8.694836446692227, "grad_norm": 2.3157694339752197, "learning_rate": 2.5502517744318527e-06, "loss": 0.2427, "num_input_tokens_seen": 101789024, "step": 106590 }, { "epoch": 8.695244310302634, "grad_norm": 1.9448927640914917, "learning_rate": 2.548685861145128e-06, "loss": 0.2721, "num_input_tokens_seen": 101793920, "step": 106595 }, { "epoch": 8.695652173913043, "grad_norm": 11.356274604797363, "learning_rate": 2.5471204029370004e-06, "loss": 0.2374, "num_input_tokens_seen": 101798672, "step": 106600 }, { "epoch": 8.696060037523452, "grad_norm": 36.515560150146484, "learning_rate": 2.545555399839203e-06, "loss": 0.406, "num_input_tokens_seen": 101803680, "step": 106605 }, { "epoch": 8.696467901133861, "grad_norm": 12.679990768432617, "learning_rate": 2.5439908518834487e-06, "loss": 0.389, "num_input_tokens_seen": 101808736, "step": 106610 }, { "epoch": 8.69687576474427, "grad_norm": 26.70828628540039, "learning_rate": 2.542426759101463e-06, "loss": 0.4888, "num_input_tokens_seen": 101813280, "step": 106615 }, { "epoch": 8.697283628354679, "grad_norm": 6.271064758300781, "learning_rate": 2.5408631215249445e-06, "loss": 0.4461, "num_input_tokens_seen": 101818208, "step": 106620 }, { "epoch": 8.697691491965086, "grad_norm": 5.721041679382324, "learning_rate": 2.539299939185585e-06, "loss": 0.2582, "num_input_tokens_seen": 101821872, "step": 106625 }, { "epoch": 8.698099355575495, "grad_norm": 25.576915740966797, "learning_rate": 2.5377372121150713e-06, "loss": 0.3364, "num_input_tokens_seen": 101827104, "step": 106630 }, { "epoch": 8.698507219185904, "grad_norm": 3.216470956802368, "learning_rate": 2.5361749403450818e-06, "loss": 0.3179, "num_input_tokens_seen": 101831568, "step": 106635 }, { "epoch": 8.698915082796313, "grad_norm": 5.517298221588135, "learning_rate": 2.5346131239072812e-06, "loss": 0.3086, "num_input_tokens_seen": 101836816, "step": 106640 }, { "epoch": 8.699322946406722, "grad_norm": 3.5471882820129395, "learning_rate": 2.5330517628333246e-06, "loss": 0.2626, "num_input_tokens_seen": 101841120, "step": 106645 }, { "epoch": 8.69973081001713, "grad_norm": 5.136291980743408, "learning_rate": 2.531490857154864e-06, "loss": 0.4601, "num_input_tokens_seen": 101846256, "step": 106650 }, { "epoch": 8.700138673627539, "grad_norm": 7.6601667404174805, "learning_rate": 2.52993040690353e-06, "loss": 0.4178, "num_input_tokens_seen": 101851824, "step": 106655 }, { "epoch": 8.700546537237948, "grad_norm": 25.20052719116211, "learning_rate": 2.5283704121109643e-06, "loss": 0.3126, "num_input_tokens_seen": 101857088, "step": 106660 }, { "epoch": 8.700954400848357, "grad_norm": 4.762247085571289, "learning_rate": 2.5268108728087824e-06, "loss": 0.2584, "num_input_tokens_seen": 101861232, "step": 106665 }, { "epoch": 8.701362264458766, "grad_norm": 11.7481050491333, "learning_rate": 2.525251789028593e-06, "loss": 0.372, "num_input_tokens_seen": 101865840, "step": 106670 }, { "epoch": 8.701770128069175, "grad_norm": 26.01104736328125, "learning_rate": 2.5236931608020015e-06, "loss": 0.3633, "num_input_tokens_seen": 101870240, "step": 106675 }, { "epoch": 8.702177991679582, "grad_norm": 14.119110107421875, "learning_rate": 2.522134988160599e-06, "loss": 0.3826, "num_input_tokens_seen": 101874912, "step": 106680 }, { "epoch": 8.70258585528999, "grad_norm": 34.86362075805664, "learning_rate": 2.520577271135968e-06, "loss": 0.3027, "num_input_tokens_seen": 101879408, "step": 106685 }, { "epoch": 8.7029937189004, "grad_norm": 3.4159011840820312, "learning_rate": 2.519020009759682e-06, "loss": 0.2187, "num_input_tokens_seen": 101884032, "step": 106690 }, { "epoch": 8.703401582510809, "grad_norm": 8.958894729614258, "learning_rate": 2.5174632040633085e-06, "loss": 0.1899, "num_input_tokens_seen": 101888528, "step": 106695 }, { "epoch": 8.703809446121218, "grad_norm": 4.379201889038086, "learning_rate": 2.515906854078398e-06, "loss": 0.4575, "num_input_tokens_seen": 101893328, "step": 106700 }, { "epoch": 8.704217309731625, "grad_norm": 27.615028381347656, "learning_rate": 2.514350959836509e-06, "loss": 0.4728, "num_input_tokens_seen": 101898144, "step": 106705 }, { "epoch": 8.704625173342034, "grad_norm": 23.45978546142578, "learning_rate": 2.5127955213691684e-06, "loss": 0.2116, "num_input_tokens_seen": 101902848, "step": 106710 }, { "epoch": 8.705033036952443, "grad_norm": 1.224482774734497, "learning_rate": 2.511240538707907e-06, "loss": 0.3479, "num_input_tokens_seen": 101906816, "step": 106715 }, { "epoch": 8.705440900562852, "grad_norm": 19.176687240600586, "learning_rate": 2.5096860118842406e-06, "loss": 0.5106, "num_input_tokens_seen": 101911936, "step": 106720 }, { "epoch": 8.705848764173261, "grad_norm": 2.648005723953247, "learning_rate": 2.508131940929684e-06, "loss": 0.2701, "num_input_tokens_seen": 101916304, "step": 106725 }, { "epoch": 8.706256627783668, "grad_norm": 4.442667007446289, "learning_rate": 2.506578325875739e-06, "loss": 0.4506, "num_input_tokens_seen": 101921616, "step": 106730 }, { "epoch": 8.706664491394077, "grad_norm": 8.47016429901123, "learning_rate": 2.505025166753891e-06, "loss": 0.2614, "num_input_tokens_seen": 101925792, "step": 106735 }, { "epoch": 8.707072355004486, "grad_norm": 12.199898719787598, "learning_rate": 2.5034724635956242e-06, "loss": 0.1958, "num_input_tokens_seen": 101929200, "step": 106740 }, { "epoch": 8.707480218614895, "grad_norm": 8.70538330078125, "learning_rate": 2.5019202164324076e-06, "loss": 0.3636, "num_input_tokens_seen": 101933184, "step": 106745 }, { "epoch": 8.707888082225304, "grad_norm": 11.483537673950195, "learning_rate": 2.5003684252957127e-06, "loss": 0.371, "num_input_tokens_seen": 101937856, "step": 106750 }, { "epoch": 8.708295945835712, "grad_norm": 9.329018592834473, "learning_rate": 2.49881709021699e-06, "loss": 0.3357, "num_input_tokens_seen": 101943296, "step": 106755 }, { "epoch": 8.70870380944612, "grad_norm": 15.840486526489258, "learning_rate": 2.4972662112276805e-06, "loss": 0.1993, "num_input_tokens_seen": 101947664, "step": 106760 }, { "epoch": 8.70911167305653, "grad_norm": 2.347074270248413, "learning_rate": 2.495715788359221e-06, "loss": 0.2136, "num_input_tokens_seen": 101952688, "step": 106765 }, { "epoch": 8.709519536666939, "grad_norm": 26.999385833740234, "learning_rate": 2.4941658216430435e-06, "loss": 0.4726, "num_input_tokens_seen": 101956864, "step": 106770 }, { "epoch": 8.709927400277348, "grad_norm": 10.572456359863281, "learning_rate": 2.492616311110563e-06, "loss": 0.6261, "num_input_tokens_seen": 101961808, "step": 106775 }, { "epoch": 8.710335263887757, "grad_norm": 7.750582695007324, "learning_rate": 2.4910672567931837e-06, "loss": 0.3219, "num_input_tokens_seen": 101965776, "step": 106780 }, { "epoch": 8.710743127498164, "grad_norm": 5.608170509338379, "learning_rate": 2.4895186587223092e-06, "loss": 0.2068, "num_input_tokens_seen": 101970640, "step": 106785 }, { "epoch": 8.711150991108573, "grad_norm": 5.089609622955322, "learning_rate": 2.4879705169293196e-06, "loss": 0.4788, "num_input_tokens_seen": 101975264, "step": 106790 }, { "epoch": 8.711558854718982, "grad_norm": 37.645015716552734, "learning_rate": 2.486422831445606e-06, "loss": 0.3558, "num_input_tokens_seen": 101980400, "step": 106795 }, { "epoch": 8.711966718329391, "grad_norm": 22.265390396118164, "learning_rate": 2.484875602302536e-06, "loss": 0.2544, "num_input_tokens_seen": 101985616, "step": 106800 }, { "epoch": 8.7123745819398, "grad_norm": 6.177062034606934, "learning_rate": 2.48332882953147e-06, "loss": 0.361, "num_input_tokens_seen": 101989696, "step": 106805 }, { "epoch": 8.712782445550207, "grad_norm": 11.734046936035156, "learning_rate": 2.4817825131637608e-06, "loss": 0.4591, "num_input_tokens_seen": 101994128, "step": 106810 }, { "epoch": 8.713190309160616, "grad_norm": 7.496705055236816, "learning_rate": 2.48023665323075e-06, "loss": 0.3319, "num_input_tokens_seen": 101999056, "step": 106815 }, { "epoch": 8.713598172771025, "grad_norm": 22.309619903564453, "learning_rate": 2.478691249763773e-06, "loss": 0.4364, "num_input_tokens_seen": 102004400, "step": 106820 }, { "epoch": 8.714006036381434, "grad_norm": 2.6141226291656494, "learning_rate": 2.477146302794156e-06, "loss": 0.2643, "num_input_tokens_seen": 102008896, "step": 106825 }, { "epoch": 8.714413899991843, "grad_norm": 5.452387809753418, "learning_rate": 2.4756018123532092e-06, "loss": 0.5217, "num_input_tokens_seen": 102013984, "step": 106830 }, { "epoch": 8.714821763602252, "grad_norm": 21.34805679321289, "learning_rate": 2.474057778472247e-06, "loss": 0.3667, "num_input_tokens_seen": 102019024, "step": 106835 }, { "epoch": 8.71522962721266, "grad_norm": 2.3466880321502686, "learning_rate": 2.4725142011825604e-06, "loss": 0.348, "num_input_tokens_seen": 102023888, "step": 106840 }, { "epoch": 8.715637490823068, "grad_norm": 15.28549861907959, "learning_rate": 2.4709710805154385e-06, "loss": 0.4212, "num_input_tokens_seen": 102028336, "step": 106845 }, { "epoch": 8.716045354433477, "grad_norm": 40.99810028076172, "learning_rate": 2.469428416502159e-06, "loss": 0.4496, "num_input_tokens_seen": 102032992, "step": 106850 }, { "epoch": 8.716453218043887, "grad_norm": 1.0670040845870972, "learning_rate": 2.467886209173989e-06, "loss": 0.2954, "num_input_tokens_seen": 102038288, "step": 106855 }, { "epoch": 8.716861081654296, "grad_norm": 7.6504106521606445, "learning_rate": 2.466344458562192e-06, "loss": 0.3494, "num_input_tokens_seen": 102043104, "step": 106860 }, { "epoch": 8.717268945264703, "grad_norm": 6.153103828430176, "learning_rate": 2.464803164698021e-06, "loss": 0.2854, "num_input_tokens_seen": 102048400, "step": 106865 }, { "epoch": 8.717676808875112, "grad_norm": 1.8830342292785645, "learning_rate": 2.4632623276127144e-06, "loss": 0.3401, "num_input_tokens_seen": 102053744, "step": 106870 }, { "epoch": 8.71808467248552, "grad_norm": 26.063968658447266, "learning_rate": 2.4617219473374975e-06, "loss": 0.4466, "num_input_tokens_seen": 102058560, "step": 106875 }, { "epoch": 8.71849253609593, "grad_norm": 17.543289184570312, "learning_rate": 2.4601820239036066e-06, "loss": 0.3596, "num_input_tokens_seen": 102063632, "step": 106880 }, { "epoch": 8.718900399706339, "grad_norm": 12.78616714477539, "learning_rate": 2.4586425573422447e-06, "loss": 0.1815, "num_input_tokens_seen": 102069376, "step": 106885 }, { "epoch": 8.719308263316748, "grad_norm": 2.811269760131836, "learning_rate": 2.457103547684622e-06, "loss": 0.4404, "num_input_tokens_seen": 102074304, "step": 106890 }, { "epoch": 8.719716126927155, "grad_norm": 12.30138111114502, "learning_rate": 2.4555649949619314e-06, "loss": 0.3895, "num_input_tokens_seen": 102079264, "step": 106895 }, { "epoch": 8.720123990537564, "grad_norm": 11.898932456970215, "learning_rate": 2.454026899205353e-06, "loss": 0.4269, "num_input_tokens_seen": 102083792, "step": 106900 }, { "epoch": 8.720531854147973, "grad_norm": 4.031135559082031, "learning_rate": 2.4524892604460753e-06, "loss": 0.3719, "num_input_tokens_seen": 102088672, "step": 106905 }, { "epoch": 8.720939717758382, "grad_norm": 13.80726432800293, "learning_rate": 2.4509520787152574e-06, "loss": 0.421, "num_input_tokens_seen": 102093296, "step": 106910 }, { "epoch": 8.721347581368791, "grad_norm": 8.561914443969727, "learning_rate": 2.44941535404406e-06, "loss": 0.2088, "num_input_tokens_seen": 102097504, "step": 106915 }, { "epoch": 8.721755444979198, "grad_norm": 22.27991485595703, "learning_rate": 2.447879086463628e-06, "loss": 0.1389, "num_input_tokens_seen": 102102560, "step": 106920 }, { "epoch": 8.722163308589607, "grad_norm": 0.8914002776145935, "learning_rate": 2.446343276005106e-06, "loss": 0.2391, "num_input_tokens_seen": 102107872, "step": 106925 }, { "epoch": 8.722571172200016, "grad_norm": 2.595735788345337, "learning_rate": 2.444807922699624e-06, "loss": 0.3173, "num_input_tokens_seen": 102112256, "step": 106930 }, { "epoch": 8.722979035810425, "grad_norm": 5.20474910736084, "learning_rate": 2.443273026578302e-06, "loss": 0.2065, "num_input_tokens_seen": 102117136, "step": 106935 }, { "epoch": 8.723386899420834, "grad_norm": 10.743101119995117, "learning_rate": 2.4417385876722487e-06, "loss": 0.4176, "num_input_tokens_seen": 102122352, "step": 106940 }, { "epoch": 8.723794763031242, "grad_norm": 1.124500036239624, "learning_rate": 2.4402046060125695e-06, "loss": 0.4108, "num_input_tokens_seen": 102126752, "step": 106945 }, { "epoch": 8.72420262664165, "grad_norm": 5.26539945602417, "learning_rate": 2.438671081630356e-06, "loss": 0.3859, "num_input_tokens_seen": 102131280, "step": 106950 }, { "epoch": 8.72461049025206, "grad_norm": 1.8317148685455322, "learning_rate": 2.437138014556692e-06, "loss": 0.104, "num_input_tokens_seen": 102136272, "step": 106955 }, { "epoch": 8.725018353862469, "grad_norm": 39.13044357299805, "learning_rate": 2.435605404822655e-06, "loss": 0.3362, "num_input_tokens_seen": 102141488, "step": 106960 }, { "epoch": 8.725426217472878, "grad_norm": 10.796704292297363, "learning_rate": 2.434073252459301e-06, "loss": 0.3495, "num_input_tokens_seen": 102145840, "step": 106965 }, { "epoch": 8.725834081083285, "grad_norm": 17.667991638183594, "learning_rate": 2.4325415574977e-06, "loss": 0.4949, "num_input_tokens_seen": 102150672, "step": 106970 }, { "epoch": 8.726241944693694, "grad_norm": 21.100934982299805, "learning_rate": 2.4310103199688934e-06, "loss": 0.4989, "num_input_tokens_seen": 102155088, "step": 106975 }, { "epoch": 8.726649808304103, "grad_norm": 59.15983581542969, "learning_rate": 2.4294795399039144e-06, "loss": 0.4038, "num_input_tokens_seen": 102159952, "step": 106980 }, { "epoch": 8.727057671914512, "grad_norm": 27.437767028808594, "learning_rate": 2.427949217333797e-06, "loss": 0.38, "num_input_tokens_seen": 102164800, "step": 106985 }, { "epoch": 8.72746553552492, "grad_norm": 16.197860717773438, "learning_rate": 2.4264193522895524e-06, "loss": 0.3509, "num_input_tokens_seen": 102169472, "step": 106990 }, { "epoch": 8.72787339913533, "grad_norm": 17.63105010986328, "learning_rate": 2.4248899448022e-06, "loss": 0.3027, "num_input_tokens_seen": 102174656, "step": 106995 }, { "epoch": 8.728281262745737, "grad_norm": 14.736625671386719, "learning_rate": 2.423360994902735e-06, "loss": 0.1915, "num_input_tokens_seen": 102179360, "step": 107000 }, { "epoch": 8.728689126356146, "grad_norm": 0.5990437865257263, "learning_rate": 2.4218325026221485e-06, "loss": 0.2105, "num_input_tokens_seen": 102184992, "step": 107005 }, { "epoch": 8.729096989966555, "grad_norm": 2.57975172996521, "learning_rate": 2.4203044679914188e-06, "loss": 0.2901, "num_input_tokens_seen": 102189424, "step": 107010 }, { "epoch": 8.729504853576964, "grad_norm": 4.294280052185059, "learning_rate": 2.4187768910415272e-06, "loss": 0.1967, "num_input_tokens_seen": 102193440, "step": 107015 }, { "epoch": 8.729912717187373, "grad_norm": 2.716799259185791, "learning_rate": 2.417249771803434e-06, "loss": 0.3605, "num_input_tokens_seen": 102198464, "step": 107020 }, { "epoch": 8.73032058079778, "grad_norm": 3.4513936042785645, "learning_rate": 2.4157231103080906e-06, "loss": 0.4533, "num_input_tokens_seen": 102202576, "step": 107025 }, { "epoch": 8.73072844440819, "grad_norm": 20.13607406616211, "learning_rate": 2.4141969065864413e-06, "loss": 0.4302, "num_input_tokens_seen": 102207424, "step": 107030 }, { "epoch": 8.731136308018598, "grad_norm": 12.976991653442383, "learning_rate": 2.4126711606694247e-06, "loss": 0.3988, "num_input_tokens_seen": 102211792, "step": 107035 }, { "epoch": 8.731544171629007, "grad_norm": 10.466802597045898, "learning_rate": 2.4111458725879667e-06, "loss": 0.2384, "num_input_tokens_seen": 102215952, "step": 107040 }, { "epoch": 8.731952035239416, "grad_norm": 26.772815704345703, "learning_rate": 2.409621042372984e-06, "loss": 0.2381, "num_input_tokens_seen": 102221136, "step": 107045 }, { "epoch": 8.732359898849825, "grad_norm": 3.029750347137451, "learning_rate": 2.408096670055382e-06, "loss": 0.158, "num_input_tokens_seen": 102226224, "step": 107050 }, { "epoch": 8.732767762460233, "grad_norm": 16.592559814453125, "learning_rate": 2.4065727556660588e-06, "loss": 0.2527, "num_input_tokens_seen": 102230592, "step": 107055 }, { "epoch": 8.733175626070642, "grad_norm": 1.4855923652648926, "learning_rate": 2.4050492992359093e-06, "loss": 0.0977, "num_input_tokens_seen": 102234976, "step": 107060 }, { "epoch": 8.73358348968105, "grad_norm": 1.6453368663787842, "learning_rate": 2.4035263007958083e-06, "loss": 0.453, "num_input_tokens_seen": 102239056, "step": 107065 }, { "epoch": 8.73399135329146, "grad_norm": 3.4414353370666504, "learning_rate": 2.4020037603766253e-06, "loss": 0.3226, "num_input_tokens_seen": 102244272, "step": 107070 }, { "epoch": 8.734399216901869, "grad_norm": 24.743579864501953, "learning_rate": 2.4004816780092254e-06, "loss": 0.3124, "num_input_tokens_seen": 102248832, "step": 107075 }, { "epoch": 8.734807080512276, "grad_norm": 3.5451979637145996, "learning_rate": 2.398960053724461e-06, "loss": 0.338, "num_input_tokens_seen": 102254240, "step": 107080 }, { "epoch": 8.735214944122685, "grad_norm": 5.760693550109863, "learning_rate": 2.397438887553166e-06, "loss": 0.3054, "num_input_tokens_seen": 102259040, "step": 107085 }, { "epoch": 8.735622807733094, "grad_norm": 27.23957633972168, "learning_rate": 2.3959181795261826e-06, "loss": 0.4128, "num_input_tokens_seen": 102263472, "step": 107090 }, { "epoch": 8.736030671343503, "grad_norm": 10.965132713317871, "learning_rate": 2.3943979296743355e-06, "loss": 0.2158, "num_input_tokens_seen": 102268032, "step": 107095 }, { "epoch": 8.736438534953912, "grad_norm": 16.36212921142578, "learning_rate": 2.392878138028437e-06, "loss": 0.5138, "num_input_tokens_seen": 102272128, "step": 107100 }, { "epoch": 8.736846398564321, "grad_norm": 2.7571587562561035, "learning_rate": 2.3913588046192926e-06, "loss": 0.3033, "num_input_tokens_seen": 102277632, "step": 107105 }, { "epoch": 8.737254262174728, "grad_norm": 2.59035325050354, "learning_rate": 2.389839929477697e-06, "loss": 0.2661, "num_input_tokens_seen": 102281568, "step": 107110 }, { "epoch": 8.737662125785137, "grad_norm": 9.187079429626465, "learning_rate": 2.3883215126344368e-06, "loss": 0.2821, "num_input_tokens_seen": 102285824, "step": 107115 }, { "epoch": 8.738069989395546, "grad_norm": 25.853322982788086, "learning_rate": 2.38680355412029e-06, "loss": 0.5186, "num_input_tokens_seen": 102290576, "step": 107120 }, { "epoch": 8.738477853005955, "grad_norm": 32.66566848754883, "learning_rate": 2.3852860539660266e-06, "loss": 0.3698, "num_input_tokens_seen": 102294784, "step": 107125 }, { "epoch": 8.738885716616364, "grad_norm": 1.31606924533844, "learning_rate": 2.383769012202408e-06, "loss": 0.2876, "num_input_tokens_seen": 102299312, "step": 107130 }, { "epoch": 8.739293580226771, "grad_norm": 22.105960845947266, "learning_rate": 2.382252428860182e-06, "loss": 0.3028, "num_input_tokens_seen": 102304400, "step": 107135 }, { "epoch": 8.73970144383718, "grad_norm": 14.036971092224121, "learning_rate": 2.380736303970088e-06, "loss": 0.3361, "num_input_tokens_seen": 102308768, "step": 107140 }, { "epoch": 8.74010930744759, "grad_norm": 11.61271858215332, "learning_rate": 2.379220637562851e-06, "loss": 0.1289, "num_input_tokens_seen": 102313600, "step": 107145 }, { "epoch": 8.740517171057999, "grad_norm": 0.9633728861808777, "learning_rate": 2.377705429669208e-06, "loss": 0.3865, "num_input_tokens_seen": 102318784, "step": 107150 }, { "epoch": 8.740925034668408, "grad_norm": 31.061866760253906, "learning_rate": 2.376190680319859e-06, "loss": 0.408, "num_input_tokens_seen": 102323328, "step": 107155 }, { "epoch": 8.741332898278815, "grad_norm": 18.31764793395996, "learning_rate": 2.374676389545513e-06, "loss": 0.4162, "num_input_tokens_seen": 102328368, "step": 107160 }, { "epoch": 8.741740761889224, "grad_norm": 19.20397186279297, "learning_rate": 2.3731625573768594e-06, "loss": 0.3609, "num_input_tokens_seen": 102333632, "step": 107165 }, { "epoch": 8.742148625499633, "grad_norm": 15.184399604797363, "learning_rate": 2.37164918384459e-06, "loss": 0.3954, "num_input_tokens_seen": 102338816, "step": 107170 }, { "epoch": 8.742556489110042, "grad_norm": 2.3650548458099365, "learning_rate": 2.3701362689793753e-06, "loss": 0.3097, "num_input_tokens_seen": 102343792, "step": 107175 }, { "epoch": 8.74296435272045, "grad_norm": 2.9467039108276367, "learning_rate": 2.368623812811882e-06, "loss": 0.5699, "num_input_tokens_seen": 102347872, "step": 107180 }, { "epoch": 8.743372216330858, "grad_norm": 2.663309335708618, "learning_rate": 2.3671118153727635e-06, "loss": 0.3185, "num_input_tokens_seen": 102352544, "step": 107185 }, { "epoch": 8.743780079941267, "grad_norm": 8.108922004699707, "learning_rate": 2.3656002766926756e-06, "loss": 0.4376, "num_input_tokens_seen": 102357536, "step": 107190 }, { "epoch": 8.744187943551676, "grad_norm": 25.555316925048828, "learning_rate": 2.364089196802252e-06, "loss": 0.3097, "num_input_tokens_seen": 102361952, "step": 107195 }, { "epoch": 8.744595807162085, "grad_norm": 16.925451278686523, "learning_rate": 2.3625785757321244e-06, "loss": 0.4314, "num_input_tokens_seen": 102367696, "step": 107200 }, { "epoch": 8.745003670772494, "grad_norm": 17.96031951904297, "learning_rate": 2.361068413512907e-06, "loss": 0.4475, "num_input_tokens_seen": 102372912, "step": 107205 }, { "epoch": 8.745411534382903, "grad_norm": 1.8545336723327637, "learning_rate": 2.359558710175208e-06, "loss": 0.3049, "num_input_tokens_seen": 102378000, "step": 107210 }, { "epoch": 8.74581939799331, "grad_norm": 30.977685928344727, "learning_rate": 2.3580494657496397e-06, "loss": 0.3382, "num_input_tokens_seen": 102382608, "step": 107215 }, { "epoch": 8.74622726160372, "grad_norm": 1.0806142091751099, "learning_rate": 2.356540680266786e-06, "loss": 0.3832, "num_input_tokens_seen": 102387472, "step": 107220 }, { "epoch": 8.746635125214128, "grad_norm": 4.9106764793396, "learning_rate": 2.3550323537572333e-06, "loss": 0.3805, "num_input_tokens_seen": 102392288, "step": 107225 }, { "epoch": 8.747042988824537, "grad_norm": 16.526931762695312, "learning_rate": 2.3535244862515515e-06, "loss": 0.2123, "num_input_tokens_seen": 102396848, "step": 107230 }, { "epoch": 8.747450852434946, "grad_norm": 17.866907119750977, "learning_rate": 2.352017077780305e-06, "loss": 0.5256, "num_input_tokens_seen": 102401088, "step": 107235 }, { "epoch": 8.747858716045354, "grad_norm": 67.12124633789062, "learning_rate": 2.3505101283740477e-06, "loss": 0.4767, "num_input_tokens_seen": 102406336, "step": 107240 }, { "epoch": 8.748266579655763, "grad_norm": 0.9094327688217163, "learning_rate": 2.349003638063327e-06, "loss": 0.2592, "num_input_tokens_seen": 102411088, "step": 107245 }, { "epoch": 8.748674443266172, "grad_norm": 5.141645908355713, "learning_rate": 2.3474976068786764e-06, "loss": 0.4187, "num_input_tokens_seen": 102415952, "step": 107250 }, { "epoch": 8.74908230687658, "grad_norm": 2.709989070892334, "learning_rate": 2.345992034850619e-06, "loss": 0.3502, "num_input_tokens_seen": 102420736, "step": 107255 }, { "epoch": 8.74949017048699, "grad_norm": 28.87004852294922, "learning_rate": 2.3444869220096837e-06, "loss": 0.4875, "num_input_tokens_seen": 102424752, "step": 107260 }, { "epoch": 8.749898034097399, "grad_norm": 5.252894401550293, "learning_rate": 2.3429822683863706e-06, "loss": 0.2906, "num_input_tokens_seen": 102429680, "step": 107265 }, { "epoch": 8.750305897707806, "grad_norm": 34.60877227783203, "learning_rate": 2.3414780740111773e-06, "loss": 0.2329, "num_input_tokens_seen": 102435168, "step": 107270 }, { "epoch": 8.750713761318215, "grad_norm": 5.10886287689209, "learning_rate": 2.3399743389145914e-06, "loss": 0.5352, "num_input_tokens_seen": 102440368, "step": 107275 }, { "epoch": 8.751121624928624, "grad_norm": 46.70104217529297, "learning_rate": 2.338471063127104e-06, "loss": 0.4237, "num_input_tokens_seen": 102445200, "step": 107280 }, { "epoch": 8.751529488539033, "grad_norm": 4.9956231117248535, "learning_rate": 2.3369682466791748e-06, "loss": 0.3037, "num_input_tokens_seen": 102449952, "step": 107285 }, { "epoch": 8.751937352149442, "grad_norm": 7.295062065124512, "learning_rate": 2.335465889601271e-06, "loss": 0.4276, "num_input_tokens_seen": 102455904, "step": 107290 }, { "epoch": 8.75234521575985, "grad_norm": 0.660779595375061, "learning_rate": 2.333963991923843e-06, "loss": 0.4472, "num_input_tokens_seen": 102460064, "step": 107295 }, { "epoch": 8.752753079370258, "grad_norm": 22.223657608032227, "learning_rate": 2.332462553677331e-06, "loss": 0.3316, "num_input_tokens_seen": 102464336, "step": 107300 }, { "epoch": 8.753160942980667, "grad_norm": 3.7802486419677734, "learning_rate": 2.330961574892174e-06, "loss": 0.4995, "num_input_tokens_seen": 102469280, "step": 107305 }, { "epoch": 8.753568806591076, "grad_norm": 1.1594377756118774, "learning_rate": 2.3294610555987915e-06, "loss": 0.2461, "num_input_tokens_seen": 102474944, "step": 107310 }, { "epoch": 8.753976670201485, "grad_norm": 15.312318801879883, "learning_rate": 2.327960995827602e-06, "loss": 0.1778, "num_input_tokens_seen": 102480064, "step": 107315 }, { "epoch": 8.754384533811894, "grad_norm": 13.41583251953125, "learning_rate": 2.326461395609003e-06, "loss": 0.3929, "num_input_tokens_seen": 102485328, "step": 107320 }, { "epoch": 8.754792397422301, "grad_norm": 0.792622447013855, "learning_rate": 2.3249622549734033e-06, "loss": 0.4668, "num_input_tokens_seen": 102489696, "step": 107325 }, { "epoch": 8.75520026103271, "grad_norm": 12.805374145507812, "learning_rate": 2.3234635739511818e-06, "loss": 0.3572, "num_input_tokens_seen": 102495280, "step": 107330 }, { "epoch": 8.75560812464312, "grad_norm": 12.185647964477539, "learning_rate": 2.3219653525727193e-06, "loss": 0.3327, "num_input_tokens_seen": 102500464, "step": 107335 }, { "epoch": 8.756015988253528, "grad_norm": 9.64728832244873, "learning_rate": 2.3204675908683803e-06, "loss": 0.3529, "num_input_tokens_seen": 102505056, "step": 107340 }, { "epoch": 8.756423851863937, "grad_norm": 1.7310147285461426, "learning_rate": 2.3189702888685242e-06, "loss": 0.144, "num_input_tokens_seen": 102510144, "step": 107345 }, { "epoch": 8.756831715474345, "grad_norm": 44.77577590942383, "learning_rate": 2.317473446603505e-06, "loss": 0.3226, "num_input_tokens_seen": 102514016, "step": 107350 }, { "epoch": 8.757239579084754, "grad_norm": 6.88189697265625, "learning_rate": 2.315977064103664e-06, "loss": 0.3398, "num_input_tokens_seen": 102519536, "step": 107355 }, { "epoch": 8.757647442695163, "grad_norm": 5.577188968658447, "learning_rate": 2.3144811413993245e-06, "loss": 0.4353, "num_input_tokens_seen": 102524320, "step": 107360 }, { "epoch": 8.758055306305572, "grad_norm": 40.27577590942383, "learning_rate": 2.312985678520815e-06, "loss": 0.2782, "num_input_tokens_seen": 102529744, "step": 107365 }, { "epoch": 8.75846316991598, "grad_norm": 2.1435515880584717, "learning_rate": 2.3114906754984454e-06, "loss": 0.2727, "num_input_tokens_seen": 102534384, "step": 107370 }, { "epoch": 8.758871033526388, "grad_norm": 32.318233489990234, "learning_rate": 2.3099961323625185e-06, "loss": 0.4705, "num_input_tokens_seen": 102539248, "step": 107375 }, { "epoch": 8.759278897136797, "grad_norm": 32.947811126708984, "learning_rate": 2.3085020491433295e-06, "loss": 0.4759, "num_input_tokens_seen": 102543952, "step": 107380 }, { "epoch": 8.759686760747206, "grad_norm": 4.874349117279053, "learning_rate": 2.3070084258711574e-06, "loss": 0.2347, "num_input_tokens_seen": 102548096, "step": 107385 }, { "epoch": 8.760094624357615, "grad_norm": 3.37585711479187, "learning_rate": 2.3055152625762855e-06, "loss": 0.3162, "num_input_tokens_seen": 102552640, "step": 107390 }, { "epoch": 8.760502487968024, "grad_norm": 3.2774245738983154, "learning_rate": 2.3040225592889734e-06, "loss": 0.2733, "num_input_tokens_seen": 102558656, "step": 107395 }, { "epoch": 8.760910351578431, "grad_norm": 27.178293228149414, "learning_rate": 2.3025303160394834e-06, "loss": 0.5113, "num_input_tokens_seen": 102563184, "step": 107400 }, { "epoch": 8.76131821518884, "grad_norm": 4.8833417892456055, "learning_rate": 2.3010385328580576e-06, "loss": 0.5407, "num_input_tokens_seen": 102567856, "step": 107405 }, { "epoch": 8.76172607879925, "grad_norm": 12.252097129821777, "learning_rate": 2.29954720977493e-06, "loss": 0.3323, "num_input_tokens_seen": 102572304, "step": 107410 }, { "epoch": 8.762133942409658, "grad_norm": 3.8993148803710938, "learning_rate": 2.2980563468203403e-06, "loss": 0.4171, "num_input_tokens_seen": 102576576, "step": 107415 }, { "epoch": 8.762541806020067, "grad_norm": 14.868839263916016, "learning_rate": 2.2965659440245037e-06, "loss": 0.2243, "num_input_tokens_seen": 102581328, "step": 107420 }, { "epoch": 8.762949669630476, "grad_norm": 2.883493185043335, "learning_rate": 2.2950760014176257e-06, "loss": 0.2272, "num_input_tokens_seen": 102585664, "step": 107425 }, { "epoch": 8.763357533240884, "grad_norm": 9.848518371582031, "learning_rate": 2.2935865190299027e-06, "loss": 0.4436, "num_input_tokens_seen": 102590528, "step": 107430 }, { "epoch": 8.763765396851293, "grad_norm": 15.87203311920166, "learning_rate": 2.2920974968915406e-06, "loss": 0.3045, "num_input_tokens_seen": 102595936, "step": 107435 }, { "epoch": 8.764173260461702, "grad_norm": 2.1792399883270264, "learning_rate": 2.290608935032712e-06, "loss": 0.1353, "num_input_tokens_seen": 102599856, "step": 107440 }, { "epoch": 8.76458112407211, "grad_norm": 2.559454917907715, "learning_rate": 2.2891208334835905e-06, "loss": 0.3323, "num_input_tokens_seen": 102604592, "step": 107445 }, { "epoch": 8.76498898768252, "grad_norm": 17.695119857788086, "learning_rate": 2.2876331922743383e-06, "loss": 0.321, "num_input_tokens_seen": 102608880, "step": 107450 }, { "epoch": 8.765396851292927, "grad_norm": 1.8875805139541626, "learning_rate": 2.2861460114351057e-06, "loss": 0.3189, "num_input_tokens_seen": 102613536, "step": 107455 }, { "epoch": 8.765804714903336, "grad_norm": 6.30343770980835, "learning_rate": 2.2846592909960466e-06, "loss": 0.4888, "num_input_tokens_seen": 102618160, "step": 107460 }, { "epoch": 8.766212578513745, "grad_norm": 12.779667854309082, "learning_rate": 2.283173030987293e-06, "loss": 0.4974, "num_input_tokens_seen": 102622880, "step": 107465 }, { "epoch": 8.766620442124154, "grad_norm": 39.13511657714844, "learning_rate": 2.281687231438967e-06, "loss": 0.6178, "num_input_tokens_seen": 102627120, "step": 107470 }, { "epoch": 8.767028305734563, "grad_norm": 13.794837951660156, "learning_rate": 2.280201892381184e-06, "loss": 0.6356, "num_input_tokens_seen": 102631824, "step": 107475 }, { "epoch": 8.767436169344972, "grad_norm": 3.5878982543945312, "learning_rate": 2.278717013844059e-06, "loss": 0.4204, "num_input_tokens_seen": 102636976, "step": 107480 }, { "epoch": 8.767844032955379, "grad_norm": 15.031736373901367, "learning_rate": 2.277232595857684e-06, "loss": 0.3743, "num_input_tokens_seen": 102642000, "step": 107485 }, { "epoch": 8.768251896565788, "grad_norm": 5.3286824226379395, "learning_rate": 2.2757486384521493e-06, "loss": 0.2839, "num_input_tokens_seen": 102646592, "step": 107490 }, { "epoch": 8.768659760176197, "grad_norm": 3.500559091567993, "learning_rate": 2.2742651416575335e-06, "loss": 0.2965, "num_input_tokens_seen": 102651392, "step": 107495 }, { "epoch": 8.769067623786606, "grad_norm": 3.272735118865967, "learning_rate": 2.2727821055039065e-06, "loss": 0.2097, "num_input_tokens_seen": 102656352, "step": 107500 }, { "epoch": 8.769475487397015, "grad_norm": 6.666053295135498, "learning_rate": 2.2712995300213287e-06, "loss": 0.368, "num_input_tokens_seen": 102661616, "step": 107505 }, { "epoch": 8.769883351007422, "grad_norm": 1.763026475906372, "learning_rate": 2.2698174152398527e-06, "loss": 0.1875, "num_input_tokens_seen": 102666672, "step": 107510 }, { "epoch": 8.770291214617831, "grad_norm": 39.95838165283203, "learning_rate": 2.268335761189519e-06, "loss": 0.5085, "num_input_tokens_seen": 102672016, "step": 107515 }, { "epoch": 8.77069907822824, "grad_norm": 5.945542812347412, "learning_rate": 2.266854567900353e-06, "loss": 0.4967, "num_input_tokens_seen": 102676768, "step": 107520 }, { "epoch": 8.77110694183865, "grad_norm": 22.505306243896484, "learning_rate": 2.2653738354023927e-06, "loss": 0.3342, "num_input_tokens_seen": 102682080, "step": 107525 }, { "epoch": 8.771514805449058, "grad_norm": 7.167877197265625, "learning_rate": 2.2638935637256414e-06, "loss": 0.3908, "num_input_tokens_seen": 102686416, "step": 107530 }, { "epoch": 8.771922669059467, "grad_norm": 39.05689239501953, "learning_rate": 2.262413752900108e-06, "loss": 0.2819, "num_input_tokens_seen": 102691616, "step": 107535 }, { "epoch": 8.772330532669875, "grad_norm": 9.173908233642578, "learning_rate": 2.260934402955786e-06, "loss": 0.521, "num_input_tokens_seen": 102696880, "step": 107540 }, { "epoch": 8.772738396280284, "grad_norm": 4.403298854827881, "learning_rate": 2.2594555139226565e-06, "loss": 0.3956, "num_input_tokens_seen": 102701712, "step": 107545 }, { "epoch": 8.773146259890693, "grad_norm": 2.897578477859497, "learning_rate": 2.2579770858307064e-06, "loss": 0.4358, "num_input_tokens_seen": 102706400, "step": 107550 }, { "epoch": 8.773554123501102, "grad_norm": 9.030709266662598, "learning_rate": 2.2564991187098957e-06, "loss": 0.4325, "num_input_tokens_seen": 102711088, "step": 107555 }, { "epoch": 8.77396198711151, "grad_norm": 2.3776183128356934, "learning_rate": 2.255021612590183e-06, "loss": 0.338, "num_input_tokens_seen": 102715824, "step": 107560 }, { "epoch": 8.774369850721918, "grad_norm": 3.087571620941162, "learning_rate": 2.2535445675015143e-06, "loss": 0.1805, "num_input_tokens_seen": 102720288, "step": 107565 }, { "epoch": 8.774777714332327, "grad_norm": 31.24903678894043, "learning_rate": 2.252067983473835e-06, "loss": 0.5377, "num_input_tokens_seen": 102725104, "step": 107570 }, { "epoch": 8.775185577942736, "grad_norm": 22.44270896911621, "learning_rate": 2.250591860537074e-06, "loss": 0.4525, "num_input_tokens_seen": 102728672, "step": 107575 }, { "epoch": 8.775593441553145, "grad_norm": 5.383939743041992, "learning_rate": 2.2491161987211464e-06, "loss": 0.1785, "num_input_tokens_seen": 102732944, "step": 107580 }, { "epoch": 8.776001305163554, "grad_norm": 28.843795776367188, "learning_rate": 2.247640998055961e-06, "loss": 0.294, "num_input_tokens_seen": 102738576, "step": 107585 }, { "epoch": 8.776409168773961, "grad_norm": 20.645469665527344, "learning_rate": 2.2461662585714304e-06, "loss": 0.3529, "num_input_tokens_seen": 102743056, "step": 107590 }, { "epoch": 8.77681703238437, "grad_norm": 11.197982788085938, "learning_rate": 2.244691980297442e-06, "loss": 0.4347, "num_input_tokens_seen": 102747712, "step": 107595 }, { "epoch": 8.77722489599478, "grad_norm": 57.976539611816406, "learning_rate": 2.2432181632638773e-06, "loss": 0.2564, "num_input_tokens_seen": 102753200, "step": 107600 }, { "epoch": 8.777632759605188, "grad_norm": 25.07088851928711, "learning_rate": 2.2417448075006093e-06, "loss": 0.313, "num_input_tokens_seen": 102757840, "step": 107605 }, { "epoch": 8.778040623215597, "grad_norm": 18.39459800720215, "learning_rate": 2.2402719130375003e-06, "loss": 0.4791, "num_input_tokens_seen": 102761728, "step": 107610 }, { "epoch": 8.778448486826006, "grad_norm": 3.3769073486328125, "learning_rate": 2.238799479904413e-06, "loss": 0.3901, "num_input_tokens_seen": 102766944, "step": 107615 }, { "epoch": 8.778856350436413, "grad_norm": 10.878584861755371, "learning_rate": 2.237327508131187e-06, "loss": 0.2592, "num_input_tokens_seen": 102771536, "step": 107620 }, { "epoch": 8.779264214046822, "grad_norm": 25.76712989807129, "learning_rate": 2.23585599774766e-06, "loss": 0.3299, "num_input_tokens_seen": 102775824, "step": 107625 }, { "epoch": 8.779672077657231, "grad_norm": 15.434066772460938, "learning_rate": 2.2343849487836602e-06, "loss": 0.4796, "num_input_tokens_seen": 102780192, "step": 107630 }, { "epoch": 8.78007994126764, "grad_norm": 21.74760627746582, "learning_rate": 2.2329143612690034e-06, "loss": 0.3036, "num_input_tokens_seen": 102785024, "step": 107635 }, { "epoch": 8.78048780487805, "grad_norm": 2.506075859069824, "learning_rate": 2.2314442352334986e-06, "loss": 0.4429, "num_input_tokens_seen": 102790112, "step": 107640 }, { "epoch": 8.780895668488457, "grad_norm": 1.5310570001602173, "learning_rate": 2.2299745707069414e-06, "loss": 0.1975, "num_input_tokens_seen": 102794960, "step": 107645 }, { "epoch": 8.781303532098866, "grad_norm": 60.05959701538086, "learning_rate": 2.2285053677191276e-06, "loss": 0.4031, "num_input_tokens_seen": 102799968, "step": 107650 }, { "epoch": 8.781711395709275, "grad_norm": 0.9644725322723389, "learning_rate": 2.2270366262998278e-06, "loss": 0.3176, "num_input_tokens_seen": 102805184, "step": 107655 }, { "epoch": 8.782119259319684, "grad_norm": 3.230910539627075, "learning_rate": 2.2255683464788236e-06, "loss": 0.2601, "num_input_tokens_seen": 102809072, "step": 107660 }, { "epoch": 8.782527122930093, "grad_norm": 10.62717056274414, "learning_rate": 2.2241005282858717e-06, "loss": 0.3039, "num_input_tokens_seen": 102812640, "step": 107665 }, { "epoch": 8.7829349865405, "grad_norm": 19.94153594970703, "learning_rate": 2.222633171750724e-06, "loss": 0.4208, "num_input_tokens_seen": 102817696, "step": 107670 }, { "epoch": 8.783342850150909, "grad_norm": 13.211833953857422, "learning_rate": 2.2211662769031168e-06, "loss": 0.4058, "num_input_tokens_seen": 102821648, "step": 107675 }, { "epoch": 8.783750713761318, "grad_norm": 2.586909532546997, "learning_rate": 2.219699843772796e-06, "loss": 0.5124, "num_input_tokens_seen": 102826848, "step": 107680 }, { "epoch": 8.784158577371727, "grad_norm": 2.581352472305298, "learning_rate": 2.2182338723894804e-06, "loss": 0.2393, "num_input_tokens_seen": 102830512, "step": 107685 }, { "epoch": 8.784566440982136, "grad_norm": 0.3257848918437958, "learning_rate": 2.216768362782881e-06, "loss": 0.2728, "num_input_tokens_seen": 102834400, "step": 107690 }, { "epoch": 8.784974304592545, "grad_norm": 1.9529184103012085, "learning_rate": 2.2153033149827083e-06, "loss": 0.2975, "num_input_tokens_seen": 102838704, "step": 107695 }, { "epoch": 8.785382168202952, "grad_norm": 2.1851413249969482, "learning_rate": 2.2138387290186496e-06, "loss": 0.1233, "num_input_tokens_seen": 102843744, "step": 107700 }, { "epoch": 8.785790031813361, "grad_norm": 2.3566792011260986, "learning_rate": 2.212374604920403e-06, "loss": 0.4607, "num_input_tokens_seen": 102849024, "step": 107705 }, { "epoch": 8.78619789542377, "grad_norm": 4.248885631561279, "learning_rate": 2.2109109427176393e-06, "loss": 0.3677, "num_input_tokens_seen": 102854176, "step": 107710 }, { "epoch": 8.78660575903418, "grad_norm": 3.966956853866577, "learning_rate": 2.2094477424400295e-06, "loss": 0.245, "num_input_tokens_seen": 102858496, "step": 107715 }, { "epoch": 8.787013622644588, "grad_norm": 2.3942408561706543, "learning_rate": 2.207985004117222e-06, "loss": 0.3649, "num_input_tokens_seen": 102864080, "step": 107720 }, { "epoch": 8.787421486254996, "grad_norm": 13.186457633972168, "learning_rate": 2.2065227277788785e-06, "loss": 0.4146, "num_input_tokens_seen": 102868224, "step": 107725 }, { "epoch": 8.787829349865405, "grad_norm": 3.076871395111084, "learning_rate": 2.2050609134546346e-06, "loss": 0.5292, "num_input_tokens_seen": 102872704, "step": 107730 }, { "epoch": 8.788237213475814, "grad_norm": 8.104390144348145, "learning_rate": 2.203599561174119e-06, "loss": 0.4238, "num_input_tokens_seen": 102877824, "step": 107735 }, { "epoch": 8.788645077086223, "grad_norm": 1.3119498491287231, "learning_rate": 2.202138670966955e-06, "loss": 0.2026, "num_input_tokens_seen": 102883488, "step": 107740 }, { "epoch": 8.789052940696632, "grad_norm": 3.275768518447876, "learning_rate": 2.200678242862747e-06, "loss": 0.2544, "num_input_tokens_seen": 102887152, "step": 107745 }, { "epoch": 8.78946080430704, "grad_norm": 2.6292014122009277, "learning_rate": 2.1992182768911073e-06, "loss": 0.2857, "num_input_tokens_seen": 102892368, "step": 107750 }, { "epoch": 8.789868667917448, "grad_norm": 1.6461315155029297, "learning_rate": 2.1977587730816262e-06, "loss": 0.2828, "num_input_tokens_seen": 102897536, "step": 107755 }, { "epoch": 8.790276531527857, "grad_norm": 6.063393592834473, "learning_rate": 2.1962997314638857e-06, "loss": 0.2799, "num_input_tokens_seen": 102902800, "step": 107760 }, { "epoch": 8.790684395138266, "grad_norm": 36.548500061035156, "learning_rate": 2.194841152067456e-06, "loss": 0.2532, "num_input_tokens_seen": 102907280, "step": 107765 }, { "epoch": 8.791092258748675, "grad_norm": 1.4927642345428467, "learning_rate": 2.1933830349219087e-06, "loss": 0.4959, "num_input_tokens_seen": 102912064, "step": 107770 }, { "epoch": 8.791500122359084, "grad_norm": 1.6528981924057007, "learning_rate": 2.1919253800567917e-06, "loss": 0.2578, "num_input_tokens_seen": 102917264, "step": 107775 }, { "epoch": 8.791907985969491, "grad_norm": 3.0521020889282227, "learning_rate": 2.19046818750166e-06, "loss": 0.3101, "num_input_tokens_seen": 102922352, "step": 107780 }, { "epoch": 8.7923158495799, "grad_norm": 2.930819272994995, "learning_rate": 2.1890114572860444e-06, "loss": 0.3776, "num_input_tokens_seen": 102928240, "step": 107785 }, { "epoch": 8.792723713190309, "grad_norm": 37.652896881103516, "learning_rate": 2.1875551894394746e-06, "loss": 0.6584, "num_input_tokens_seen": 102933056, "step": 107790 }, { "epoch": 8.793131576800718, "grad_norm": 9.873344421386719, "learning_rate": 2.186099383991466e-06, "loss": 0.2619, "num_input_tokens_seen": 102937648, "step": 107795 }, { "epoch": 8.793539440411127, "grad_norm": 3.1222786903381348, "learning_rate": 2.1846440409715314e-06, "loss": 0.1402, "num_input_tokens_seen": 102942048, "step": 107800 }, { "epoch": 8.793947304021536, "grad_norm": 2.5387966632843018, "learning_rate": 2.183189160409166e-06, "loss": 0.3856, "num_input_tokens_seen": 102947008, "step": 107805 }, { "epoch": 8.794355167631943, "grad_norm": 23.1016788482666, "learning_rate": 2.1817347423338545e-06, "loss": 0.3811, "num_input_tokens_seen": 102951840, "step": 107810 }, { "epoch": 8.794763031242352, "grad_norm": 6.9880828857421875, "learning_rate": 2.1802807867750907e-06, "loss": 0.377, "num_input_tokens_seen": 102956928, "step": 107815 }, { "epoch": 8.795170894852761, "grad_norm": 6.168269157409668, "learning_rate": 2.1788272937623394e-06, "loss": 0.5644, "num_input_tokens_seen": 102961952, "step": 107820 }, { "epoch": 8.79557875846317, "grad_norm": 12.279511451721191, "learning_rate": 2.177374263325058e-06, "loss": 0.3806, "num_input_tokens_seen": 102968240, "step": 107825 }, { "epoch": 8.79598662207358, "grad_norm": 9.180262565612793, "learning_rate": 2.1759216954927e-06, "loss": 0.3319, "num_input_tokens_seen": 102973120, "step": 107830 }, { "epoch": 8.796394485683987, "grad_norm": 30.30339813232422, "learning_rate": 2.174469590294717e-06, "loss": 0.3986, "num_input_tokens_seen": 102977520, "step": 107835 }, { "epoch": 8.796802349294396, "grad_norm": 46.37225341796875, "learning_rate": 2.1730179477605333e-06, "loss": 0.4402, "num_input_tokens_seen": 102980832, "step": 107840 }, { "epoch": 8.797210212904805, "grad_norm": 23.50660514831543, "learning_rate": 2.171566767919578e-06, "loss": 0.4777, "num_input_tokens_seen": 102985600, "step": 107845 }, { "epoch": 8.797618076515214, "grad_norm": 2.8376309871673584, "learning_rate": 2.170116050801263e-06, "loss": 0.3917, "num_input_tokens_seen": 102990144, "step": 107850 }, { "epoch": 8.798025940125623, "grad_norm": 0.9058136940002441, "learning_rate": 2.1686657964349875e-06, "loss": 0.4395, "num_input_tokens_seen": 102995088, "step": 107855 }, { "epoch": 8.79843380373603, "grad_norm": 2.642343521118164, "learning_rate": 2.1672160048501642e-06, "loss": 0.4476, "num_input_tokens_seen": 103000400, "step": 107860 }, { "epoch": 8.798841667346439, "grad_norm": 33.815799713134766, "learning_rate": 2.165766676076167e-06, "loss": 0.4026, "num_input_tokens_seen": 103005856, "step": 107865 }, { "epoch": 8.799249530956848, "grad_norm": 8.83061408996582, "learning_rate": 2.164317810142377e-06, "loss": 0.2859, "num_input_tokens_seen": 103011488, "step": 107870 }, { "epoch": 8.799657394567257, "grad_norm": 51.34899139404297, "learning_rate": 2.162869407078158e-06, "loss": 0.3278, "num_input_tokens_seen": 103015440, "step": 107875 }, { "epoch": 8.800065258177666, "grad_norm": 2.2635202407836914, "learning_rate": 2.1614214669128744e-06, "loss": 0.422, "num_input_tokens_seen": 103021008, "step": 107880 }, { "epoch": 8.800473121788073, "grad_norm": 25.876373291015625, "learning_rate": 2.159973989675876e-06, "loss": 0.4028, "num_input_tokens_seen": 103025456, "step": 107885 }, { "epoch": 8.800880985398482, "grad_norm": 14.73037052154541, "learning_rate": 2.1585269753964964e-06, "loss": 0.4186, "num_input_tokens_seen": 103030256, "step": 107890 }, { "epoch": 8.801288849008891, "grad_norm": 2.46622633934021, "learning_rate": 2.1570804241040713e-06, "loss": 0.4233, "num_input_tokens_seen": 103035504, "step": 107895 }, { "epoch": 8.8016967126193, "grad_norm": 9.752272605895996, "learning_rate": 2.1556343358279136e-06, "loss": 0.3832, "num_input_tokens_seen": 103040256, "step": 107900 }, { "epoch": 8.80210457622971, "grad_norm": 3.8928730487823486, "learning_rate": 2.1541887105973462e-06, "loss": 0.3078, "num_input_tokens_seen": 103045232, "step": 107905 }, { "epoch": 8.802512439840118, "grad_norm": 6.629335880279541, "learning_rate": 2.152743548441666e-06, "loss": 0.3778, "num_input_tokens_seen": 103050096, "step": 107910 }, { "epoch": 8.802920303450525, "grad_norm": 3.9911601543426514, "learning_rate": 2.1512988493901654e-06, "loss": 0.1475, "num_input_tokens_seen": 103055392, "step": 107915 }, { "epoch": 8.803328167060934, "grad_norm": 38.81910705566406, "learning_rate": 2.1498546134721272e-06, "loss": 0.2247, "num_input_tokens_seen": 103059056, "step": 107920 }, { "epoch": 8.803736030671343, "grad_norm": 23.743276596069336, "learning_rate": 2.1484108407168274e-06, "loss": 0.4064, "num_input_tokens_seen": 103063424, "step": 107925 }, { "epoch": 8.804143894281752, "grad_norm": 28.130130767822266, "learning_rate": 2.1469675311535294e-06, "loss": 0.2557, "num_input_tokens_seen": 103068208, "step": 107930 }, { "epoch": 8.804551757892161, "grad_norm": 2.5619959831237793, "learning_rate": 2.1455246848114895e-06, "loss": 0.2253, "num_input_tokens_seen": 103072400, "step": 107935 }, { "epoch": 8.804959621502569, "grad_norm": 16.182697296142578, "learning_rate": 2.1440823017199462e-06, "loss": 0.3718, "num_input_tokens_seen": 103077184, "step": 107940 }, { "epoch": 8.805367485112978, "grad_norm": 22.184389114379883, "learning_rate": 2.142640381908148e-06, "loss": 0.3684, "num_input_tokens_seen": 103082848, "step": 107945 }, { "epoch": 8.805775348723387, "grad_norm": 7.320306301116943, "learning_rate": 2.141198925405316e-06, "loss": 0.3605, "num_input_tokens_seen": 103086576, "step": 107950 }, { "epoch": 8.806183212333796, "grad_norm": 5.249170303344727, "learning_rate": 2.1397579322406714e-06, "loss": 0.4485, "num_input_tokens_seen": 103091664, "step": 107955 }, { "epoch": 8.806591075944205, "grad_norm": 13.300293922424316, "learning_rate": 2.138317402443418e-06, "loss": 0.3876, "num_input_tokens_seen": 103095664, "step": 107960 }, { "epoch": 8.806998939554614, "grad_norm": 41.18682098388672, "learning_rate": 2.1368773360427504e-06, "loss": 0.2627, "num_input_tokens_seen": 103100640, "step": 107965 }, { "epoch": 8.807406803165021, "grad_norm": 41.89704895019531, "learning_rate": 2.135437733067869e-06, "loss": 0.3999, "num_input_tokens_seen": 103106240, "step": 107970 }, { "epoch": 8.80781466677543, "grad_norm": 2.2657063007354736, "learning_rate": 2.133998593547951e-06, "loss": 0.1947, "num_input_tokens_seen": 103110688, "step": 107975 }, { "epoch": 8.808222530385839, "grad_norm": 4.073027610778809, "learning_rate": 2.1325599175121643e-06, "loss": 0.3479, "num_input_tokens_seen": 103115536, "step": 107980 }, { "epoch": 8.808630393996248, "grad_norm": 1.5104749202728271, "learning_rate": 2.1311217049896637e-06, "loss": 0.3496, "num_input_tokens_seen": 103119936, "step": 107985 }, { "epoch": 8.809038257606657, "grad_norm": 7.114727973937988, "learning_rate": 2.1296839560096176e-06, "loss": 0.3221, "num_input_tokens_seen": 103124208, "step": 107990 }, { "epoch": 8.809446121217064, "grad_norm": 18.240598678588867, "learning_rate": 2.1282466706011557e-06, "loss": 0.3786, "num_input_tokens_seen": 103128560, "step": 107995 }, { "epoch": 8.809853984827473, "grad_norm": 11.434941291809082, "learning_rate": 2.126809848793418e-06, "loss": 0.2208, "num_input_tokens_seen": 103132896, "step": 108000 }, { "epoch": 8.810261848437882, "grad_norm": 2.6485846042633057, "learning_rate": 2.125373490615523e-06, "loss": 0.405, "num_input_tokens_seen": 103138096, "step": 108005 }, { "epoch": 8.810669712048291, "grad_norm": 36.62828826904297, "learning_rate": 2.123937596096584e-06, "loss": 0.3829, "num_input_tokens_seen": 103144128, "step": 108010 }, { "epoch": 8.8110775756587, "grad_norm": 9.619998931884766, "learning_rate": 2.1225021652657135e-06, "loss": 0.2518, "num_input_tokens_seen": 103148928, "step": 108015 }, { "epoch": 8.81148543926911, "grad_norm": 9.564017295837402, "learning_rate": 2.121067198152002e-06, "loss": 0.2612, "num_input_tokens_seen": 103153856, "step": 108020 }, { "epoch": 8.811893302879517, "grad_norm": 7.211920738220215, "learning_rate": 2.1196326947845374e-06, "loss": 0.4306, "num_input_tokens_seen": 103158576, "step": 108025 }, { "epoch": 8.812301166489926, "grad_norm": 1.7688610553741455, "learning_rate": 2.118198655192391e-06, "loss": 0.3308, "num_input_tokens_seen": 103163248, "step": 108030 }, { "epoch": 8.812709030100335, "grad_norm": 27.667387008666992, "learning_rate": 2.1167650794046393e-06, "loss": 0.3405, "num_input_tokens_seen": 103167968, "step": 108035 }, { "epoch": 8.813116893710744, "grad_norm": 2.053987503051758, "learning_rate": 2.1153319674503373e-06, "loss": 0.283, "num_input_tokens_seen": 103173040, "step": 108040 }, { "epoch": 8.813524757321153, "grad_norm": 9.64345645904541, "learning_rate": 2.1138993193585312e-06, "loss": 0.4962, "num_input_tokens_seen": 103177072, "step": 108045 }, { "epoch": 8.81393262093156, "grad_norm": 8.557814598083496, "learning_rate": 2.112467135158261e-06, "loss": 0.4757, "num_input_tokens_seen": 103181424, "step": 108050 }, { "epoch": 8.814340484541969, "grad_norm": 3.6158807277679443, "learning_rate": 2.1110354148785537e-06, "loss": 0.2293, "num_input_tokens_seen": 103185488, "step": 108055 }, { "epoch": 8.814748348152378, "grad_norm": 34.29574966430664, "learning_rate": 2.1096041585484366e-06, "loss": 0.3488, "num_input_tokens_seen": 103191056, "step": 108060 }, { "epoch": 8.815156211762787, "grad_norm": 5.621246814727783, "learning_rate": 2.1081733661969137e-06, "loss": 0.3958, "num_input_tokens_seen": 103195328, "step": 108065 }, { "epoch": 8.815564075373196, "grad_norm": 17.928075790405273, "learning_rate": 2.10674303785299e-06, "loss": 0.2709, "num_input_tokens_seen": 103200336, "step": 108070 }, { "epoch": 8.815971938983603, "grad_norm": 25.820676803588867, "learning_rate": 2.1053131735456533e-06, "loss": 0.3186, "num_input_tokens_seen": 103204880, "step": 108075 }, { "epoch": 8.816379802594012, "grad_norm": 3.5489985942840576, "learning_rate": 2.103883773303894e-06, "loss": 0.313, "num_input_tokens_seen": 103209472, "step": 108080 }, { "epoch": 8.816787666204421, "grad_norm": 22.785831451416016, "learning_rate": 2.102454837156681e-06, "loss": 0.5202, "num_input_tokens_seen": 103214352, "step": 108085 }, { "epoch": 8.81719552981483, "grad_norm": 33.00645065307617, "learning_rate": 2.1010263651329796e-06, "loss": 0.4398, "num_input_tokens_seen": 103219392, "step": 108090 }, { "epoch": 8.81760339342524, "grad_norm": 46.90361785888672, "learning_rate": 2.099598357261742e-06, "loss": 0.2908, "num_input_tokens_seen": 103223744, "step": 108095 }, { "epoch": 8.818011257035646, "grad_norm": 19.704864501953125, "learning_rate": 2.098170813571912e-06, "loss": 0.3851, "num_input_tokens_seen": 103229104, "step": 108100 }, { "epoch": 8.818419120646055, "grad_norm": 5.2641777992248535, "learning_rate": 2.0967437340924297e-06, "loss": 0.1194, "num_input_tokens_seen": 103234352, "step": 108105 }, { "epoch": 8.818826984256464, "grad_norm": 5.821273326873779, "learning_rate": 2.095317118852222e-06, "loss": 0.3396, "num_input_tokens_seen": 103239376, "step": 108110 }, { "epoch": 8.819234847866873, "grad_norm": 14.442896842956543, "learning_rate": 2.093890967880205e-06, "loss": 0.4477, "num_input_tokens_seen": 103243792, "step": 108115 }, { "epoch": 8.819642711477282, "grad_norm": 10.58993911743164, "learning_rate": 2.0924652812052776e-06, "loss": 0.4021, "num_input_tokens_seen": 103249136, "step": 108120 }, { "epoch": 8.820050575087691, "grad_norm": 1.7103676795959473, "learning_rate": 2.09104005885635e-06, "loss": 0.284, "num_input_tokens_seen": 103253520, "step": 108125 }, { "epoch": 8.820458438698099, "grad_norm": 19.401975631713867, "learning_rate": 2.089615300862305e-06, "loss": 0.3588, "num_input_tokens_seen": 103258528, "step": 108130 }, { "epoch": 8.820866302308508, "grad_norm": 25.61678695678711, "learning_rate": 2.0881910072520243e-06, "loss": 0.2648, "num_input_tokens_seen": 103263536, "step": 108135 }, { "epoch": 8.821274165918917, "grad_norm": 42.75261688232422, "learning_rate": 2.0867671780543717e-06, "loss": 0.2926, "num_input_tokens_seen": 103268448, "step": 108140 }, { "epoch": 8.821682029529326, "grad_norm": 0.8759035468101501, "learning_rate": 2.0853438132982155e-06, "loss": 0.3137, "num_input_tokens_seen": 103272800, "step": 108145 }, { "epoch": 8.822089893139735, "grad_norm": 1.8411866426467896, "learning_rate": 2.0839209130124023e-06, "loss": 0.2836, "num_input_tokens_seen": 103276432, "step": 108150 }, { "epoch": 8.822497756750142, "grad_norm": 7.1011152267456055, "learning_rate": 2.0824984772257777e-06, "loss": 0.2174, "num_input_tokens_seen": 103281344, "step": 108155 }, { "epoch": 8.822905620360551, "grad_norm": 4.587558269500732, "learning_rate": 2.0810765059671664e-06, "loss": 0.3879, "num_input_tokens_seen": 103286080, "step": 108160 }, { "epoch": 8.82331348397096, "grad_norm": 42.9061279296875, "learning_rate": 2.079654999265396e-06, "loss": 0.5007, "num_input_tokens_seen": 103290368, "step": 108165 }, { "epoch": 8.823721347581369, "grad_norm": 14.649500846862793, "learning_rate": 2.0782339571492808e-06, "loss": 0.3905, "num_input_tokens_seen": 103295088, "step": 108170 }, { "epoch": 8.824129211191778, "grad_norm": 1.899431586265564, "learning_rate": 2.076813379647621e-06, "loss": 0.3786, "num_input_tokens_seen": 103300464, "step": 108175 }, { "epoch": 8.824537074802187, "grad_norm": 15.763468742370605, "learning_rate": 2.075393266789216e-06, "loss": 0.296, "num_input_tokens_seen": 103305264, "step": 108180 }, { "epoch": 8.824944938412594, "grad_norm": 21.583173751831055, "learning_rate": 2.073973618602848e-06, "loss": 0.4732, "num_input_tokens_seen": 103310144, "step": 108185 }, { "epoch": 8.825352802023003, "grad_norm": 23.492067337036133, "learning_rate": 2.0725544351172936e-06, "loss": 0.4113, "num_input_tokens_seen": 103315440, "step": 108190 }, { "epoch": 8.825760665633412, "grad_norm": 13.587035179138184, "learning_rate": 2.071135716361319e-06, "loss": 0.5238, "num_input_tokens_seen": 103320096, "step": 108195 }, { "epoch": 8.826168529243821, "grad_norm": 31.806032180786133, "learning_rate": 2.0697174623636794e-06, "loss": 0.4545, "num_input_tokens_seen": 103325296, "step": 108200 }, { "epoch": 8.82657639285423, "grad_norm": 0.6878485083580017, "learning_rate": 2.068299673153121e-06, "loss": 0.1792, "num_input_tokens_seen": 103330480, "step": 108205 }, { "epoch": 8.826984256464637, "grad_norm": 2.10546875, "learning_rate": 2.066882348758381e-06, "loss": 0.3004, "num_input_tokens_seen": 103335040, "step": 108210 }, { "epoch": 8.827392120075046, "grad_norm": 3.2466087341308594, "learning_rate": 2.065465489208196e-06, "loss": 0.1113, "num_input_tokens_seen": 103339840, "step": 108215 }, { "epoch": 8.827799983685455, "grad_norm": 43.782196044921875, "learning_rate": 2.064049094531281e-06, "loss": 0.4613, "num_input_tokens_seen": 103345008, "step": 108220 }, { "epoch": 8.828207847295864, "grad_norm": 14.279403686523438, "learning_rate": 2.0626331647563445e-06, "loss": 0.2303, "num_input_tokens_seen": 103350496, "step": 108225 }, { "epoch": 8.828615710906274, "grad_norm": 9.036111831665039, "learning_rate": 2.0612176999120798e-06, "loss": 0.4603, "num_input_tokens_seen": 103355024, "step": 108230 }, { "epoch": 8.829023574516683, "grad_norm": 26.373777389526367, "learning_rate": 2.0598027000271912e-06, "loss": 0.177, "num_input_tokens_seen": 103360704, "step": 108235 }, { "epoch": 8.82943143812709, "grad_norm": 2.2878572940826416, "learning_rate": 2.058388165130354e-06, "loss": 0.33, "num_input_tokens_seen": 103365040, "step": 108240 }, { "epoch": 8.829839301737499, "grad_norm": 0.6447942852973938, "learning_rate": 2.0569740952502416e-06, "loss": 0.2867, "num_input_tokens_seen": 103369056, "step": 108245 }, { "epoch": 8.830247165347908, "grad_norm": 23.750633239746094, "learning_rate": 2.0555604904155144e-06, "loss": 0.3611, "num_input_tokens_seen": 103373792, "step": 108250 }, { "epoch": 8.830655028958317, "grad_norm": 6.702017307281494, "learning_rate": 2.05414735065482e-06, "loss": 0.4073, "num_input_tokens_seen": 103378224, "step": 108255 }, { "epoch": 8.831062892568726, "grad_norm": 7.08266019821167, "learning_rate": 2.0527346759968146e-06, "loss": 0.4215, "num_input_tokens_seen": 103382704, "step": 108260 }, { "epoch": 8.831470756179133, "grad_norm": 5.97360897064209, "learning_rate": 2.0513224664701263e-06, "loss": 0.3165, "num_input_tokens_seen": 103387392, "step": 108265 }, { "epoch": 8.831878619789542, "grad_norm": 22.4205379486084, "learning_rate": 2.049910722103382e-06, "loss": 0.4039, "num_input_tokens_seen": 103391632, "step": 108270 }, { "epoch": 8.832286483399951, "grad_norm": 2.458895683288574, "learning_rate": 2.048499442925189e-06, "loss": 0.2103, "num_input_tokens_seen": 103396128, "step": 108275 }, { "epoch": 8.83269434701036, "grad_norm": 26.031496047973633, "learning_rate": 2.0470886289641637e-06, "loss": 0.4775, "num_input_tokens_seen": 103401568, "step": 108280 }, { "epoch": 8.833102210620769, "grad_norm": 20.103336334228516, "learning_rate": 2.0456782802488974e-06, "loss": 0.2431, "num_input_tokens_seen": 103405984, "step": 108285 }, { "epoch": 8.833510074231176, "grad_norm": 8.524582862854004, "learning_rate": 2.0442683968079803e-06, "loss": 0.2305, "num_input_tokens_seen": 103410608, "step": 108290 }, { "epoch": 8.833917937841585, "grad_norm": 16.873985290527344, "learning_rate": 2.04285897866999e-06, "loss": 0.5889, "num_input_tokens_seen": 103415120, "step": 108295 }, { "epoch": 8.834325801451994, "grad_norm": 2.5832371711730957, "learning_rate": 2.0414500258634867e-06, "loss": 0.3716, "num_input_tokens_seen": 103420432, "step": 108300 }, { "epoch": 8.834733665062403, "grad_norm": 11.346923828125, "learning_rate": 2.0400415384170397e-06, "loss": 0.182, "num_input_tokens_seen": 103424592, "step": 108305 }, { "epoch": 8.835141528672812, "grad_norm": 2.349592685699463, "learning_rate": 2.038633516359195e-06, "loss": 0.4675, "num_input_tokens_seen": 103429856, "step": 108310 }, { "epoch": 8.83554939228322, "grad_norm": 5.2922821044921875, "learning_rate": 2.037225959718492e-06, "loss": 0.3897, "num_input_tokens_seen": 103434592, "step": 108315 }, { "epoch": 8.835957255893629, "grad_norm": 2.303765058517456, "learning_rate": 2.035818868523459e-06, "loss": 0.1737, "num_input_tokens_seen": 103439216, "step": 108320 }, { "epoch": 8.836365119504038, "grad_norm": 49.277530670166016, "learning_rate": 2.034412242802622e-06, "loss": 0.5334, "num_input_tokens_seen": 103444256, "step": 108325 }, { "epoch": 8.836772983114447, "grad_norm": 3.3084475994110107, "learning_rate": 2.0330060825844905e-06, "loss": 0.2969, "num_input_tokens_seen": 103449584, "step": 108330 }, { "epoch": 8.837180846724856, "grad_norm": 11.812684059143066, "learning_rate": 2.031600387897567e-06, "loss": 0.1715, "num_input_tokens_seen": 103454528, "step": 108335 }, { "epoch": 8.837588710335265, "grad_norm": 2.2128872871398926, "learning_rate": 2.030195158770337e-06, "loss": 0.3895, "num_input_tokens_seen": 103460016, "step": 108340 }, { "epoch": 8.837996573945672, "grad_norm": 47.807918548583984, "learning_rate": 2.0287903952312975e-06, "loss": 0.4445, "num_input_tokens_seen": 103465040, "step": 108345 }, { "epoch": 8.83840443755608, "grad_norm": 2.158787488937378, "learning_rate": 2.0273860973089143e-06, "loss": 0.4972, "num_input_tokens_seen": 103469424, "step": 108350 }, { "epoch": 8.83881230116649, "grad_norm": 17.31421661376953, "learning_rate": 2.0259822650316533e-06, "loss": 0.6182, "num_input_tokens_seen": 103475056, "step": 108355 }, { "epoch": 8.839220164776899, "grad_norm": 18.766904830932617, "learning_rate": 2.024578898427967e-06, "loss": 0.3389, "num_input_tokens_seen": 103479840, "step": 108360 }, { "epoch": 8.839628028387308, "grad_norm": 38.74946975708008, "learning_rate": 2.0231759975263027e-06, "loss": 0.4804, "num_input_tokens_seen": 103484624, "step": 108365 }, { "epoch": 8.840035891997715, "grad_norm": 3.647404193878174, "learning_rate": 2.0217735623551002e-06, "loss": 0.3356, "num_input_tokens_seen": 103489664, "step": 108370 }, { "epoch": 8.840443755608124, "grad_norm": 14.899405479431152, "learning_rate": 2.0203715929427854e-06, "loss": 0.446, "num_input_tokens_seen": 103494288, "step": 108375 }, { "epoch": 8.840851619218533, "grad_norm": 17.865299224853516, "learning_rate": 2.018970089317773e-06, "loss": 0.4595, "num_input_tokens_seen": 103499376, "step": 108380 }, { "epoch": 8.841259482828942, "grad_norm": 21.972379684448242, "learning_rate": 2.0175690515084667e-06, "loss": 0.3506, "num_input_tokens_seen": 103503680, "step": 108385 }, { "epoch": 8.841667346439351, "grad_norm": 7.4046759605407715, "learning_rate": 2.0161684795432763e-06, "loss": 0.3728, "num_input_tokens_seen": 103509408, "step": 108390 }, { "epoch": 8.84207521004976, "grad_norm": 24.81574249267578, "learning_rate": 2.014768373450582e-06, "loss": 0.3675, "num_input_tokens_seen": 103513696, "step": 108395 }, { "epoch": 8.842483073660167, "grad_norm": 3.560511589050293, "learning_rate": 2.0133687332587665e-06, "loss": 0.2942, "num_input_tokens_seen": 103518272, "step": 108400 }, { "epoch": 8.842890937270576, "grad_norm": 4.546351909637451, "learning_rate": 2.011969558996199e-06, "loss": 0.4172, "num_input_tokens_seen": 103523024, "step": 108405 }, { "epoch": 8.843298800880985, "grad_norm": 9.953814506530762, "learning_rate": 2.0105708506912346e-06, "loss": 0.3117, "num_input_tokens_seen": 103528064, "step": 108410 }, { "epoch": 8.843706664491394, "grad_norm": 7.736817836761475, "learning_rate": 2.0091726083722366e-06, "loss": 0.3772, "num_input_tokens_seen": 103532560, "step": 108415 }, { "epoch": 8.844114528101803, "grad_norm": 14.332013130187988, "learning_rate": 2.007774832067541e-06, "loss": 0.3249, "num_input_tokens_seen": 103537696, "step": 108420 }, { "epoch": 8.84452239171221, "grad_norm": 2.2537620067596436, "learning_rate": 2.0063775218054776e-06, "loss": 0.3107, "num_input_tokens_seen": 103541616, "step": 108425 }, { "epoch": 8.84493025532262, "grad_norm": 2.755760431289673, "learning_rate": 2.004980677614368e-06, "loss": 0.3452, "num_input_tokens_seen": 103546128, "step": 108430 }, { "epoch": 8.845338118933029, "grad_norm": 9.45134449005127, "learning_rate": 2.0035842995225317e-06, "loss": 0.3396, "num_input_tokens_seen": 103549872, "step": 108435 }, { "epoch": 8.845745982543438, "grad_norm": 34.42272186279297, "learning_rate": 2.002188387558271e-06, "loss": 0.2708, "num_input_tokens_seen": 103555360, "step": 108440 }, { "epoch": 8.846153846153847, "grad_norm": 11.146798133850098, "learning_rate": 2.000792941749879e-06, "loss": 0.5221, "num_input_tokens_seen": 103560416, "step": 108445 }, { "epoch": 8.846561709764256, "grad_norm": 38.47454071044922, "learning_rate": 1.9993979621256432e-06, "loss": 0.2649, "num_input_tokens_seen": 103564992, "step": 108450 }, { "epoch": 8.846969573374663, "grad_norm": 10.877341270446777, "learning_rate": 1.9980034487138366e-06, "loss": 0.2597, "num_input_tokens_seen": 103570112, "step": 108455 }, { "epoch": 8.847377436985072, "grad_norm": 1.347562313079834, "learning_rate": 1.9966094015427234e-06, "loss": 0.3117, "num_input_tokens_seen": 103575248, "step": 108460 }, { "epoch": 8.847785300595481, "grad_norm": 44.07620620727539, "learning_rate": 1.995215820640564e-06, "loss": 0.3043, "num_input_tokens_seen": 103579488, "step": 108465 }, { "epoch": 8.84819316420589, "grad_norm": 6.647546768188477, "learning_rate": 1.9938227060356002e-06, "loss": 0.2218, "num_input_tokens_seen": 103582816, "step": 108470 }, { "epoch": 8.848601027816299, "grad_norm": 12.352885246276855, "learning_rate": 1.9924300577560814e-06, "loss": 0.325, "num_input_tokens_seen": 103587664, "step": 108475 }, { "epoch": 8.849008891426706, "grad_norm": 1.9556615352630615, "learning_rate": 1.991037875830226e-06, "loss": 0.2763, "num_input_tokens_seen": 103592736, "step": 108480 }, { "epoch": 8.849416755037115, "grad_norm": 44.863319396972656, "learning_rate": 1.989646160286254e-06, "loss": 0.2196, "num_input_tokens_seen": 103596304, "step": 108485 }, { "epoch": 8.849824618647524, "grad_norm": 2.9877402782440186, "learning_rate": 1.9882549111523792e-06, "loss": 0.3849, "num_input_tokens_seen": 103601088, "step": 108490 }, { "epoch": 8.850232482257933, "grad_norm": 31.067180633544922, "learning_rate": 1.9868641284567946e-06, "loss": 0.3686, "num_input_tokens_seen": 103606704, "step": 108495 }, { "epoch": 8.850640345868342, "grad_norm": 1.531752586364746, "learning_rate": 1.9854738122277004e-06, "loss": 0.3962, "num_input_tokens_seen": 103610992, "step": 108500 }, { "epoch": 8.85104820947875, "grad_norm": 12.315597534179688, "learning_rate": 1.9840839624932716e-06, "loss": 0.6528, "num_input_tokens_seen": 103616000, "step": 108505 }, { "epoch": 8.851456073089158, "grad_norm": 3.743938446044922, "learning_rate": 1.98269457928168e-06, "loss": 0.3632, "num_input_tokens_seen": 103619280, "step": 108510 }, { "epoch": 8.851863936699568, "grad_norm": 9.476252555847168, "learning_rate": 1.9813056626210886e-06, "loss": 0.3031, "num_input_tokens_seen": 103623952, "step": 108515 }, { "epoch": 8.852271800309977, "grad_norm": 32.779266357421875, "learning_rate": 1.9799172125396477e-06, "loss": 0.3452, "num_input_tokens_seen": 103627872, "step": 108520 }, { "epoch": 8.852679663920386, "grad_norm": 25.7829532623291, "learning_rate": 1.978529229065507e-06, "loss": 0.377, "num_input_tokens_seen": 103632768, "step": 108525 }, { "epoch": 8.853087527530795, "grad_norm": 5.199161529541016, "learning_rate": 1.977141712226796e-06, "loss": 0.423, "num_input_tokens_seen": 103637856, "step": 108530 }, { "epoch": 8.853495391141202, "grad_norm": 36.54241180419922, "learning_rate": 1.9757546620516375e-06, "loss": 0.423, "num_input_tokens_seen": 103642688, "step": 108535 }, { "epoch": 8.85390325475161, "grad_norm": 35.70207214355469, "learning_rate": 1.974368078568148e-06, "loss": 0.3643, "num_input_tokens_seen": 103647376, "step": 108540 }, { "epoch": 8.85431111836202, "grad_norm": 11.834300994873047, "learning_rate": 1.972981961804435e-06, "loss": 0.3896, "num_input_tokens_seen": 103652464, "step": 108545 }, { "epoch": 8.854718981972429, "grad_norm": 8.596368789672852, "learning_rate": 1.971596311788593e-06, "loss": 0.4563, "num_input_tokens_seen": 103657424, "step": 108550 }, { "epoch": 8.855126845582838, "grad_norm": 4.375720977783203, "learning_rate": 1.9702111285487074e-06, "loss": 0.3859, "num_input_tokens_seen": 103663648, "step": 108555 }, { "epoch": 8.855534709193245, "grad_norm": 34.37803268432617, "learning_rate": 1.9688264121128593e-06, "loss": 0.5175, "num_input_tokens_seen": 103667648, "step": 108560 }, { "epoch": 8.855942572803654, "grad_norm": 1.465160608291626, "learning_rate": 1.967442162509106e-06, "loss": 0.2333, "num_input_tokens_seen": 103672800, "step": 108565 }, { "epoch": 8.856350436414063, "grad_norm": 7.670046329498291, "learning_rate": 1.96605837976552e-06, "loss": 0.353, "num_input_tokens_seen": 103677520, "step": 108570 }, { "epoch": 8.856758300024472, "grad_norm": 31.867019653320312, "learning_rate": 1.96467506391014e-06, "loss": 0.4321, "num_input_tokens_seen": 103681408, "step": 108575 }, { "epoch": 8.857166163634881, "grad_norm": 0.46908074617385864, "learning_rate": 1.9632922149710104e-06, "loss": 0.2765, "num_input_tokens_seen": 103686272, "step": 108580 }, { "epoch": 8.857574027245288, "grad_norm": 3.5161328315734863, "learning_rate": 1.9619098329761584e-06, "loss": 0.3366, "num_input_tokens_seen": 103691008, "step": 108585 }, { "epoch": 8.857981890855697, "grad_norm": 17.674392700195312, "learning_rate": 1.9605279179536003e-06, "loss": 0.5158, "num_input_tokens_seen": 103696688, "step": 108590 }, { "epoch": 8.858389754466106, "grad_norm": 6.126110553741455, "learning_rate": 1.959146469931353e-06, "loss": 0.5147, "num_input_tokens_seen": 103701472, "step": 108595 }, { "epoch": 8.858797618076515, "grad_norm": 22.243410110473633, "learning_rate": 1.9577654889374184e-06, "loss": 0.2476, "num_input_tokens_seen": 103705952, "step": 108600 }, { "epoch": 8.859205481686924, "grad_norm": 7.328158378601074, "learning_rate": 1.9563849749997864e-06, "loss": 0.2956, "num_input_tokens_seen": 103710480, "step": 108605 }, { "epoch": 8.859613345297333, "grad_norm": 2.3260037899017334, "learning_rate": 1.9550049281464393e-06, "loss": 0.361, "num_input_tokens_seen": 103715072, "step": 108610 }, { "epoch": 8.86002120890774, "grad_norm": 9.506709098815918, "learning_rate": 1.9536253484053496e-06, "loss": 0.5674, "num_input_tokens_seen": 103720384, "step": 108615 }, { "epoch": 8.86042907251815, "grad_norm": 2.522087812423706, "learning_rate": 1.952246235804481e-06, "loss": 0.373, "num_input_tokens_seen": 103724656, "step": 108620 }, { "epoch": 8.860836936128559, "grad_norm": 1.036344289779663, "learning_rate": 1.9508675903717854e-06, "loss": 0.2505, "num_input_tokens_seen": 103729280, "step": 108625 }, { "epoch": 8.861244799738968, "grad_norm": 7.031239986419678, "learning_rate": 1.949489412135208e-06, "loss": 0.6633, "num_input_tokens_seen": 103733776, "step": 108630 }, { "epoch": 8.861652663349377, "grad_norm": 14.386134147644043, "learning_rate": 1.94811170112269e-06, "loss": 0.4071, "num_input_tokens_seen": 103738720, "step": 108635 }, { "epoch": 8.862060526959784, "grad_norm": 9.56823444366455, "learning_rate": 1.946734457362151e-06, "loss": 0.3994, "num_input_tokens_seen": 103744016, "step": 108640 }, { "epoch": 8.862468390570193, "grad_norm": 3.106975793838501, "learning_rate": 1.94535768088151e-06, "loss": 0.2567, "num_input_tokens_seen": 103748544, "step": 108645 }, { "epoch": 8.862876254180602, "grad_norm": 1.4987728595733643, "learning_rate": 1.9439813717086695e-06, "loss": 0.2555, "num_input_tokens_seen": 103752560, "step": 108650 }, { "epoch": 8.86328411779101, "grad_norm": 6.534255027770996, "learning_rate": 1.9426055298715273e-06, "loss": 0.3412, "num_input_tokens_seen": 103757840, "step": 108655 }, { "epoch": 8.86369198140142, "grad_norm": 9.618826866149902, "learning_rate": 1.941230155397977e-06, "loss": 0.3991, "num_input_tokens_seen": 103762512, "step": 108660 }, { "epoch": 8.864099845011829, "grad_norm": 13.411861419677734, "learning_rate": 1.9398552483158943e-06, "loss": 0.5506, "num_input_tokens_seen": 103767072, "step": 108665 }, { "epoch": 8.864507708622236, "grad_norm": 11.753742218017578, "learning_rate": 1.9384808086531454e-06, "loss": 0.3517, "num_input_tokens_seen": 103771296, "step": 108670 }, { "epoch": 8.864915572232645, "grad_norm": 24.143333435058594, "learning_rate": 1.937106836437588e-06, "loss": 0.3411, "num_input_tokens_seen": 103776352, "step": 108675 }, { "epoch": 8.865323435843054, "grad_norm": 17.320045471191406, "learning_rate": 1.9357333316970787e-06, "loss": 0.4113, "num_input_tokens_seen": 103780672, "step": 108680 }, { "epoch": 8.865731299453463, "grad_norm": 12.720763206481934, "learning_rate": 1.934360294459456e-06, "loss": 0.344, "num_input_tokens_seen": 103785328, "step": 108685 }, { "epoch": 8.866139163063872, "grad_norm": 5.906503677368164, "learning_rate": 1.9329877247525467e-06, "loss": 0.344, "num_input_tokens_seen": 103789648, "step": 108690 }, { "epoch": 8.86654702667428, "grad_norm": 19.659992218017578, "learning_rate": 1.9316156226041687e-06, "loss": 0.3454, "num_input_tokens_seen": 103794896, "step": 108695 }, { "epoch": 8.866954890284688, "grad_norm": 24.77082633972168, "learning_rate": 1.9302439880421463e-06, "loss": 0.4549, "num_input_tokens_seen": 103799680, "step": 108700 }, { "epoch": 8.867362753895097, "grad_norm": 34.57133483886719, "learning_rate": 1.9288728210942767e-06, "loss": 0.3101, "num_input_tokens_seen": 103804224, "step": 108705 }, { "epoch": 8.867770617505506, "grad_norm": 9.827077865600586, "learning_rate": 1.9275021217883494e-06, "loss": 0.2391, "num_input_tokens_seen": 103809424, "step": 108710 }, { "epoch": 8.868178481115915, "grad_norm": 12.791791915893555, "learning_rate": 1.9261318901521495e-06, "loss": 0.4184, "num_input_tokens_seen": 103814960, "step": 108715 }, { "epoch": 8.868586344726324, "grad_norm": 22.68103790283203, "learning_rate": 1.9247621262134475e-06, "loss": 0.4224, "num_input_tokens_seen": 103819024, "step": 108720 }, { "epoch": 8.868994208336732, "grad_norm": 9.300902366638184, "learning_rate": 1.9233928300000175e-06, "loss": 0.5033, "num_input_tokens_seen": 103823376, "step": 108725 }, { "epoch": 8.86940207194714, "grad_norm": 6.196560859680176, "learning_rate": 1.92202400153961e-06, "loss": 0.2972, "num_input_tokens_seen": 103827584, "step": 108730 }, { "epoch": 8.86980993555755, "grad_norm": 18.101896286010742, "learning_rate": 1.920655640859967e-06, "loss": 0.2364, "num_input_tokens_seen": 103833392, "step": 108735 }, { "epoch": 8.870217799167959, "grad_norm": 5.6637043952941895, "learning_rate": 1.919287747988827e-06, "loss": 0.3348, "num_input_tokens_seen": 103837664, "step": 108740 }, { "epoch": 8.870625662778368, "grad_norm": 16.04564666748047, "learning_rate": 1.91792032295392e-06, "loss": 0.3149, "num_input_tokens_seen": 103842064, "step": 108745 }, { "epoch": 8.871033526388775, "grad_norm": 9.976119995117188, "learning_rate": 1.9165533657829555e-06, "loss": 0.3432, "num_input_tokens_seen": 103847216, "step": 108750 }, { "epoch": 8.871441389999184, "grad_norm": 2.686220169067383, "learning_rate": 1.9151868765036497e-06, "loss": 0.5208, "num_input_tokens_seen": 103851568, "step": 108755 }, { "epoch": 8.871849253609593, "grad_norm": 8.951343536376953, "learning_rate": 1.9138208551436942e-06, "loss": 0.2536, "num_input_tokens_seen": 103855744, "step": 108760 }, { "epoch": 8.872257117220002, "grad_norm": 5.832307815551758, "learning_rate": 1.9124553017307754e-06, "loss": 0.2396, "num_input_tokens_seen": 103860192, "step": 108765 }, { "epoch": 8.872664980830411, "grad_norm": 41.0195426940918, "learning_rate": 1.9110902162925824e-06, "loss": 0.5173, "num_input_tokens_seen": 103863552, "step": 108770 }, { "epoch": 8.873072844440818, "grad_norm": 9.114982604980469, "learning_rate": 1.9097255988567793e-06, "loss": 0.3863, "num_input_tokens_seen": 103868096, "step": 108775 }, { "epoch": 8.873480708051227, "grad_norm": 19.146825790405273, "learning_rate": 1.9083614494510264e-06, "loss": 0.317, "num_input_tokens_seen": 103873184, "step": 108780 }, { "epoch": 8.873888571661636, "grad_norm": 3.5178465843200684, "learning_rate": 1.9069977681029722e-06, "loss": 0.4793, "num_input_tokens_seen": 103877488, "step": 108785 }, { "epoch": 8.874296435272045, "grad_norm": 34.43178939819336, "learning_rate": 1.9056345548402632e-06, "loss": 0.2959, "num_input_tokens_seen": 103881600, "step": 108790 }, { "epoch": 8.874704298882454, "grad_norm": 16.574626922607422, "learning_rate": 1.9042718096905276e-06, "loss": 0.4309, "num_input_tokens_seen": 103886416, "step": 108795 }, { "epoch": 8.875112162492862, "grad_norm": 2.6948082447052, "learning_rate": 1.9029095326813907e-06, "loss": 0.3176, "num_input_tokens_seen": 103890704, "step": 108800 }, { "epoch": 8.87552002610327, "grad_norm": 13.713279724121094, "learning_rate": 1.9015477238404605e-06, "loss": 0.3914, "num_input_tokens_seen": 103895568, "step": 108805 }, { "epoch": 8.87592788971368, "grad_norm": 0.8234747052192688, "learning_rate": 1.9001863831953403e-06, "loss": 0.4472, "num_input_tokens_seen": 103899968, "step": 108810 }, { "epoch": 8.876335753324089, "grad_norm": 12.941644668579102, "learning_rate": 1.8988255107736303e-06, "loss": 0.4065, "num_input_tokens_seen": 103905232, "step": 108815 }, { "epoch": 8.876743616934498, "grad_norm": 9.091936111450195, "learning_rate": 1.897465106602911e-06, "loss": 0.4765, "num_input_tokens_seen": 103910448, "step": 108820 }, { "epoch": 8.877151480544907, "grad_norm": 4.3739142417907715, "learning_rate": 1.8961051707107547e-06, "loss": 0.3631, "num_input_tokens_seen": 103914608, "step": 108825 }, { "epoch": 8.877559344155314, "grad_norm": 7.90826416015625, "learning_rate": 1.8947457031247285e-06, "loss": 0.4057, "num_input_tokens_seen": 103919952, "step": 108830 }, { "epoch": 8.877967207765723, "grad_norm": 0.8050577640533447, "learning_rate": 1.8933867038723907e-06, "loss": 0.3065, "num_input_tokens_seen": 103925376, "step": 108835 }, { "epoch": 8.878375071376132, "grad_norm": 3.11574125289917, "learning_rate": 1.8920281729812862e-06, "loss": 0.1765, "num_input_tokens_seen": 103930144, "step": 108840 }, { "epoch": 8.87878293498654, "grad_norm": 1.608042597770691, "learning_rate": 1.8906701104789537e-06, "loss": 0.2706, "num_input_tokens_seen": 103934800, "step": 108845 }, { "epoch": 8.87919079859695, "grad_norm": 8.85161018371582, "learning_rate": 1.8893125163929159e-06, "loss": 0.3079, "num_input_tokens_seen": 103939248, "step": 108850 }, { "epoch": 8.879598662207357, "grad_norm": 6.099125862121582, "learning_rate": 1.8879553907506897e-06, "loss": 0.5027, "num_input_tokens_seen": 103943520, "step": 108855 }, { "epoch": 8.880006525817766, "grad_norm": 2.440964698791504, "learning_rate": 1.886598733579792e-06, "loss": 0.2685, "num_input_tokens_seen": 103947808, "step": 108860 }, { "epoch": 8.880414389428175, "grad_norm": 35.20002365112305, "learning_rate": 1.8852425449077143e-06, "loss": 0.4295, "num_input_tokens_seen": 103952304, "step": 108865 }, { "epoch": 8.880822253038584, "grad_norm": 2.4970204830169678, "learning_rate": 1.883886824761949e-06, "loss": 0.3216, "num_input_tokens_seen": 103957568, "step": 108870 }, { "epoch": 8.881230116648993, "grad_norm": 2.9464340209960938, "learning_rate": 1.8825315731699767e-06, "loss": 0.397, "num_input_tokens_seen": 103962480, "step": 108875 }, { "epoch": 8.881637980259402, "grad_norm": 1.467846155166626, "learning_rate": 1.8811767901592642e-06, "loss": 0.3129, "num_input_tokens_seen": 103967120, "step": 108880 }, { "epoch": 8.88204584386981, "grad_norm": 1.1364669799804688, "learning_rate": 1.8798224757572757e-06, "loss": 0.4341, "num_input_tokens_seen": 103971552, "step": 108885 }, { "epoch": 8.882453707480218, "grad_norm": 6.993063926696777, "learning_rate": 1.878468629991459e-06, "loss": 0.3562, "num_input_tokens_seen": 103975840, "step": 108890 }, { "epoch": 8.882861571090627, "grad_norm": 7.267539978027344, "learning_rate": 1.8771152528892555e-06, "loss": 0.2049, "num_input_tokens_seen": 103980496, "step": 108895 }, { "epoch": 8.883269434701036, "grad_norm": 18.20572853088379, "learning_rate": 1.8757623444781047e-06, "loss": 0.5114, "num_input_tokens_seen": 103985216, "step": 108900 }, { "epoch": 8.883677298311445, "grad_norm": 2.901268482208252, "learning_rate": 1.8744099047854263e-06, "loss": 0.264, "num_input_tokens_seen": 103990304, "step": 108905 }, { "epoch": 8.884085161921853, "grad_norm": 48.21882247924805, "learning_rate": 1.8730579338386317e-06, "loss": 0.4308, "num_input_tokens_seen": 103995088, "step": 108910 }, { "epoch": 8.884493025532262, "grad_norm": 14.8076171875, "learning_rate": 1.8717064316651239e-06, "loss": 0.4426, "num_input_tokens_seen": 103999360, "step": 108915 }, { "epoch": 8.88490088914267, "grad_norm": 3.214370012283325, "learning_rate": 1.8703553982922978e-06, "loss": 0.4152, "num_input_tokens_seen": 104003360, "step": 108920 }, { "epoch": 8.88530875275308, "grad_norm": 1.8329542875289917, "learning_rate": 1.8690048337475397e-06, "loss": 0.2449, "num_input_tokens_seen": 104007376, "step": 108925 }, { "epoch": 8.885716616363489, "grad_norm": 13.715445518493652, "learning_rate": 1.8676547380582276e-06, "loss": 0.4629, "num_input_tokens_seen": 104011728, "step": 108930 }, { "epoch": 8.886124479973898, "grad_norm": 16.413610458374023, "learning_rate": 1.8663051112517232e-06, "loss": 0.2689, "num_input_tokens_seen": 104016672, "step": 108935 }, { "epoch": 8.886532343584305, "grad_norm": 45.93177032470703, "learning_rate": 1.8649559533553796e-06, "loss": 0.3768, "num_input_tokens_seen": 104020912, "step": 108940 }, { "epoch": 8.886940207194714, "grad_norm": 8.993273735046387, "learning_rate": 1.8636072643965551e-06, "loss": 0.2775, "num_input_tokens_seen": 104024768, "step": 108945 }, { "epoch": 8.887348070805123, "grad_norm": 4.097333908081055, "learning_rate": 1.8622590444025783e-06, "loss": 0.3893, "num_input_tokens_seen": 104029632, "step": 108950 }, { "epoch": 8.887755934415532, "grad_norm": 4.944906234741211, "learning_rate": 1.8609112934007773e-06, "loss": 0.4721, "num_input_tokens_seen": 104034384, "step": 108955 }, { "epoch": 8.888163798025941, "grad_norm": 3.5440027713775635, "learning_rate": 1.8595640114184742e-06, "loss": 0.2307, "num_input_tokens_seen": 104039808, "step": 108960 }, { "epoch": 8.888571661636348, "grad_norm": 1.7993459701538086, "learning_rate": 1.85821719848297e-06, "loss": 0.3122, "num_input_tokens_seen": 104044480, "step": 108965 }, { "epoch": 8.888979525246757, "grad_norm": 7.329512596130371, "learning_rate": 1.856870854621573e-06, "loss": 0.4416, "num_input_tokens_seen": 104049840, "step": 108970 }, { "epoch": 8.889387388857166, "grad_norm": 1.4933472871780396, "learning_rate": 1.8555249798615726e-06, "loss": 0.4141, "num_input_tokens_seen": 104055024, "step": 108975 }, { "epoch": 8.889795252467575, "grad_norm": 26.60417366027832, "learning_rate": 1.8541795742302443e-06, "loss": 0.1746, "num_input_tokens_seen": 104059296, "step": 108980 }, { "epoch": 8.890203116077984, "grad_norm": 14.119250297546387, "learning_rate": 1.852834637754855e-06, "loss": 0.4885, "num_input_tokens_seen": 104064112, "step": 108985 }, { "epoch": 8.890610979688391, "grad_norm": 1.1612157821655273, "learning_rate": 1.8514901704626802e-06, "loss": 0.1834, "num_input_tokens_seen": 104068576, "step": 108990 }, { "epoch": 8.8910188432988, "grad_norm": 28.790197372436523, "learning_rate": 1.8501461723809593e-06, "loss": 0.1759, "num_input_tokens_seen": 104073152, "step": 108995 }, { "epoch": 8.89142670690921, "grad_norm": 11.693382263183594, "learning_rate": 1.8488026435369398e-06, "loss": 0.407, "num_input_tokens_seen": 104078016, "step": 109000 }, { "epoch": 8.891834570519618, "grad_norm": 34.02867126464844, "learning_rate": 1.847459583957853e-06, "loss": 0.4673, "num_input_tokens_seen": 104083600, "step": 109005 }, { "epoch": 8.892242434130027, "grad_norm": 1.3334063291549683, "learning_rate": 1.846116993670924e-06, "loss": 0.4077, "num_input_tokens_seen": 104088096, "step": 109010 }, { "epoch": 8.892650297740435, "grad_norm": 15.339778900146484, "learning_rate": 1.8447748727033643e-06, "loss": 0.4319, "num_input_tokens_seen": 104093344, "step": 109015 }, { "epoch": 8.893058161350844, "grad_norm": 7.272938251495361, "learning_rate": 1.8434332210823775e-06, "loss": 0.3838, "num_input_tokens_seen": 104097520, "step": 109020 }, { "epoch": 8.893466024961253, "grad_norm": 7.182590484619141, "learning_rate": 1.8420920388351582e-06, "loss": 0.2566, "num_input_tokens_seen": 104102960, "step": 109025 }, { "epoch": 8.893873888571662, "grad_norm": 2.7039825916290283, "learning_rate": 1.8407513259888904e-06, "loss": 0.3242, "num_input_tokens_seen": 104108464, "step": 109030 }, { "epoch": 8.89428175218207, "grad_norm": 9.57785701751709, "learning_rate": 1.8394110825707578e-06, "loss": 0.2764, "num_input_tokens_seen": 104112880, "step": 109035 }, { "epoch": 8.89468961579248, "grad_norm": 45.19483947753906, "learning_rate": 1.8380713086079193e-06, "loss": 0.4013, "num_input_tokens_seen": 104117264, "step": 109040 }, { "epoch": 8.895097479402887, "grad_norm": 2.6895737648010254, "learning_rate": 1.8367320041275337e-06, "loss": 0.3719, "num_input_tokens_seen": 104122560, "step": 109045 }, { "epoch": 8.895505343013296, "grad_norm": 3.7202138900756836, "learning_rate": 1.8353931691567434e-06, "loss": 0.4455, "num_input_tokens_seen": 104127008, "step": 109050 }, { "epoch": 8.895913206623705, "grad_norm": 5.691122531890869, "learning_rate": 1.8340548037226957e-06, "loss": 0.2424, "num_input_tokens_seen": 104132592, "step": 109055 }, { "epoch": 8.896321070234114, "grad_norm": 15.627301216125488, "learning_rate": 1.8327169078525108e-06, "loss": 0.2467, "num_input_tokens_seen": 104136352, "step": 109060 }, { "epoch": 8.896728933844523, "grad_norm": 15.083399772644043, "learning_rate": 1.8313794815733114e-06, "loss": 0.3036, "num_input_tokens_seen": 104142112, "step": 109065 }, { "epoch": 8.89713679745493, "grad_norm": 8.713423728942871, "learning_rate": 1.8300425249122066e-06, "loss": 0.295, "num_input_tokens_seen": 104146944, "step": 109070 }, { "epoch": 8.89754466106534, "grad_norm": 26.104536056518555, "learning_rate": 1.8287060378962883e-06, "loss": 0.4585, "num_input_tokens_seen": 104152112, "step": 109075 }, { "epoch": 8.897952524675748, "grad_norm": 10.246734619140625, "learning_rate": 1.8273700205526544e-06, "loss": 0.2768, "num_input_tokens_seen": 104157568, "step": 109080 }, { "epoch": 8.898360388286157, "grad_norm": 30.39059066772461, "learning_rate": 1.8260344729083862e-06, "loss": 0.4431, "num_input_tokens_seen": 104162000, "step": 109085 }, { "epoch": 8.898768251896566, "grad_norm": 10.339616775512695, "learning_rate": 1.8246993949905534e-06, "loss": 0.5043, "num_input_tokens_seen": 104166704, "step": 109090 }, { "epoch": 8.899176115506975, "grad_norm": 7.541053771972656, "learning_rate": 1.8233647868262094e-06, "loss": 0.3339, "num_input_tokens_seen": 104171440, "step": 109095 }, { "epoch": 8.899583979117383, "grad_norm": 1.865134835243225, "learning_rate": 1.8220306484424187e-06, "loss": 0.3533, "num_input_tokens_seen": 104176016, "step": 109100 }, { "epoch": 8.899991842727792, "grad_norm": 7.805665969848633, "learning_rate": 1.8206969798662154e-06, "loss": 0.3816, "num_input_tokens_seen": 104180752, "step": 109105 }, { "epoch": 8.9003997063382, "grad_norm": 9.585586547851562, "learning_rate": 1.8193637811246361e-06, "loss": 0.3869, "num_input_tokens_seen": 104186128, "step": 109110 }, { "epoch": 8.90080756994861, "grad_norm": 1.2196011543273926, "learning_rate": 1.8180310522447036e-06, "loss": 0.4029, "num_input_tokens_seen": 104191280, "step": 109115 }, { "epoch": 8.901215433559019, "grad_norm": 22.403051376342773, "learning_rate": 1.8166987932534268e-06, "loss": 0.3208, "num_input_tokens_seen": 104195536, "step": 109120 }, { "epoch": 8.901623297169426, "grad_norm": 37.51948547363281, "learning_rate": 1.8153670041778176e-06, "loss": 0.5012, "num_input_tokens_seen": 104200320, "step": 109125 }, { "epoch": 8.902031160779835, "grad_norm": 3.742280960083008, "learning_rate": 1.814035685044868e-06, "loss": 0.3206, "num_input_tokens_seen": 104203904, "step": 109130 }, { "epoch": 8.902439024390244, "grad_norm": 3.1484689712524414, "learning_rate": 1.812704835881565e-06, "loss": 0.3236, "num_input_tokens_seen": 104208384, "step": 109135 }, { "epoch": 8.902846888000653, "grad_norm": 37.74723815917969, "learning_rate": 1.8113744567148789e-06, "loss": 0.2444, "num_input_tokens_seen": 104213136, "step": 109140 }, { "epoch": 8.903254751611062, "grad_norm": 5.149608135223389, "learning_rate": 1.8100445475717792e-06, "loss": 0.1897, "num_input_tokens_seen": 104218448, "step": 109145 }, { "epoch": 8.90366261522147, "grad_norm": 4.6379475593566895, "learning_rate": 1.8087151084792253e-06, "loss": 0.2804, "num_input_tokens_seen": 104222720, "step": 109150 }, { "epoch": 8.904070478831878, "grad_norm": 16.39967918395996, "learning_rate": 1.8073861394641595e-06, "loss": 0.2105, "num_input_tokens_seen": 104227792, "step": 109155 }, { "epoch": 8.904478342442287, "grad_norm": 11.311018943786621, "learning_rate": 1.8060576405535212e-06, "loss": 0.3891, "num_input_tokens_seen": 104232432, "step": 109160 }, { "epoch": 8.904886206052696, "grad_norm": 2.739841938018799, "learning_rate": 1.8047296117742334e-06, "loss": 0.2162, "num_input_tokens_seen": 104237088, "step": 109165 }, { "epoch": 8.905294069663105, "grad_norm": 32.357749938964844, "learning_rate": 1.8034020531532275e-06, "loss": 0.373, "num_input_tokens_seen": 104241120, "step": 109170 }, { "epoch": 8.905701933273514, "grad_norm": 20.84423065185547, "learning_rate": 1.802074964717404e-06, "loss": 0.2131, "num_input_tokens_seen": 104245632, "step": 109175 }, { "epoch": 8.906109796883921, "grad_norm": 28.99855613708496, "learning_rate": 1.8007483464936637e-06, "loss": 0.3423, "num_input_tokens_seen": 104250032, "step": 109180 }, { "epoch": 8.90651766049433, "grad_norm": 38.908363342285156, "learning_rate": 1.7994221985088905e-06, "loss": 0.2339, "num_input_tokens_seen": 104254496, "step": 109185 }, { "epoch": 8.90692552410474, "grad_norm": 25.316831588745117, "learning_rate": 1.7980965207899769e-06, "loss": 0.2801, "num_input_tokens_seen": 104259440, "step": 109190 }, { "epoch": 8.907333387715148, "grad_norm": 8.151199340820312, "learning_rate": 1.7967713133637876e-06, "loss": 0.2927, "num_input_tokens_seen": 104263632, "step": 109195 }, { "epoch": 8.907741251325557, "grad_norm": 14.782256126403809, "learning_rate": 1.7954465762571844e-06, "loss": 0.4246, "num_input_tokens_seen": 104267984, "step": 109200 }, { "epoch": 8.908149114935965, "grad_norm": 13.965200424194336, "learning_rate": 1.7941223094970178e-06, "loss": 0.2917, "num_input_tokens_seen": 104272768, "step": 109205 }, { "epoch": 8.908556978546374, "grad_norm": 1.3078709840774536, "learning_rate": 1.7927985131101276e-06, "loss": 0.3251, "num_input_tokens_seen": 104277104, "step": 109210 }, { "epoch": 8.908964842156783, "grad_norm": 9.464371681213379, "learning_rate": 1.7914751871233536e-06, "loss": 0.3195, "num_input_tokens_seen": 104281792, "step": 109215 }, { "epoch": 8.909372705767192, "grad_norm": 2.590078353881836, "learning_rate": 1.790152331563516e-06, "loss": 0.4236, "num_input_tokens_seen": 104286896, "step": 109220 }, { "epoch": 8.9097805693776, "grad_norm": 33.020751953125, "learning_rate": 1.7888299464574293e-06, "loss": 0.4964, "num_input_tokens_seen": 104291840, "step": 109225 }, { "epoch": 8.910188432988008, "grad_norm": 2.3377251625061035, "learning_rate": 1.7875080318318914e-06, "loss": 0.4803, "num_input_tokens_seen": 104295760, "step": 109230 }, { "epoch": 8.910596296598417, "grad_norm": 2.81169056892395, "learning_rate": 1.7861865877137062e-06, "loss": 0.3768, "num_input_tokens_seen": 104301408, "step": 109235 }, { "epoch": 8.911004160208826, "grad_norm": 6.590303421020508, "learning_rate": 1.784865614129655e-06, "loss": 0.2131, "num_input_tokens_seen": 104305824, "step": 109240 }, { "epoch": 8.911412023819235, "grad_norm": 30.42841339111328, "learning_rate": 1.7835451111065131e-06, "loss": 0.4307, "num_input_tokens_seen": 104310704, "step": 109245 }, { "epoch": 8.911819887429644, "grad_norm": 24.035173416137695, "learning_rate": 1.7822250786710404e-06, "loss": 0.2586, "num_input_tokens_seen": 104315648, "step": 109250 }, { "epoch": 8.912227751040053, "grad_norm": 11.043082237243652, "learning_rate": 1.7809055168500065e-06, "loss": 0.2146, "num_input_tokens_seen": 104320272, "step": 109255 }, { "epoch": 8.91263561465046, "grad_norm": 14.523189544677734, "learning_rate": 1.7795864256701516e-06, "loss": 0.4917, "num_input_tokens_seen": 104323968, "step": 109260 }, { "epoch": 8.91304347826087, "grad_norm": 2.740912914276123, "learning_rate": 1.7782678051582096e-06, "loss": 0.3401, "num_input_tokens_seen": 104328800, "step": 109265 }, { "epoch": 8.913451341871278, "grad_norm": 1.636289358139038, "learning_rate": 1.7769496553409148e-06, "loss": 0.1811, "num_input_tokens_seen": 104333120, "step": 109270 }, { "epoch": 8.913859205481687, "grad_norm": 41.06982421875, "learning_rate": 1.775631976244982e-06, "loss": 0.4936, "num_input_tokens_seen": 104338272, "step": 109275 }, { "epoch": 8.914267069092096, "grad_norm": 27.317625045776367, "learning_rate": 1.7743147678971201e-06, "loss": 0.4773, "num_input_tokens_seen": 104343600, "step": 109280 }, { "epoch": 8.914674932702503, "grad_norm": 2.70821213722229, "learning_rate": 1.7729980303240245e-06, "loss": 0.4822, "num_input_tokens_seen": 104348656, "step": 109285 }, { "epoch": 8.915082796312912, "grad_norm": 11.324933052062988, "learning_rate": 1.7716817635523908e-06, "loss": 0.3171, "num_input_tokens_seen": 104353040, "step": 109290 }, { "epoch": 8.915490659923321, "grad_norm": 22.63779640197754, "learning_rate": 1.7703659676089001e-06, "loss": 0.5369, "num_input_tokens_seen": 104357424, "step": 109295 }, { "epoch": 8.91589852353373, "grad_norm": 26.208600997924805, "learning_rate": 1.7690506425202201e-06, "loss": 0.3043, "num_input_tokens_seen": 104363040, "step": 109300 }, { "epoch": 8.91630638714414, "grad_norm": 11.455493927001953, "learning_rate": 1.767735788313013e-06, "loss": 0.402, "num_input_tokens_seen": 104367984, "step": 109305 }, { "epoch": 8.916714250754548, "grad_norm": 6.853171348571777, "learning_rate": 1.7664214050139267e-06, "loss": 0.4547, "num_input_tokens_seen": 104373136, "step": 109310 }, { "epoch": 8.917122114364956, "grad_norm": 22.60442352294922, "learning_rate": 1.7651074926496092e-06, "loss": 0.2802, "num_input_tokens_seen": 104378240, "step": 109315 }, { "epoch": 8.917529977975365, "grad_norm": 3.2513458728790283, "learning_rate": 1.7637940512466838e-06, "loss": 0.2924, "num_input_tokens_seen": 104382752, "step": 109320 }, { "epoch": 8.917937841585774, "grad_norm": 38.194740295410156, "learning_rate": 1.7624810808317822e-06, "loss": 0.4094, "num_input_tokens_seen": 104386720, "step": 109325 }, { "epoch": 8.918345705196183, "grad_norm": 3.831254720687866, "learning_rate": 1.7611685814315188e-06, "loss": 0.4766, "num_input_tokens_seen": 104392016, "step": 109330 }, { "epoch": 8.918753568806592, "grad_norm": 1.0544286966323853, "learning_rate": 1.7598565530724892e-06, "loss": 0.4062, "num_input_tokens_seen": 104396432, "step": 109335 }, { "epoch": 8.919161432416999, "grad_norm": 14.898114204406738, "learning_rate": 1.7585449957812916e-06, "loss": 0.3123, "num_input_tokens_seen": 104401328, "step": 109340 }, { "epoch": 8.919569296027408, "grad_norm": 27.45931625366211, "learning_rate": 1.757233909584513e-06, "loss": 0.3039, "num_input_tokens_seen": 104406016, "step": 109345 }, { "epoch": 8.919977159637817, "grad_norm": 30.975954055786133, "learning_rate": 1.7559232945087267e-06, "loss": 0.3235, "num_input_tokens_seen": 104410800, "step": 109350 }, { "epoch": 8.920385023248226, "grad_norm": 10.145147323608398, "learning_rate": 1.7546131505804975e-06, "loss": 0.323, "num_input_tokens_seen": 104415056, "step": 109355 }, { "epoch": 8.920792886858635, "grad_norm": 13.920097351074219, "learning_rate": 1.7533034778263819e-06, "loss": 0.3016, "num_input_tokens_seen": 104418848, "step": 109360 }, { "epoch": 8.921200750469044, "grad_norm": 27.318851470947266, "learning_rate": 1.7519942762729251e-06, "loss": 0.6008, "num_input_tokens_seen": 104423376, "step": 109365 }, { "epoch": 8.921608614079451, "grad_norm": 7.986074924468994, "learning_rate": 1.7506855459466675e-06, "loss": 0.35, "num_input_tokens_seen": 104428176, "step": 109370 }, { "epoch": 8.92201647768986, "grad_norm": 5.374252796173096, "learning_rate": 1.7493772868741377e-06, "loss": 0.2615, "num_input_tokens_seen": 104433216, "step": 109375 }, { "epoch": 8.92242434130027, "grad_norm": 18.78190040588379, "learning_rate": 1.7480694990818476e-06, "loss": 0.398, "num_input_tokens_seen": 104438672, "step": 109380 }, { "epoch": 8.922832204910678, "grad_norm": 12.09681224822998, "learning_rate": 1.7467621825963038e-06, "loss": 0.596, "num_input_tokens_seen": 104443344, "step": 109385 }, { "epoch": 8.923240068521087, "grad_norm": 14.784706115722656, "learning_rate": 1.7454553374440159e-06, "loss": 0.2816, "num_input_tokens_seen": 104448800, "step": 109390 }, { "epoch": 8.923647932131495, "grad_norm": 25.106998443603516, "learning_rate": 1.7441489636514651e-06, "loss": 0.3704, "num_input_tokens_seen": 104453776, "step": 109395 }, { "epoch": 8.924055795741904, "grad_norm": 16.59272575378418, "learning_rate": 1.7428430612451336e-06, "loss": 0.317, "num_input_tokens_seen": 104459008, "step": 109400 }, { "epoch": 8.924463659352313, "grad_norm": 1.2062273025512695, "learning_rate": 1.7415376302514914e-06, "loss": 0.2575, "num_input_tokens_seen": 104463808, "step": 109405 }, { "epoch": 8.924871522962722, "grad_norm": 1.6875503063201904, "learning_rate": 1.7402326706969923e-06, "loss": 0.3031, "num_input_tokens_seen": 104469456, "step": 109410 }, { "epoch": 8.92527938657313, "grad_norm": 2.125715970993042, "learning_rate": 1.7389281826080989e-06, "loss": 0.3526, "num_input_tokens_seen": 104474224, "step": 109415 }, { "epoch": 8.925687250183538, "grad_norm": 37.586299896240234, "learning_rate": 1.737624166011248e-06, "loss": 0.4921, "num_input_tokens_seen": 104478608, "step": 109420 }, { "epoch": 8.926095113793947, "grad_norm": 34.040931701660156, "learning_rate": 1.7363206209328686e-06, "loss": 0.4024, "num_input_tokens_seen": 104483008, "step": 109425 }, { "epoch": 8.926502977404356, "grad_norm": 34.70817947387695, "learning_rate": 1.7350175473993868e-06, "loss": 0.3793, "num_input_tokens_seen": 104487824, "step": 109430 }, { "epoch": 8.926910841014765, "grad_norm": 2.809434413909912, "learning_rate": 1.733714945437212e-06, "loss": 0.4357, "num_input_tokens_seen": 104492032, "step": 109435 }, { "epoch": 8.927318704625174, "grad_norm": 3.0232717990875244, "learning_rate": 1.7324128150727482e-06, "loss": 0.2302, "num_input_tokens_seen": 104496720, "step": 109440 }, { "epoch": 8.927726568235581, "grad_norm": 7.165178298950195, "learning_rate": 1.7311111563323906e-06, "loss": 0.4412, "num_input_tokens_seen": 104502096, "step": 109445 }, { "epoch": 8.92813443184599, "grad_norm": 1.8133783340454102, "learning_rate": 1.7298099692425184e-06, "loss": 0.3264, "num_input_tokens_seen": 104506704, "step": 109450 }, { "epoch": 8.928542295456399, "grad_norm": 2.8445191383361816, "learning_rate": 1.7285092538295134e-06, "loss": 0.252, "num_input_tokens_seen": 104511488, "step": 109455 }, { "epoch": 8.928950159066808, "grad_norm": 16.8725643157959, "learning_rate": 1.7272090101197375e-06, "loss": 0.4707, "num_input_tokens_seen": 104515136, "step": 109460 }, { "epoch": 8.929358022677217, "grad_norm": 7.435752868652344, "learning_rate": 1.7259092381395448e-06, "loss": 0.4293, "num_input_tokens_seen": 104520800, "step": 109465 }, { "epoch": 8.929765886287626, "grad_norm": 10.535928726196289, "learning_rate": 1.7246099379152836e-06, "loss": 0.3783, "num_input_tokens_seen": 104525888, "step": 109470 }, { "epoch": 8.930173749898033, "grad_norm": 17.402795791625977, "learning_rate": 1.7233111094732829e-06, "loss": 0.4464, "num_input_tokens_seen": 104530400, "step": 109475 }, { "epoch": 8.930581613508442, "grad_norm": 14.6692533493042, "learning_rate": 1.7220127528398828e-06, "loss": 0.2536, "num_input_tokens_seen": 104535536, "step": 109480 }, { "epoch": 8.930989477118851, "grad_norm": 22.513320922851562, "learning_rate": 1.7207148680413899e-06, "loss": 0.2917, "num_input_tokens_seen": 104540640, "step": 109485 }, { "epoch": 8.93139734072926, "grad_norm": 45.47037124633789, "learning_rate": 1.7194174551041136e-06, "loss": 0.3028, "num_input_tokens_seen": 104545856, "step": 109490 }, { "epoch": 8.93180520433967, "grad_norm": 7.687917709350586, "learning_rate": 1.7181205140543499e-06, "loss": 0.4233, "num_input_tokens_seen": 104549328, "step": 109495 }, { "epoch": 8.932213067950077, "grad_norm": 74.1647720336914, "learning_rate": 1.716824044918397e-06, "loss": 0.3573, "num_input_tokens_seen": 104554192, "step": 109500 }, { "epoch": 8.932620931560486, "grad_norm": 4.6051554679870605, "learning_rate": 1.7155280477225228e-06, "loss": 0.3351, "num_input_tokens_seen": 104559168, "step": 109505 }, { "epoch": 8.933028795170895, "grad_norm": 13.384501457214355, "learning_rate": 1.7142325224930033e-06, "loss": 0.5441, "num_input_tokens_seen": 104563840, "step": 109510 }, { "epoch": 8.933436658781304, "grad_norm": 15.493426322937012, "learning_rate": 1.7129374692560956e-06, "loss": 0.3328, "num_input_tokens_seen": 104568816, "step": 109515 }, { "epoch": 8.933844522391713, "grad_norm": 15.120070457458496, "learning_rate": 1.7116428880380453e-06, "loss": 0.4351, "num_input_tokens_seen": 104573552, "step": 109520 }, { "epoch": 8.934252386002122, "grad_norm": 19.424379348754883, "learning_rate": 1.7103487788651034e-06, "loss": 0.4167, "num_input_tokens_seen": 104579264, "step": 109525 }, { "epoch": 8.934660249612529, "grad_norm": 2.5751705169677734, "learning_rate": 1.7090551417634965e-06, "loss": 0.2523, "num_input_tokens_seen": 104583616, "step": 109530 }, { "epoch": 8.935068113222938, "grad_norm": 2.8744375705718994, "learning_rate": 1.7077619767594422e-06, "loss": 0.4669, "num_input_tokens_seen": 104588496, "step": 109535 }, { "epoch": 8.935475976833347, "grad_norm": 22.43335723876953, "learning_rate": 1.7064692838791503e-06, "loss": 0.3851, "num_input_tokens_seen": 104592832, "step": 109540 }, { "epoch": 8.935883840443756, "grad_norm": 1.2224441766738892, "learning_rate": 1.7051770631488357e-06, "loss": 0.3196, "num_input_tokens_seen": 104598096, "step": 109545 }, { "epoch": 8.936291704054165, "grad_norm": 0.7869859933853149, "learning_rate": 1.7038853145946804e-06, "loss": 0.2206, "num_input_tokens_seen": 104602336, "step": 109550 }, { "epoch": 8.936699567664572, "grad_norm": 9.778326034545898, "learning_rate": 1.7025940382428719e-06, "loss": 0.3402, "num_input_tokens_seen": 104607744, "step": 109555 }, { "epoch": 8.937107431274981, "grad_norm": 12.279704093933105, "learning_rate": 1.7013032341195833e-06, "loss": 0.3507, "num_input_tokens_seen": 104612496, "step": 109560 }, { "epoch": 8.93751529488539, "grad_norm": 3.800410747528076, "learning_rate": 1.7000129022509776e-06, "loss": 0.3185, "num_input_tokens_seen": 104616720, "step": 109565 }, { "epoch": 8.9379231584958, "grad_norm": 2.5949859619140625, "learning_rate": 1.6987230426632085e-06, "loss": 0.3278, "num_input_tokens_seen": 104621216, "step": 109570 }, { "epoch": 8.938331022106208, "grad_norm": 0.5859152674674988, "learning_rate": 1.6974336553824217e-06, "loss": 0.4209, "num_input_tokens_seen": 104625152, "step": 109575 }, { "epoch": 8.938738885716617, "grad_norm": 16.091419219970703, "learning_rate": 1.6961447404347547e-06, "loss": 0.5029, "num_input_tokens_seen": 104630016, "step": 109580 }, { "epoch": 8.939146749327024, "grad_norm": 26.69537925720215, "learning_rate": 1.6948562978463255e-06, "loss": 0.1423, "num_input_tokens_seen": 104635072, "step": 109585 }, { "epoch": 8.939554612937433, "grad_norm": 1.6954363584518433, "learning_rate": 1.6935683276432607e-06, "loss": 0.1699, "num_input_tokens_seen": 104638752, "step": 109590 }, { "epoch": 8.939962476547842, "grad_norm": 2.9479544162750244, "learning_rate": 1.692280829851664e-06, "loss": 0.2502, "num_input_tokens_seen": 104643072, "step": 109595 }, { "epoch": 8.940370340158251, "grad_norm": 22.86945915222168, "learning_rate": 1.6909938044976286e-06, "loss": 0.4151, "num_input_tokens_seen": 104647184, "step": 109600 }, { "epoch": 8.94077820376866, "grad_norm": 3.210732936859131, "learning_rate": 1.689707251607245e-06, "loss": 0.2893, "num_input_tokens_seen": 104652496, "step": 109605 }, { "epoch": 8.941186067379068, "grad_norm": 15.554512023925781, "learning_rate": 1.688421171206586e-06, "loss": 0.367, "num_input_tokens_seen": 104656704, "step": 109610 }, { "epoch": 8.941593930989477, "grad_norm": 10.732807159423828, "learning_rate": 1.687135563321729e-06, "loss": 0.2929, "num_input_tokens_seen": 104661600, "step": 109615 }, { "epoch": 8.942001794599886, "grad_norm": 6.939029216766357, "learning_rate": 1.6858504279787274e-06, "loss": 0.433, "num_input_tokens_seen": 104666336, "step": 109620 }, { "epoch": 8.942409658210295, "grad_norm": 1.4359179735183716, "learning_rate": 1.6845657652036328e-06, "loss": 0.3175, "num_input_tokens_seen": 104671104, "step": 109625 }, { "epoch": 8.942817521820704, "grad_norm": 36.3540153503418, "learning_rate": 1.6832815750224746e-06, "loss": 0.3012, "num_input_tokens_seen": 104675072, "step": 109630 }, { "epoch": 8.943225385431111, "grad_norm": 7.605679512023926, "learning_rate": 1.6819978574612983e-06, "loss": 0.3885, "num_input_tokens_seen": 104679184, "step": 109635 }, { "epoch": 8.94363324904152, "grad_norm": 2.995422601699829, "learning_rate": 1.6807146125461166e-06, "loss": 0.293, "num_input_tokens_seen": 104682832, "step": 109640 }, { "epoch": 8.944041112651929, "grad_norm": 5.9846649169921875, "learning_rate": 1.679431840302939e-06, "loss": 0.3443, "num_input_tokens_seen": 104687792, "step": 109645 }, { "epoch": 8.944448976262338, "grad_norm": 20.208826065063477, "learning_rate": 1.6781495407577673e-06, "loss": 0.4645, "num_input_tokens_seen": 104692512, "step": 109650 }, { "epoch": 8.944856839872747, "grad_norm": 4.7362446784973145, "learning_rate": 1.6768677139365974e-06, "loss": 0.2671, "num_input_tokens_seen": 104697824, "step": 109655 }, { "epoch": 8.945264703483156, "grad_norm": 14.7485933303833, "learning_rate": 1.6755863598654054e-06, "loss": 0.7243, "num_input_tokens_seen": 104702592, "step": 109660 }, { "epoch": 8.945672567093563, "grad_norm": 25.582366943359375, "learning_rate": 1.6743054785701707e-06, "loss": 0.5348, "num_input_tokens_seen": 104707792, "step": 109665 }, { "epoch": 8.946080430703972, "grad_norm": 23.046995162963867, "learning_rate": 1.6730250700768502e-06, "loss": 0.2946, "num_input_tokens_seen": 104712704, "step": 109670 }, { "epoch": 8.946488294314381, "grad_norm": 15.27338981628418, "learning_rate": 1.6717451344113955e-06, "loss": 0.4093, "num_input_tokens_seen": 104717888, "step": 109675 }, { "epoch": 8.94689615792479, "grad_norm": 7.996420860290527, "learning_rate": 1.670465671599758e-06, "loss": 0.4271, "num_input_tokens_seen": 104722848, "step": 109680 }, { "epoch": 8.9473040215352, "grad_norm": 2.022299289703369, "learning_rate": 1.6691866816678697e-06, "loss": 0.2283, "num_input_tokens_seen": 104727216, "step": 109685 }, { "epoch": 8.947711885145607, "grad_norm": 37.553192138671875, "learning_rate": 1.6679081646416517e-06, "loss": 0.3871, "num_input_tokens_seen": 104731360, "step": 109690 }, { "epoch": 8.948119748756016, "grad_norm": 7.685275554656982, "learning_rate": 1.666630120547022e-06, "loss": 0.4222, "num_input_tokens_seen": 104735840, "step": 109695 }, { "epoch": 8.948527612366425, "grad_norm": 6.041115760803223, "learning_rate": 1.665352549409882e-06, "loss": 0.3032, "num_input_tokens_seen": 104741408, "step": 109700 }, { "epoch": 8.948935475976834, "grad_norm": 0.9133862257003784, "learning_rate": 1.6640754512561335e-06, "loss": 0.3998, "num_input_tokens_seen": 104746176, "step": 109705 }, { "epoch": 8.949343339587243, "grad_norm": 25.41129493713379, "learning_rate": 1.6627988261116584e-06, "loss": 0.464, "num_input_tokens_seen": 104750768, "step": 109710 }, { "epoch": 8.94975120319765, "grad_norm": 12.708456039428711, "learning_rate": 1.6615226740023359e-06, "loss": 0.5844, "num_input_tokens_seen": 104756112, "step": 109715 }, { "epoch": 8.950159066808059, "grad_norm": 1.0251919031143188, "learning_rate": 1.6602469949540261e-06, "loss": 0.2414, "num_input_tokens_seen": 104760320, "step": 109720 }, { "epoch": 8.950566930418468, "grad_norm": 11.653436660766602, "learning_rate": 1.6589717889925944e-06, "loss": 0.316, "num_input_tokens_seen": 104764880, "step": 109725 }, { "epoch": 8.950974794028877, "grad_norm": 2.8579440116882324, "learning_rate": 1.6576970561438892e-06, "loss": 0.2021, "num_input_tokens_seen": 104769328, "step": 109730 }, { "epoch": 8.951382657639286, "grad_norm": 4.859091281890869, "learning_rate": 1.656422796433746e-06, "loss": 0.3724, "num_input_tokens_seen": 104773792, "step": 109735 }, { "epoch": 8.951790521249695, "grad_norm": 14.479450225830078, "learning_rate": 1.6551490098879879e-06, "loss": 0.3407, "num_input_tokens_seen": 104778288, "step": 109740 }, { "epoch": 8.952198384860102, "grad_norm": 14.022912979125977, "learning_rate": 1.6538756965324448e-06, "loss": 0.3851, "num_input_tokens_seen": 104783472, "step": 109745 }, { "epoch": 8.952606248470511, "grad_norm": 26.77170181274414, "learning_rate": 1.652602856392918e-06, "loss": 0.313, "num_input_tokens_seen": 104788416, "step": 109750 }, { "epoch": 8.95301411208092, "grad_norm": 2.0050220489501953, "learning_rate": 1.6513304894952148e-06, "loss": 0.1936, "num_input_tokens_seen": 104792624, "step": 109755 }, { "epoch": 8.95342197569133, "grad_norm": 14.166030883789062, "learning_rate": 1.650058595865117e-06, "loss": 0.3541, "num_input_tokens_seen": 104797904, "step": 109760 }, { "epoch": 8.953829839301738, "grad_norm": 12.003133773803711, "learning_rate": 1.6487871755284073e-06, "loss": 0.4243, "num_input_tokens_seen": 104803232, "step": 109765 }, { "epoch": 8.954237702912145, "grad_norm": 7.95346212387085, "learning_rate": 1.6475162285108647e-06, "loss": 0.5413, "num_input_tokens_seen": 104807952, "step": 109770 }, { "epoch": 8.954645566522554, "grad_norm": 6.370889663696289, "learning_rate": 1.6462457548382437e-06, "loss": 0.3352, "num_input_tokens_seen": 104812064, "step": 109775 }, { "epoch": 8.955053430132963, "grad_norm": 16.41486167907715, "learning_rate": 1.644975754536296e-06, "loss": 0.4829, "num_input_tokens_seen": 104817024, "step": 109780 }, { "epoch": 8.955461293743372, "grad_norm": 2.6207525730133057, "learning_rate": 1.6437062276307648e-06, "loss": 0.4134, "num_input_tokens_seen": 104822032, "step": 109785 }, { "epoch": 8.955869157353781, "grad_norm": 2.3998019695281982, "learning_rate": 1.642437174147385e-06, "loss": 0.1547, "num_input_tokens_seen": 104827072, "step": 109790 }, { "epoch": 8.95627702096419, "grad_norm": 14.52812385559082, "learning_rate": 1.6411685941118804e-06, "loss": 0.4039, "num_input_tokens_seen": 104832240, "step": 109795 }, { "epoch": 8.956684884574598, "grad_norm": 22.177539825439453, "learning_rate": 1.6399004875499613e-06, "loss": 0.4058, "num_input_tokens_seen": 104836688, "step": 109800 }, { "epoch": 8.957092748185007, "grad_norm": 11.981832504272461, "learning_rate": 1.6386328544873292e-06, "loss": 0.5021, "num_input_tokens_seen": 104841280, "step": 109805 }, { "epoch": 8.957500611795416, "grad_norm": 3.863412618637085, "learning_rate": 1.6373656949496858e-06, "loss": 0.3218, "num_input_tokens_seen": 104846608, "step": 109810 }, { "epoch": 8.957908475405825, "grad_norm": 6.150936603546143, "learning_rate": 1.6360990089627132e-06, "loss": 0.5025, "num_input_tokens_seen": 104851392, "step": 109815 }, { "epoch": 8.958316339016234, "grad_norm": 35.898529052734375, "learning_rate": 1.6348327965520854e-06, "loss": 0.4453, "num_input_tokens_seen": 104856304, "step": 109820 }, { "epoch": 8.958724202626641, "grad_norm": 49.07426452636719, "learning_rate": 1.633567057743468e-06, "loss": 0.4974, "num_input_tokens_seen": 104861888, "step": 109825 }, { "epoch": 8.95913206623705, "grad_norm": 23.0277099609375, "learning_rate": 1.6323017925625184e-06, "loss": 0.3511, "num_input_tokens_seen": 104865920, "step": 109830 }, { "epoch": 8.959539929847459, "grad_norm": 27.989810943603516, "learning_rate": 1.6310370010348825e-06, "loss": 0.3137, "num_input_tokens_seen": 104871040, "step": 109835 }, { "epoch": 8.959947793457868, "grad_norm": 4.581576347351074, "learning_rate": 1.6297726831861954e-06, "loss": 0.2317, "num_input_tokens_seen": 104875584, "step": 109840 }, { "epoch": 8.960355657068277, "grad_norm": 9.142644882202148, "learning_rate": 1.628508839042084e-06, "loss": 0.2573, "num_input_tokens_seen": 104880288, "step": 109845 }, { "epoch": 8.960763520678686, "grad_norm": 13.07916259765625, "learning_rate": 1.6272454686281635e-06, "loss": 0.5675, "num_input_tokens_seen": 104885168, "step": 109850 }, { "epoch": 8.961171384289093, "grad_norm": 3.219517469406128, "learning_rate": 1.6259825719700528e-06, "loss": 0.2209, "num_input_tokens_seen": 104889424, "step": 109855 }, { "epoch": 8.961579247899502, "grad_norm": 4.575126647949219, "learning_rate": 1.6247201490933394e-06, "loss": 0.251, "num_input_tokens_seen": 104894208, "step": 109860 }, { "epoch": 8.961987111509911, "grad_norm": 20.35185432434082, "learning_rate": 1.6234582000236197e-06, "loss": 0.506, "num_input_tokens_seen": 104899184, "step": 109865 }, { "epoch": 8.96239497512032, "grad_norm": 27.23800277709961, "learning_rate": 1.622196724786465e-06, "loss": 0.3613, "num_input_tokens_seen": 104903856, "step": 109870 }, { "epoch": 8.96280283873073, "grad_norm": 6.534718990325928, "learning_rate": 1.620935723407449e-06, "loss": 0.3002, "num_input_tokens_seen": 104907952, "step": 109875 }, { "epoch": 8.963210702341136, "grad_norm": 11.242704391479492, "learning_rate": 1.619675195912132e-06, "loss": 0.3162, "num_input_tokens_seen": 104913216, "step": 109880 }, { "epoch": 8.963618565951545, "grad_norm": 6.562804222106934, "learning_rate": 1.6184151423260657e-06, "loss": 0.4579, "num_input_tokens_seen": 104917648, "step": 109885 }, { "epoch": 8.964026429561955, "grad_norm": 33.50341796875, "learning_rate": 1.617155562674788e-06, "loss": 0.441, "num_input_tokens_seen": 104922208, "step": 109890 }, { "epoch": 8.964434293172364, "grad_norm": 9.132067680358887, "learning_rate": 1.6158964569838287e-06, "loss": 0.4819, "num_input_tokens_seen": 104927088, "step": 109895 }, { "epoch": 8.964842156782773, "grad_norm": 6.816127300262451, "learning_rate": 1.6146378252787141e-06, "loss": 0.2822, "num_input_tokens_seen": 104932256, "step": 109900 }, { "epoch": 8.96525002039318, "grad_norm": 8.927118301391602, "learning_rate": 1.6133796675849522e-06, "loss": 0.2961, "num_input_tokens_seen": 104937088, "step": 109905 }, { "epoch": 8.965657884003589, "grad_norm": 2.7142534255981445, "learning_rate": 1.61212198392805e-06, "loss": 0.2242, "num_input_tokens_seen": 104942464, "step": 109910 }, { "epoch": 8.966065747613998, "grad_norm": 13.756637573242188, "learning_rate": 1.6108647743334953e-06, "loss": 0.2902, "num_input_tokens_seen": 104947232, "step": 109915 }, { "epoch": 8.966473611224407, "grad_norm": 1.3187288045883179, "learning_rate": 1.6096080388267708e-06, "loss": 0.3189, "num_input_tokens_seen": 104951728, "step": 109920 }, { "epoch": 8.966881474834816, "grad_norm": 13.444839477539062, "learning_rate": 1.6083517774333535e-06, "loss": 0.4948, "num_input_tokens_seen": 104957312, "step": 109925 }, { "epoch": 8.967289338445223, "grad_norm": 14.848045349121094, "learning_rate": 1.6070959901787086e-06, "loss": 0.3716, "num_input_tokens_seen": 104962608, "step": 109930 }, { "epoch": 8.967697202055632, "grad_norm": 5.1923675537109375, "learning_rate": 1.6058406770882856e-06, "loss": 0.223, "num_input_tokens_seen": 104967104, "step": 109935 }, { "epoch": 8.968105065666041, "grad_norm": 29.979108810424805, "learning_rate": 1.6045858381875307e-06, "loss": 0.3906, "num_input_tokens_seen": 104972912, "step": 109940 }, { "epoch": 8.96851292927645, "grad_norm": 21.145008087158203, "learning_rate": 1.603331473501879e-06, "loss": 0.301, "num_input_tokens_seen": 104977648, "step": 109945 }, { "epoch": 8.968920792886859, "grad_norm": 5.994073867797852, "learning_rate": 1.6020775830567603e-06, "loss": 0.542, "num_input_tokens_seen": 104981776, "step": 109950 }, { "epoch": 8.969328656497268, "grad_norm": 24.062124252319336, "learning_rate": 1.6008241668775848e-06, "loss": 0.453, "num_input_tokens_seen": 104987504, "step": 109955 }, { "epoch": 8.969736520107675, "grad_norm": 5.90571403503418, "learning_rate": 1.5995712249897598e-06, "loss": 0.2679, "num_input_tokens_seen": 104991984, "step": 109960 }, { "epoch": 8.970144383718084, "grad_norm": 16.71891975402832, "learning_rate": 1.5983187574186848e-06, "loss": 0.5843, "num_input_tokens_seen": 104996640, "step": 109965 }, { "epoch": 8.970552247328493, "grad_norm": 5.360071182250977, "learning_rate": 1.5970667641897418e-06, "loss": 0.4151, "num_input_tokens_seen": 105002112, "step": 109970 }, { "epoch": 8.970960110938902, "grad_norm": 8.596988677978516, "learning_rate": 1.5958152453283082e-06, "loss": 0.3316, "num_input_tokens_seen": 105005872, "step": 109975 }, { "epoch": 8.971367974549311, "grad_norm": 21.69080352783203, "learning_rate": 1.5945642008597606e-06, "loss": 0.4578, "num_input_tokens_seen": 105011200, "step": 109980 }, { "epoch": 8.971775838159719, "grad_norm": 4.692798614501953, "learning_rate": 1.5933136308094481e-06, "loss": 0.1856, "num_input_tokens_seen": 105015056, "step": 109985 }, { "epoch": 8.972183701770128, "grad_norm": 7.481375694274902, "learning_rate": 1.592063535202723e-06, "loss": 0.4172, "num_input_tokens_seen": 105020112, "step": 109990 }, { "epoch": 8.972591565380537, "grad_norm": 9.606297492980957, "learning_rate": 1.5908139140649231e-06, "loss": 0.1599, "num_input_tokens_seen": 105025296, "step": 109995 }, { "epoch": 8.972999428990946, "grad_norm": 3.932435989379883, "learning_rate": 1.5895647674213809e-06, "loss": 0.4972, "num_input_tokens_seen": 105030208, "step": 110000 }, { "epoch": 8.973407292601355, "grad_norm": 4.268985271453857, "learning_rate": 1.588316095297407e-06, "loss": 0.2383, "num_input_tokens_seen": 105035696, "step": 110005 }, { "epoch": 8.973815156211764, "grad_norm": 7.5406928062438965, "learning_rate": 1.5870678977183194e-06, "loss": 0.3937, "num_input_tokens_seen": 105039376, "step": 110010 }, { "epoch": 8.97422301982217, "grad_norm": 2.3660480976104736, "learning_rate": 1.5858201747094208e-06, "loss": 0.1363, "num_input_tokens_seen": 105044640, "step": 110015 }, { "epoch": 8.97463088343258, "grad_norm": 19.053260803222656, "learning_rate": 1.5845729262959964e-06, "loss": 0.5378, "num_input_tokens_seen": 105048848, "step": 110020 }, { "epoch": 8.975038747042989, "grad_norm": 1.4950159788131714, "learning_rate": 1.5833261525033283e-06, "loss": 0.3421, "num_input_tokens_seen": 105053984, "step": 110025 }, { "epoch": 8.975446610653398, "grad_norm": 0.964677095413208, "learning_rate": 1.5820798533566855e-06, "loss": 0.1478, "num_input_tokens_seen": 105058944, "step": 110030 }, { "epoch": 8.975854474263807, "grad_norm": 5.585527420043945, "learning_rate": 1.5808340288813367e-06, "loss": 0.3919, "num_input_tokens_seen": 105063664, "step": 110035 }, { "epoch": 8.976262337874214, "grad_norm": 1.8027968406677246, "learning_rate": 1.579588679102531e-06, "loss": 0.3685, "num_input_tokens_seen": 105067728, "step": 110040 }, { "epoch": 8.976670201484623, "grad_norm": 1.933010458946228, "learning_rate": 1.5783438040455095e-06, "loss": 0.3235, "num_input_tokens_seen": 105071568, "step": 110045 }, { "epoch": 8.977078065095032, "grad_norm": 25.460859298706055, "learning_rate": 1.5770994037355047e-06, "loss": 0.2767, "num_input_tokens_seen": 105077232, "step": 110050 }, { "epoch": 8.977485928705441, "grad_norm": 2.2984187602996826, "learning_rate": 1.5758554781977435e-06, "loss": 0.5653, "num_input_tokens_seen": 105082912, "step": 110055 }, { "epoch": 8.97789379231585, "grad_norm": 6.062098026275635, "learning_rate": 1.5746120274574393e-06, "loss": 0.1952, "num_input_tokens_seen": 105087680, "step": 110060 }, { "epoch": 8.97830165592626, "grad_norm": 44.319252014160156, "learning_rate": 1.573369051539797e-06, "loss": 0.2934, "num_input_tokens_seen": 105091808, "step": 110065 }, { "epoch": 8.978709519536666, "grad_norm": 5.979466915130615, "learning_rate": 1.5721265504700073e-06, "loss": 0.2173, "num_input_tokens_seen": 105096432, "step": 110070 }, { "epoch": 8.979117383147075, "grad_norm": 10.704071998596191, "learning_rate": 1.5708845242732556e-06, "loss": 0.3237, "num_input_tokens_seen": 105101536, "step": 110075 }, { "epoch": 8.979525246757484, "grad_norm": 39.437347412109375, "learning_rate": 1.5696429729747219e-06, "loss": 0.3233, "num_input_tokens_seen": 105106240, "step": 110080 }, { "epoch": 8.979933110367893, "grad_norm": 15.499784469604492, "learning_rate": 1.5684018965995668e-06, "loss": 0.5483, "num_input_tokens_seen": 105110368, "step": 110085 }, { "epoch": 8.980340973978302, "grad_norm": 5.613301753997803, "learning_rate": 1.5671612951729503e-06, "loss": 0.3071, "num_input_tokens_seen": 105115888, "step": 110090 }, { "epoch": 8.98074883758871, "grad_norm": 1.5360231399536133, "learning_rate": 1.5659211687200136e-06, "loss": 0.3065, "num_input_tokens_seen": 105120768, "step": 110095 }, { "epoch": 8.981156701199119, "grad_norm": 23.90366554260254, "learning_rate": 1.5646815172659008e-06, "loss": 0.4503, "num_input_tokens_seen": 105124896, "step": 110100 }, { "epoch": 8.981564564809528, "grad_norm": 19.186992645263672, "learning_rate": 1.5634423408357356e-06, "loss": 0.2424, "num_input_tokens_seen": 105127952, "step": 110105 }, { "epoch": 8.981972428419937, "grad_norm": 2.1089704036712646, "learning_rate": 1.5622036394546346e-06, "loss": 0.4051, "num_input_tokens_seen": 105132400, "step": 110110 }, { "epoch": 8.982380292030346, "grad_norm": 7.429603576660156, "learning_rate": 1.5609654131477081e-06, "loss": 0.3969, "num_input_tokens_seen": 105136784, "step": 110115 }, { "epoch": 8.982788155640753, "grad_norm": 3.483671188354492, "learning_rate": 1.5597276619400526e-06, "loss": 0.5096, "num_input_tokens_seen": 105141952, "step": 110120 }, { "epoch": 8.983196019251162, "grad_norm": 3.0800154209136963, "learning_rate": 1.5584903858567562e-06, "loss": 0.2421, "num_input_tokens_seen": 105146800, "step": 110125 }, { "epoch": 8.983603882861571, "grad_norm": 1.6304843425750732, "learning_rate": 1.5572535849228965e-06, "loss": 0.4816, "num_input_tokens_seen": 105152176, "step": 110130 }, { "epoch": 8.98401174647198, "grad_norm": 2.084604024887085, "learning_rate": 1.5560172591635475e-06, "loss": 0.1197, "num_input_tokens_seen": 105156800, "step": 110135 }, { "epoch": 8.984419610082389, "grad_norm": 21.068323135375977, "learning_rate": 1.5547814086037643e-06, "loss": 0.4086, "num_input_tokens_seen": 105160400, "step": 110140 }, { "epoch": 8.984827473692796, "grad_norm": 22.581012725830078, "learning_rate": 1.553546033268602e-06, "loss": 0.4633, "num_input_tokens_seen": 105164448, "step": 110145 }, { "epoch": 8.985235337303205, "grad_norm": 2.0005240440368652, "learning_rate": 1.5523111331830987e-06, "loss": 0.3142, "num_input_tokens_seen": 105170224, "step": 110150 }, { "epoch": 8.985643200913614, "grad_norm": 3.7944862842559814, "learning_rate": 1.5510767083722844e-06, "loss": 0.2585, "num_input_tokens_seen": 105175264, "step": 110155 }, { "epoch": 8.986051064524023, "grad_norm": 3.659571409225464, "learning_rate": 1.5498427588611808e-06, "loss": 0.1837, "num_input_tokens_seen": 105179920, "step": 110160 }, { "epoch": 8.986458928134432, "grad_norm": 4.536130428314209, "learning_rate": 1.5486092846747984e-06, "loss": 0.5566, "num_input_tokens_seen": 105185184, "step": 110165 }, { "epoch": 8.986866791744841, "grad_norm": 31.53923225402832, "learning_rate": 1.5473762858381423e-06, "loss": 0.2444, "num_input_tokens_seen": 105189552, "step": 110170 }, { "epoch": 8.987274655355248, "grad_norm": 1.0841002464294434, "learning_rate": 1.5461437623762038e-06, "loss": 0.2323, "num_input_tokens_seen": 105194000, "step": 110175 }, { "epoch": 8.987682518965658, "grad_norm": 16.394384384155273, "learning_rate": 1.5449117143139652e-06, "loss": 0.5114, "num_input_tokens_seen": 105199088, "step": 110180 }, { "epoch": 8.988090382576067, "grad_norm": 25.106918334960938, "learning_rate": 1.5436801416763958e-06, "loss": 0.3653, "num_input_tokens_seen": 105203184, "step": 110185 }, { "epoch": 8.988498246186476, "grad_norm": 7.0961384773254395, "learning_rate": 1.542449044488467e-06, "loss": 0.2902, "num_input_tokens_seen": 105207376, "step": 110190 }, { "epoch": 8.988906109796885, "grad_norm": 1.8203294277191162, "learning_rate": 1.5412184227751259e-06, "loss": 0.2919, "num_input_tokens_seen": 105212160, "step": 110195 }, { "epoch": 8.989313973407292, "grad_norm": 12.566667556762695, "learning_rate": 1.5399882765613216e-06, "loss": 0.1843, "num_input_tokens_seen": 105216480, "step": 110200 }, { "epoch": 8.9897218370177, "grad_norm": 23.612592697143555, "learning_rate": 1.5387586058719815e-06, "loss": 0.3385, "num_input_tokens_seen": 105221168, "step": 110205 }, { "epoch": 8.99012970062811, "grad_norm": 2.7723183631896973, "learning_rate": 1.5375294107320415e-06, "loss": 0.3929, "num_input_tokens_seen": 105225632, "step": 110210 }, { "epoch": 8.990537564238519, "grad_norm": 18.572490692138672, "learning_rate": 1.5363006911664063e-06, "loss": 0.4229, "num_input_tokens_seen": 105230960, "step": 110215 }, { "epoch": 8.990945427848928, "grad_norm": 7.557192325592041, "learning_rate": 1.5350724471999894e-06, "loss": 0.3053, "num_input_tokens_seen": 105236176, "step": 110220 }, { "epoch": 8.991353291459337, "grad_norm": 8.041545867919922, "learning_rate": 1.5338446788576822e-06, "loss": 0.3458, "num_input_tokens_seen": 105240192, "step": 110225 }, { "epoch": 8.991761155069744, "grad_norm": 27.105484008789062, "learning_rate": 1.5326173861643673e-06, "loss": 0.34, "num_input_tokens_seen": 105244560, "step": 110230 }, { "epoch": 8.992169018680153, "grad_norm": 14.835923194885254, "learning_rate": 1.5313905691449304e-06, "loss": 0.4029, "num_input_tokens_seen": 105249504, "step": 110235 }, { "epoch": 8.992576882290562, "grad_norm": 30.596529006958008, "learning_rate": 1.5301642278242322e-06, "loss": 0.2663, "num_input_tokens_seen": 105254320, "step": 110240 }, { "epoch": 8.992984745900971, "grad_norm": 9.36587905883789, "learning_rate": 1.5289383622271337e-06, "loss": 0.3459, "num_input_tokens_seen": 105258800, "step": 110245 }, { "epoch": 8.99339260951138, "grad_norm": 32.9352912902832, "learning_rate": 1.527712972378481e-06, "loss": 0.2699, "num_input_tokens_seen": 105264192, "step": 110250 }, { "epoch": 8.993800473121787, "grad_norm": 0.5175116658210754, "learning_rate": 1.5264880583031133e-06, "loss": 0.3893, "num_input_tokens_seen": 105269328, "step": 110255 }, { "epoch": 8.994208336732196, "grad_norm": 17.50804901123047, "learning_rate": 1.5252636200258574e-06, "loss": 0.2232, "num_input_tokens_seen": 105274384, "step": 110260 }, { "epoch": 8.994616200342605, "grad_norm": 2.2899656295776367, "learning_rate": 1.5240396575715326e-06, "loss": 0.3562, "num_input_tokens_seen": 105278240, "step": 110265 }, { "epoch": 8.995024063953014, "grad_norm": 33.218894958496094, "learning_rate": 1.5228161709649497e-06, "loss": 0.4625, "num_input_tokens_seen": 105282976, "step": 110270 }, { "epoch": 8.995431927563423, "grad_norm": 10.137137413024902, "learning_rate": 1.5215931602309024e-06, "loss": 0.3302, "num_input_tokens_seen": 105287504, "step": 110275 }, { "epoch": 8.995839791173832, "grad_norm": 5.36222505569458, "learning_rate": 1.5203706253941906e-06, "loss": 0.1306, "num_input_tokens_seen": 105292784, "step": 110280 }, { "epoch": 8.99624765478424, "grad_norm": 18.088253021240234, "learning_rate": 1.5191485664795862e-06, "loss": 0.558, "num_input_tokens_seen": 105297952, "step": 110285 }, { "epoch": 8.996655518394649, "grad_norm": 9.307246208190918, "learning_rate": 1.5179269835118638e-06, "loss": 0.3621, "num_input_tokens_seen": 105302656, "step": 110290 }, { "epoch": 8.997063382005058, "grad_norm": 6.074688911437988, "learning_rate": 1.516705876515781e-06, "loss": 0.3448, "num_input_tokens_seen": 105307376, "step": 110295 }, { "epoch": 8.997471245615467, "grad_norm": 20.106632232666016, "learning_rate": 1.5154852455160935e-06, "loss": 0.1597, "num_input_tokens_seen": 105312320, "step": 110300 }, { "epoch": 8.997879109225876, "grad_norm": 28.71381187438965, "learning_rate": 1.5142650905375422e-06, "loss": 0.6052, "num_input_tokens_seen": 105317680, "step": 110305 }, { "epoch": 8.998286972836283, "grad_norm": 2.2634174823760986, "learning_rate": 1.5130454116048548e-06, "loss": 0.3195, "num_input_tokens_seen": 105322496, "step": 110310 }, { "epoch": 8.998694836446692, "grad_norm": 3.5584139823913574, "learning_rate": 1.5118262087427588e-06, "loss": 0.4471, "num_input_tokens_seen": 105327536, "step": 110315 }, { "epoch": 8.999102700057101, "grad_norm": 15.80735969543457, "learning_rate": 1.5106074819759619e-06, "loss": 0.3754, "num_input_tokens_seen": 105331936, "step": 110320 }, { "epoch": 8.99951056366751, "grad_norm": 23.737001419067383, "learning_rate": 1.5093892313291724e-06, "loss": 0.4946, "num_input_tokens_seen": 105337168, "step": 110325 }, { "epoch": 8.999918427277919, "grad_norm": 5.047572612762451, "learning_rate": 1.5081714568270815e-06, "loss": 0.3102, "num_input_tokens_seen": 105342736, "step": 110330 }, { "epoch": 9.000326290888326, "grad_norm": 24.319028854370117, "learning_rate": 1.5069541584943753e-06, "loss": 0.2489, "num_input_tokens_seen": 105346352, "step": 110335 }, { "epoch": 9.000734154498735, "grad_norm": 1.8703604936599731, "learning_rate": 1.5057373363557197e-06, "loss": 0.4701, "num_input_tokens_seen": 105351168, "step": 110340 }, { "epoch": 9.000734154498735, "eval_loss": 0.3481733500957489, "eval_runtime": 570.9339, "eval_samples_per_second": 4.773, "eval_steps_per_second": 2.387, "num_input_tokens_seen": 105351168, "step": 110340 }, { "epoch": 9.001142018109144, "grad_norm": 2.6773810386657715, "learning_rate": 1.5045209904357898e-06, "loss": 0.3022, "num_input_tokens_seen": 105356400, "step": 110345 }, { "epoch": 9.001549881719553, "grad_norm": 47.708675384521484, "learning_rate": 1.5033051207592325e-06, "loss": 0.3488, "num_input_tokens_seen": 105360304, "step": 110350 }, { "epoch": 9.001957745329962, "grad_norm": 6.030847072601318, "learning_rate": 1.5020897273507001e-06, "loss": 0.3283, "num_input_tokens_seen": 105365344, "step": 110355 }, { "epoch": 9.002365608940371, "grad_norm": 13.146097183227539, "learning_rate": 1.5008748102348174e-06, "loss": 0.3104, "num_input_tokens_seen": 105369200, "step": 110360 }, { "epoch": 9.002773472550778, "grad_norm": 3.146608591079712, "learning_rate": 1.4996603694362232e-06, "loss": 0.2367, "num_input_tokens_seen": 105374288, "step": 110365 }, { "epoch": 9.003181336161187, "grad_norm": 10.6716947555542, "learning_rate": 1.4984464049795255e-06, "loss": 0.5227, "num_input_tokens_seen": 105378752, "step": 110370 }, { "epoch": 9.003589199771596, "grad_norm": 3.4359090328216553, "learning_rate": 1.4972329168893323e-06, "loss": 0.3628, "num_input_tokens_seen": 105383936, "step": 110375 }, { "epoch": 9.003997063382005, "grad_norm": 24.18544578552246, "learning_rate": 1.4960199051902434e-06, "loss": 0.5024, "num_input_tokens_seen": 105388336, "step": 110380 }, { "epoch": 9.004404926992414, "grad_norm": 2.205094337463379, "learning_rate": 1.4948073699068422e-06, "loss": 0.243, "num_input_tokens_seen": 105392624, "step": 110385 }, { "epoch": 9.004812790602822, "grad_norm": 27.736217498779297, "learning_rate": 1.4935953110637058e-06, "loss": 0.2872, "num_input_tokens_seen": 105398000, "step": 110390 }, { "epoch": 9.00522065421323, "grad_norm": 12.067534446716309, "learning_rate": 1.4923837286854065e-06, "loss": 0.2141, "num_input_tokens_seen": 105403200, "step": 110395 }, { "epoch": 9.00562851782364, "grad_norm": 11.86270523071289, "learning_rate": 1.4911726227964994e-06, "loss": 0.6692, "num_input_tokens_seen": 105408272, "step": 110400 }, { "epoch": 9.006036381434049, "grad_norm": 48.4768180847168, "learning_rate": 1.4899619934215291e-06, "loss": 0.3579, "num_input_tokens_seen": 105413616, "step": 110405 }, { "epoch": 9.006444245044458, "grad_norm": 23.82143783569336, "learning_rate": 1.4887518405850425e-06, "loss": 0.2425, "num_input_tokens_seen": 105417440, "step": 110410 }, { "epoch": 9.006852108654865, "grad_norm": 3.378594160079956, "learning_rate": 1.4875421643115673e-06, "loss": 0.2905, "num_input_tokens_seen": 105422368, "step": 110415 }, { "epoch": 9.007259972265274, "grad_norm": 15.311163902282715, "learning_rate": 1.4863329646256223e-06, "loss": 0.2359, "num_input_tokens_seen": 105427104, "step": 110420 }, { "epoch": 9.007667835875683, "grad_norm": 15.438767433166504, "learning_rate": 1.4851242415517136e-06, "loss": 0.4023, "num_input_tokens_seen": 105431584, "step": 110425 }, { "epoch": 9.008075699486092, "grad_norm": 3.599780559539795, "learning_rate": 1.4839159951143433e-06, "loss": 0.4434, "num_input_tokens_seen": 105436144, "step": 110430 }, { "epoch": 9.008483563096501, "grad_norm": 5.460829734802246, "learning_rate": 1.4827082253380058e-06, "loss": 0.3388, "num_input_tokens_seen": 105440704, "step": 110435 }, { "epoch": 9.00889142670691, "grad_norm": 2.5377275943756104, "learning_rate": 1.481500932247179e-06, "loss": 0.267, "num_input_tokens_seen": 105446240, "step": 110440 }, { "epoch": 9.009299290317317, "grad_norm": 37.217491149902344, "learning_rate": 1.4802941158663347e-06, "loss": 0.2062, "num_input_tokens_seen": 105451328, "step": 110445 }, { "epoch": 9.009707153927726, "grad_norm": 13.8912992477417, "learning_rate": 1.4790877762199311e-06, "loss": 0.3411, "num_input_tokens_seen": 105456176, "step": 110450 }, { "epoch": 9.010115017538135, "grad_norm": 30.482316970825195, "learning_rate": 1.4778819133324268e-06, "loss": 0.2657, "num_input_tokens_seen": 105461152, "step": 110455 }, { "epoch": 9.010522881148544, "grad_norm": 14.800559997558594, "learning_rate": 1.476676527228263e-06, "loss": 0.5654, "num_input_tokens_seen": 105465984, "step": 110460 }, { "epoch": 9.010930744758953, "grad_norm": 1.411240816116333, "learning_rate": 1.4754716179318673e-06, "loss": 0.3601, "num_input_tokens_seen": 105471008, "step": 110465 }, { "epoch": 9.01133860836936, "grad_norm": 21.84136199951172, "learning_rate": 1.4742671854676675e-06, "loss": 0.2702, "num_input_tokens_seen": 105476032, "step": 110470 }, { "epoch": 9.01174647197977, "grad_norm": 17.769563674926758, "learning_rate": 1.4730632298600722e-06, "loss": 0.2825, "num_input_tokens_seen": 105480800, "step": 110475 }, { "epoch": 9.012154335590179, "grad_norm": 0.9472105503082275, "learning_rate": 1.471859751133492e-06, "loss": 0.3328, "num_input_tokens_seen": 105485552, "step": 110480 }, { "epoch": 9.012562199200588, "grad_norm": 4.863049030303955, "learning_rate": 1.4706567493123157e-06, "loss": 0.2552, "num_input_tokens_seen": 105490448, "step": 110485 }, { "epoch": 9.012970062810997, "grad_norm": 2.9940366744995117, "learning_rate": 1.469454224420927e-06, "loss": 0.2585, "num_input_tokens_seen": 105494736, "step": 110490 }, { "epoch": 9.013377926421406, "grad_norm": 1.8032640218734741, "learning_rate": 1.4682521764837004e-06, "loss": 0.3768, "num_input_tokens_seen": 105500320, "step": 110495 }, { "epoch": 9.013785790031813, "grad_norm": 11.835538864135742, "learning_rate": 1.4670506055250084e-06, "loss": 0.359, "num_input_tokens_seen": 105505568, "step": 110500 }, { "epoch": 9.014193653642222, "grad_norm": 2.2113430500030518, "learning_rate": 1.4658495115691978e-06, "loss": 0.2354, "num_input_tokens_seen": 105509984, "step": 110505 }, { "epoch": 9.01460151725263, "grad_norm": 9.731566429138184, "learning_rate": 1.4646488946406189e-06, "loss": 0.2995, "num_input_tokens_seen": 105514352, "step": 110510 }, { "epoch": 9.01500938086304, "grad_norm": 3.3382115364074707, "learning_rate": 1.4634487547636077e-06, "loss": 0.4424, "num_input_tokens_seen": 105519920, "step": 110515 }, { "epoch": 9.015417244473449, "grad_norm": 3.6371471881866455, "learning_rate": 1.4622490919624865e-06, "loss": 0.3652, "num_input_tokens_seen": 105524528, "step": 110520 }, { "epoch": 9.015825108083856, "grad_norm": 40.49025344848633, "learning_rate": 1.4610499062615774e-06, "loss": 0.4117, "num_input_tokens_seen": 105529152, "step": 110525 }, { "epoch": 9.016232971694265, "grad_norm": 33.84339141845703, "learning_rate": 1.4598511976851803e-06, "loss": 0.4294, "num_input_tokens_seen": 105534368, "step": 110530 }, { "epoch": 9.016640835304674, "grad_norm": 27.283884048461914, "learning_rate": 1.4586529662575981e-06, "loss": 0.2847, "num_input_tokens_seen": 105538752, "step": 110535 }, { "epoch": 9.017048698915083, "grad_norm": 8.038209915161133, "learning_rate": 1.4574552120031143e-06, "loss": 0.3385, "num_input_tokens_seen": 105542544, "step": 110540 }, { "epoch": 9.017456562525492, "grad_norm": 25.147846221923828, "learning_rate": 1.456257934946015e-06, "loss": 0.3736, "num_input_tokens_seen": 105547104, "step": 110545 }, { "epoch": 9.0178644261359, "grad_norm": 6.264885902404785, "learning_rate": 1.4550611351105581e-06, "loss": 0.4559, "num_input_tokens_seen": 105551648, "step": 110550 }, { "epoch": 9.018272289746308, "grad_norm": 18.11133575439453, "learning_rate": 1.4538648125210108e-06, "loss": 0.6235, "num_input_tokens_seen": 105556128, "step": 110555 }, { "epoch": 9.018680153356717, "grad_norm": 5.102461338043213, "learning_rate": 1.4526689672016147e-06, "loss": 0.3861, "num_input_tokens_seen": 105561472, "step": 110560 }, { "epoch": 9.019088016967126, "grad_norm": 5.983560085296631, "learning_rate": 1.4514735991766142e-06, "loss": 0.2773, "num_input_tokens_seen": 105565760, "step": 110565 }, { "epoch": 9.019495880577535, "grad_norm": 8.711015701293945, "learning_rate": 1.4502787084702403e-06, "loss": 0.1716, "num_input_tokens_seen": 105571296, "step": 110570 }, { "epoch": 9.019903744187944, "grad_norm": 1.732320785522461, "learning_rate": 1.4490842951067064e-06, "loss": 0.4164, "num_input_tokens_seen": 105575936, "step": 110575 }, { "epoch": 9.020311607798352, "grad_norm": 42.94304656982422, "learning_rate": 1.4478903591102295e-06, "loss": 0.3989, "num_input_tokens_seen": 105580080, "step": 110580 }, { "epoch": 9.02071947140876, "grad_norm": 8.263998985290527, "learning_rate": 1.4466969005050013e-06, "loss": 0.2947, "num_input_tokens_seen": 105585184, "step": 110585 }, { "epoch": 9.02112733501917, "grad_norm": 37.06916427612305, "learning_rate": 1.4455039193152247e-06, "loss": 0.5667, "num_input_tokens_seen": 105590480, "step": 110590 }, { "epoch": 9.021535198629579, "grad_norm": 15.714320182800293, "learning_rate": 1.4443114155650723e-06, "loss": 0.322, "num_input_tokens_seen": 105595680, "step": 110595 }, { "epoch": 9.021943062239988, "grad_norm": 3.4391396045684814, "learning_rate": 1.4431193892787187e-06, "loss": 0.3802, "num_input_tokens_seen": 105600704, "step": 110600 }, { "epoch": 9.022350925850395, "grad_norm": 1.2831933498382568, "learning_rate": 1.4419278404803226e-06, "loss": 0.3144, "num_input_tokens_seen": 105604784, "step": 110605 }, { "epoch": 9.022758789460804, "grad_norm": 14.238603591918945, "learning_rate": 1.4407367691940427e-06, "loss": 0.4307, "num_input_tokens_seen": 105609360, "step": 110610 }, { "epoch": 9.023166653071213, "grad_norm": 14.543034553527832, "learning_rate": 1.4395461754440177e-06, "loss": 0.2001, "num_input_tokens_seen": 105613696, "step": 110615 }, { "epoch": 9.023574516681622, "grad_norm": 24.626785278320312, "learning_rate": 1.4383560592543782e-06, "loss": 0.2512, "num_input_tokens_seen": 105619056, "step": 110620 }, { "epoch": 9.023982380292031, "grad_norm": 27.106473922729492, "learning_rate": 1.4371664206492496e-06, "loss": 0.2998, "num_input_tokens_seen": 105624624, "step": 110625 }, { "epoch": 9.024390243902438, "grad_norm": 24.534669876098633, "learning_rate": 1.4359772596527432e-06, "loss": 0.3815, "num_input_tokens_seen": 105629936, "step": 110630 }, { "epoch": 9.024798107512847, "grad_norm": 2.1753134727478027, "learning_rate": 1.4347885762889673e-06, "loss": 0.3243, "num_input_tokens_seen": 105633952, "step": 110635 }, { "epoch": 9.025205971123256, "grad_norm": 36.2579460144043, "learning_rate": 1.433600370582014e-06, "loss": 0.4137, "num_input_tokens_seen": 105638864, "step": 110640 }, { "epoch": 9.025613834733665, "grad_norm": 2.6938822269439697, "learning_rate": 1.4324126425559663e-06, "loss": 0.2631, "num_input_tokens_seen": 105643104, "step": 110645 }, { "epoch": 9.026021698344074, "grad_norm": 34.59735107421875, "learning_rate": 1.4312253922349e-06, "loss": 0.3903, "num_input_tokens_seen": 105647408, "step": 110650 }, { "epoch": 9.026429561954483, "grad_norm": 18.063859939575195, "learning_rate": 1.4300386196428816e-06, "loss": 0.2478, "num_input_tokens_seen": 105652144, "step": 110655 }, { "epoch": 9.02683742556489, "grad_norm": 18.660938262939453, "learning_rate": 1.4288523248039643e-06, "loss": 0.3834, "num_input_tokens_seen": 105656800, "step": 110660 }, { "epoch": 9.0272452891753, "grad_norm": 5.472995281219482, "learning_rate": 1.4276665077421953e-06, "loss": 0.3586, "num_input_tokens_seen": 105661584, "step": 110665 }, { "epoch": 9.027653152785708, "grad_norm": 55.9412841796875, "learning_rate": 1.4264811684816027e-06, "loss": 0.4859, "num_input_tokens_seen": 105665440, "step": 110670 }, { "epoch": 9.028061016396117, "grad_norm": 10.996487617492676, "learning_rate": 1.4252963070462255e-06, "loss": 0.3727, "num_input_tokens_seen": 105670864, "step": 110675 }, { "epoch": 9.028468880006526, "grad_norm": 6.2808756828308105, "learning_rate": 1.4241119234600753e-06, "loss": 0.3303, "num_input_tokens_seen": 105676016, "step": 110680 }, { "epoch": 9.028876743616934, "grad_norm": 4.586552143096924, "learning_rate": 1.4229280177471577e-06, "loss": 0.3342, "num_input_tokens_seen": 105681808, "step": 110685 }, { "epoch": 9.029284607227343, "grad_norm": 25.329421997070312, "learning_rate": 1.4217445899314702e-06, "loss": 0.3339, "num_input_tokens_seen": 105686880, "step": 110690 }, { "epoch": 9.029692470837752, "grad_norm": 9.313051223754883, "learning_rate": 1.4205616400369992e-06, "loss": 0.4622, "num_input_tokens_seen": 105691360, "step": 110695 }, { "epoch": 9.03010033444816, "grad_norm": 1.4011993408203125, "learning_rate": 1.4193791680877254e-06, "loss": 0.3114, "num_input_tokens_seen": 105695984, "step": 110700 }, { "epoch": 9.03050819805857, "grad_norm": 4.101881980895996, "learning_rate": 1.4181971741076189e-06, "loss": 0.3675, "num_input_tokens_seen": 105700432, "step": 110705 }, { "epoch": 9.030916061668979, "grad_norm": 26.204620361328125, "learning_rate": 1.4170156581206323e-06, "loss": 0.4261, "num_input_tokens_seen": 105704816, "step": 110710 }, { "epoch": 9.031323925279386, "grad_norm": 11.053388595581055, "learning_rate": 1.4158346201507188e-06, "loss": 0.3466, "num_input_tokens_seen": 105709632, "step": 110715 }, { "epoch": 9.031731788889795, "grad_norm": 15.956233978271484, "learning_rate": 1.4146540602218123e-06, "loss": 0.2811, "num_input_tokens_seen": 105715024, "step": 110720 }, { "epoch": 9.032139652500204, "grad_norm": 10.21658706665039, "learning_rate": 1.4134739783578487e-06, "loss": 0.5364, "num_input_tokens_seen": 105720128, "step": 110725 }, { "epoch": 9.032547516110613, "grad_norm": 12.51203441619873, "learning_rate": 1.4122943745827455e-06, "loss": 0.2515, "num_input_tokens_seen": 105724752, "step": 110730 }, { "epoch": 9.032955379721022, "grad_norm": 8.801898956298828, "learning_rate": 1.411115248920411e-06, "loss": 0.4347, "num_input_tokens_seen": 105729216, "step": 110735 }, { "epoch": 9.03336324333143, "grad_norm": 24.68821144104004, "learning_rate": 1.4099366013947457e-06, "loss": 0.3894, "num_input_tokens_seen": 105733920, "step": 110740 }, { "epoch": 9.033771106941838, "grad_norm": 17.31641960144043, "learning_rate": 1.4087584320296414e-06, "loss": 0.3797, "num_input_tokens_seen": 105738672, "step": 110745 }, { "epoch": 9.034178970552247, "grad_norm": 17.922945022583008, "learning_rate": 1.407580740848982e-06, "loss": 0.4683, "num_input_tokens_seen": 105743008, "step": 110750 }, { "epoch": 9.034586834162656, "grad_norm": 3.017432689666748, "learning_rate": 1.4064035278766374e-06, "loss": 0.3752, "num_input_tokens_seen": 105748112, "step": 110755 }, { "epoch": 9.034994697773065, "grad_norm": 0.5311194062232971, "learning_rate": 1.4052267931364604e-06, "loss": 0.1498, "num_input_tokens_seen": 105753168, "step": 110760 }, { "epoch": 9.035402561383473, "grad_norm": 3.86181378364563, "learning_rate": 1.4040505366523182e-06, "loss": 0.3829, "num_input_tokens_seen": 105758960, "step": 110765 }, { "epoch": 9.035810424993882, "grad_norm": 25.77412223815918, "learning_rate": 1.4028747584480417e-06, "loss": 0.2401, "num_input_tokens_seen": 105764016, "step": 110770 }, { "epoch": 9.03621828860429, "grad_norm": 21.510135650634766, "learning_rate": 1.4016994585474675e-06, "loss": 0.3829, "num_input_tokens_seen": 105769040, "step": 110775 }, { "epoch": 9.0366261522147, "grad_norm": 4.14849853515625, "learning_rate": 1.400524636974418e-06, "loss": 0.2558, "num_input_tokens_seen": 105773744, "step": 110780 }, { "epoch": 9.037034015825109, "grad_norm": 37.298038482666016, "learning_rate": 1.3993502937527075e-06, "loss": 0.5164, "num_input_tokens_seen": 105779536, "step": 110785 }, { "epoch": 9.037441879435518, "grad_norm": 14.819355010986328, "learning_rate": 1.3981764289061311e-06, "loss": 0.3464, "num_input_tokens_seen": 105784448, "step": 110790 }, { "epoch": 9.037849743045925, "grad_norm": 60.022422790527344, "learning_rate": 1.3970030424584974e-06, "loss": 0.2661, "num_input_tokens_seen": 105789744, "step": 110795 }, { "epoch": 9.038257606656334, "grad_norm": 5.254289627075195, "learning_rate": 1.3958301344335789e-06, "loss": 0.4736, "num_input_tokens_seen": 105793712, "step": 110800 }, { "epoch": 9.038665470266743, "grad_norm": 4.164790153503418, "learning_rate": 1.394657704855154e-06, "loss": 0.2514, "num_input_tokens_seen": 105798960, "step": 110805 }, { "epoch": 9.039073333877152, "grad_norm": 16.77020263671875, "learning_rate": 1.3934857537469899e-06, "loss": 0.4954, "num_input_tokens_seen": 105804480, "step": 110810 }, { "epoch": 9.03948119748756, "grad_norm": 2.6906509399414062, "learning_rate": 1.3923142811328366e-06, "loss": 0.2401, "num_input_tokens_seen": 105809152, "step": 110815 }, { "epoch": 9.039889061097968, "grad_norm": 5.045943737030029, "learning_rate": 1.3911432870364393e-06, "loss": 0.4546, "num_input_tokens_seen": 105813808, "step": 110820 }, { "epoch": 9.040296924708377, "grad_norm": 3.2885947227478027, "learning_rate": 1.3899727714815374e-06, "loss": 0.4139, "num_input_tokens_seen": 105819568, "step": 110825 }, { "epoch": 9.040704788318786, "grad_norm": 2.801497220993042, "learning_rate": 1.3888027344918508e-06, "loss": 0.2678, "num_input_tokens_seen": 105824128, "step": 110830 }, { "epoch": 9.041112651929195, "grad_norm": 31.261533737182617, "learning_rate": 1.3876331760911049e-06, "loss": 0.3157, "num_input_tokens_seen": 105829920, "step": 110835 }, { "epoch": 9.041520515539604, "grad_norm": 46.571231842041016, "learning_rate": 1.3864640963030002e-06, "loss": 0.4226, "num_input_tokens_seen": 105834912, "step": 110840 }, { "epoch": 9.041928379150011, "grad_norm": 13.001720428466797, "learning_rate": 1.3852954951512343e-06, "loss": 0.4446, "num_input_tokens_seen": 105838880, "step": 110845 }, { "epoch": 9.04233624276042, "grad_norm": 7.673785209655762, "learning_rate": 1.3841273726594912e-06, "loss": 0.6044, "num_input_tokens_seen": 105843248, "step": 110850 }, { "epoch": 9.04274410637083, "grad_norm": 31.103519439697266, "learning_rate": 1.3829597288514517e-06, "loss": 0.422, "num_input_tokens_seen": 105847184, "step": 110855 }, { "epoch": 9.043151969981238, "grad_norm": 3.8592848777770996, "learning_rate": 1.3817925637507861e-06, "loss": 0.2269, "num_input_tokens_seen": 105851456, "step": 110860 }, { "epoch": 9.043559833591647, "grad_norm": 14.086812019348145, "learning_rate": 1.3806258773811476e-06, "loss": 0.3388, "num_input_tokens_seen": 105855440, "step": 110865 }, { "epoch": 9.043967697202056, "grad_norm": 7.38339900970459, "learning_rate": 1.3794596697661894e-06, "loss": 0.3515, "num_input_tokens_seen": 105859888, "step": 110870 }, { "epoch": 9.044375560812464, "grad_norm": 33.65972137451172, "learning_rate": 1.3782939409295397e-06, "loss": 0.4747, "num_input_tokens_seen": 105864496, "step": 110875 }, { "epoch": 9.044783424422873, "grad_norm": 24.983734130859375, "learning_rate": 1.3771286908948382e-06, "loss": 0.3711, "num_input_tokens_seen": 105869120, "step": 110880 }, { "epoch": 9.045191288033282, "grad_norm": 28.840312957763672, "learning_rate": 1.375963919685702e-06, "loss": 0.3063, "num_input_tokens_seen": 105873280, "step": 110885 }, { "epoch": 9.04559915164369, "grad_norm": 10.503544807434082, "learning_rate": 1.3747996273257375e-06, "loss": 0.2182, "num_input_tokens_seen": 105877664, "step": 110890 }, { "epoch": 9.0460070152541, "grad_norm": 11.161566734313965, "learning_rate": 1.373635813838542e-06, "loss": 0.211, "num_input_tokens_seen": 105883280, "step": 110895 }, { "epoch": 9.046414878864507, "grad_norm": 5.688350200653076, "learning_rate": 1.3724724792477133e-06, "loss": 0.3452, "num_input_tokens_seen": 105888256, "step": 110900 }, { "epoch": 9.046822742474916, "grad_norm": 5.539332389831543, "learning_rate": 1.3713096235768247e-06, "loss": 0.3528, "num_input_tokens_seen": 105892096, "step": 110905 }, { "epoch": 9.047230606085325, "grad_norm": 2.016728162765503, "learning_rate": 1.370147246849454e-06, "loss": 0.4221, "num_input_tokens_seen": 105896352, "step": 110910 }, { "epoch": 9.047638469695734, "grad_norm": 53.89524459838867, "learning_rate": 1.3689853490891524e-06, "loss": 0.3199, "num_input_tokens_seen": 105900880, "step": 110915 }, { "epoch": 9.048046333306143, "grad_norm": 39.29887390136719, "learning_rate": 1.3678239303194784e-06, "loss": 0.4008, "num_input_tokens_seen": 105905440, "step": 110920 }, { "epoch": 9.048454196916552, "grad_norm": 2.4177587032318115, "learning_rate": 1.3666629905639717e-06, "loss": 0.2222, "num_input_tokens_seen": 105909152, "step": 110925 }, { "epoch": 9.04886206052696, "grad_norm": 4.935035228729248, "learning_rate": 1.365502529846166e-06, "loss": 0.3151, "num_input_tokens_seen": 105914048, "step": 110930 }, { "epoch": 9.049269924137368, "grad_norm": 19.2337703704834, "learning_rate": 1.3643425481895789e-06, "loss": 0.4113, "num_input_tokens_seen": 105917872, "step": 110935 }, { "epoch": 9.049677787747777, "grad_norm": 5.932989597320557, "learning_rate": 1.3631830456177248e-06, "loss": 0.5431, "num_input_tokens_seen": 105922144, "step": 110940 }, { "epoch": 9.050085651358186, "grad_norm": 4.612043857574463, "learning_rate": 1.3620240221541098e-06, "loss": 0.4345, "num_input_tokens_seen": 105926288, "step": 110945 }, { "epoch": 9.050493514968595, "grad_norm": 0.5280805826187134, "learning_rate": 1.3608654778222208e-06, "loss": 0.3006, "num_input_tokens_seen": 105930832, "step": 110950 }, { "epoch": 9.050901378579002, "grad_norm": 12.809676170349121, "learning_rate": 1.3597074126455417e-06, "loss": 0.3222, "num_input_tokens_seen": 105935248, "step": 110955 }, { "epoch": 9.051309242189411, "grad_norm": 17.59526252746582, "learning_rate": 1.3585498266475483e-06, "loss": 0.4053, "num_input_tokens_seen": 105940368, "step": 110960 }, { "epoch": 9.05171710579982, "grad_norm": 7.051308631896973, "learning_rate": 1.357392719851705e-06, "loss": 0.2438, "num_input_tokens_seen": 105945136, "step": 110965 }, { "epoch": 9.05212496941023, "grad_norm": 7.686849594116211, "learning_rate": 1.3562360922814653e-06, "loss": 0.4157, "num_input_tokens_seen": 105949680, "step": 110970 }, { "epoch": 9.052532833020638, "grad_norm": 43.56723403930664, "learning_rate": 1.3550799439602746e-06, "loss": 0.3622, "num_input_tokens_seen": 105955168, "step": 110975 }, { "epoch": 9.052940696631046, "grad_norm": 0.579724133014679, "learning_rate": 1.3539242749115665e-06, "loss": 0.1967, "num_input_tokens_seen": 105959568, "step": 110980 }, { "epoch": 9.053348560241455, "grad_norm": 5.5985107421875, "learning_rate": 1.3527690851587587e-06, "loss": 0.3655, "num_input_tokens_seen": 105965392, "step": 110985 }, { "epoch": 9.053756423851864, "grad_norm": 9.274060249328613, "learning_rate": 1.3516143747252795e-06, "loss": 0.3976, "num_input_tokens_seen": 105970352, "step": 110990 }, { "epoch": 9.054164287462273, "grad_norm": 2.631312370300293, "learning_rate": 1.3504601436345272e-06, "loss": 0.1572, "num_input_tokens_seen": 105975856, "step": 110995 }, { "epoch": 9.054572151072682, "grad_norm": 36.880191802978516, "learning_rate": 1.3493063919098992e-06, "loss": 0.3961, "num_input_tokens_seen": 105980320, "step": 111000 }, { "epoch": 9.05498001468309, "grad_norm": 37.4228515625, "learning_rate": 1.348153119574777e-06, "loss": 0.2688, "num_input_tokens_seen": 105984544, "step": 111005 }, { "epoch": 9.055387878293498, "grad_norm": 11.482223510742188, "learning_rate": 1.347000326652545e-06, "loss": 0.2012, "num_input_tokens_seen": 105989184, "step": 111010 }, { "epoch": 9.055795741903907, "grad_norm": 15.272364616394043, "learning_rate": 1.3458480131665647e-06, "loss": 0.2843, "num_input_tokens_seen": 105993776, "step": 111015 }, { "epoch": 9.056203605514316, "grad_norm": 8.286396026611328, "learning_rate": 1.3446961791401924e-06, "loss": 0.1628, "num_input_tokens_seen": 105998960, "step": 111020 }, { "epoch": 9.056611469124725, "grad_norm": 26.35280418395996, "learning_rate": 1.343544824596779e-06, "loss": 0.1787, "num_input_tokens_seen": 106004080, "step": 111025 }, { "epoch": 9.057019332735134, "grad_norm": 1.0945048332214355, "learning_rate": 1.342393949559656e-06, "loss": 0.1738, "num_input_tokens_seen": 106008704, "step": 111030 }, { "epoch": 9.057427196345541, "grad_norm": 1.0484877824783325, "learning_rate": 1.3412435540521573e-06, "loss": 0.3454, "num_input_tokens_seen": 106013536, "step": 111035 }, { "epoch": 9.05783505995595, "grad_norm": 23.905214309692383, "learning_rate": 1.3400936380976003e-06, "loss": 0.3077, "num_input_tokens_seen": 106019184, "step": 111040 }, { "epoch": 9.05824292356636, "grad_norm": 15.047605514526367, "learning_rate": 1.3389442017192916e-06, "loss": 0.4858, "num_input_tokens_seen": 106024400, "step": 111045 }, { "epoch": 9.058650787176768, "grad_norm": 2.4879403114318848, "learning_rate": 1.3377952449405234e-06, "loss": 0.1751, "num_input_tokens_seen": 106029664, "step": 111050 }, { "epoch": 9.059058650787177, "grad_norm": 16.603479385375977, "learning_rate": 1.3366467677845968e-06, "loss": 0.6077, "num_input_tokens_seen": 106035408, "step": 111055 }, { "epoch": 9.059466514397586, "grad_norm": 11.665702819824219, "learning_rate": 1.3354987702747846e-06, "loss": 0.4152, "num_input_tokens_seen": 106039776, "step": 111060 }, { "epoch": 9.059874378007994, "grad_norm": 0.8588429093360901, "learning_rate": 1.334351252434357e-06, "loss": 0.2756, "num_input_tokens_seen": 106043712, "step": 111065 }, { "epoch": 9.060282241618403, "grad_norm": 41.38840866088867, "learning_rate": 1.3332042142865735e-06, "loss": 0.4504, "num_input_tokens_seen": 106048544, "step": 111070 }, { "epoch": 9.060690105228812, "grad_norm": 1.5234267711639404, "learning_rate": 1.332057655854682e-06, "loss": 0.2433, "num_input_tokens_seen": 106053232, "step": 111075 }, { "epoch": 9.06109796883922, "grad_norm": 5.59054708480835, "learning_rate": 1.330911577161928e-06, "loss": 0.3873, "num_input_tokens_seen": 106057872, "step": 111080 }, { "epoch": 9.06150583244963, "grad_norm": 8.233907699584961, "learning_rate": 1.3297659782315369e-06, "loss": 0.5697, "num_input_tokens_seen": 106062848, "step": 111085 }, { "epoch": 9.061913696060037, "grad_norm": 14.307645797729492, "learning_rate": 1.3286208590867294e-06, "loss": 0.3418, "num_input_tokens_seen": 106067776, "step": 111090 }, { "epoch": 9.062321559670446, "grad_norm": 2.7836272716522217, "learning_rate": 1.3274762197507173e-06, "loss": 0.4081, "num_input_tokens_seen": 106072304, "step": 111095 }, { "epoch": 9.062729423280855, "grad_norm": 18.601951599121094, "learning_rate": 1.3263320602467072e-06, "loss": 0.3363, "num_input_tokens_seen": 106077376, "step": 111100 }, { "epoch": 9.063137286891264, "grad_norm": 67.14615631103516, "learning_rate": 1.325188380597886e-06, "loss": 0.4724, "num_input_tokens_seen": 106082704, "step": 111105 }, { "epoch": 9.063545150501673, "grad_norm": 5.641860485076904, "learning_rate": 1.324045180827435e-06, "loss": 0.3956, "num_input_tokens_seen": 106088080, "step": 111110 }, { "epoch": 9.06395301411208, "grad_norm": 29.48127555847168, "learning_rate": 1.322902460958525e-06, "loss": 0.4981, "num_input_tokens_seen": 106093472, "step": 111115 }, { "epoch": 9.064360877722489, "grad_norm": 36.08110046386719, "learning_rate": 1.3217602210143259e-06, "loss": 0.2865, "num_input_tokens_seen": 106097792, "step": 111120 }, { "epoch": 9.064768741332898, "grad_norm": 32.75868606567383, "learning_rate": 1.3206184610179834e-06, "loss": 0.4791, "num_input_tokens_seen": 106103584, "step": 111125 }, { "epoch": 9.065176604943307, "grad_norm": 10.070280075073242, "learning_rate": 1.3194771809926426e-06, "loss": 0.3322, "num_input_tokens_seen": 106108560, "step": 111130 }, { "epoch": 9.065584468553716, "grad_norm": 9.5296630859375, "learning_rate": 1.318336380961438e-06, "loss": 0.5338, "num_input_tokens_seen": 106114000, "step": 111135 }, { "epoch": 9.065992332164125, "grad_norm": 2.4567041397094727, "learning_rate": 1.3171960609474897e-06, "loss": 0.4607, "num_input_tokens_seen": 106117824, "step": 111140 }, { "epoch": 9.066400195774532, "grad_norm": 11.907526016235352, "learning_rate": 1.3160562209739157e-06, "loss": 0.3654, "num_input_tokens_seen": 106122400, "step": 111145 }, { "epoch": 9.066808059384941, "grad_norm": 13.273133277893066, "learning_rate": 1.3149168610638197e-06, "loss": 0.2077, "num_input_tokens_seen": 106128032, "step": 111150 }, { "epoch": 9.06721592299535, "grad_norm": 15.587933540344238, "learning_rate": 1.313777981240294e-06, "loss": 0.3048, "num_input_tokens_seen": 106132320, "step": 111155 }, { "epoch": 9.06762378660576, "grad_norm": 8.075145721435547, "learning_rate": 1.3126395815264175e-06, "loss": 0.3106, "num_input_tokens_seen": 106136800, "step": 111160 }, { "epoch": 9.068031650216168, "grad_norm": 39.801753997802734, "learning_rate": 1.3115016619452774e-06, "loss": 0.5665, "num_input_tokens_seen": 106141136, "step": 111165 }, { "epoch": 9.068439513826576, "grad_norm": 15.246111869812012, "learning_rate": 1.310364222519933e-06, "loss": 0.3641, "num_input_tokens_seen": 106146240, "step": 111170 }, { "epoch": 9.068847377436985, "grad_norm": 29.60818862915039, "learning_rate": 1.3092272632734381e-06, "loss": 0.3834, "num_input_tokens_seen": 106151152, "step": 111175 }, { "epoch": 9.069255241047394, "grad_norm": 4.457292556762695, "learning_rate": 1.3080907842288408e-06, "loss": 0.2338, "num_input_tokens_seen": 106155600, "step": 111180 }, { "epoch": 9.069663104657803, "grad_norm": 23.440475463867188, "learning_rate": 1.306954785409173e-06, "loss": 0.4821, "num_input_tokens_seen": 106160656, "step": 111185 }, { "epoch": 9.070070968268212, "grad_norm": 12.893878936767578, "learning_rate": 1.3058192668374658e-06, "loss": 0.3548, "num_input_tokens_seen": 106165120, "step": 111190 }, { "epoch": 9.070478831878619, "grad_norm": 9.507519721984863, "learning_rate": 1.3046842285367344e-06, "loss": 0.3983, "num_input_tokens_seen": 106169040, "step": 111195 }, { "epoch": 9.070886695489028, "grad_norm": 2.3159892559051514, "learning_rate": 1.3035496705299854e-06, "loss": 0.3564, "num_input_tokens_seen": 106173120, "step": 111200 }, { "epoch": 9.071294559099437, "grad_norm": 8.59460735321045, "learning_rate": 1.3024155928402171e-06, "loss": 0.5944, "num_input_tokens_seen": 106177392, "step": 111205 }, { "epoch": 9.071702422709846, "grad_norm": 22.22426986694336, "learning_rate": 1.301281995490411e-06, "loss": 0.4849, "num_input_tokens_seen": 106182288, "step": 111210 }, { "epoch": 9.072110286320255, "grad_norm": 24.93241310119629, "learning_rate": 1.3001488785035515e-06, "loss": 0.2788, "num_input_tokens_seen": 106187184, "step": 111215 }, { "epoch": 9.072518149930664, "grad_norm": 35.38848114013672, "learning_rate": 1.2990162419026037e-06, "loss": 0.3217, "num_input_tokens_seen": 106191536, "step": 111220 }, { "epoch": 9.072926013541071, "grad_norm": 10.210853576660156, "learning_rate": 1.297884085710524e-06, "loss": 0.4362, "num_input_tokens_seen": 106196320, "step": 111225 }, { "epoch": 9.07333387715148, "grad_norm": 2.119997262954712, "learning_rate": 1.2967524099502582e-06, "loss": 0.2498, "num_input_tokens_seen": 106201280, "step": 111230 }, { "epoch": 9.07374174076189, "grad_norm": 21.681167602539062, "learning_rate": 1.2956212146447543e-06, "loss": 0.491, "num_input_tokens_seen": 106205632, "step": 111235 }, { "epoch": 9.074149604372298, "grad_norm": 4.000537395477295, "learning_rate": 1.294490499816936e-06, "loss": 0.385, "num_input_tokens_seen": 106210096, "step": 111240 }, { "epoch": 9.074557467982707, "grad_norm": 15.849632263183594, "learning_rate": 1.293360265489721e-06, "loss": 0.3519, "num_input_tokens_seen": 106215248, "step": 111245 }, { "epoch": 9.074965331593114, "grad_norm": 33.51810073852539, "learning_rate": 1.2922305116860157e-06, "loss": 0.1843, "num_input_tokens_seen": 106220176, "step": 111250 }, { "epoch": 9.075373195203523, "grad_norm": 4.403530597686768, "learning_rate": 1.2911012384287296e-06, "loss": 0.2648, "num_input_tokens_seen": 106225296, "step": 111255 }, { "epoch": 9.075781058813932, "grad_norm": 2.078812837600708, "learning_rate": 1.2899724457407447e-06, "loss": 0.3681, "num_input_tokens_seen": 106230896, "step": 111260 }, { "epoch": 9.076188922424341, "grad_norm": 1.3985620737075806, "learning_rate": 1.2888441336449452e-06, "loss": 0.3117, "num_input_tokens_seen": 106234320, "step": 111265 }, { "epoch": 9.07659678603475, "grad_norm": 9.135945320129395, "learning_rate": 1.2877163021641963e-06, "loss": 0.3328, "num_input_tokens_seen": 106238864, "step": 111270 }, { "epoch": 9.07700464964516, "grad_norm": 5.3788862228393555, "learning_rate": 1.286588951321363e-06, "loss": 0.1512, "num_input_tokens_seen": 106243392, "step": 111275 }, { "epoch": 9.077412513255567, "grad_norm": 6.426348686218262, "learning_rate": 1.2854620811392965e-06, "loss": 0.3631, "num_input_tokens_seen": 106248032, "step": 111280 }, { "epoch": 9.077820376865976, "grad_norm": 3.8457727432250977, "learning_rate": 1.2843356916408367e-06, "loss": 0.3721, "num_input_tokens_seen": 106252784, "step": 111285 }, { "epoch": 9.078228240476385, "grad_norm": 31.23651695251465, "learning_rate": 1.2832097828488127e-06, "loss": 0.2524, "num_input_tokens_seen": 106256960, "step": 111290 }, { "epoch": 9.078636104086794, "grad_norm": 11.04989242553711, "learning_rate": 1.2820843547860478e-06, "loss": 0.4195, "num_input_tokens_seen": 106261696, "step": 111295 }, { "epoch": 9.079043967697203, "grad_norm": 26.56450653076172, "learning_rate": 1.2809594074753573e-06, "loss": 0.4083, "num_input_tokens_seen": 106265408, "step": 111300 }, { "epoch": 9.07945183130761, "grad_norm": 28.92422103881836, "learning_rate": 1.2798349409395422e-06, "loss": 0.4039, "num_input_tokens_seen": 106269680, "step": 111305 }, { "epoch": 9.079859694918019, "grad_norm": 5.5012102127075195, "learning_rate": 1.2787109552013926e-06, "loss": 0.3735, "num_input_tokens_seen": 106275008, "step": 111310 }, { "epoch": 9.080267558528428, "grad_norm": 21.094242095947266, "learning_rate": 1.2775874502836877e-06, "loss": 0.4819, "num_input_tokens_seen": 106278624, "step": 111315 }, { "epoch": 9.080675422138837, "grad_norm": 34.258392333984375, "learning_rate": 1.276464426209209e-06, "loss": 0.3731, "num_input_tokens_seen": 106283248, "step": 111320 }, { "epoch": 9.081083285749246, "grad_norm": 0.8233363032341003, "learning_rate": 1.2753418830007163e-06, "loss": 0.3497, "num_input_tokens_seen": 106288112, "step": 111325 }, { "epoch": 9.081491149359653, "grad_norm": 3.3544089794158936, "learning_rate": 1.2742198206809636e-06, "loss": 0.2774, "num_input_tokens_seen": 106293456, "step": 111330 }, { "epoch": 9.081899012970062, "grad_norm": 4.0029754638671875, "learning_rate": 1.2730982392726908e-06, "loss": 0.491, "num_input_tokens_seen": 106298688, "step": 111335 }, { "epoch": 9.082306876580471, "grad_norm": 8.213215827941895, "learning_rate": 1.2719771387986357e-06, "loss": 0.148, "num_input_tokens_seen": 106303392, "step": 111340 }, { "epoch": 9.08271474019088, "grad_norm": 15.768068313598633, "learning_rate": 1.2708565192815215e-06, "loss": 0.4241, "num_input_tokens_seen": 106308848, "step": 111345 }, { "epoch": 9.08312260380129, "grad_norm": 2.7780208587646484, "learning_rate": 1.2697363807440631e-06, "loss": 0.2071, "num_input_tokens_seen": 106313120, "step": 111350 }, { "epoch": 9.083530467411698, "grad_norm": 1.8191531896591187, "learning_rate": 1.2686167232089624e-06, "loss": 0.363, "num_input_tokens_seen": 106317520, "step": 111355 }, { "epoch": 9.083938331022106, "grad_norm": 21.645950317382812, "learning_rate": 1.2674975466989148e-06, "loss": 0.2703, "num_input_tokens_seen": 106321584, "step": 111360 }, { "epoch": 9.084346194632515, "grad_norm": 1.5089168548583984, "learning_rate": 1.2663788512366103e-06, "loss": 0.3424, "num_input_tokens_seen": 106326944, "step": 111365 }, { "epoch": 9.084754058242924, "grad_norm": 15.06002426147461, "learning_rate": 1.2652606368447228e-06, "loss": 0.3055, "num_input_tokens_seen": 106331824, "step": 111370 }, { "epoch": 9.085161921853333, "grad_norm": 5.483792781829834, "learning_rate": 1.2641429035459174e-06, "loss": 0.4245, "num_input_tokens_seen": 106336448, "step": 111375 }, { "epoch": 9.085569785463742, "grad_norm": 1.3913904428482056, "learning_rate": 1.2630256513628452e-06, "loss": 0.3665, "num_input_tokens_seen": 106342016, "step": 111380 }, { "epoch": 9.085977649074149, "grad_norm": 9.705788612365723, "learning_rate": 1.2619088803181578e-06, "loss": 0.29, "num_input_tokens_seen": 106346624, "step": 111385 }, { "epoch": 9.086385512684558, "grad_norm": 7.126750469207764, "learning_rate": 1.2607925904344897e-06, "loss": 0.2197, "num_input_tokens_seen": 106351344, "step": 111390 }, { "epoch": 9.086793376294967, "grad_norm": 35.971702575683594, "learning_rate": 1.259676781734473e-06, "loss": 0.5291, "num_input_tokens_seen": 106356592, "step": 111395 }, { "epoch": 9.087201239905376, "grad_norm": 2.4012506008148193, "learning_rate": 1.258561454240717e-06, "loss": 0.2833, "num_input_tokens_seen": 106361472, "step": 111400 }, { "epoch": 9.087609103515785, "grad_norm": 13.883545875549316, "learning_rate": 1.257446607975829e-06, "loss": 0.4145, "num_input_tokens_seen": 106366480, "step": 111405 }, { "epoch": 9.088016967126194, "grad_norm": 12.122325897216797, "learning_rate": 1.256332242962413e-06, "loss": 0.4765, "num_input_tokens_seen": 106371296, "step": 111410 }, { "epoch": 9.088424830736601, "grad_norm": 8.420172691345215, "learning_rate": 1.2552183592230537e-06, "loss": 0.4146, "num_input_tokens_seen": 106376944, "step": 111415 }, { "epoch": 9.08883269434701, "grad_norm": 14.625268936157227, "learning_rate": 1.2541049567803276e-06, "loss": 0.233, "num_input_tokens_seen": 106381328, "step": 111420 }, { "epoch": 9.08924055795742, "grad_norm": 6.147273063659668, "learning_rate": 1.2529920356568026e-06, "loss": 0.276, "num_input_tokens_seen": 106385920, "step": 111425 }, { "epoch": 9.089648421567828, "grad_norm": 7.154266357421875, "learning_rate": 1.2518795958750357e-06, "loss": 0.2566, "num_input_tokens_seen": 106390288, "step": 111430 }, { "epoch": 9.090056285178237, "grad_norm": 7.381341934204102, "learning_rate": 1.250767637457584e-06, "loss": 0.2381, "num_input_tokens_seen": 106394000, "step": 111435 }, { "epoch": 9.090464148788644, "grad_norm": 1.0891273021697998, "learning_rate": 1.249656160426979e-06, "loss": 0.2931, "num_input_tokens_seen": 106398656, "step": 111440 }, { "epoch": 9.090872012399053, "grad_norm": 26.565370559692383, "learning_rate": 1.2485451648057506e-06, "loss": 0.3632, "num_input_tokens_seen": 106403856, "step": 111445 }, { "epoch": 9.091279876009462, "grad_norm": 13.620762825012207, "learning_rate": 1.2474346506164164e-06, "loss": 0.4365, "num_input_tokens_seen": 106409328, "step": 111450 }, { "epoch": 9.091687739619871, "grad_norm": 4.712001800537109, "learning_rate": 1.2463246178814918e-06, "loss": 0.4576, "num_input_tokens_seen": 106414416, "step": 111455 }, { "epoch": 9.09209560323028, "grad_norm": 3.877794027328491, "learning_rate": 1.2452150666234757e-06, "loss": 0.2877, "num_input_tokens_seen": 106418528, "step": 111460 }, { "epoch": 9.092503466840688, "grad_norm": 3.3912527561187744, "learning_rate": 1.2441059968648526e-06, "loss": 0.3037, "num_input_tokens_seen": 106423120, "step": 111465 }, { "epoch": 9.092911330451097, "grad_norm": 27.628990173339844, "learning_rate": 1.2429974086281104e-06, "loss": 0.2232, "num_input_tokens_seen": 106428544, "step": 111470 }, { "epoch": 9.093319194061506, "grad_norm": 9.035723686218262, "learning_rate": 1.241889301935714e-06, "loss": 0.5946, "num_input_tokens_seen": 106433344, "step": 111475 }, { "epoch": 9.093727057671915, "grad_norm": 22.003541946411133, "learning_rate": 1.2407816768101238e-06, "loss": 0.4207, "num_input_tokens_seen": 106438128, "step": 111480 }, { "epoch": 9.094134921282324, "grad_norm": 7.160031795501709, "learning_rate": 1.2396745332737937e-06, "loss": 0.3045, "num_input_tokens_seen": 106442944, "step": 111485 }, { "epoch": 9.094542784892733, "grad_norm": 37.6967887878418, "learning_rate": 1.2385678713491638e-06, "loss": 0.3705, "num_input_tokens_seen": 106447232, "step": 111490 }, { "epoch": 9.09495064850314, "grad_norm": 18.589622497558594, "learning_rate": 1.2374616910586696e-06, "loss": 0.1873, "num_input_tokens_seen": 106452112, "step": 111495 }, { "epoch": 9.095358512113549, "grad_norm": 4.318237781524658, "learning_rate": 1.2363559924247287e-06, "loss": 0.451, "num_input_tokens_seen": 106456496, "step": 111500 }, { "epoch": 9.095766375723958, "grad_norm": 20.451740264892578, "learning_rate": 1.235250775469754e-06, "loss": 0.2485, "num_input_tokens_seen": 106461104, "step": 111505 }, { "epoch": 9.096174239334367, "grad_norm": 8.585819244384766, "learning_rate": 1.234146040216147e-06, "loss": 0.3662, "num_input_tokens_seen": 106465936, "step": 111510 }, { "epoch": 9.096582102944776, "grad_norm": 44.23064422607422, "learning_rate": 1.2330417866862981e-06, "loss": 0.242, "num_input_tokens_seen": 106471152, "step": 111515 }, { "epoch": 9.096989966555183, "grad_norm": 41.959625244140625, "learning_rate": 1.2319380149025977e-06, "loss": 0.2919, "num_input_tokens_seen": 106475072, "step": 111520 }, { "epoch": 9.097397830165592, "grad_norm": 4.031904220581055, "learning_rate": 1.2308347248874113e-06, "loss": 0.3388, "num_input_tokens_seen": 106479152, "step": 111525 }, { "epoch": 9.097805693776001, "grad_norm": 1.8719005584716797, "learning_rate": 1.2297319166631072e-06, "loss": 0.4707, "num_input_tokens_seen": 106483616, "step": 111530 }, { "epoch": 9.09821355738641, "grad_norm": 21.079288482666016, "learning_rate": 1.2286295902520367e-06, "loss": 0.4357, "num_input_tokens_seen": 106488912, "step": 111535 }, { "epoch": 9.09862142099682, "grad_norm": 16.659440994262695, "learning_rate": 1.2275277456765378e-06, "loss": 0.229, "num_input_tokens_seen": 106494304, "step": 111540 }, { "epoch": 9.099029284607226, "grad_norm": 53.12660598754883, "learning_rate": 1.2264263829589533e-06, "loss": 0.3031, "num_input_tokens_seen": 106499520, "step": 111545 }, { "epoch": 9.099437148217635, "grad_norm": 6.6635894775390625, "learning_rate": 1.2253255021216076e-06, "loss": 0.5047, "num_input_tokens_seen": 106504000, "step": 111550 }, { "epoch": 9.099845011828045, "grad_norm": 71.9163818359375, "learning_rate": 1.2242251031868074e-06, "loss": 0.2654, "num_input_tokens_seen": 106508496, "step": 111555 }, { "epoch": 9.100252875438454, "grad_norm": 16.956544876098633, "learning_rate": 1.22312518617686e-06, "loss": 0.2243, "num_input_tokens_seen": 106513264, "step": 111560 }, { "epoch": 9.100660739048863, "grad_norm": 21.448911666870117, "learning_rate": 1.2220257511140642e-06, "loss": 0.5011, "num_input_tokens_seen": 106517728, "step": 111565 }, { "epoch": 9.101068602659272, "grad_norm": 9.86502456665039, "learning_rate": 1.2209267980207051e-06, "loss": 0.3671, "num_input_tokens_seen": 106522752, "step": 111570 }, { "epoch": 9.101476466269679, "grad_norm": 46.07709884643555, "learning_rate": 1.2198283269190536e-06, "loss": 0.433, "num_input_tokens_seen": 106527984, "step": 111575 }, { "epoch": 9.101884329880088, "grad_norm": 5.951733112335205, "learning_rate": 1.218730337831378e-06, "loss": 0.2691, "num_input_tokens_seen": 106532144, "step": 111580 }, { "epoch": 9.102292193490497, "grad_norm": 15.109625816345215, "learning_rate": 1.2176328307799273e-06, "loss": 0.5409, "num_input_tokens_seen": 106536656, "step": 111585 }, { "epoch": 9.102700057100906, "grad_norm": 1.551106572151184, "learning_rate": 1.2165358057869585e-06, "loss": 0.4559, "num_input_tokens_seen": 106542064, "step": 111590 }, { "epoch": 9.103107920711315, "grad_norm": 16.061697006225586, "learning_rate": 1.2154392628747042e-06, "loss": 0.3492, "num_input_tokens_seen": 106547120, "step": 111595 }, { "epoch": 9.103515784321722, "grad_norm": 26.06583023071289, "learning_rate": 1.2143432020653877e-06, "loss": 0.3326, "num_input_tokens_seen": 106551296, "step": 111600 }, { "epoch": 9.103923647932131, "grad_norm": 4.938784122467041, "learning_rate": 1.2132476233812224e-06, "loss": 0.3861, "num_input_tokens_seen": 106556976, "step": 111605 }, { "epoch": 9.10433151154254, "grad_norm": 10.098310470581055, "learning_rate": 1.2121525268444262e-06, "loss": 0.3603, "num_input_tokens_seen": 106561648, "step": 111610 }, { "epoch": 9.104739375152949, "grad_norm": 16.31244468688965, "learning_rate": 1.2110579124771897e-06, "loss": 0.2755, "num_input_tokens_seen": 106565824, "step": 111615 }, { "epoch": 9.105147238763358, "grad_norm": 4.5359416007995605, "learning_rate": 1.2099637803016983e-06, "loss": 0.2272, "num_input_tokens_seen": 106570256, "step": 111620 }, { "epoch": 9.105555102373767, "grad_norm": 14.47602367401123, "learning_rate": 1.208870130340134e-06, "loss": 0.5121, "num_input_tokens_seen": 106575504, "step": 111625 }, { "epoch": 9.105962965984174, "grad_norm": 4.612565994262695, "learning_rate": 1.207776962614665e-06, "loss": 0.3907, "num_input_tokens_seen": 106580192, "step": 111630 }, { "epoch": 9.106370829594583, "grad_norm": 9.840307235717773, "learning_rate": 1.2066842771474435e-06, "loss": 0.2236, "num_input_tokens_seen": 106585184, "step": 111635 }, { "epoch": 9.106778693204992, "grad_norm": 15.178853988647461, "learning_rate": 1.2055920739606235e-06, "loss": 0.46, "num_input_tokens_seen": 106589952, "step": 111640 }, { "epoch": 9.107186556815401, "grad_norm": 42.584720611572266, "learning_rate": 1.2045003530763405e-06, "loss": 0.3514, "num_input_tokens_seen": 106594784, "step": 111645 }, { "epoch": 9.10759442042581, "grad_norm": 9.088616371154785, "learning_rate": 1.2034091145167208e-06, "loss": 0.4107, "num_input_tokens_seen": 106599200, "step": 111650 }, { "epoch": 9.108002284036218, "grad_norm": 3.738858699798584, "learning_rate": 1.2023183583038887e-06, "loss": 0.2998, "num_input_tokens_seen": 106603760, "step": 111655 }, { "epoch": 9.108410147646627, "grad_norm": 34.64190673828125, "learning_rate": 1.2012280844599517e-06, "loss": 0.3554, "num_input_tokens_seen": 106609072, "step": 111660 }, { "epoch": 9.108818011257036, "grad_norm": 5.976853370666504, "learning_rate": 1.200138293007011e-06, "loss": 0.4157, "num_input_tokens_seen": 106612880, "step": 111665 }, { "epoch": 9.109225874867445, "grad_norm": 20.505735397338867, "learning_rate": 1.1990489839671465e-06, "loss": 0.4472, "num_input_tokens_seen": 106617648, "step": 111670 }, { "epoch": 9.109633738477854, "grad_norm": 17.646806716918945, "learning_rate": 1.1979601573624515e-06, "loss": 0.5085, "num_input_tokens_seen": 106622384, "step": 111675 }, { "epoch": 9.11004160208826, "grad_norm": 35.1692008972168, "learning_rate": 1.1968718132149893e-06, "loss": 0.4559, "num_input_tokens_seen": 106627216, "step": 111680 }, { "epoch": 9.11044946569867, "grad_norm": 23.659198760986328, "learning_rate": 1.1957839515468222e-06, "loss": 0.3979, "num_input_tokens_seen": 106632288, "step": 111685 }, { "epoch": 9.110857329309079, "grad_norm": 14.42733383178711, "learning_rate": 1.1946965723799998e-06, "loss": 0.3539, "num_input_tokens_seen": 106637376, "step": 111690 }, { "epoch": 9.111265192919488, "grad_norm": 7.968183517456055, "learning_rate": 1.1936096757365567e-06, "loss": 0.4814, "num_input_tokens_seen": 106641776, "step": 111695 }, { "epoch": 9.111673056529897, "grad_norm": 16.218658447265625, "learning_rate": 1.192523261638534e-06, "loss": 0.3031, "num_input_tokens_seen": 106646784, "step": 111700 }, { "epoch": 9.112080920140306, "grad_norm": 3.9779231548309326, "learning_rate": 1.1914373301079474e-06, "loss": 0.2131, "num_input_tokens_seen": 106652032, "step": 111705 }, { "epoch": 9.112488783750713, "grad_norm": 30.682010650634766, "learning_rate": 1.1903518811668097e-06, "loss": 0.3863, "num_input_tokens_seen": 106656976, "step": 111710 }, { "epoch": 9.112896647361122, "grad_norm": 27.911235809326172, "learning_rate": 1.18926691483712e-06, "loss": 0.4035, "num_input_tokens_seen": 106660736, "step": 111715 }, { "epoch": 9.113304510971531, "grad_norm": 16.0253963470459, "learning_rate": 1.1881824311408745e-06, "loss": 0.3924, "num_input_tokens_seen": 106665840, "step": 111720 }, { "epoch": 9.11371237458194, "grad_norm": 3.0969278812408447, "learning_rate": 1.187098430100053e-06, "loss": 0.3123, "num_input_tokens_seen": 106670416, "step": 111725 }, { "epoch": 9.11412023819235, "grad_norm": 4.342897415161133, "learning_rate": 1.186014911736627e-06, "loss": 0.3721, "num_input_tokens_seen": 106675872, "step": 111730 }, { "epoch": 9.114528101802756, "grad_norm": 23.66040802001953, "learning_rate": 1.1849318760725593e-06, "loss": 0.3445, "num_input_tokens_seen": 106681008, "step": 111735 }, { "epoch": 9.114935965413165, "grad_norm": 1.00176203250885, "learning_rate": 1.1838493231298015e-06, "loss": 0.2405, "num_input_tokens_seen": 106685632, "step": 111740 }, { "epoch": 9.115343829023574, "grad_norm": 1.9558794498443604, "learning_rate": 1.1827672529303002e-06, "loss": 0.2765, "num_input_tokens_seen": 106690880, "step": 111745 }, { "epoch": 9.115751692633983, "grad_norm": 0.6079480051994324, "learning_rate": 1.181685665495985e-06, "loss": 0.2452, "num_input_tokens_seen": 106695712, "step": 111750 }, { "epoch": 9.116159556244392, "grad_norm": 37.96359634399414, "learning_rate": 1.1806045608487804e-06, "loss": 0.5001, "num_input_tokens_seen": 106700352, "step": 111755 }, { "epoch": 9.1165674198548, "grad_norm": 2.1619086265563965, "learning_rate": 1.1795239390106017e-06, "loss": 0.5314, "num_input_tokens_seen": 106704784, "step": 111760 }, { "epoch": 9.116975283465209, "grad_norm": 18.132192611694336, "learning_rate": 1.178443800003351e-06, "loss": 0.3469, "num_input_tokens_seen": 106709360, "step": 111765 }, { "epoch": 9.117383147075618, "grad_norm": 11.55582332611084, "learning_rate": 1.177364143848919e-06, "loss": 0.4902, "num_input_tokens_seen": 106714368, "step": 111770 }, { "epoch": 9.117791010686027, "grad_norm": 11.03935432434082, "learning_rate": 1.176284970569197e-06, "loss": 0.3266, "num_input_tokens_seen": 106719088, "step": 111775 }, { "epoch": 9.118198874296436, "grad_norm": 10.377195358276367, "learning_rate": 1.175206280186053e-06, "loss": 0.3439, "num_input_tokens_seen": 106723552, "step": 111780 }, { "epoch": 9.118606737906845, "grad_norm": 27.234556198120117, "learning_rate": 1.1741280727213505e-06, "loss": 0.4443, "num_input_tokens_seen": 106728800, "step": 111785 }, { "epoch": 9.119014601517252, "grad_norm": 0.9916041493415833, "learning_rate": 1.1730503481969495e-06, "loss": 0.2739, "num_input_tokens_seen": 106734112, "step": 111790 }, { "epoch": 9.119422465127661, "grad_norm": 4.339323043823242, "learning_rate": 1.1719731066346966e-06, "loss": 0.3645, "num_input_tokens_seen": 106738464, "step": 111795 }, { "epoch": 9.11983032873807, "grad_norm": 54.80268096923828, "learning_rate": 1.1708963480564217e-06, "loss": 0.416, "num_input_tokens_seen": 106743856, "step": 111800 }, { "epoch": 9.120238192348479, "grad_norm": 3.478092908859253, "learning_rate": 1.1698200724839486e-06, "loss": 0.343, "num_input_tokens_seen": 106748560, "step": 111805 }, { "epoch": 9.120646055958888, "grad_norm": 10.33715534210205, "learning_rate": 1.1687442799391018e-06, "loss": 0.2825, "num_input_tokens_seen": 106752816, "step": 111810 }, { "epoch": 9.121053919569295, "grad_norm": 3.8548262119293213, "learning_rate": 1.1676689704436805e-06, "loss": 0.2768, "num_input_tokens_seen": 106756832, "step": 111815 }, { "epoch": 9.121461783179704, "grad_norm": 1.799216866493225, "learning_rate": 1.166594144019481e-06, "loss": 0.4003, "num_input_tokens_seen": 106761392, "step": 111820 }, { "epoch": 9.121869646790113, "grad_norm": 15.562150001525879, "learning_rate": 1.1655198006882945e-06, "loss": 0.281, "num_input_tokens_seen": 106766032, "step": 111825 }, { "epoch": 9.122277510400522, "grad_norm": 38.159568786621094, "learning_rate": 1.1644459404718865e-06, "loss": 0.3327, "num_input_tokens_seen": 106771360, "step": 111830 }, { "epoch": 9.122685374010931, "grad_norm": 38.02171325683594, "learning_rate": 1.1633725633920372e-06, "loss": 0.3479, "num_input_tokens_seen": 106776752, "step": 111835 }, { "epoch": 9.12309323762134, "grad_norm": 61.693321228027344, "learning_rate": 1.1622996694704957e-06, "loss": 0.4925, "num_input_tokens_seen": 106782240, "step": 111840 }, { "epoch": 9.123501101231748, "grad_norm": 22.853721618652344, "learning_rate": 1.1612272587290114e-06, "loss": 0.1758, "num_input_tokens_seen": 106787472, "step": 111845 }, { "epoch": 9.123908964842157, "grad_norm": 18.81492805480957, "learning_rate": 1.1601553311893165e-06, "loss": 0.5058, "num_input_tokens_seen": 106791952, "step": 111850 }, { "epoch": 9.124316828452566, "grad_norm": 14.242537498474121, "learning_rate": 1.1590838868731468e-06, "loss": 0.4899, "num_input_tokens_seen": 106796832, "step": 111855 }, { "epoch": 9.124724692062975, "grad_norm": 9.302511215209961, "learning_rate": 1.1580129258022155e-06, "loss": 0.2314, "num_input_tokens_seen": 106801872, "step": 111860 }, { "epoch": 9.125132555673384, "grad_norm": 4.331729412078857, "learning_rate": 1.1569424479982328e-06, "loss": 0.3591, "num_input_tokens_seen": 106807024, "step": 111865 }, { "epoch": 9.12554041928379, "grad_norm": 2.5998027324676514, "learning_rate": 1.1558724534828896e-06, "loss": 0.3097, "num_input_tokens_seen": 106812336, "step": 111870 }, { "epoch": 9.1259482828942, "grad_norm": 14.20494556427002, "learning_rate": 1.1548029422778855e-06, "loss": 0.4097, "num_input_tokens_seen": 106817408, "step": 111875 }, { "epoch": 9.126356146504609, "grad_norm": 1.5672284364700317, "learning_rate": 1.153733914404892e-06, "loss": 0.356, "num_input_tokens_seen": 106822512, "step": 111880 }, { "epoch": 9.126764010115018, "grad_norm": 1.8855509757995605, "learning_rate": 1.1526653698855804e-06, "loss": 0.3672, "num_input_tokens_seen": 106827328, "step": 111885 }, { "epoch": 9.127171873725427, "grad_norm": 28.5335750579834, "learning_rate": 1.151597308741606e-06, "loss": 0.3749, "num_input_tokens_seen": 106832512, "step": 111890 }, { "epoch": 9.127579737335834, "grad_norm": 16.113142013549805, "learning_rate": 1.1505297309946205e-06, "loss": 0.3042, "num_input_tokens_seen": 106837488, "step": 111895 }, { "epoch": 9.127987600946243, "grad_norm": 8.930002212524414, "learning_rate": 1.1494626366662653e-06, "loss": 0.2678, "num_input_tokens_seen": 106841952, "step": 111900 }, { "epoch": 9.128395464556652, "grad_norm": 20.577415466308594, "learning_rate": 1.1483960257781672e-06, "loss": 0.1815, "num_input_tokens_seen": 106846928, "step": 111905 }, { "epoch": 9.128803328167061, "grad_norm": 5.007158279418945, "learning_rate": 1.1473298983519454e-06, "loss": 0.3955, "num_input_tokens_seen": 106851552, "step": 111910 }, { "epoch": 9.12921119177747, "grad_norm": 0.9436996579170227, "learning_rate": 1.1462642544092072e-06, "loss": 0.3721, "num_input_tokens_seen": 106856000, "step": 111915 }, { "epoch": 9.129619055387879, "grad_norm": 14.079584121704102, "learning_rate": 1.1451990939715606e-06, "loss": 0.2957, "num_input_tokens_seen": 106860368, "step": 111920 }, { "epoch": 9.130026918998286, "grad_norm": 36.4052734375, "learning_rate": 1.144134417060594e-06, "loss": 0.4926, "num_input_tokens_seen": 106865600, "step": 111925 }, { "epoch": 9.130434782608695, "grad_norm": 2.555763006210327, "learning_rate": 1.143070223697884e-06, "loss": 0.4925, "num_input_tokens_seen": 106870848, "step": 111930 }, { "epoch": 9.130842646219104, "grad_norm": 3.1475327014923096, "learning_rate": 1.1420065139050029e-06, "loss": 0.2715, "num_input_tokens_seen": 106875408, "step": 111935 }, { "epoch": 9.131250509829513, "grad_norm": 49.13080978393555, "learning_rate": 1.140943287703508e-06, "loss": 0.4448, "num_input_tokens_seen": 106880496, "step": 111940 }, { "epoch": 9.131658373439922, "grad_norm": 10.28637981414795, "learning_rate": 1.1398805451149575e-06, "loss": 0.3384, "num_input_tokens_seen": 106885088, "step": 111945 }, { "epoch": 9.13206623705033, "grad_norm": 0.7942911386489868, "learning_rate": 1.138818286160892e-06, "loss": 0.3739, "num_input_tokens_seen": 106890480, "step": 111950 }, { "epoch": 9.132474100660739, "grad_norm": 40.0300407409668, "learning_rate": 1.1377565108628391e-06, "loss": 0.1668, "num_input_tokens_seen": 106894592, "step": 111955 }, { "epoch": 9.132881964271148, "grad_norm": 30.990041732788086, "learning_rate": 1.1366952192423175e-06, "loss": 0.5314, "num_input_tokens_seen": 106899456, "step": 111960 }, { "epoch": 9.133289827881557, "grad_norm": 15.526357650756836, "learning_rate": 1.1356344113208489e-06, "loss": 0.3117, "num_input_tokens_seen": 106904256, "step": 111965 }, { "epoch": 9.133697691491966, "grad_norm": 2.457106828689575, "learning_rate": 1.1345740871199274e-06, "loss": 0.4411, "num_input_tokens_seen": 106908464, "step": 111970 }, { "epoch": 9.134105555102373, "grad_norm": 9.820060729980469, "learning_rate": 1.133514246661052e-06, "loss": 0.3892, "num_input_tokens_seen": 106913248, "step": 111975 }, { "epoch": 9.134513418712782, "grad_norm": 3.4510068893432617, "learning_rate": 1.1324548899656977e-06, "loss": 0.2397, "num_input_tokens_seen": 106918640, "step": 111980 }, { "epoch": 9.134921282323191, "grad_norm": 13.83370590209961, "learning_rate": 1.1313960170553417e-06, "loss": 0.4439, "num_input_tokens_seen": 106923872, "step": 111985 }, { "epoch": 9.1353291459336, "grad_norm": 10.441911697387695, "learning_rate": 1.1303376279514444e-06, "loss": 0.4837, "num_input_tokens_seen": 106928304, "step": 111990 }, { "epoch": 9.135737009544009, "grad_norm": 8.059250831604004, "learning_rate": 1.1292797226754637e-06, "loss": 0.3379, "num_input_tokens_seen": 106933760, "step": 111995 }, { "epoch": 9.136144873154418, "grad_norm": 5.99697208404541, "learning_rate": 1.1282223012488407e-06, "loss": 0.3892, "num_input_tokens_seen": 106939520, "step": 112000 }, { "epoch": 9.136552736764825, "grad_norm": 13.562263488769531, "learning_rate": 1.127165363693003e-06, "loss": 0.3192, "num_input_tokens_seen": 106943872, "step": 112005 }, { "epoch": 9.136960600375234, "grad_norm": 3.1983306407928467, "learning_rate": 1.1261089100293804e-06, "loss": 0.3249, "num_input_tokens_seen": 106948336, "step": 112010 }, { "epoch": 9.137368463985643, "grad_norm": 15.173093795776367, "learning_rate": 1.1250529402793892e-06, "loss": 0.2521, "num_input_tokens_seen": 106953072, "step": 112015 }, { "epoch": 9.137776327596052, "grad_norm": 18.02916145324707, "learning_rate": 1.1239974544644288e-06, "loss": 0.182, "num_input_tokens_seen": 106958608, "step": 112020 }, { "epoch": 9.138184191206461, "grad_norm": 6.4994425773620605, "learning_rate": 1.1229424526058934e-06, "loss": 0.2357, "num_input_tokens_seen": 106963760, "step": 112025 }, { "epoch": 9.138592054816868, "grad_norm": 21.32158660888672, "learning_rate": 1.1218879347251687e-06, "loss": 0.3959, "num_input_tokens_seen": 106967728, "step": 112030 }, { "epoch": 9.138999918427277, "grad_norm": 57.67494583129883, "learning_rate": 1.1208339008436292e-06, "loss": 0.3845, "num_input_tokens_seen": 106972256, "step": 112035 }, { "epoch": 9.139407782037686, "grad_norm": 8.976009368896484, "learning_rate": 1.119780350982641e-06, "loss": 0.4129, "num_input_tokens_seen": 106977120, "step": 112040 }, { "epoch": 9.139815645648095, "grad_norm": 4.809865951538086, "learning_rate": 1.1187272851635567e-06, "loss": 0.3876, "num_input_tokens_seen": 106982192, "step": 112045 }, { "epoch": 9.140223509258504, "grad_norm": 3.629328966140747, "learning_rate": 1.11767470340772e-06, "loss": 0.3497, "num_input_tokens_seen": 106987216, "step": 112050 }, { "epoch": 9.140631372868913, "grad_norm": 2.2888071537017822, "learning_rate": 1.1166226057364697e-06, "loss": 0.3231, "num_input_tokens_seen": 106991600, "step": 112055 }, { "epoch": 9.14103923647932, "grad_norm": 5.8626861572265625, "learning_rate": 1.1155709921711332e-06, "loss": 0.3729, "num_input_tokens_seen": 106995776, "step": 112060 }, { "epoch": 9.14144710008973, "grad_norm": 22.724489212036133, "learning_rate": 1.114519862733024e-06, "loss": 0.3493, "num_input_tokens_seen": 107001200, "step": 112065 }, { "epoch": 9.141854963700139, "grad_norm": 44.869258880615234, "learning_rate": 1.1134692174434413e-06, "loss": 0.4792, "num_input_tokens_seen": 107005040, "step": 112070 }, { "epoch": 9.142262827310548, "grad_norm": 30.072145462036133, "learning_rate": 1.1124190563236937e-06, "loss": 0.312, "num_input_tokens_seen": 107010016, "step": 112075 }, { "epoch": 9.142670690920957, "grad_norm": 31.460247039794922, "learning_rate": 1.1113693793950609e-06, "loss": 0.5171, "num_input_tokens_seen": 107014944, "step": 112080 }, { "epoch": 9.143078554531364, "grad_norm": 21.1409969329834, "learning_rate": 1.1103201866788176e-06, "loss": 0.3713, "num_input_tokens_seen": 107019456, "step": 112085 }, { "epoch": 9.143486418141773, "grad_norm": 6.479165077209473, "learning_rate": 1.109271478196236e-06, "loss": 0.4386, "num_input_tokens_seen": 107023808, "step": 112090 }, { "epoch": 9.143894281752182, "grad_norm": 14.177695274353027, "learning_rate": 1.1082232539685627e-06, "loss": 0.1788, "num_input_tokens_seen": 107029152, "step": 112095 }, { "epoch": 9.144302145362591, "grad_norm": 15.307609558105469, "learning_rate": 1.1071755140170587e-06, "loss": 0.242, "num_input_tokens_seen": 107034160, "step": 112100 }, { "epoch": 9.144710008973, "grad_norm": 1.6498072147369385, "learning_rate": 1.1061282583629512e-06, "loss": 0.3575, "num_input_tokens_seen": 107038160, "step": 112105 }, { "epoch": 9.145117872583407, "grad_norm": 22.649126052856445, "learning_rate": 1.1050814870274734e-06, "loss": 0.4533, "num_input_tokens_seen": 107042832, "step": 112110 }, { "epoch": 9.145525736193816, "grad_norm": 3.0503971576690674, "learning_rate": 1.104035200031836e-06, "loss": 0.2009, "num_input_tokens_seen": 107047232, "step": 112115 }, { "epoch": 9.145933599804225, "grad_norm": 4.498297691345215, "learning_rate": 1.1029893973972555e-06, "loss": 0.3355, "num_input_tokens_seen": 107052416, "step": 112120 }, { "epoch": 9.146341463414634, "grad_norm": 48.68854904174805, "learning_rate": 1.1019440791449232e-06, "loss": 0.4941, "num_input_tokens_seen": 107056544, "step": 112125 }, { "epoch": 9.146749327025043, "grad_norm": 15.473414421081543, "learning_rate": 1.1008992452960304e-06, "loss": 0.337, "num_input_tokens_seen": 107061088, "step": 112130 }, { "epoch": 9.147157190635452, "grad_norm": 4.776795864105225, "learning_rate": 1.0998548958717547e-06, "loss": 0.4128, "num_input_tokens_seen": 107066160, "step": 112135 }, { "epoch": 9.14756505424586, "grad_norm": 5.705031871795654, "learning_rate": 1.0988110308932625e-06, "loss": 0.3258, "num_input_tokens_seen": 107070944, "step": 112140 }, { "epoch": 9.147972917856269, "grad_norm": 9.83434772491455, "learning_rate": 1.0977676503817174e-06, "loss": 0.2003, "num_input_tokens_seen": 107075856, "step": 112145 }, { "epoch": 9.148380781466678, "grad_norm": 17.186243057250977, "learning_rate": 1.0967247543582636e-06, "loss": 0.3105, "num_input_tokens_seen": 107079328, "step": 112150 }, { "epoch": 9.148788645077087, "grad_norm": 17.841510772705078, "learning_rate": 1.0956823428440428e-06, "loss": 0.3706, "num_input_tokens_seen": 107083360, "step": 112155 }, { "epoch": 9.149196508687496, "grad_norm": 9.628990173339844, "learning_rate": 1.0946404158601847e-06, "loss": 0.3532, "num_input_tokens_seen": 107087984, "step": 112160 }, { "epoch": 9.149604372297903, "grad_norm": 24.393993377685547, "learning_rate": 1.0935989734278063e-06, "loss": 0.3985, "num_input_tokens_seen": 107093488, "step": 112165 }, { "epoch": 9.150012235908312, "grad_norm": 12.406829833984375, "learning_rate": 1.0925580155680182e-06, "loss": 0.3604, "num_input_tokens_seen": 107098384, "step": 112170 }, { "epoch": 9.15042009951872, "grad_norm": 1.7560616731643677, "learning_rate": 1.0915175423019174e-06, "loss": 0.2072, "num_input_tokens_seen": 107102960, "step": 112175 }, { "epoch": 9.15082796312913, "grad_norm": 6.011899471282959, "learning_rate": 1.0904775536505984e-06, "loss": 0.2688, "num_input_tokens_seen": 107108384, "step": 112180 }, { "epoch": 9.151235826739539, "grad_norm": 31.846647262573242, "learning_rate": 1.0894380496351415e-06, "loss": 0.309, "num_input_tokens_seen": 107113008, "step": 112185 }, { "epoch": 9.151643690349948, "grad_norm": 7.1311750411987305, "learning_rate": 1.088399030276613e-06, "loss": 0.5311, "num_input_tokens_seen": 107117456, "step": 112190 }, { "epoch": 9.152051553960355, "grad_norm": 9.55527114868164, "learning_rate": 1.0873604955960765e-06, "loss": 0.4195, "num_input_tokens_seen": 107123104, "step": 112195 }, { "epoch": 9.152459417570764, "grad_norm": 10.548645973205566, "learning_rate": 1.086322445614582e-06, "loss": 0.3874, "num_input_tokens_seen": 107127504, "step": 112200 }, { "epoch": 9.152867281181173, "grad_norm": 33.69185256958008, "learning_rate": 1.0852848803531685e-06, "loss": 0.1573, "num_input_tokens_seen": 107132448, "step": 112205 }, { "epoch": 9.153275144791582, "grad_norm": 70.04236602783203, "learning_rate": 1.0842477998328686e-06, "loss": 0.3783, "num_input_tokens_seen": 107136624, "step": 112210 }, { "epoch": 9.153683008401991, "grad_norm": 40.452308654785156, "learning_rate": 1.083211204074705e-06, "loss": 0.3712, "num_input_tokens_seen": 107141152, "step": 112215 }, { "epoch": 9.154090872012398, "grad_norm": 23.117923736572266, "learning_rate": 1.0821750930996883e-06, "loss": 0.4032, "num_input_tokens_seen": 107145536, "step": 112220 }, { "epoch": 9.154498735622807, "grad_norm": 29.326358795166016, "learning_rate": 1.0811394669288128e-06, "loss": 0.3542, "num_input_tokens_seen": 107150864, "step": 112225 }, { "epoch": 9.154906599233216, "grad_norm": 9.357514381408691, "learning_rate": 1.0801043255830811e-06, "loss": 0.5012, "num_input_tokens_seen": 107155792, "step": 112230 }, { "epoch": 9.155314462843625, "grad_norm": 9.868033409118652, "learning_rate": 1.0790696690834712e-06, "loss": 0.4655, "num_input_tokens_seen": 107160848, "step": 112235 }, { "epoch": 9.155722326454034, "grad_norm": 4.03169584274292, "learning_rate": 1.0780354974509548e-06, "loss": 0.2958, "num_input_tokens_seen": 107165152, "step": 112240 }, { "epoch": 9.156130190064442, "grad_norm": 13.286050796508789, "learning_rate": 1.0770018107064933e-06, "loss": 0.5729, "num_input_tokens_seen": 107170480, "step": 112245 }, { "epoch": 9.15653805367485, "grad_norm": 19.54829216003418, "learning_rate": 1.0759686088710364e-06, "loss": 0.2135, "num_input_tokens_seen": 107175408, "step": 112250 }, { "epoch": 9.15694591728526, "grad_norm": 26.830209732055664, "learning_rate": 1.0749358919655338e-06, "loss": 0.3302, "num_input_tokens_seen": 107180064, "step": 112255 }, { "epoch": 9.157353780895669, "grad_norm": 5.443314552307129, "learning_rate": 1.0739036600109137e-06, "loss": 0.4157, "num_input_tokens_seen": 107185504, "step": 112260 }, { "epoch": 9.157761644506078, "grad_norm": 18.295866012573242, "learning_rate": 1.0728719130281007e-06, "loss": 0.3187, "num_input_tokens_seen": 107188992, "step": 112265 }, { "epoch": 9.158169508116487, "grad_norm": 1.3130015134811401, "learning_rate": 1.071840651038003e-06, "loss": 0.3799, "num_input_tokens_seen": 107193088, "step": 112270 }, { "epoch": 9.158577371726894, "grad_norm": 19.647483825683594, "learning_rate": 1.0708098740615291e-06, "loss": 0.3548, "num_input_tokens_seen": 107197632, "step": 112275 }, { "epoch": 9.158985235337303, "grad_norm": 9.117501258850098, "learning_rate": 1.0697795821195733e-06, "loss": 0.3005, "num_input_tokens_seen": 107201808, "step": 112280 }, { "epoch": 9.159393098947712, "grad_norm": 15.865735054016113, "learning_rate": 1.068749775233016e-06, "loss": 0.1606, "num_input_tokens_seen": 107206544, "step": 112285 }, { "epoch": 9.159800962558121, "grad_norm": 1.3683738708496094, "learning_rate": 1.0677204534227297e-06, "loss": 0.1861, "num_input_tokens_seen": 107211616, "step": 112290 }, { "epoch": 9.16020882616853, "grad_norm": 7.400764465332031, "learning_rate": 1.0666916167095836e-06, "loss": 0.4734, "num_input_tokens_seen": 107215936, "step": 112295 }, { "epoch": 9.160616689778937, "grad_norm": 15.73423957824707, "learning_rate": 1.0656632651144221e-06, "loss": 0.3715, "num_input_tokens_seen": 107221440, "step": 112300 }, { "epoch": 9.161024553389346, "grad_norm": 3.66487455368042, "learning_rate": 1.064635398658101e-06, "loss": 0.3028, "num_input_tokens_seen": 107225600, "step": 112305 }, { "epoch": 9.161432416999755, "grad_norm": 18.297542572021484, "learning_rate": 1.0636080173614504e-06, "loss": 0.3376, "num_input_tokens_seen": 107229584, "step": 112310 }, { "epoch": 9.161840280610164, "grad_norm": 12.911511421203613, "learning_rate": 1.062581121245293e-06, "loss": 0.2703, "num_input_tokens_seen": 107234432, "step": 112315 }, { "epoch": 9.162248144220573, "grad_norm": 0.9065473675727844, "learning_rate": 1.061554710330445e-06, "loss": 0.3712, "num_input_tokens_seen": 107239200, "step": 112320 }, { "epoch": 9.162656007830982, "grad_norm": 4.66623592376709, "learning_rate": 1.0605287846377098e-06, "loss": 0.3302, "num_input_tokens_seen": 107243824, "step": 112325 }, { "epoch": 9.16306387144139, "grad_norm": 27.55023956298828, "learning_rate": 1.0595033441878842e-06, "loss": 0.3054, "num_input_tokens_seen": 107249200, "step": 112330 }, { "epoch": 9.163471735051798, "grad_norm": 22.423681259155273, "learning_rate": 1.0584783890017546e-06, "loss": 0.3641, "num_input_tokens_seen": 107253776, "step": 112335 }, { "epoch": 9.163879598662207, "grad_norm": 32.50271224975586, "learning_rate": 1.0574539191000903e-06, "loss": 0.4454, "num_input_tokens_seen": 107258656, "step": 112340 }, { "epoch": 9.164287462272616, "grad_norm": 23.346771240234375, "learning_rate": 1.0564299345036637e-06, "loss": 0.5171, "num_input_tokens_seen": 107263408, "step": 112345 }, { "epoch": 9.164695325883025, "grad_norm": 8.758362770080566, "learning_rate": 1.0554064352332276e-06, "loss": 0.29, "num_input_tokens_seen": 107268384, "step": 112350 }, { "epoch": 9.165103189493433, "grad_norm": 6.957071781158447, "learning_rate": 1.0543834213095322e-06, "loss": 0.2851, "num_input_tokens_seen": 107272320, "step": 112355 }, { "epoch": 9.165511053103842, "grad_norm": 1.2997044324874878, "learning_rate": 1.0533608927533024e-06, "loss": 0.4071, "num_input_tokens_seen": 107276912, "step": 112360 }, { "epoch": 9.16591891671425, "grad_norm": 2.6528658866882324, "learning_rate": 1.052338849585277e-06, "loss": 0.4044, "num_input_tokens_seen": 107281472, "step": 112365 }, { "epoch": 9.16632678032466, "grad_norm": 28.720958709716797, "learning_rate": 1.051317291826165e-06, "loss": 0.2716, "num_input_tokens_seen": 107286144, "step": 112370 }, { "epoch": 9.166734643935069, "grad_norm": 13.452862739562988, "learning_rate": 1.050296219496677e-06, "loss": 0.2547, "num_input_tokens_seen": 107291232, "step": 112375 }, { "epoch": 9.167142507545476, "grad_norm": 3.583102226257324, "learning_rate": 1.049275632617508e-06, "loss": 0.2047, "num_input_tokens_seen": 107295376, "step": 112380 }, { "epoch": 9.167550371155885, "grad_norm": 19.70651626586914, "learning_rate": 1.0482555312093412e-06, "loss": 0.4905, "num_input_tokens_seen": 107300480, "step": 112385 }, { "epoch": 9.167958234766294, "grad_norm": 30.518451690673828, "learning_rate": 1.0472359152928602e-06, "loss": 0.3417, "num_input_tokens_seen": 107306336, "step": 112390 }, { "epoch": 9.168366098376703, "grad_norm": 41.84482192993164, "learning_rate": 1.046216784888729e-06, "loss": 0.3572, "num_input_tokens_seen": 107311440, "step": 112395 }, { "epoch": 9.168773961987112, "grad_norm": 27.529298782348633, "learning_rate": 1.045198140017603e-06, "loss": 0.3424, "num_input_tokens_seen": 107316784, "step": 112400 }, { "epoch": 9.169181825597521, "grad_norm": 6.9638824462890625, "learning_rate": 1.0441799807001302e-06, "loss": 0.4284, "num_input_tokens_seen": 107320976, "step": 112405 }, { "epoch": 9.169589689207928, "grad_norm": 8.124746322631836, "learning_rate": 1.0431623069569546e-06, "loss": 0.5122, "num_input_tokens_seen": 107325392, "step": 112410 }, { "epoch": 9.169997552818337, "grad_norm": 40.01320266723633, "learning_rate": 1.0421451188086962e-06, "loss": 0.4273, "num_input_tokens_seen": 107330224, "step": 112415 }, { "epoch": 9.170405416428746, "grad_norm": 34.45998764038086, "learning_rate": 1.0411284162759771e-06, "loss": 0.3149, "num_input_tokens_seen": 107334688, "step": 112420 }, { "epoch": 9.170813280039155, "grad_norm": 52.693824768066406, "learning_rate": 1.0401121993794033e-06, "loss": 0.46, "num_input_tokens_seen": 107339936, "step": 112425 }, { "epoch": 9.171221143649564, "grad_norm": 33.53925323486328, "learning_rate": 1.0390964681395748e-06, "loss": 0.2987, "num_input_tokens_seen": 107344784, "step": 112430 }, { "epoch": 9.171629007259972, "grad_norm": 2.3892874717712402, "learning_rate": 1.0380812225770782e-06, "loss": 0.5551, "num_input_tokens_seen": 107349408, "step": 112435 }, { "epoch": 9.17203687087038, "grad_norm": 18.42776107788086, "learning_rate": 1.0370664627124965e-06, "loss": 0.3228, "num_input_tokens_seen": 107354368, "step": 112440 }, { "epoch": 9.17244473448079, "grad_norm": 33.007530212402344, "learning_rate": 1.0360521885663916e-06, "loss": 0.3626, "num_input_tokens_seen": 107359536, "step": 112445 }, { "epoch": 9.172852598091199, "grad_norm": 13.153634071350098, "learning_rate": 1.0350384001593271e-06, "loss": 0.2971, "num_input_tokens_seen": 107363792, "step": 112450 }, { "epoch": 9.173260461701608, "grad_norm": 13.263116836547852, "learning_rate": 1.0340250975118538e-06, "loss": 0.3812, "num_input_tokens_seen": 107368624, "step": 112455 }, { "epoch": 9.173668325312015, "grad_norm": 11.115476608276367, "learning_rate": 1.0330122806445048e-06, "loss": 0.4368, "num_input_tokens_seen": 107373184, "step": 112460 }, { "epoch": 9.174076188922424, "grad_norm": 17.061925888061523, "learning_rate": 1.0319999495778139e-06, "loss": 0.3289, "num_input_tokens_seen": 107378160, "step": 112465 }, { "epoch": 9.174484052532833, "grad_norm": 1.8205938339233398, "learning_rate": 1.030988104332295e-06, "loss": 0.4208, "num_input_tokens_seen": 107383232, "step": 112470 }, { "epoch": 9.174891916143242, "grad_norm": 32.869117736816406, "learning_rate": 1.0299767449284652e-06, "loss": 0.2247, "num_input_tokens_seen": 107388144, "step": 112475 }, { "epoch": 9.17529977975365, "grad_norm": 17.252958297729492, "learning_rate": 1.0289658713868221e-06, "loss": 0.1988, "num_input_tokens_seen": 107393376, "step": 112480 }, { "epoch": 9.17570764336406, "grad_norm": 1.9534976482391357, "learning_rate": 1.0279554837278577e-06, "loss": 0.2583, "num_input_tokens_seen": 107397824, "step": 112485 }, { "epoch": 9.176115506974467, "grad_norm": 13.409521102905273, "learning_rate": 1.0269455819720441e-06, "loss": 0.4879, "num_input_tokens_seen": 107402704, "step": 112490 }, { "epoch": 9.176523370584876, "grad_norm": 0.6091201901435852, "learning_rate": 1.025936166139857e-06, "loss": 0.2692, "num_input_tokens_seen": 107408288, "step": 112495 }, { "epoch": 9.176931234195285, "grad_norm": 1.312477946281433, "learning_rate": 1.0249272362517603e-06, "loss": 0.4188, "num_input_tokens_seen": 107413840, "step": 112500 }, { "epoch": 9.177339097805694, "grad_norm": 11.404397964477539, "learning_rate": 1.023918792328199e-06, "loss": 0.5332, "num_input_tokens_seen": 107419072, "step": 112505 }, { "epoch": 9.177746961416103, "grad_norm": 5.022780418395996, "learning_rate": 1.022910834389615e-06, "loss": 0.3058, "num_input_tokens_seen": 107422944, "step": 112510 }, { "epoch": 9.17815482502651, "grad_norm": 5.887312889099121, "learning_rate": 1.021903362456439e-06, "loss": 0.4308, "num_input_tokens_seen": 107426912, "step": 112515 }, { "epoch": 9.17856268863692, "grad_norm": 2.3134937286376953, "learning_rate": 1.0208963765490963e-06, "loss": 0.4506, "num_input_tokens_seen": 107431888, "step": 112520 }, { "epoch": 9.178970552247328, "grad_norm": 3.298802614212036, "learning_rate": 1.0198898766879933e-06, "loss": 0.4988, "num_input_tokens_seen": 107435616, "step": 112525 }, { "epoch": 9.179378415857737, "grad_norm": 41.01116943359375, "learning_rate": 1.0188838628935327e-06, "loss": 0.4183, "num_input_tokens_seen": 107440208, "step": 112530 }, { "epoch": 9.179786279468146, "grad_norm": 2.21720027923584, "learning_rate": 1.0178783351861066e-06, "loss": 0.1502, "num_input_tokens_seen": 107445504, "step": 112535 }, { "epoch": 9.180194143078555, "grad_norm": 15.398604393005371, "learning_rate": 1.016873293586093e-06, "loss": 0.395, "num_input_tokens_seen": 107451040, "step": 112540 }, { "epoch": 9.180602006688963, "grad_norm": 7.7419209480285645, "learning_rate": 1.0158687381138677e-06, "loss": 0.3111, "num_input_tokens_seen": 107456064, "step": 112545 }, { "epoch": 9.181009870299372, "grad_norm": 23.832698822021484, "learning_rate": 1.0148646687897944e-06, "loss": 0.3564, "num_input_tokens_seen": 107461152, "step": 112550 }, { "epoch": 9.18141773390978, "grad_norm": 26.882015228271484, "learning_rate": 1.0138610856342211e-06, "loss": 0.3438, "num_input_tokens_seen": 107465904, "step": 112555 }, { "epoch": 9.18182559752019, "grad_norm": 17.841707229614258, "learning_rate": 1.012857988667487e-06, "loss": 0.3064, "num_input_tokens_seen": 107470272, "step": 112560 }, { "epoch": 9.182233461130599, "grad_norm": 28.268152236938477, "learning_rate": 1.0118553779099343e-06, "loss": 0.2032, "num_input_tokens_seen": 107475168, "step": 112565 }, { "epoch": 9.182641324741006, "grad_norm": 19.55464744567871, "learning_rate": 1.0108532533818798e-06, "loss": 0.297, "num_input_tokens_seen": 107479488, "step": 112570 }, { "epoch": 9.183049188351415, "grad_norm": 1.8032524585723877, "learning_rate": 1.0098516151036353e-06, "loss": 0.3364, "num_input_tokens_seen": 107484624, "step": 112575 }, { "epoch": 9.183457051961824, "grad_norm": 32.54183578491211, "learning_rate": 1.0088504630955038e-06, "loss": 0.2939, "num_input_tokens_seen": 107489248, "step": 112580 }, { "epoch": 9.183864915572233, "grad_norm": 22.802001953125, "learning_rate": 1.00784979737778e-06, "loss": 0.4633, "num_input_tokens_seen": 107494640, "step": 112585 }, { "epoch": 9.184272779182642, "grad_norm": 29.786266326904297, "learning_rate": 1.0068496179707454e-06, "loss": 0.2684, "num_input_tokens_seen": 107499472, "step": 112590 }, { "epoch": 9.18468064279305, "grad_norm": 34.0415153503418, "learning_rate": 1.0058499248946724e-06, "loss": 0.3946, "num_input_tokens_seen": 107504064, "step": 112595 }, { "epoch": 9.185088506403458, "grad_norm": 53.41810989379883, "learning_rate": 1.004850718169828e-06, "loss": 0.2473, "num_input_tokens_seen": 107509024, "step": 112600 }, { "epoch": 9.185496370013867, "grad_norm": 11.767329216003418, "learning_rate": 1.00385199781646e-06, "loss": 0.4658, "num_input_tokens_seen": 107514448, "step": 112605 }, { "epoch": 9.185904233624276, "grad_norm": 1.2048064470291138, "learning_rate": 1.0028537638548164e-06, "loss": 0.4743, "num_input_tokens_seen": 107518352, "step": 112610 }, { "epoch": 9.186312097234685, "grad_norm": 5.009082794189453, "learning_rate": 1.0018560163051332e-06, "loss": 0.4952, "num_input_tokens_seen": 107523712, "step": 112615 }, { "epoch": 9.186719960845094, "grad_norm": 35.10713577270508, "learning_rate": 1.000858755187628e-06, "loss": 0.3614, "num_input_tokens_seen": 107529072, "step": 112620 }, { "epoch": 9.187127824455501, "grad_norm": 47.8353271484375, "learning_rate": 9.99861980522518e-07, "loss": 0.242, "num_input_tokens_seen": 107534112, "step": 112625 }, { "epoch": 9.18753568806591, "grad_norm": 31.373695373535156, "learning_rate": 9.988656923300088e-07, "loss": 0.3627, "num_input_tokens_seen": 107538752, "step": 112630 }, { "epoch": 9.18794355167632, "grad_norm": 5.4777421951293945, "learning_rate": 9.97869890630293e-07, "loss": 0.3871, "num_input_tokens_seen": 107543792, "step": 112635 }, { "epoch": 9.188351415286728, "grad_norm": 10.58381462097168, "learning_rate": 9.968745754435542e-07, "loss": 0.2764, "num_input_tokens_seen": 107548160, "step": 112640 }, { "epoch": 9.188759278897138, "grad_norm": 3.6301488876342773, "learning_rate": 9.958797467899712e-07, "loss": 0.5174, "num_input_tokens_seen": 107552528, "step": 112645 }, { "epoch": 9.189167142507545, "grad_norm": 26.76774024963379, "learning_rate": 9.948854046896993e-07, "loss": 0.3651, "num_input_tokens_seen": 107557424, "step": 112650 }, { "epoch": 9.189575006117954, "grad_norm": 29.795883178710938, "learning_rate": 9.938915491629063e-07, "loss": 0.4281, "num_input_tokens_seen": 107560816, "step": 112655 }, { "epoch": 9.189982869728363, "grad_norm": 37.24571990966797, "learning_rate": 9.92898180229729e-07, "loss": 0.3963, "num_input_tokens_seen": 107565600, "step": 112660 }, { "epoch": 9.190390733338772, "grad_norm": 9.386505126953125, "learning_rate": 9.919052979103061e-07, "loss": 0.284, "num_input_tokens_seen": 107570432, "step": 112665 }, { "epoch": 9.19079859694918, "grad_norm": 8.493943214416504, "learning_rate": 9.909129022247554e-07, "loss": 0.2102, "num_input_tokens_seen": 107574448, "step": 112670 }, { "epoch": 9.191206460559588, "grad_norm": 3.1426684856414795, "learning_rate": 9.899209931932024e-07, "loss": 0.3785, "num_input_tokens_seen": 107579344, "step": 112675 }, { "epoch": 9.191614324169997, "grad_norm": 25.542858123779297, "learning_rate": 9.889295708357476e-07, "loss": 0.3226, "num_input_tokens_seen": 107584336, "step": 112680 }, { "epoch": 9.192022187780406, "grad_norm": 41.72187042236328, "learning_rate": 9.87938635172489e-07, "loss": 0.3595, "num_input_tokens_seen": 107588880, "step": 112685 }, { "epoch": 9.192430051390815, "grad_norm": 2.518153190612793, "learning_rate": 9.869481862235103e-07, "loss": 0.1976, "num_input_tokens_seen": 107593200, "step": 112690 }, { "epoch": 9.192837915001224, "grad_norm": 12.741004943847656, "learning_rate": 9.859582240088844e-07, "loss": 0.2798, "num_input_tokens_seen": 107598272, "step": 112695 }, { "epoch": 9.193245778611633, "grad_norm": 6.146705150604248, "learning_rate": 9.84968748548687e-07, "loss": 0.3174, "num_input_tokens_seen": 107602064, "step": 112700 }, { "epoch": 9.19365364222204, "grad_norm": 5.875842571258545, "learning_rate": 9.839797598629658e-07, "loss": 0.1734, "num_input_tokens_seen": 107607328, "step": 112705 }, { "epoch": 9.19406150583245, "grad_norm": 6.505321025848389, "learning_rate": 9.829912579717688e-07, "loss": 0.4133, "num_input_tokens_seen": 107610800, "step": 112710 }, { "epoch": 9.194469369442858, "grad_norm": 11.973458290100098, "learning_rate": 9.820032428951353e-07, "loss": 0.2912, "num_input_tokens_seen": 107616192, "step": 112715 }, { "epoch": 9.194877233053267, "grad_norm": 5.919703006744385, "learning_rate": 9.810157146530913e-07, "loss": 0.4006, "num_input_tokens_seen": 107621200, "step": 112720 }, { "epoch": 9.195285096663676, "grad_norm": 1.493011236190796, "learning_rate": 9.80028673265651e-07, "loss": 0.3265, "num_input_tokens_seen": 107626272, "step": 112725 }, { "epoch": 9.195692960274084, "grad_norm": 20.30649185180664, "learning_rate": 9.790421187528208e-07, "loss": 0.394, "num_input_tokens_seen": 107631568, "step": 112730 }, { "epoch": 9.196100823884493, "grad_norm": 6.7144083976745605, "learning_rate": 9.780560511346015e-07, "loss": 0.2087, "num_input_tokens_seen": 107636000, "step": 112735 }, { "epoch": 9.196508687494902, "grad_norm": 9.774356842041016, "learning_rate": 9.770704704309769e-07, "loss": 0.4207, "num_input_tokens_seen": 107641520, "step": 112740 }, { "epoch": 9.19691655110531, "grad_norm": 42.06707763671875, "learning_rate": 9.760853766619283e-07, "loss": 0.4151, "num_input_tokens_seen": 107647216, "step": 112745 }, { "epoch": 9.19732441471572, "grad_norm": 4.991184234619141, "learning_rate": 9.751007698474202e-07, "loss": 0.298, "num_input_tokens_seen": 107652144, "step": 112750 }, { "epoch": 9.197732278326129, "grad_norm": 15.921762466430664, "learning_rate": 9.741166500074117e-07, "loss": 0.3105, "num_input_tokens_seen": 107656976, "step": 112755 }, { "epoch": 9.198140141936536, "grad_norm": 7.862148284912109, "learning_rate": 9.731330171618452e-07, "loss": 0.33, "num_input_tokens_seen": 107662720, "step": 112760 }, { "epoch": 9.198548005546945, "grad_norm": 6.900808811187744, "learning_rate": 9.72149871330666e-07, "loss": 0.5043, "num_input_tokens_seen": 107668000, "step": 112765 }, { "epoch": 9.198955869157354, "grad_norm": 22.84720802307129, "learning_rate": 9.711672125337995e-07, "loss": 0.4016, "num_input_tokens_seen": 107673280, "step": 112770 }, { "epoch": 9.199363732767763, "grad_norm": 3.392359733581543, "learning_rate": 9.701850407911605e-07, "loss": 0.2893, "num_input_tokens_seen": 107677952, "step": 112775 }, { "epoch": 9.199771596378172, "grad_norm": 10.030112266540527, "learning_rate": 9.69203356122661e-07, "loss": 0.3204, "num_input_tokens_seen": 107682928, "step": 112780 }, { "epoch": 9.200179459988579, "grad_norm": 5.069645881652832, "learning_rate": 9.682221585481987e-07, "loss": 0.4453, "num_input_tokens_seen": 107686352, "step": 112785 }, { "epoch": 9.200587323598988, "grad_norm": 40.10953140258789, "learning_rate": 9.672414480876607e-07, "loss": 0.367, "num_input_tokens_seen": 107691328, "step": 112790 }, { "epoch": 9.200995187209397, "grad_norm": 2.0314929485321045, "learning_rate": 9.662612247609281e-07, "loss": 0.2504, "num_input_tokens_seen": 107695936, "step": 112795 }, { "epoch": 9.201403050819806, "grad_norm": 21.596595764160156, "learning_rate": 9.652814885878687e-07, "loss": 0.3266, "num_input_tokens_seen": 107701472, "step": 112800 }, { "epoch": 9.201810914430215, "grad_norm": 17.533382415771484, "learning_rate": 9.643022395883355e-07, "loss": 0.3212, "num_input_tokens_seen": 107706816, "step": 112805 }, { "epoch": 9.202218778040622, "grad_norm": 0.9365869164466858, "learning_rate": 9.633234777821853e-07, "loss": 0.2716, "num_input_tokens_seen": 107712080, "step": 112810 }, { "epoch": 9.202626641651031, "grad_norm": 15.460100173950195, "learning_rate": 9.62345203189255e-07, "loss": 0.4737, "num_input_tokens_seen": 107717024, "step": 112815 }, { "epoch": 9.20303450526144, "grad_norm": 21.832687377929688, "learning_rate": 9.613674158293756e-07, "loss": 0.2878, "num_input_tokens_seen": 107721424, "step": 112820 }, { "epoch": 9.20344236887185, "grad_norm": 1.246964693069458, "learning_rate": 9.603901157223594e-07, "loss": 0.255, "num_input_tokens_seen": 107726576, "step": 112825 }, { "epoch": 9.203850232482258, "grad_norm": 12.30678653717041, "learning_rate": 9.59413302888021e-07, "loss": 0.5013, "num_input_tokens_seen": 107731152, "step": 112830 }, { "epoch": 9.204258096092667, "grad_norm": 8.467286109924316, "learning_rate": 9.584369773461637e-07, "loss": 0.3449, "num_input_tokens_seen": 107735824, "step": 112835 }, { "epoch": 9.204665959703075, "grad_norm": 9.216726303100586, "learning_rate": 9.574611391165694e-07, "loss": 0.5344, "num_input_tokens_seen": 107740736, "step": 112840 }, { "epoch": 9.205073823313484, "grad_norm": 33.692138671875, "learning_rate": 9.564857882190247e-07, "loss": 0.4013, "num_input_tokens_seen": 107745856, "step": 112845 }, { "epoch": 9.205481686923893, "grad_norm": 2.207113027572632, "learning_rate": 9.555109246732947e-07, "loss": 0.2931, "num_input_tokens_seen": 107751072, "step": 112850 }, { "epoch": 9.205889550534302, "grad_norm": 11.468379020690918, "learning_rate": 9.545365484991408e-07, "loss": 0.3649, "num_input_tokens_seen": 107755680, "step": 112855 }, { "epoch": 9.20629741414471, "grad_norm": 2.5379300117492676, "learning_rate": 9.535626597163144e-07, "loss": 0.1721, "num_input_tokens_seen": 107759808, "step": 112860 }, { "epoch": 9.206705277755118, "grad_norm": 7.587278366088867, "learning_rate": 9.525892583445522e-07, "loss": 0.2797, "num_input_tokens_seen": 107764560, "step": 112865 }, { "epoch": 9.207113141365527, "grad_norm": 9.576742172241211, "learning_rate": 9.516163444035886e-07, "loss": 0.4085, "num_input_tokens_seen": 107769712, "step": 112870 }, { "epoch": 9.207521004975936, "grad_norm": 13.662793159484863, "learning_rate": 9.506439179131438e-07, "loss": 0.4399, "num_input_tokens_seen": 107774768, "step": 112875 }, { "epoch": 9.207928868586345, "grad_norm": 20.71360969543457, "learning_rate": 9.496719788929271e-07, "loss": 0.3327, "num_input_tokens_seen": 107779232, "step": 112880 }, { "epoch": 9.208336732196754, "grad_norm": 2.799449920654297, "learning_rate": 9.48700527362642e-07, "loss": 0.3866, "num_input_tokens_seen": 107783728, "step": 112885 }, { "epoch": 9.208744595807161, "grad_norm": 44.28998947143555, "learning_rate": 9.477295633419731e-07, "loss": 0.5395, "num_input_tokens_seen": 107788368, "step": 112890 }, { "epoch": 9.20915245941757, "grad_norm": 18.99886703491211, "learning_rate": 9.467590868506071e-07, "loss": 0.3217, "num_input_tokens_seen": 107792848, "step": 112895 }, { "epoch": 9.20956032302798, "grad_norm": 1.3229994773864746, "learning_rate": 9.457890979082146e-07, "loss": 0.2018, "num_input_tokens_seen": 107797616, "step": 112900 }, { "epoch": 9.209968186638388, "grad_norm": 3.779175281524658, "learning_rate": 9.448195965344547e-07, "loss": 0.375, "num_input_tokens_seen": 107802176, "step": 112905 }, { "epoch": 9.210376050248797, "grad_norm": 3.5171825885772705, "learning_rate": 9.438505827489813e-07, "loss": 0.4165, "num_input_tokens_seen": 107807264, "step": 112910 }, { "epoch": 9.210783913859206, "grad_norm": 11.812294960021973, "learning_rate": 9.428820565714313e-07, "loss": 0.5526, "num_input_tokens_seen": 107812048, "step": 112915 }, { "epoch": 9.211191777469613, "grad_norm": 7.9675068855285645, "learning_rate": 9.41914018021442e-07, "loss": 0.372, "num_input_tokens_seen": 107816608, "step": 112920 }, { "epoch": 9.211599641080022, "grad_norm": 27.375652313232422, "learning_rate": 9.409464671186336e-07, "loss": 0.3492, "num_input_tokens_seen": 107821472, "step": 112925 }, { "epoch": 9.212007504690432, "grad_norm": 0.9850319027900696, "learning_rate": 9.399794038826154e-07, "loss": 0.2748, "num_input_tokens_seen": 107825600, "step": 112930 }, { "epoch": 9.21241536830084, "grad_norm": 13.202239990234375, "learning_rate": 9.39012828332994e-07, "loss": 0.3705, "num_input_tokens_seen": 107829312, "step": 112935 }, { "epoch": 9.21282323191125, "grad_norm": 31.194265365600586, "learning_rate": 9.380467404893511e-07, "loss": 0.2539, "num_input_tokens_seen": 107834304, "step": 112940 }, { "epoch": 9.213231095521657, "grad_norm": 2.994676351547241, "learning_rate": 9.37081140371282e-07, "loss": 0.375, "num_input_tokens_seen": 107838896, "step": 112945 }, { "epoch": 9.213638959132066, "grad_norm": 28.170055389404297, "learning_rate": 9.361160279983544e-07, "loss": 0.4299, "num_input_tokens_seen": 107843424, "step": 112950 }, { "epoch": 9.214046822742475, "grad_norm": 1.5372505187988281, "learning_rate": 9.351514033901277e-07, "loss": 0.3564, "num_input_tokens_seen": 107847344, "step": 112955 }, { "epoch": 9.214454686352884, "grad_norm": 16.791791915893555, "learning_rate": 9.341872665661528e-07, "loss": 0.3765, "num_input_tokens_seen": 107851168, "step": 112960 }, { "epoch": 9.214862549963293, "grad_norm": 37.290035247802734, "learning_rate": 9.332236175459807e-07, "loss": 0.1743, "num_input_tokens_seen": 107855968, "step": 112965 }, { "epoch": 9.215270413573702, "grad_norm": 18.52869987487793, "learning_rate": 9.322604563491377e-07, "loss": 0.411, "num_input_tokens_seen": 107861392, "step": 112970 }, { "epoch": 9.215678277184109, "grad_norm": 42.65638732910156, "learning_rate": 9.312977829951469e-07, "loss": 0.3731, "num_input_tokens_seen": 107865984, "step": 112975 }, { "epoch": 9.216086140794518, "grad_norm": 2.3692591190338135, "learning_rate": 9.30335597503526e-07, "loss": 0.4026, "num_input_tokens_seen": 107870816, "step": 112980 }, { "epoch": 9.216494004404927, "grad_norm": 44.93325424194336, "learning_rate": 9.293738998937734e-07, "loss": 0.2726, "num_input_tokens_seen": 107875856, "step": 112985 }, { "epoch": 9.216901868015336, "grad_norm": 4.387568950653076, "learning_rate": 9.28412690185379e-07, "loss": 0.2417, "num_input_tokens_seen": 107881184, "step": 112990 }, { "epoch": 9.217309731625745, "grad_norm": 18.08961296081543, "learning_rate": 9.274519683978355e-07, "loss": 0.5827, "num_input_tokens_seen": 107884928, "step": 112995 }, { "epoch": 9.217717595236152, "grad_norm": 4.312880039215088, "learning_rate": 9.264917345506108e-07, "loss": 0.5513, "num_input_tokens_seen": 107889088, "step": 113000 }, { "epoch": 9.218125458846561, "grad_norm": 10.873485565185547, "learning_rate": 9.255319886631697e-07, "loss": 0.4378, "num_input_tokens_seen": 107893424, "step": 113005 }, { "epoch": 9.21853332245697, "grad_norm": 8.298274993896484, "learning_rate": 9.245727307549662e-07, "loss": 0.2396, "num_input_tokens_seen": 107898624, "step": 113010 }, { "epoch": 9.21894118606738, "grad_norm": 2.918239116668701, "learning_rate": 9.23613960845443e-07, "loss": 0.2043, "num_input_tokens_seen": 107902672, "step": 113015 }, { "epoch": 9.219349049677788, "grad_norm": 21.751771926879883, "learning_rate": 9.22655678954032e-07, "loss": 0.3968, "num_input_tokens_seen": 107908448, "step": 113020 }, { "epoch": 9.219756913288196, "grad_norm": 1.0778647661209106, "learning_rate": 9.21697885100159e-07, "loss": 0.1513, "num_input_tokens_seen": 107913808, "step": 113025 }, { "epoch": 9.220164776898605, "grad_norm": 4.327885627746582, "learning_rate": 9.207405793032392e-07, "loss": 0.2315, "num_input_tokens_seen": 107919120, "step": 113030 }, { "epoch": 9.220572640509014, "grad_norm": 2.467451572418213, "learning_rate": 9.197837615826765e-07, "loss": 0.182, "num_input_tokens_seen": 107923744, "step": 113035 }, { "epoch": 9.220980504119423, "grad_norm": 7.736416816711426, "learning_rate": 9.188274319578638e-07, "loss": 0.3641, "num_input_tokens_seen": 107927872, "step": 113040 }, { "epoch": 9.221388367729832, "grad_norm": 2.6568405628204346, "learning_rate": 9.178715904481883e-07, "loss": 0.1942, "num_input_tokens_seen": 107933104, "step": 113045 }, { "epoch": 9.22179623134024, "grad_norm": 1.5382834672927856, "learning_rate": 9.169162370730177e-07, "loss": 0.3608, "num_input_tokens_seen": 107937968, "step": 113050 }, { "epoch": 9.222204094950648, "grad_norm": 3.8747572898864746, "learning_rate": 9.159613718517257e-07, "loss": 0.3196, "num_input_tokens_seen": 107943184, "step": 113055 }, { "epoch": 9.222611958561057, "grad_norm": 1.2214208841323853, "learning_rate": 9.150069948036633e-07, "loss": 0.2127, "num_input_tokens_seen": 107948080, "step": 113060 }, { "epoch": 9.223019822171466, "grad_norm": 11.677956581115723, "learning_rate": 9.140531059481732e-07, "loss": 0.2443, "num_input_tokens_seen": 107952720, "step": 113065 }, { "epoch": 9.223427685781875, "grad_norm": 1.6288883686065674, "learning_rate": 9.130997053045903e-07, "loss": 0.2501, "num_input_tokens_seen": 107957440, "step": 113070 }, { "epoch": 9.223835549392284, "grad_norm": 1.431047797203064, "learning_rate": 9.121467928922434e-07, "loss": 0.2553, "num_input_tokens_seen": 107962160, "step": 113075 }, { "epoch": 9.224243413002691, "grad_norm": 15.293379783630371, "learning_rate": 9.111943687304447e-07, "loss": 0.3968, "num_input_tokens_seen": 107967904, "step": 113080 }, { "epoch": 9.2246512766131, "grad_norm": 1.6947873830795288, "learning_rate": 9.102424328385012e-07, "loss": 0.3068, "num_input_tokens_seen": 107973312, "step": 113085 }, { "epoch": 9.22505914022351, "grad_norm": 23.823501586914062, "learning_rate": 9.092909852357084e-07, "loss": 0.2832, "num_input_tokens_seen": 107978032, "step": 113090 }, { "epoch": 9.225467003833918, "grad_norm": 12.16527271270752, "learning_rate": 9.083400259413455e-07, "loss": 0.3174, "num_input_tokens_seen": 107981792, "step": 113095 }, { "epoch": 9.225874867444327, "grad_norm": 10.342117309570312, "learning_rate": 9.073895549746969e-07, "loss": 0.4036, "num_input_tokens_seen": 107986528, "step": 113100 }, { "epoch": 9.226282731054734, "grad_norm": 6.1916913986206055, "learning_rate": 9.064395723550251e-07, "loss": 0.3177, "num_input_tokens_seen": 107990896, "step": 113105 }, { "epoch": 9.226690594665143, "grad_norm": 2.3367936611175537, "learning_rate": 9.054900781015841e-07, "loss": 0.2215, "num_input_tokens_seen": 107995072, "step": 113110 }, { "epoch": 9.227098458275552, "grad_norm": 14.138474464416504, "learning_rate": 9.045410722336195e-07, "loss": 0.4413, "num_input_tokens_seen": 107999472, "step": 113115 }, { "epoch": 9.227506321885961, "grad_norm": 26.243423461914062, "learning_rate": 9.035925547703688e-07, "loss": 0.483, "num_input_tokens_seen": 108004240, "step": 113120 }, { "epoch": 9.22791418549637, "grad_norm": 7.1370978355407715, "learning_rate": 9.026445257310612e-07, "loss": 0.3717, "num_input_tokens_seen": 108009456, "step": 113125 }, { "epoch": 9.22832204910678, "grad_norm": 10.87314224243164, "learning_rate": 9.01696985134906e-07, "loss": 0.3203, "num_input_tokens_seen": 108013200, "step": 113130 }, { "epoch": 9.228729912717187, "grad_norm": 7.478621959686279, "learning_rate": 9.00749933001116e-07, "loss": 0.3606, "num_input_tokens_seen": 108018416, "step": 113135 }, { "epoch": 9.229137776327596, "grad_norm": 2.6600420475006104, "learning_rate": 8.998033693488839e-07, "loss": 0.3789, "num_input_tokens_seen": 108023648, "step": 113140 }, { "epoch": 9.229545639938005, "grad_norm": 5.70174503326416, "learning_rate": 8.988572941973944e-07, "loss": 0.4098, "num_input_tokens_seen": 108028336, "step": 113145 }, { "epoch": 9.229953503548414, "grad_norm": 38.65835189819336, "learning_rate": 8.979117075658295e-07, "loss": 0.3935, "num_input_tokens_seen": 108033360, "step": 113150 }, { "epoch": 9.230361367158823, "grad_norm": 12.001641273498535, "learning_rate": 8.969666094733486e-07, "loss": 0.2261, "num_input_tokens_seen": 108038224, "step": 113155 }, { "epoch": 9.23076923076923, "grad_norm": 10.110522270202637, "learning_rate": 8.960219999391145e-07, "loss": 0.2182, "num_input_tokens_seen": 108043056, "step": 113160 }, { "epoch": 9.231177094379639, "grad_norm": 9.560555458068848, "learning_rate": 8.950778789822728e-07, "loss": 0.3413, "num_input_tokens_seen": 108047056, "step": 113165 }, { "epoch": 9.231584957990048, "grad_norm": 34.72481918334961, "learning_rate": 8.941342466219583e-07, "loss": 0.4068, "num_input_tokens_seen": 108051856, "step": 113170 }, { "epoch": 9.231992821600457, "grad_norm": 2.9349589347839355, "learning_rate": 8.931911028772999e-07, "loss": 0.2247, "num_input_tokens_seen": 108057168, "step": 113175 }, { "epoch": 9.232400685210866, "grad_norm": 1.7448718547821045, "learning_rate": 8.922484477674131e-07, "loss": 0.2391, "num_input_tokens_seen": 108062496, "step": 113180 }, { "epoch": 9.232808548821275, "grad_norm": 18.66197395324707, "learning_rate": 8.913062813114076e-07, "loss": 0.3362, "num_input_tokens_seen": 108067696, "step": 113185 }, { "epoch": 9.233216412431682, "grad_norm": 29.883506774902344, "learning_rate": 8.903646035283792e-07, "loss": 0.2087, "num_input_tokens_seen": 108072864, "step": 113190 }, { "epoch": 9.233624276042091, "grad_norm": 12.017620086669922, "learning_rate": 8.894234144374153e-07, "loss": 0.4675, "num_input_tokens_seen": 108077712, "step": 113195 }, { "epoch": 9.2340321396525, "grad_norm": 3.7708311080932617, "learning_rate": 8.884827140575952e-07, "loss": 0.408, "num_input_tokens_seen": 108083536, "step": 113200 }, { "epoch": 9.23444000326291, "grad_norm": 21.39378547668457, "learning_rate": 8.875425024079815e-07, "loss": 0.4758, "num_input_tokens_seen": 108088160, "step": 113205 }, { "epoch": 9.234847866873318, "grad_norm": 9.608125686645508, "learning_rate": 8.866027795076393e-07, "loss": 0.2208, "num_input_tokens_seen": 108092928, "step": 113210 }, { "epoch": 9.235255730483726, "grad_norm": 74.34451293945312, "learning_rate": 8.85663545375609e-07, "loss": 0.4351, "num_input_tokens_seen": 108097856, "step": 113215 }, { "epoch": 9.235663594094135, "grad_norm": 21.49197006225586, "learning_rate": 8.847248000309338e-07, "loss": 0.1969, "num_input_tokens_seen": 108102720, "step": 113220 }, { "epoch": 9.236071457704544, "grad_norm": 37.296443939208984, "learning_rate": 8.837865434926373e-07, "loss": 0.3718, "num_input_tokens_seen": 108106928, "step": 113225 }, { "epoch": 9.236479321314953, "grad_norm": 17.182716369628906, "learning_rate": 8.828487757797432e-07, "loss": 0.555, "num_input_tokens_seen": 108111328, "step": 113230 }, { "epoch": 9.236887184925362, "grad_norm": 0.7062857747077942, "learning_rate": 8.819114969112558e-07, "loss": 0.2294, "num_input_tokens_seen": 108116336, "step": 113235 }, { "epoch": 9.237295048535769, "grad_norm": 4.534114360809326, "learning_rate": 8.809747069061736e-07, "loss": 0.3746, "num_input_tokens_seen": 108121440, "step": 113240 }, { "epoch": 9.237702912146178, "grad_norm": 40.49944305419922, "learning_rate": 8.800384057834843e-07, "loss": 0.4856, "num_input_tokens_seen": 108126064, "step": 113245 }, { "epoch": 9.238110775756587, "grad_norm": 4.425275802612305, "learning_rate": 8.791025935621644e-07, "loss": 0.2632, "num_input_tokens_seen": 108130432, "step": 113250 }, { "epoch": 9.238518639366996, "grad_norm": 3.3624038696289062, "learning_rate": 8.781672702611904e-07, "loss": 0.5403, "num_input_tokens_seen": 108135552, "step": 113255 }, { "epoch": 9.238926502977405, "grad_norm": 39.854732513427734, "learning_rate": 8.772324358995138e-07, "loss": 0.2942, "num_input_tokens_seen": 108140544, "step": 113260 }, { "epoch": 9.239334366587814, "grad_norm": 21.653491973876953, "learning_rate": 8.762980904960861e-07, "loss": 0.4488, "num_input_tokens_seen": 108145488, "step": 113265 }, { "epoch": 9.239742230198221, "grad_norm": 22.761497497558594, "learning_rate": 8.753642340698476e-07, "loss": 0.4622, "num_input_tokens_seen": 108149792, "step": 113270 }, { "epoch": 9.24015009380863, "grad_norm": 5.988372325897217, "learning_rate": 8.744308666397222e-07, "loss": 0.2602, "num_input_tokens_seen": 108154240, "step": 113275 }, { "epoch": 9.240557957419039, "grad_norm": 31.319683074951172, "learning_rate": 8.734979882246308e-07, "loss": 0.4312, "num_input_tokens_seen": 108159008, "step": 113280 }, { "epoch": 9.240965821029448, "grad_norm": 2.1668765544891357, "learning_rate": 8.725655988434861e-07, "loss": 0.296, "num_input_tokens_seen": 108164672, "step": 113285 }, { "epoch": 9.241373684639857, "grad_norm": 2.789684534072876, "learning_rate": 8.716336985151841e-07, "loss": 0.3869, "num_input_tokens_seen": 108170016, "step": 113290 }, { "epoch": 9.241781548250264, "grad_norm": 16.904403686523438, "learning_rate": 8.707022872586096e-07, "loss": 0.3583, "num_input_tokens_seen": 108174832, "step": 113295 }, { "epoch": 9.242189411860673, "grad_norm": 22.91313362121582, "learning_rate": 8.697713650926504e-07, "loss": 0.2878, "num_input_tokens_seen": 108179648, "step": 113300 }, { "epoch": 9.242597275471082, "grad_norm": 23.907087326049805, "learning_rate": 8.68840932036169e-07, "loss": 0.3848, "num_input_tokens_seen": 108185360, "step": 113305 }, { "epoch": 9.243005139081491, "grad_norm": 25.503711700439453, "learning_rate": 8.67910988108031e-07, "loss": 0.4547, "num_input_tokens_seen": 108189888, "step": 113310 }, { "epoch": 9.2434130026919, "grad_norm": 31.097900390625, "learning_rate": 8.669815333270798e-07, "loss": 0.4494, "num_input_tokens_seen": 108194144, "step": 113315 }, { "epoch": 9.24382086630231, "grad_norm": 1.5509477853775024, "learning_rate": 8.660525677121611e-07, "loss": 0.1292, "num_input_tokens_seen": 108198736, "step": 113320 }, { "epoch": 9.244228729912717, "grad_norm": 3.2535831928253174, "learning_rate": 8.651240912821018e-07, "loss": 0.1401, "num_input_tokens_seen": 108203872, "step": 113325 }, { "epoch": 9.244636593523126, "grad_norm": 31.55080223083496, "learning_rate": 8.641961040557228e-07, "loss": 0.3703, "num_input_tokens_seen": 108209616, "step": 113330 }, { "epoch": 9.245044457133535, "grad_norm": 11.419404983520508, "learning_rate": 8.632686060518286e-07, "loss": 0.2916, "num_input_tokens_seen": 108214352, "step": 113335 }, { "epoch": 9.245452320743944, "grad_norm": 35.392662048339844, "learning_rate": 8.623415972892263e-07, "loss": 0.327, "num_input_tokens_seen": 108218960, "step": 113340 }, { "epoch": 9.245860184354353, "grad_norm": 15.273728370666504, "learning_rate": 8.614150777867008e-07, "loss": 0.2674, "num_input_tokens_seen": 108223984, "step": 113345 }, { "epoch": 9.24626804796476, "grad_norm": 13.000350952148438, "learning_rate": 8.604890475630372e-07, "loss": 0.5195, "num_input_tokens_seen": 108228944, "step": 113350 }, { "epoch": 9.246675911575169, "grad_norm": 14.580330848693848, "learning_rate": 8.595635066370039e-07, "loss": 0.3163, "num_input_tokens_seen": 108233712, "step": 113355 }, { "epoch": 9.247083775185578, "grad_norm": 17.072771072387695, "learning_rate": 8.586384550273552e-07, "loss": 0.2334, "num_input_tokens_seen": 108238256, "step": 113360 }, { "epoch": 9.247491638795987, "grad_norm": 15.964129447937012, "learning_rate": 8.577138927528511e-07, "loss": 0.3892, "num_input_tokens_seen": 108242592, "step": 113365 }, { "epoch": 9.247899502406396, "grad_norm": 9.054287910461426, "learning_rate": 8.567898198322294e-07, "loss": 0.3965, "num_input_tokens_seen": 108247328, "step": 113370 }, { "epoch": 9.248307366016803, "grad_norm": 7.87816047668457, "learning_rate": 8.558662362842168e-07, "loss": 0.6415, "num_input_tokens_seen": 108252176, "step": 113375 }, { "epoch": 9.248715229627212, "grad_norm": 2.4077365398406982, "learning_rate": 8.549431421275317e-07, "loss": 0.36, "num_input_tokens_seen": 108257440, "step": 113380 }, { "epoch": 9.249123093237621, "grad_norm": 48.86750793457031, "learning_rate": 8.540205373808951e-07, "loss": 0.3555, "num_input_tokens_seen": 108262880, "step": 113385 }, { "epoch": 9.24953095684803, "grad_norm": 9.317712783813477, "learning_rate": 8.530984220630006e-07, "loss": 0.4821, "num_input_tokens_seen": 108267472, "step": 113390 }, { "epoch": 9.24993882045844, "grad_norm": 1.1024463176727295, "learning_rate": 8.521767961925415e-07, "loss": 0.3813, "num_input_tokens_seen": 108271552, "step": 113395 }, { "epoch": 9.250346684068848, "grad_norm": 12.495268821716309, "learning_rate": 8.512556597881943e-07, "loss": 0.3134, "num_input_tokens_seen": 108276336, "step": 113400 }, { "epoch": 9.250754547679255, "grad_norm": 0.491678923368454, "learning_rate": 8.503350128686361e-07, "loss": 0.3055, "num_input_tokens_seen": 108281296, "step": 113405 }, { "epoch": 9.251162411289664, "grad_norm": 60.45667266845703, "learning_rate": 8.494148554525267e-07, "loss": 0.549, "num_input_tokens_seen": 108286352, "step": 113410 }, { "epoch": 9.251570274900073, "grad_norm": 17.686317443847656, "learning_rate": 8.484951875585151e-07, "loss": 0.4352, "num_input_tokens_seen": 108291360, "step": 113415 }, { "epoch": 9.251978138510482, "grad_norm": 3.9790961742401123, "learning_rate": 8.475760092052421e-07, "loss": 0.21, "num_input_tokens_seen": 108296032, "step": 113420 }, { "epoch": 9.252386002120891, "grad_norm": 3.2128522396087646, "learning_rate": 8.46657320411337e-07, "loss": 0.3721, "num_input_tokens_seen": 108300624, "step": 113425 }, { "epoch": 9.252793865731299, "grad_norm": 6.935163974761963, "learning_rate": 8.457391211954296e-07, "loss": 0.3786, "num_input_tokens_seen": 108306064, "step": 113430 }, { "epoch": 9.253201729341708, "grad_norm": 3.073258876800537, "learning_rate": 8.448214115761271e-07, "loss": 0.2963, "num_input_tokens_seen": 108310176, "step": 113435 }, { "epoch": 9.253609592952117, "grad_norm": 25.731874465942383, "learning_rate": 8.439041915720286e-07, "loss": 0.3643, "num_input_tokens_seen": 108314320, "step": 113440 }, { "epoch": 9.254017456562526, "grad_norm": 17.846908569335938, "learning_rate": 8.429874612017274e-07, "loss": 0.281, "num_input_tokens_seen": 108318448, "step": 113445 }, { "epoch": 9.254425320172935, "grad_norm": 14.158212661743164, "learning_rate": 8.420712204838033e-07, "loss": 0.3687, "num_input_tokens_seen": 108323712, "step": 113450 }, { "epoch": 9.254833183783344, "grad_norm": 6.78923225402832, "learning_rate": 8.411554694368329e-07, "loss": 0.3571, "num_input_tokens_seen": 108328544, "step": 113455 }, { "epoch": 9.255241047393751, "grad_norm": 34.510440826416016, "learning_rate": 8.402402080793764e-07, "loss": 0.3792, "num_input_tokens_seen": 108333008, "step": 113460 }, { "epoch": 9.25564891100416, "grad_norm": 5.375369071960449, "learning_rate": 8.393254364299829e-07, "loss": 0.2854, "num_input_tokens_seen": 108337472, "step": 113465 }, { "epoch": 9.256056774614569, "grad_norm": 15.509760856628418, "learning_rate": 8.384111545071932e-07, "loss": 0.2433, "num_input_tokens_seen": 108342304, "step": 113470 }, { "epoch": 9.256464638224978, "grad_norm": 2.3648293018341064, "learning_rate": 8.374973623295479e-07, "loss": 0.1733, "num_input_tokens_seen": 108346896, "step": 113475 }, { "epoch": 9.256872501835387, "grad_norm": 8.526975631713867, "learning_rate": 8.365840599155627e-07, "loss": 0.3516, "num_input_tokens_seen": 108351008, "step": 113480 }, { "epoch": 9.257280365445794, "grad_norm": 5.480643272399902, "learning_rate": 8.356712472837508e-07, "loss": 0.3081, "num_input_tokens_seen": 108355872, "step": 113485 }, { "epoch": 9.257688229056203, "grad_norm": 18.720733642578125, "learning_rate": 8.347589244526139e-07, "loss": 0.2945, "num_input_tokens_seen": 108360368, "step": 113490 }, { "epoch": 9.258096092666612, "grad_norm": 13.404139518737793, "learning_rate": 8.338470914406426e-07, "loss": 0.3328, "num_input_tokens_seen": 108365696, "step": 113495 }, { "epoch": 9.258503956277021, "grad_norm": 22.64665412902832, "learning_rate": 8.329357482663252e-07, "loss": 0.1407, "num_input_tokens_seen": 108370064, "step": 113500 }, { "epoch": 9.25891181988743, "grad_norm": 24.14026641845703, "learning_rate": 8.320248949481302e-07, "loss": 0.3098, "num_input_tokens_seen": 108374336, "step": 113505 }, { "epoch": 9.259319683497838, "grad_norm": 1.2765839099884033, "learning_rate": 8.311145315045233e-07, "loss": 0.3302, "num_input_tokens_seen": 108379088, "step": 113510 }, { "epoch": 9.259727547108247, "grad_norm": 30.02634620666504, "learning_rate": 8.302046579539507e-07, "loss": 0.3402, "num_input_tokens_seen": 108384080, "step": 113515 }, { "epoch": 9.260135410718656, "grad_norm": 1.8894591331481934, "learning_rate": 8.292952743148619e-07, "loss": 0.2468, "num_input_tokens_seen": 108388704, "step": 113520 }, { "epoch": 9.260543274329065, "grad_norm": 5.000943660736084, "learning_rate": 8.283863806056863e-07, "loss": 0.2511, "num_input_tokens_seen": 108393504, "step": 113525 }, { "epoch": 9.260951137939474, "grad_norm": 1.1090574264526367, "learning_rate": 8.274779768448482e-07, "loss": 0.2214, "num_input_tokens_seen": 108398592, "step": 113530 }, { "epoch": 9.261359001549883, "grad_norm": 32.884300231933594, "learning_rate": 8.265700630507606e-07, "loss": 0.4366, "num_input_tokens_seen": 108402704, "step": 113535 }, { "epoch": 9.26176686516029, "grad_norm": 16.989242553710938, "learning_rate": 8.256626392418255e-07, "loss": 0.4669, "num_input_tokens_seen": 108407360, "step": 113540 }, { "epoch": 9.262174728770699, "grad_norm": 9.157249450683594, "learning_rate": 8.247557054364363e-07, "loss": 0.2281, "num_input_tokens_seen": 108413280, "step": 113545 }, { "epoch": 9.262582592381108, "grad_norm": 12.456930160522461, "learning_rate": 8.238492616529758e-07, "loss": 0.2818, "num_input_tokens_seen": 108418592, "step": 113550 }, { "epoch": 9.262990455991517, "grad_norm": 23.23804473876953, "learning_rate": 8.229433079098181e-07, "loss": 0.3436, "num_input_tokens_seen": 108422736, "step": 113555 }, { "epoch": 9.263398319601926, "grad_norm": 10.117220878601074, "learning_rate": 8.220378442253235e-07, "loss": 0.3966, "num_input_tokens_seen": 108427088, "step": 113560 }, { "epoch": 9.263806183212333, "grad_norm": 30.465457916259766, "learning_rate": 8.211328706178523e-07, "loss": 0.4085, "num_input_tokens_seen": 108432576, "step": 113565 }, { "epoch": 9.264214046822742, "grad_norm": 16.88703727722168, "learning_rate": 8.202283871057426e-07, "loss": 0.4285, "num_input_tokens_seen": 108437392, "step": 113570 }, { "epoch": 9.264621910433151, "grad_norm": 27.600566864013672, "learning_rate": 8.193243937073269e-07, "loss": 0.3436, "num_input_tokens_seen": 108443296, "step": 113575 }, { "epoch": 9.26502977404356, "grad_norm": 2.305143356323242, "learning_rate": 8.184208904409296e-07, "loss": 0.4631, "num_input_tokens_seen": 108448336, "step": 113580 }, { "epoch": 9.265437637653969, "grad_norm": 32.10293197631836, "learning_rate": 8.175178773248693e-07, "loss": 0.5063, "num_input_tokens_seen": 108453888, "step": 113585 }, { "epoch": 9.265845501264376, "grad_norm": 17.03169059753418, "learning_rate": 8.166153543774452e-07, "loss": 0.353, "num_input_tokens_seen": 108458192, "step": 113590 }, { "epoch": 9.266253364874785, "grad_norm": 5.545176982879639, "learning_rate": 8.15713321616951e-07, "loss": 0.533, "num_input_tokens_seen": 108462976, "step": 113595 }, { "epoch": 9.266661228485194, "grad_norm": 2.1925134658813477, "learning_rate": 8.148117790616694e-07, "loss": 0.4051, "num_input_tokens_seen": 108467632, "step": 113600 }, { "epoch": 9.267069092095603, "grad_norm": 13.746047973632812, "learning_rate": 8.139107267298774e-07, "loss": 0.1932, "num_input_tokens_seen": 108472544, "step": 113605 }, { "epoch": 9.267476955706012, "grad_norm": 2.1203484535217285, "learning_rate": 8.13010164639838e-07, "loss": 0.219, "num_input_tokens_seen": 108478112, "step": 113610 }, { "epoch": 9.267884819316421, "grad_norm": 32.03229904174805, "learning_rate": 8.121100928098035e-07, "loss": 0.2806, "num_input_tokens_seen": 108482528, "step": 113615 }, { "epoch": 9.268292682926829, "grad_norm": 7.339576244354248, "learning_rate": 8.112105112580231e-07, "loss": 0.4091, "num_input_tokens_seen": 108486432, "step": 113620 }, { "epoch": 9.268700546537238, "grad_norm": 17.817296981811523, "learning_rate": 8.103114200027212e-07, "loss": 0.2628, "num_input_tokens_seen": 108491168, "step": 113625 }, { "epoch": 9.269108410147647, "grad_norm": 6.360503673553467, "learning_rate": 8.094128190621331e-07, "loss": 0.4489, "num_input_tokens_seen": 108495504, "step": 113630 }, { "epoch": 9.269516273758056, "grad_norm": 34.310237884521484, "learning_rate": 8.085147084544664e-07, "loss": 0.6058, "num_input_tokens_seen": 108499536, "step": 113635 }, { "epoch": 9.269924137368465, "grad_norm": 6.452673435211182, "learning_rate": 8.076170881979261e-07, "loss": 0.4157, "num_input_tokens_seen": 108503936, "step": 113640 }, { "epoch": 9.270332000978872, "grad_norm": 14.348095893859863, "learning_rate": 8.067199583107116e-07, "loss": 0.1566, "num_input_tokens_seen": 108509424, "step": 113645 }, { "epoch": 9.270739864589281, "grad_norm": 22.497116088867188, "learning_rate": 8.058233188109971e-07, "loss": 0.3894, "num_input_tokens_seen": 108513248, "step": 113650 }, { "epoch": 9.27114772819969, "grad_norm": 3.3024775981903076, "learning_rate": 8.049271697169652e-07, "loss": 0.2906, "num_input_tokens_seen": 108517968, "step": 113655 }, { "epoch": 9.271555591810099, "grad_norm": 3.451853036880493, "learning_rate": 8.040315110467794e-07, "loss": 0.2745, "num_input_tokens_seen": 108522880, "step": 113660 }, { "epoch": 9.271963455420508, "grad_norm": 11.263982772827148, "learning_rate": 8.031363428185946e-07, "loss": 0.2906, "num_input_tokens_seen": 108526880, "step": 113665 }, { "epoch": 9.272371319030917, "grad_norm": 5.630288124084473, "learning_rate": 8.022416650505543e-07, "loss": 0.2469, "num_input_tokens_seen": 108531312, "step": 113670 }, { "epoch": 9.272779182641324, "grad_norm": 7.552093505859375, "learning_rate": 8.013474777607916e-07, "loss": 0.4236, "num_input_tokens_seen": 108536304, "step": 113675 }, { "epoch": 9.273187046251733, "grad_norm": 17.703533172607422, "learning_rate": 8.004537809674334e-07, "loss": 0.3407, "num_input_tokens_seen": 108542000, "step": 113680 }, { "epoch": 9.273594909862142, "grad_norm": 3.0016863346099854, "learning_rate": 7.995605746885932e-07, "loss": 0.2165, "num_input_tokens_seen": 108546416, "step": 113685 }, { "epoch": 9.274002773472551, "grad_norm": 1.8182216882705688, "learning_rate": 7.986678589423758e-07, "loss": 0.2967, "num_input_tokens_seen": 108551344, "step": 113690 }, { "epoch": 9.27441063708296, "grad_norm": 3.558617115020752, "learning_rate": 7.977756337468805e-07, "loss": 0.3765, "num_input_tokens_seen": 108556320, "step": 113695 }, { "epoch": 9.274818500693367, "grad_norm": 1.41838538646698, "learning_rate": 7.968838991201877e-07, "loss": 0.2907, "num_input_tokens_seen": 108561760, "step": 113700 }, { "epoch": 9.275226364303776, "grad_norm": 13.880624771118164, "learning_rate": 7.959926550803742e-07, "loss": 0.1649, "num_input_tokens_seen": 108566528, "step": 113705 }, { "epoch": 9.275634227914185, "grad_norm": 37.42255401611328, "learning_rate": 7.951019016455036e-07, "loss": 0.4447, "num_input_tokens_seen": 108570816, "step": 113710 }, { "epoch": 9.276042091524594, "grad_norm": 13.784102439880371, "learning_rate": 7.942116388336307e-07, "loss": 0.4272, "num_input_tokens_seen": 108576496, "step": 113715 }, { "epoch": 9.276449955135003, "grad_norm": 23.42502212524414, "learning_rate": 7.933218666628023e-07, "loss": 0.3613, "num_input_tokens_seen": 108580864, "step": 113720 }, { "epoch": 9.27685781874541, "grad_norm": 5.323981761932373, "learning_rate": 7.924325851510566e-07, "loss": 0.3227, "num_input_tokens_seen": 108585360, "step": 113725 }, { "epoch": 9.27726568235582, "grad_norm": 5.745125770568848, "learning_rate": 7.915437943164155e-07, "loss": 0.4114, "num_input_tokens_seen": 108589392, "step": 113730 }, { "epoch": 9.277673545966229, "grad_norm": 9.748376846313477, "learning_rate": 7.906554941768895e-07, "loss": 0.4143, "num_input_tokens_seen": 108594048, "step": 113735 }, { "epoch": 9.278081409576638, "grad_norm": 5.362934112548828, "learning_rate": 7.897676847504948e-07, "loss": 0.5174, "num_input_tokens_seen": 108599024, "step": 113740 }, { "epoch": 9.278489273187047, "grad_norm": 2.48882794380188, "learning_rate": 7.888803660552224e-07, "loss": 0.3958, "num_input_tokens_seen": 108603504, "step": 113745 }, { "epoch": 9.278897136797456, "grad_norm": 2.7957632541656494, "learning_rate": 7.879935381090553e-07, "loss": 0.3049, "num_input_tokens_seen": 108609152, "step": 113750 }, { "epoch": 9.279305000407863, "grad_norm": 11.663832664489746, "learning_rate": 7.871072009299707e-07, "loss": 0.3329, "num_input_tokens_seen": 108613968, "step": 113755 }, { "epoch": 9.279712864018272, "grad_norm": 3.9903817176818848, "learning_rate": 7.862213545359321e-07, "loss": 0.4667, "num_input_tokens_seen": 108618528, "step": 113760 }, { "epoch": 9.280120727628681, "grad_norm": 5.362144947052002, "learning_rate": 7.853359989448999e-07, "loss": 0.4379, "num_input_tokens_seen": 108623040, "step": 113765 }, { "epoch": 9.28052859123909, "grad_norm": 22.849580764770508, "learning_rate": 7.844511341748184e-07, "loss": 0.4704, "num_input_tokens_seen": 108628976, "step": 113770 }, { "epoch": 9.280936454849499, "grad_norm": 12.417304039001465, "learning_rate": 7.835667602436231e-07, "loss": 0.3159, "num_input_tokens_seen": 108634176, "step": 113775 }, { "epoch": 9.281344318459906, "grad_norm": 15.447271347045898, "learning_rate": 7.826828771692357e-07, "loss": 0.3739, "num_input_tokens_seen": 108639408, "step": 113780 }, { "epoch": 9.281752182070315, "grad_norm": 9.12403392791748, "learning_rate": 7.817994849695808e-07, "loss": 0.3841, "num_input_tokens_seen": 108644080, "step": 113785 }, { "epoch": 9.282160045680724, "grad_norm": 1.358007788658142, "learning_rate": 7.809165836625553e-07, "loss": 0.1471, "num_input_tokens_seen": 108649200, "step": 113790 }, { "epoch": 9.282567909291133, "grad_norm": 26.494386672973633, "learning_rate": 7.800341732660615e-07, "loss": 0.6843, "num_input_tokens_seen": 108654048, "step": 113795 }, { "epoch": 9.282975772901542, "grad_norm": 8.812605857849121, "learning_rate": 7.791522537979851e-07, "loss": 0.3199, "num_input_tokens_seen": 108658816, "step": 113800 }, { "epoch": 9.28338363651195, "grad_norm": 21.436203002929688, "learning_rate": 7.78270825276195e-07, "loss": 0.4753, "num_input_tokens_seen": 108664352, "step": 113805 }, { "epoch": 9.283791500122359, "grad_norm": 1.0738205909729004, "learning_rate": 7.773898877185659e-07, "loss": 0.5148, "num_input_tokens_seen": 108669328, "step": 113810 }, { "epoch": 9.284199363732768, "grad_norm": 11.769378662109375, "learning_rate": 7.76509441142953e-07, "loss": 0.3202, "num_input_tokens_seen": 108673536, "step": 113815 }, { "epoch": 9.284607227343177, "grad_norm": 3.9980883598327637, "learning_rate": 7.756294855671975e-07, "loss": 0.4971, "num_input_tokens_seen": 108678464, "step": 113820 }, { "epoch": 9.285015090953586, "grad_norm": 3.8183069229125977, "learning_rate": 7.747500210091407e-07, "loss": 0.2812, "num_input_tokens_seen": 108682864, "step": 113825 }, { "epoch": 9.285422954563995, "grad_norm": 5.828769207000732, "learning_rate": 7.738710474866073e-07, "loss": 0.2777, "num_input_tokens_seen": 108688160, "step": 113830 }, { "epoch": 9.285830818174402, "grad_norm": 4.859057426452637, "learning_rate": 7.729925650174136e-07, "loss": 0.3058, "num_input_tokens_seen": 108693152, "step": 113835 }, { "epoch": 9.28623868178481, "grad_norm": 1.9597457647323608, "learning_rate": 7.721145736193647e-07, "loss": 0.3729, "num_input_tokens_seen": 108698384, "step": 113840 }, { "epoch": 9.28664654539522, "grad_norm": 35.65286636352539, "learning_rate": 7.712370733102575e-07, "loss": 0.3813, "num_input_tokens_seen": 108702976, "step": 113845 }, { "epoch": 9.287054409005629, "grad_norm": 2.1702284812927246, "learning_rate": 7.703600641078778e-07, "loss": 0.3138, "num_input_tokens_seen": 108707648, "step": 113850 }, { "epoch": 9.287462272616038, "grad_norm": 3.4346249103546143, "learning_rate": 7.694835460300087e-07, "loss": 0.2868, "num_input_tokens_seen": 108712688, "step": 113855 }, { "epoch": 9.287870136226445, "grad_norm": 13.003155708312988, "learning_rate": 7.686075190944081e-07, "loss": 0.2703, "num_input_tokens_seen": 108717280, "step": 113860 }, { "epoch": 9.288277999836854, "grad_norm": 16.45001792907715, "learning_rate": 7.677319833188396e-07, "loss": 0.2744, "num_input_tokens_seen": 108721216, "step": 113865 }, { "epoch": 9.288685863447263, "grad_norm": 13.324298858642578, "learning_rate": 7.668569387210416e-07, "loss": 0.4434, "num_input_tokens_seen": 108725952, "step": 113870 }, { "epoch": 9.289093727057672, "grad_norm": 9.092816352844238, "learning_rate": 7.659823853187615e-07, "loss": 0.3364, "num_input_tokens_seen": 108731536, "step": 113875 }, { "epoch": 9.289501590668081, "grad_norm": 17.27926254272461, "learning_rate": 7.651083231297207e-07, "loss": 0.4056, "num_input_tokens_seen": 108736352, "step": 113880 }, { "epoch": 9.28990945427849, "grad_norm": 46.78675079345703, "learning_rate": 7.642347521716331e-07, "loss": 0.4362, "num_input_tokens_seen": 108741920, "step": 113885 }, { "epoch": 9.290317317888897, "grad_norm": 3.2113757133483887, "learning_rate": 7.633616724622123e-07, "loss": 0.1858, "num_input_tokens_seen": 108746752, "step": 113890 }, { "epoch": 9.290725181499306, "grad_norm": 23.167421340942383, "learning_rate": 7.624890840191468e-07, "loss": 0.3318, "num_input_tokens_seen": 108751488, "step": 113895 }, { "epoch": 9.291133045109715, "grad_norm": 18.326196670532227, "learning_rate": 7.616169868601309e-07, "loss": 0.3043, "num_input_tokens_seen": 108755680, "step": 113900 }, { "epoch": 9.291540908720124, "grad_norm": 28.966176986694336, "learning_rate": 7.607453810028392e-07, "loss": 0.3317, "num_input_tokens_seen": 108760608, "step": 113905 }, { "epoch": 9.291948772330533, "grad_norm": 31.00217056274414, "learning_rate": 7.598742664649411e-07, "loss": 0.4266, "num_input_tokens_seen": 108765088, "step": 113910 }, { "epoch": 9.29235663594094, "grad_norm": 15.8171968460083, "learning_rate": 7.590036432640863e-07, "loss": 0.3738, "num_input_tokens_seen": 108769776, "step": 113915 }, { "epoch": 9.29276449955135, "grad_norm": 4.071857929229736, "learning_rate": 7.5813351141793e-07, "loss": 0.3629, "num_input_tokens_seen": 108775344, "step": 113920 }, { "epoch": 9.293172363161759, "grad_norm": 6.389991760253906, "learning_rate": 7.572638709441054e-07, "loss": 0.3281, "num_input_tokens_seen": 108780176, "step": 113925 }, { "epoch": 9.293580226772168, "grad_norm": 18.345407485961914, "learning_rate": 7.563947218602429e-07, "loss": 0.2872, "num_input_tokens_seen": 108785568, "step": 113930 }, { "epoch": 9.293988090382577, "grad_norm": 2.893373489379883, "learning_rate": 7.555260641839534e-07, "loss": 0.4354, "num_input_tokens_seen": 108789872, "step": 113935 }, { "epoch": 9.294395953992984, "grad_norm": 1.0232043266296387, "learning_rate": 7.546578979328534e-07, "loss": 0.3857, "num_input_tokens_seen": 108794592, "step": 113940 }, { "epoch": 9.294803817603393, "grad_norm": 6.980587005615234, "learning_rate": 7.537902231245342e-07, "loss": 0.4711, "num_input_tokens_seen": 108799824, "step": 113945 }, { "epoch": 9.295211681213802, "grad_norm": 24.6677303314209, "learning_rate": 7.529230397765846e-07, "loss": 0.3206, "num_input_tokens_seen": 108804736, "step": 113950 }, { "epoch": 9.295619544824211, "grad_norm": 3.1014840602874756, "learning_rate": 7.520563479065795e-07, "loss": 0.2158, "num_input_tokens_seen": 108809408, "step": 113955 }, { "epoch": 9.29602740843462, "grad_norm": 5.020474433898926, "learning_rate": 7.511901475320909e-07, "loss": 0.5923, "num_input_tokens_seen": 108813280, "step": 113960 }, { "epoch": 9.296435272045029, "grad_norm": 1.205081582069397, "learning_rate": 7.503244386706743e-07, "loss": 0.5688, "num_input_tokens_seen": 108818128, "step": 113965 }, { "epoch": 9.296843135655436, "grad_norm": 15.320816040039062, "learning_rate": 7.494592213398766e-07, "loss": 0.405, "num_input_tokens_seen": 108822928, "step": 113970 }, { "epoch": 9.297250999265845, "grad_norm": 5.8309102058410645, "learning_rate": 7.485944955572338e-07, "loss": 0.3706, "num_input_tokens_seen": 108827536, "step": 113975 }, { "epoch": 9.297658862876254, "grad_norm": 38.66964340209961, "learning_rate": 7.477302613402765e-07, "loss": 0.3636, "num_input_tokens_seen": 108832592, "step": 113980 }, { "epoch": 9.298066726486663, "grad_norm": 36.0549201965332, "learning_rate": 7.468665187065211e-07, "loss": 0.4679, "num_input_tokens_seen": 108837968, "step": 113985 }, { "epoch": 9.298474590097072, "grad_norm": 2.1070103645324707, "learning_rate": 7.460032676734757e-07, "loss": 0.2627, "num_input_tokens_seen": 108842848, "step": 113990 }, { "epoch": 9.29888245370748, "grad_norm": 32.974609375, "learning_rate": 7.451405082586405e-07, "loss": 0.3371, "num_input_tokens_seen": 108847744, "step": 113995 }, { "epoch": 9.299290317317888, "grad_norm": 16.494966506958008, "learning_rate": 7.442782404794984e-07, "loss": 0.2192, "num_input_tokens_seen": 108852592, "step": 114000 }, { "epoch": 9.299698180928297, "grad_norm": 40.89670181274414, "learning_rate": 7.434164643535274e-07, "loss": 0.4712, "num_input_tokens_seen": 108856608, "step": 114005 }, { "epoch": 9.300106044538706, "grad_norm": 10.901738166809082, "learning_rate": 7.425551798981995e-07, "loss": 0.3268, "num_input_tokens_seen": 108861824, "step": 114010 }, { "epoch": 9.300513908149115, "grad_norm": 5.699595928192139, "learning_rate": 7.4169438713097e-07, "loss": 0.3961, "num_input_tokens_seen": 108866576, "step": 114015 }, { "epoch": 9.300921771759523, "grad_norm": 1.9913971424102783, "learning_rate": 7.408340860692891e-07, "loss": 0.368, "num_input_tokens_seen": 108872288, "step": 114020 }, { "epoch": 9.301329635369932, "grad_norm": 3.4897520542144775, "learning_rate": 7.3997427673059e-07, "loss": 0.3704, "num_input_tokens_seen": 108876528, "step": 114025 }, { "epoch": 9.30173749898034, "grad_norm": 25.50189971923828, "learning_rate": 7.39114959132306e-07, "loss": 0.3132, "num_input_tokens_seen": 108880480, "step": 114030 }, { "epoch": 9.30214536259075, "grad_norm": 5.906754493713379, "learning_rate": 7.382561332918536e-07, "loss": 0.1811, "num_input_tokens_seen": 108885040, "step": 114035 }, { "epoch": 9.302553226201159, "grad_norm": 6.549421787261963, "learning_rate": 7.373977992266385e-07, "loss": 0.3416, "num_input_tokens_seen": 108889600, "step": 114040 }, { "epoch": 9.302961089811568, "grad_norm": 15.367927551269531, "learning_rate": 7.365399569540604e-07, "loss": 0.3755, "num_input_tokens_seen": 108893520, "step": 114045 }, { "epoch": 9.303368953421975, "grad_norm": 35.51285171508789, "learning_rate": 7.356826064915028e-07, "loss": 0.2327, "num_input_tokens_seen": 108898832, "step": 114050 }, { "epoch": 9.303776817032384, "grad_norm": 8.285064697265625, "learning_rate": 7.348257478563547e-07, "loss": 0.2078, "num_input_tokens_seen": 108903968, "step": 114055 }, { "epoch": 9.304184680642793, "grad_norm": 39.187801361083984, "learning_rate": 7.339693810659742e-07, "loss": 0.5474, "num_input_tokens_seen": 108909744, "step": 114060 }, { "epoch": 9.304592544253202, "grad_norm": 16.930212020874023, "learning_rate": 7.331135061377253e-07, "loss": 0.3324, "num_input_tokens_seen": 108914640, "step": 114065 }, { "epoch": 9.305000407863611, "grad_norm": 4.188180923461914, "learning_rate": 7.322581230889524e-07, "loss": 0.2877, "num_input_tokens_seen": 108917920, "step": 114070 }, { "epoch": 9.305408271474018, "grad_norm": 5.5403056144714355, "learning_rate": 7.314032319369946e-07, "loss": 0.3107, "num_input_tokens_seen": 108922544, "step": 114075 }, { "epoch": 9.305816135084427, "grad_norm": 2.193262815475464, "learning_rate": 7.305488326991822e-07, "loss": 0.253, "num_input_tokens_seen": 108927056, "step": 114080 }, { "epoch": 9.306223998694836, "grad_norm": 3.7604565620422363, "learning_rate": 7.296949253928348e-07, "loss": 0.3676, "num_input_tokens_seen": 108931712, "step": 114085 }, { "epoch": 9.306631862305245, "grad_norm": 33.37671661376953, "learning_rate": 7.288415100352552e-07, "loss": 0.3115, "num_input_tokens_seen": 108936176, "step": 114090 }, { "epoch": 9.307039725915654, "grad_norm": 2.4672210216522217, "learning_rate": 7.279885866437464e-07, "loss": 0.3719, "num_input_tokens_seen": 108940480, "step": 114095 }, { "epoch": 9.307447589526063, "grad_norm": 9.17940616607666, "learning_rate": 7.271361552355943e-07, "loss": 0.5163, "num_input_tokens_seen": 108944736, "step": 114100 }, { "epoch": 9.30785545313647, "grad_norm": 8.907367706298828, "learning_rate": 7.262842158280769e-07, "loss": 0.4414, "num_input_tokens_seen": 108949136, "step": 114105 }, { "epoch": 9.30826331674688, "grad_norm": 16.506664276123047, "learning_rate": 7.254327684384665e-07, "loss": 0.4712, "num_input_tokens_seen": 108954736, "step": 114110 }, { "epoch": 9.308671180357289, "grad_norm": 23.600910186767578, "learning_rate": 7.245818130840159e-07, "loss": 0.4522, "num_input_tokens_seen": 108960000, "step": 114115 }, { "epoch": 9.309079043967698, "grad_norm": 17.0786075592041, "learning_rate": 7.237313497819781e-07, "loss": 0.3749, "num_input_tokens_seen": 108964384, "step": 114120 }, { "epoch": 9.309486907578107, "grad_norm": 15.345799446105957, "learning_rate": 7.228813785495919e-07, "loss": 0.3105, "num_input_tokens_seen": 108969392, "step": 114125 }, { "epoch": 9.309894771188514, "grad_norm": 13.531599998474121, "learning_rate": 7.220318994040826e-07, "loss": 0.4395, "num_input_tokens_seen": 108973344, "step": 114130 }, { "epoch": 9.310302634798923, "grad_norm": 30.76816749572754, "learning_rate": 7.211829123626668e-07, "loss": 0.393, "num_input_tokens_seen": 108978048, "step": 114135 }, { "epoch": 9.310710498409332, "grad_norm": 5.847006320953369, "learning_rate": 7.203344174425614e-07, "loss": 0.4249, "num_input_tokens_seen": 108983184, "step": 114140 }, { "epoch": 9.31111836201974, "grad_norm": 9.423812866210938, "learning_rate": 7.194864146609609e-07, "loss": 0.2516, "num_input_tokens_seen": 108988224, "step": 114145 }, { "epoch": 9.31152622563015, "grad_norm": 28.2183837890625, "learning_rate": 7.186389040350517e-07, "loss": 0.3062, "num_input_tokens_seen": 108993632, "step": 114150 }, { "epoch": 9.311934089240557, "grad_norm": 1.3643392324447632, "learning_rate": 7.177918855820143e-07, "loss": 0.3785, "num_input_tokens_seen": 108998224, "step": 114155 }, { "epoch": 9.312341952850966, "grad_norm": 2.171461582183838, "learning_rate": 7.169453593190157e-07, "loss": 0.2666, "num_input_tokens_seen": 109002816, "step": 114160 }, { "epoch": 9.312749816461375, "grad_norm": 7.416501522064209, "learning_rate": 7.16099325263217e-07, "loss": 0.2865, "num_input_tokens_seen": 109008256, "step": 114165 }, { "epoch": 9.313157680071784, "grad_norm": 1.9526731967926025, "learning_rate": 7.152537834317657e-07, "loss": 0.2446, "num_input_tokens_seen": 109012752, "step": 114170 }, { "epoch": 9.313565543682193, "grad_norm": 6.111004829406738, "learning_rate": 7.144087338418037e-07, "loss": 0.366, "num_input_tokens_seen": 109017776, "step": 114175 }, { "epoch": 9.313973407292602, "grad_norm": 4.602298259735107, "learning_rate": 7.135641765104534e-07, "loss": 0.3611, "num_input_tokens_seen": 109021968, "step": 114180 }, { "epoch": 9.31438127090301, "grad_norm": 3.263225793838501, "learning_rate": 7.127201114548398e-07, "loss": 0.3381, "num_input_tokens_seen": 109026256, "step": 114185 }, { "epoch": 9.314789134513418, "grad_norm": 23.017690658569336, "learning_rate": 7.118765386920689e-07, "loss": 0.3791, "num_input_tokens_seen": 109032128, "step": 114190 }, { "epoch": 9.315196998123827, "grad_norm": 7.219702243804932, "learning_rate": 7.110334582392408e-07, "loss": 0.245, "num_input_tokens_seen": 109036656, "step": 114195 }, { "epoch": 9.315604861734236, "grad_norm": 4.528217315673828, "learning_rate": 7.101908701134419e-07, "loss": 0.4537, "num_input_tokens_seen": 109041824, "step": 114200 }, { "epoch": 9.316012725344645, "grad_norm": 34.069976806640625, "learning_rate": 7.0934877433175e-07, "loss": 0.2613, "num_input_tokens_seen": 109047264, "step": 114205 }, { "epoch": 9.316420588955053, "grad_norm": 5.177048206329346, "learning_rate": 7.085071709112406e-07, "loss": 0.4135, "num_input_tokens_seen": 109051696, "step": 114210 }, { "epoch": 9.316828452565462, "grad_norm": 8.475915908813477, "learning_rate": 7.076660598689694e-07, "loss": 0.3933, "num_input_tokens_seen": 109055600, "step": 114215 }, { "epoch": 9.31723631617587, "grad_norm": 19.010942459106445, "learning_rate": 7.068254412219838e-07, "loss": 0.3788, "num_input_tokens_seen": 109061168, "step": 114220 }, { "epoch": 9.31764417978628, "grad_norm": 3.771026849746704, "learning_rate": 7.05985314987323e-07, "loss": 0.2754, "num_input_tokens_seen": 109066016, "step": 114225 }, { "epoch": 9.318052043396689, "grad_norm": 1.064321756362915, "learning_rate": 7.051456811820179e-07, "loss": 0.2973, "num_input_tokens_seen": 109070704, "step": 114230 }, { "epoch": 9.318459907007096, "grad_norm": 17.623449325561523, "learning_rate": 7.043065398230853e-07, "loss": 0.3545, "num_input_tokens_seen": 109075184, "step": 114235 }, { "epoch": 9.318867770617505, "grad_norm": 2.227879524230957, "learning_rate": 7.034678909275367e-07, "loss": 0.3743, "num_input_tokens_seen": 109079904, "step": 114240 }, { "epoch": 9.319275634227914, "grad_norm": 8.434989929199219, "learning_rate": 7.026297345123695e-07, "loss": 0.2116, "num_input_tokens_seen": 109084704, "step": 114245 }, { "epoch": 9.319683497838323, "grad_norm": 37.35653305053711, "learning_rate": 7.017920705945701e-07, "loss": 0.3295, "num_input_tokens_seen": 109089280, "step": 114250 }, { "epoch": 9.320091361448732, "grad_norm": 4.115536212921143, "learning_rate": 7.009548991911224e-07, "loss": 0.4225, "num_input_tokens_seen": 109094224, "step": 114255 }, { "epoch": 9.320499225059141, "grad_norm": 3.816746711730957, "learning_rate": 7.001182203189932e-07, "loss": 0.2706, "num_input_tokens_seen": 109098512, "step": 114260 }, { "epoch": 9.320907088669548, "grad_norm": 11.79011344909668, "learning_rate": 6.992820339951439e-07, "loss": 0.2685, "num_input_tokens_seen": 109102944, "step": 114265 }, { "epoch": 9.321314952279957, "grad_norm": 25.810169219970703, "learning_rate": 6.984463402365166e-07, "loss": 0.5529, "num_input_tokens_seen": 109107888, "step": 114270 }, { "epoch": 9.321722815890366, "grad_norm": 28.270648956298828, "learning_rate": 6.976111390600615e-07, "loss": 0.353, "num_input_tokens_seen": 109112736, "step": 114275 }, { "epoch": 9.322130679500775, "grad_norm": 24.003686904907227, "learning_rate": 6.967764304826984e-07, "loss": 0.2896, "num_input_tokens_seen": 109117136, "step": 114280 }, { "epoch": 9.322538543111184, "grad_norm": 0.7847376465797424, "learning_rate": 6.959422145213529e-07, "loss": 0.3575, "num_input_tokens_seen": 109122624, "step": 114285 }, { "epoch": 9.322946406721591, "grad_norm": 2.9890694618225098, "learning_rate": 6.951084911929279e-07, "loss": 0.4103, "num_input_tokens_seen": 109127216, "step": 114290 }, { "epoch": 9.323354270332, "grad_norm": 19.89931869506836, "learning_rate": 6.942752605143266e-07, "loss": 0.3196, "num_input_tokens_seen": 109132720, "step": 114295 }, { "epoch": 9.32376213394241, "grad_norm": 7.812379837036133, "learning_rate": 6.93442522502441e-07, "loss": 0.4463, "num_input_tokens_seen": 109137920, "step": 114300 }, { "epoch": 9.324169997552819, "grad_norm": 0.9762662053108215, "learning_rate": 6.926102771741439e-07, "loss": 0.183, "num_input_tokens_seen": 109142144, "step": 114305 }, { "epoch": 9.324577861163228, "grad_norm": 5.406449317932129, "learning_rate": 6.917785245463077e-07, "loss": 0.461, "num_input_tokens_seen": 109146896, "step": 114310 }, { "epoch": 9.324985724773637, "grad_norm": 29.46918296813965, "learning_rate": 6.909472646357911e-07, "loss": 0.3531, "num_input_tokens_seen": 109151904, "step": 114315 }, { "epoch": 9.325393588384044, "grad_norm": 28.595136642456055, "learning_rate": 6.901164974594448e-07, "loss": 0.3017, "num_input_tokens_seen": 109156752, "step": 114320 }, { "epoch": 9.325801451994453, "grad_norm": 1.1639324426651, "learning_rate": 6.892862230341108e-07, "loss": 0.4716, "num_input_tokens_seen": 109161392, "step": 114325 }, { "epoch": 9.326209315604862, "grad_norm": 26.124401092529297, "learning_rate": 6.884564413766115e-07, "loss": 0.2589, "num_input_tokens_seen": 109165392, "step": 114330 }, { "epoch": 9.32661717921527, "grad_norm": 9.208818435668945, "learning_rate": 6.876271525037669e-07, "loss": 0.3575, "num_input_tokens_seen": 109170240, "step": 114335 }, { "epoch": 9.32702504282568, "grad_norm": 22.736522674560547, "learning_rate": 6.867983564323916e-07, "loss": 0.3675, "num_input_tokens_seen": 109175328, "step": 114340 }, { "epoch": 9.327432906436087, "grad_norm": 30.07505226135254, "learning_rate": 6.859700531792829e-07, "loss": 0.4733, "num_input_tokens_seen": 109180128, "step": 114345 }, { "epoch": 9.327840770046496, "grad_norm": 3.348390817642212, "learning_rate": 6.851422427612302e-07, "loss": 0.3585, "num_input_tokens_seen": 109184704, "step": 114350 }, { "epoch": 9.328248633656905, "grad_norm": 1.3349004983901978, "learning_rate": 6.843149251950121e-07, "loss": 0.1623, "num_input_tokens_seen": 109188912, "step": 114355 }, { "epoch": 9.328656497267314, "grad_norm": 7.1666741371154785, "learning_rate": 6.834881004973981e-07, "loss": 0.4157, "num_input_tokens_seen": 109193904, "step": 114360 }, { "epoch": 9.329064360877723, "grad_norm": 11.59496784210205, "learning_rate": 6.826617686851472e-07, "loss": 0.3619, "num_input_tokens_seen": 109199616, "step": 114365 }, { "epoch": 9.329472224488132, "grad_norm": 7.832826137542725, "learning_rate": 6.818359297750071e-07, "loss": 0.3317, "num_input_tokens_seen": 109204288, "step": 114370 }, { "epoch": 9.32988008809854, "grad_norm": 25.001197814941406, "learning_rate": 6.8101058378372e-07, "loss": 0.4583, "num_input_tokens_seen": 109209408, "step": 114375 }, { "epoch": 9.330287951708948, "grad_norm": 1.4140946865081787, "learning_rate": 6.801857307280169e-07, "loss": 0.4271, "num_input_tokens_seen": 109213968, "step": 114380 }, { "epoch": 9.330695815319357, "grad_norm": 0.9029101729393005, "learning_rate": 6.793613706246121e-07, "loss": 0.449, "num_input_tokens_seen": 109219344, "step": 114385 }, { "epoch": 9.331103678929766, "grad_norm": 4.839012622833252, "learning_rate": 6.785375034902202e-07, "loss": 0.4249, "num_input_tokens_seen": 109224320, "step": 114390 }, { "epoch": 9.331511542540175, "grad_norm": 13.914912223815918, "learning_rate": 6.77714129341539e-07, "loss": 0.1404, "num_input_tokens_seen": 109229344, "step": 114395 }, { "epoch": 9.331919406150583, "grad_norm": 18.721010208129883, "learning_rate": 6.768912481952549e-07, "loss": 0.44, "num_input_tokens_seen": 109233920, "step": 114400 }, { "epoch": 9.332327269760992, "grad_norm": 7.749270915985107, "learning_rate": 6.760688600680492e-07, "loss": 0.4207, "num_input_tokens_seen": 109238144, "step": 114405 }, { "epoch": 9.3327351333714, "grad_norm": 5.29694938659668, "learning_rate": 6.752469649765919e-07, "loss": 0.2906, "num_input_tokens_seen": 109242720, "step": 114410 }, { "epoch": 9.33314299698181, "grad_norm": 38.43931198120117, "learning_rate": 6.744255629375446e-07, "loss": 0.3402, "num_input_tokens_seen": 109248064, "step": 114415 }, { "epoch": 9.333550860592219, "grad_norm": 3.2776262760162354, "learning_rate": 6.73604653967555e-07, "loss": 0.3723, "num_input_tokens_seen": 109251760, "step": 114420 }, { "epoch": 9.333958724202626, "grad_norm": 2.686455011367798, "learning_rate": 6.727842380832572e-07, "loss": 0.436, "num_input_tokens_seen": 109256320, "step": 114425 }, { "epoch": 9.334366587813035, "grad_norm": 25.273347854614258, "learning_rate": 6.719643153012906e-07, "loss": 0.308, "num_input_tokens_seen": 109259952, "step": 114430 }, { "epoch": 9.334774451423444, "grad_norm": 21.91310691833496, "learning_rate": 6.711448856382669e-07, "loss": 0.3591, "num_input_tokens_seen": 109264848, "step": 114435 }, { "epoch": 9.335182315033853, "grad_norm": 3.2868144512176514, "learning_rate": 6.703259491108004e-07, "loss": 0.5913, "num_input_tokens_seen": 109270032, "step": 114440 }, { "epoch": 9.335590178644262, "grad_norm": 27.917129516601562, "learning_rate": 6.695075057354894e-07, "loss": 0.2579, "num_input_tokens_seen": 109275392, "step": 114445 }, { "epoch": 9.335998042254671, "grad_norm": 12.85881233215332, "learning_rate": 6.686895555289174e-07, "loss": 0.2827, "num_input_tokens_seen": 109280976, "step": 114450 }, { "epoch": 9.336405905865078, "grad_norm": 3.952925443649292, "learning_rate": 6.678720985076742e-07, "loss": 0.3514, "num_input_tokens_seen": 109285504, "step": 114455 }, { "epoch": 9.336813769475487, "grad_norm": 2.506983757019043, "learning_rate": 6.670551346883242e-07, "loss": 0.4696, "num_input_tokens_seen": 109289776, "step": 114460 }, { "epoch": 9.337221633085896, "grad_norm": 31.99864387512207, "learning_rate": 6.662386640874235e-07, "loss": 0.4493, "num_input_tokens_seen": 109294832, "step": 114465 }, { "epoch": 9.337629496696305, "grad_norm": 2.1941421031951904, "learning_rate": 6.654226867215258e-07, "loss": 0.1857, "num_input_tokens_seen": 109299872, "step": 114470 }, { "epoch": 9.338037360306714, "grad_norm": 22.605127334594727, "learning_rate": 6.646072026071732e-07, "loss": 0.665, "num_input_tokens_seen": 109303952, "step": 114475 }, { "epoch": 9.338445223917121, "grad_norm": 8.090299606323242, "learning_rate": 6.637922117608885e-07, "loss": 0.4432, "num_input_tokens_seen": 109308512, "step": 114480 }, { "epoch": 9.33885308752753, "grad_norm": 0.9349298477172852, "learning_rate": 6.629777141991978e-07, "loss": 0.349, "num_input_tokens_seen": 109312992, "step": 114485 }, { "epoch": 9.33926095113794, "grad_norm": 3.6172521114349365, "learning_rate": 6.621637099386041e-07, "loss": 0.4539, "num_input_tokens_seen": 109317488, "step": 114490 }, { "epoch": 9.339668814748348, "grad_norm": 1.8208842277526855, "learning_rate": 6.613501989956139e-07, "loss": 0.2505, "num_input_tokens_seen": 109322576, "step": 114495 }, { "epoch": 9.340076678358757, "grad_norm": 4.788854598999023, "learning_rate": 6.605371813867084e-07, "loss": 0.4041, "num_input_tokens_seen": 109327136, "step": 114500 }, { "epoch": 9.340484541969165, "grad_norm": 2.4664323329925537, "learning_rate": 6.597246571283744e-07, "loss": 0.4045, "num_input_tokens_seen": 109332336, "step": 114505 }, { "epoch": 9.340892405579574, "grad_norm": 9.583819389343262, "learning_rate": 6.589126262370792e-07, "loss": 0.3973, "num_input_tokens_seen": 109337152, "step": 114510 }, { "epoch": 9.341300269189983, "grad_norm": 1.2094948291778564, "learning_rate": 6.581010887292822e-07, "loss": 0.1462, "num_input_tokens_seen": 109341136, "step": 114515 }, { "epoch": 9.341708132800392, "grad_norm": 2.6213696002960205, "learning_rate": 6.572900446214308e-07, "loss": 0.2733, "num_input_tokens_seen": 109345536, "step": 114520 }, { "epoch": 9.3421159964108, "grad_norm": 2.299612045288086, "learning_rate": 6.564794939299679e-07, "loss": 0.2937, "num_input_tokens_seen": 109350416, "step": 114525 }, { "epoch": 9.34252386002121, "grad_norm": 1.1873947381973267, "learning_rate": 6.556694366713218e-07, "loss": 0.222, "num_input_tokens_seen": 109355296, "step": 114530 }, { "epoch": 9.342931723631617, "grad_norm": 13.764724731445312, "learning_rate": 6.54859872861907e-07, "loss": 0.3066, "num_input_tokens_seen": 109360384, "step": 114535 }, { "epoch": 9.343339587242026, "grad_norm": 21.045873641967773, "learning_rate": 6.540508025181441e-07, "loss": 0.4317, "num_input_tokens_seen": 109364896, "step": 114540 }, { "epoch": 9.343747450852435, "grad_norm": 15.062811851501465, "learning_rate": 6.532422256564225e-07, "loss": 0.2833, "num_input_tokens_seen": 109370304, "step": 114545 }, { "epoch": 9.344155314462844, "grad_norm": 25.31129264831543, "learning_rate": 6.524341422931374e-07, "loss": 0.5874, "num_input_tokens_seen": 109375280, "step": 114550 }, { "epoch": 9.344563178073253, "grad_norm": 10.703121185302734, "learning_rate": 6.516265524446647e-07, "loss": 0.5746, "num_input_tokens_seen": 109380800, "step": 114555 }, { "epoch": 9.34497104168366, "grad_norm": 8.02738094329834, "learning_rate": 6.508194561273745e-07, "loss": 0.2023, "num_input_tokens_seen": 109385968, "step": 114560 }, { "epoch": 9.34537890529407, "grad_norm": 1.5452498197555542, "learning_rate": 6.500128533576288e-07, "loss": 0.3669, "num_input_tokens_seen": 109391328, "step": 114565 }, { "epoch": 9.345786768904478, "grad_norm": 2.2369701862335205, "learning_rate": 6.492067441517785e-07, "loss": 0.3702, "num_input_tokens_seen": 109396656, "step": 114570 }, { "epoch": 9.346194632514887, "grad_norm": 4.2184014320373535, "learning_rate": 6.484011285261577e-07, "loss": 0.4895, "num_input_tokens_seen": 109402352, "step": 114575 }, { "epoch": 9.346602496125296, "grad_norm": 18.632108688354492, "learning_rate": 6.475960064970948e-07, "loss": 0.3502, "num_input_tokens_seen": 109408064, "step": 114580 }, { "epoch": 9.347010359735705, "grad_norm": 18.46470832824707, "learning_rate": 6.46791378080916e-07, "loss": 0.3215, "num_input_tokens_seen": 109412656, "step": 114585 }, { "epoch": 9.347418223346112, "grad_norm": 2.0549428462982178, "learning_rate": 6.459872432939301e-07, "loss": 0.3084, "num_input_tokens_seen": 109416832, "step": 114590 }, { "epoch": 9.347826086956522, "grad_norm": 3.5538530349731445, "learning_rate": 6.451836021524327e-07, "loss": 0.2235, "num_input_tokens_seen": 109421904, "step": 114595 }, { "epoch": 9.34823395056693, "grad_norm": 21.568832397460938, "learning_rate": 6.443804546727133e-07, "loss": 0.3097, "num_input_tokens_seen": 109427040, "step": 114600 }, { "epoch": 9.34864181417734, "grad_norm": 10.68457317352295, "learning_rate": 6.435778008710508e-07, "loss": 0.4454, "num_input_tokens_seen": 109431728, "step": 114605 }, { "epoch": 9.349049677787749, "grad_norm": 2.1541640758514404, "learning_rate": 6.427756407637209e-07, "loss": 0.2908, "num_input_tokens_seen": 109436800, "step": 114610 }, { "epoch": 9.349457541398156, "grad_norm": 17.665163040161133, "learning_rate": 6.419739743669772e-07, "loss": 0.3323, "num_input_tokens_seen": 109441552, "step": 114615 }, { "epoch": 9.349865405008565, "grad_norm": 12.533536911010742, "learning_rate": 6.411728016970709e-07, "loss": 0.1777, "num_input_tokens_seen": 109445504, "step": 114620 }, { "epoch": 9.350273268618974, "grad_norm": 17.947874069213867, "learning_rate": 6.403721227702386e-07, "loss": 0.4619, "num_input_tokens_seen": 109449744, "step": 114625 }, { "epoch": 9.350681132229383, "grad_norm": 27.699186325073242, "learning_rate": 6.395719376027148e-07, "loss": 0.4268, "num_input_tokens_seen": 109454464, "step": 114630 }, { "epoch": 9.351088995839792, "grad_norm": 17.050111770629883, "learning_rate": 6.387722462107171e-07, "loss": 0.2303, "num_input_tokens_seen": 109459744, "step": 114635 }, { "epoch": 9.351496859450199, "grad_norm": 1.948287010192871, "learning_rate": 6.379730486104546e-07, "loss": 0.2819, "num_input_tokens_seen": 109464304, "step": 114640 }, { "epoch": 9.351904723060608, "grad_norm": 16.08224868774414, "learning_rate": 6.371743448181255e-07, "loss": 0.2662, "num_input_tokens_seen": 109469536, "step": 114645 }, { "epoch": 9.352312586671017, "grad_norm": 9.289231300354004, "learning_rate": 6.363761348499197e-07, "loss": 0.3347, "num_input_tokens_seen": 109474016, "step": 114650 }, { "epoch": 9.352720450281426, "grad_norm": 27.541297912597656, "learning_rate": 6.355784187220188e-07, "loss": 0.3873, "num_input_tokens_seen": 109478912, "step": 114655 }, { "epoch": 9.353128313891835, "grad_norm": 3.129458427429199, "learning_rate": 6.347811964505873e-07, "loss": 0.3027, "num_input_tokens_seen": 109484224, "step": 114660 }, { "epoch": 9.353536177502244, "grad_norm": 1.050605297088623, "learning_rate": 6.339844680517903e-07, "loss": 0.46, "num_input_tokens_seen": 109488112, "step": 114665 }, { "epoch": 9.353944041112651, "grad_norm": 0.9865864515304565, "learning_rate": 6.331882335417704e-07, "loss": 0.191, "num_input_tokens_seen": 109492336, "step": 114670 }, { "epoch": 9.35435190472306, "grad_norm": 4.921815872192383, "learning_rate": 6.323924929366731e-07, "loss": 0.2951, "num_input_tokens_seen": 109497456, "step": 114675 }, { "epoch": 9.35475976833347, "grad_norm": 1.4814541339874268, "learning_rate": 6.31597246252627e-07, "loss": 0.3194, "num_input_tokens_seen": 109502816, "step": 114680 }, { "epoch": 9.355167631943878, "grad_norm": 4.06960916519165, "learning_rate": 6.308024935057499e-07, "loss": 0.4163, "num_input_tokens_seen": 109507824, "step": 114685 }, { "epoch": 9.355575495554287, "grad_norm": 4.015876770019531, "learning_rate": 6.300082347121484e-07, "loss": 0.3081, "num_input_tokens_seen": 109511936, "step": 114690 }, { "epoch": 9.355983359164695, "grad_norm": 33.21589660644531, "learning_rate": 6.29214469887926e-07, "loss": 0.3957, "num_input_tokens_seen": 109517856, "step": 114695 }, { "epoch": 9.356391222775104, "grad_norm": 26.052579879760742, "learning_rate": 6.284211990491701e-07, "loss": 0.2753, "num_input_tokens_seen": 109522384, "step": 114700 }, { "epoch": 9.356799086385513, "grad_norm": 20.58365821838379, "learning_rate": 6.276284222119622e-07, "loss": 0.3481, "num_input_tokens_seen": 109526976, "step": 114705 }, { "epoch": 9.357206949995922, "grad_norm": 18.274860382080078, "learning_rate": 6.268361393923671e-07, "loss": 0.348, "num_input_tokens_seen": 109532752, "step": 114710 }, { "epoch": 9.35761481360633, "grad_norm": 13.350903511047363, "learning_rate": 6.260443506064473e-07, "loss": 0.287, "num_input_tokens_seen": 109537888, "step": 114715 }, { "epoch": 9.358022677216738, "grad_norm": 41.20457458496094, "learning_rate": 6.252530558702535e-07, "loss": 0.4113, "num_input_tokens_seen": 109541760, "step": 114720 }, { "epoch": 9.358430540827147, "grad_norm": 3.3738789558410645, "learning_rate": 6.244622551998203e-07, "loss": 0.2433, "num_input_tokens_seen": 109547040, "step": 114725 }, { "epoch": 9.358838404437556, "grad_norm": 34.44791793823242, "learning_rate": 6.23671948611182e-07, "loss": 0.3872, "num_input_tokens_seen": 109552240, "step": 114730 }, { "epoch": 9.359246268047965, "grad_norm": 14.95034408569336, "learning_rate": 6.228821361203508e-07, "loss": 0.3501, "num_input_tokens_seen": 109556752, "step": 114735 }, { "epoch": 9.359654131658374, "grad_norm": 4.293974876403809, "learning_rate": 6.220928177433444e-07, "loss": 0.302, "num_input_tokens_seen": 109560112, "step": 114740 }, { "epoch": 9.360061995268783, "grad_norm": 33.02347183227539, "learning_rate": 6.213039934961557e-07, "loss": 0.3811, "num_input_tokens_seen": 109565344, "step": 114745 }, { "epoch": 9.36046985887919, "grad_norm": 1.3537609577178955, "learning_rate": 6.205156633947773e-07, "loss": 0.4116, "num_input_tokens_seen": 109570336, "step": 114750 }, { "epoch": 9.3608777224896, "grad_norm": 37.0754508972168, "learning_rate": 6.197278274551882e-07, "loss": 0.1828, "num_input_tokens_seen": 109575024, "step": 114755 }, { "epoch": 9.361285586100008, "grad_norm": 11.470978736877441, "learning_rate": 6.189404856933506e-07, "loss": 0.3325, "num_input_tokens_seen": 109579744, "step": 114760 }, { "epoch": 9.361693449710417, "grad_norm": 0.8058910369873047, "learning_rate": 6.181536381252351e-07, "loss": 0.389, "num_input_tokens_seen": 109584592, "step": 114765 }, { "epoch": 9.362101313320826, "grad_norm": 17.32713508605957, "learning_rate": 6.173672847667817e-07, "loss": 0.335, "num_input_tokens_seen": 109588672, "step": 114770 }, { "epoch": 9.362509176931233, "grad_norm": 14.382734298706055, "learning_rate": 6.165814256339359e-07, "loss": 0.4417, "num_input_tokens_seen": 109593504, "step": 114775 }, { "epoch": 9.362917040541642, "grad_norm": 1.8388829231262207, "learning_rate": 6.157960607426211e-07, "loss": 0.3642, "num_input_tokens_seen": 109598016, "step": 114780 }, { "epoch": 9.363324904152051, "grad_norm": 38.68044662475586, "learning_rate": 6.150111901087579e-07, "loss": 0.4123, "num_input_tokens_seen": 109602608, "step": 114785 }, { "epoch": 9.36373276776246, "grad_norm": 15.687740325927734, "learning_rate": 6.142268137482588e-07, "loss": 0.3945, "num_input_tokens_seen": 109608080, "step": 114790 }, { "epoch": 9.36414063137287, "grad_norm": 7.354534149169922, "learning_rate": 6.13442931677019e-07, "loss": 0.2367, "num_input_tokens_seen": 109612672, "step": 114795 }, { "epoch": 9.364548494983278, "grad_norm": 30.551963806152344, "learning_rate": 6.12659543910929e-07, "loss": 0.352, "num_input_tokens_seen": 109618048, "step": 114800 }, { "epoch": 9.364956358593686, "grad_norm": 1.1919409036636353, "learning_rate": 6.118766504658619e-07, "loss": 0.3211, "num_input_tokens_seen": 109621808, "step": 114805 }, { "epoch": 9.365364222204095, "grad_norm": 31.164064407348633, "learning_rate": 6.110942513576967e-07, "loss": 0.1768, "num_input_tokens_seen": 109627072, "step": 114810 }, { "epoch": 9.365772085814504, "grad_norm": 3.711850881576538, "learning_rate": 6.103123466022875e-07, "loss": 0.3073, "num_input_tokens_seen": 109632144, "step": 114815 }, { "epoch": 9.366179949424913, "grad_norm": 5.221778869628906, "learning_rate": 6.095309362154828e-07, "loss": 0.4444, "num_input_tokens_seen": 109637344, "step": 114820 }, { "epoch": 9.366587813035322, "grad_norm": 5.882569789886475, "learning_rate": 6.087500202131197e-07, "loss": 0.2012, "num_input_tokens_seen": 109641648, "step": 114825 }, { "epoch": 9.366995676645729, "grad_norm": 3.4290013313293457, "learning_rate": 6.079695986110328e-07, "loss": 0.349, "num_input_tokens_seen": 109645840, "step": 114830 }, { "epoch": 9.367403540256138, "grad_norm": 7.053248882293701, "learning_rate": 6.071896714250347e-07, "loss": 0.3626, "num_input_tokens_seen": 109651040, "step": 114835 }, { "epoch": 9.367811403866547, "grad_norm": 1.8494551181793213, "learning_rate": 6.064102386709403e-07, "loss": 0.3361, "num_input_tokens_seen": 109656160, "step": 114840 }, { "epoch": 9.368219267476956, "grad_norm": 19.242671966552734, "learning_rate": 6.056313003645397e-07, "loss": 0.3145, "num_input_tokens_seen": 109660560, "step": 114845 }, { "epoch": 9.368627131087365, "grad_norm": 22.199644088745117, "learning_rate": 6.048528565216288e-07, "loss": 0.2932, "num_input_tokens_seen": 109665712, "step": 114850 }, { "epoch": 9.369034994697772, "grad_norm": 37.08228302001953, "learning_rate": 6.040749071579866e-07, "loss": 0.474, "num_input_tokens_seen": 109670096, "step": 114855 }, { "epoch": 9.369442858308181, "grad_norm": 1.7421292066574097, "learning_rate": 6.032974522893781e-07, "loss": 0.3379, "num_input_tokens_seen": 109674800, "step": 114860 }, { "epoch": 9.36985072191859, "grad_norm": 6.687957763671875, "learning_rate": 6.025204919315658e-07, "loss": 0.3653, "num_input_tokens_seen": 109679632, "step": 114865 }, { "epoch": 9.370258585529, "grad_norm": 6.322334289550781, "learning_rate": 6.017440261002899e-07, "loss": 0.3576, "num_input_tokens_seen": 109684720, "step": 114870 }, { "epoch": 9.370666449139408, "grad_norm": 3.6926660537719727, "learning_rate": 6.00968054811299e-07, "loss": 0.2668, "num_input_tokens_seen": 109688864, "step": 114875 }, { "epoch": 9.371074312749817, "grad_norm": 7.587616920471191, "learning_rate": 6.001925780803191e-07, "loss": 0.312, "num_input_tokens_seen": 109693200, "step": 114880 }, { "epoch": 9.371482176360225, "grad_norm": 41.68819808959961, "learning_rate": 5.994175959230658e-07, "loss": 0.4382, "num_input_tokens_seen": 109697360, "step": 114885 }, { "epoch": 9.371890039970634, "grad_norm": 2.4275715351104736, "learning_rate": 5.986431083552485e-07, "loss": 0.3242, "num_input_tokens_seen": 109701840, "step": 114890 }, { "epoch": 9.372297903581043, "grad_norm": 7.330752372741699, "learning_rate": 5.978691153925687e-07, "loss": 0.4296, "num_input_tokens_seen": 109706736, "step": 114895 }, { "epoch": 9.372705767191452, "grad_norm": 2.143451690673828, "learning_rate": 5.970956170507136e-07, "loss": 0.2467, "num_input_tokens_seen": 109711520, "step": 114900 }, { "epoch": 9.37311363080186, "grad_norm": 6.521383285522461, "learning_rate": 5.963226133453598e-07, "loss": 0.3714, "num_input_tokens_seen": 109716800, "step": 114905 }, { "epoch": 9.373521494412268, "grad_norm": 1.3451709747314453, "learning_rate": 5.955501042921751e-07, "loss": 0.3224, "num_input_tokens_seen": 109721712, "step": 114910 }, { "epoch": 9.373929358022677, "grad_norm": 3.2110989093780518, "learning_rate": 5.947780899068223e-07, "loss": 0.271, "num_input_tokens_seen": 109728240, "step": 114915 }, { "epoch": 9.374337221633086, "grad_norm": 0.8853681683540344, "learning_rate": 5.940065702049469e-07, "loss": 0.3746, "num_input_tokens_seen": 109733792, "step": 114920 }, { "epoch": 9.374745085243495, "grad_norm": 3.6217381954193115, "learning_rate": 5.932355452021865e-07, "loss": 0.3588, "num_input_tokens_seen": 109738800, "step": 114925 }, { "epoch": 9.375152948853904, "grad_norm": 5.508933067321777, "learning_rate": 5.924650149141731e-07, "loss": 0.3051, "num_input_tokens_seen": 109743600, "step": 114930 }, { "epoch": 9.375560812464311, "grad_norm": 3.441068172454834, "learning_rate": 5.916949793565163e-07, "loss": 0.294, "num_input_tokens_seen": 109748784, "step": 114935 }, { "epoch": 9.37596867607472, "grad_norm": 6.61161470413208, "learning_rate": 5.909254385448371e-07, "loss": 0.1915, "num_input_tokens_seen": 109753920, "step": 114940 }, { "epoch": 9.376376539685129, "grad_norm": 5.4733171463012695, "learning_rate": 5.901563924947229e-07, "loss": 0.4391, "num_input_tokens_seen": 109758240, "step": 114945 }, { "epoch": 9.376784403295538, "grad_norm": 4.377261161804199, "learning_rate": 5.893878412217696e-07, "loss": 0.3489, "num_input_tokens_seen": 109763392, "step": 114950 }, { "epoch": 9.377192266905947, "grad_norm": 1.8468842506408691, "learning_rate": 5.886197847415509e-07, "loss": 0.4144, "num_input_tokens_seen": 109767616, "step": 114955 }, { "epoch": 9.377600130516356, "grad_norm": 2.126786470413208, "learning_rate": 5.878522230696321e-07, "loss": 0.2987, "num_input_tokens_seen": 109772528, "step": 114960 }, { "epoch": 9.378007994126763, "grad_norm": 9.763699531555176, "learning_rate": 5.870851562215812e-07, "loss": 0.3826, "num_input_tokens_seen": 109777632, "step": 114965 }, { "epoch": 9.378415857737172, "grad_norm": 11.557100296020508, "learning_rate": 5.863185842129388e-07, "loss": 0.1883, "num_input_tokens_seen": 109781952, "step": 114970 }, { "epoch": 9.378823721347581, "grad_norm": 21.759384155273438, "learning_rate": 5.855525070592422e-07, "loss": 0.323, "num_input_tokens_seen": 109786352, "step": 114975 }, { "epoch": 9.37923158495799, "grad_norm": 1.4662361145019531, "learning_rate": 5.847869247760235e-07, "loss": 0.3745, "num_input_tokens_seen": 109791424, "step": 114980 }, { "epoch": 9.3796394485684, "grad_norm": 9.606708526611328, "learning_rate": 5.840218373787982e-07, "loss": 0.3829, "num_input_tokens_seen": 109796464, "step": 114985 }, { "epoch": 9.380047312178807, "grad_norm": 1.8903186321258545, "learning_rate": 5.832572448830787e-07, "loss": 0.2397, "num_input_tokens_seen": 109801840, "step": 114990 }, { "epoch": 9.380455175789216, "grad_norm": 18.255956649780273, "learning_rate": 5.824931473043583e-07, "loss": 0.2491, "num_input_tokens_seen": 109807040, "step": 114995 }, { "epoch": 9.380863039399625, "grad_norm": 1.5745224952697754, "learning_rate": 5.817295446581245e-07, "loss": 0.222, "num_input_tokens_seen": 109811504, "step": 115000 }, { "epoch": 9.381270903010034, "grad_norm": 1.5959631204605103, "learning_rate": 5.809664369598566e-07, "loss": 0.2193, "num_input_tokens_seen": 109816096, "step": 115005 }, { "epoch": 9.381678766620443, "grad_norm": 3.616489887237549, "learning_rate": 5.80203824225023e-07, "loss": 0.461, "num_input_tokens_seen": 109819792, "step": 115010 }, { "epoch": 9.382086630230852, "grad_norm": 4.541239261627197, "learning_rate": 5.794417064690832e-07, "loss": 0.3016, "num_input_tokens_seen": 109824960, "step": 115015 }, { "epoch": 9.382494493841259, "grad_norm": 3.1098694801330566, "learning_rate": 5.786800837074834e-07, "loss": 0.5015, "num_input_tokens_seen": 109829328, "step": 115020 }, { "epoch": 9.382902357451668, "grad_norm": 13.50236701965332, "learning_rate": 5.779189559556559e-07, "loss": 0.392, "num_input_tokens_seen": 109834032, "step": 115025 }, { "epoch": 9.383310221062077, "grad_norm": 1.7095155715942383, "learning_rate": 5.771583232290379e-07, "loss": 0.3977, "num_input_tokens_seen": 109838528, "step": 115030 }, { "epoch": 9.383718084672486, "grad_norm": 3.995096445083618, "learning_rate": 5.763981855430422e-07, "loss": 0.2535, "num_input_tokens_seen": 109844624, "step": 115035 }, { "epoch": 9.384125948282895, "grad_norm": 1.399689793586731, "learning_rate": 5.75638542913079e-07, "loss": 0.2102, "num_input_tokens_seen": 109848512, "step": 115040 }, { "epoch": 9.384533811893302, "grad_norm": 3.6232218742370605, "learning_rate": 5.748793953545412e-07, "loss": 0.4581, "num_input_tokens_seen": 109853104, "step": 115045 }, { "epoch": 9.384941675503711, "grad_norm": 2.001451015472412, "learning_rate": 5.741207428828221e-07, "loss": 0.3366, "num_input_tokens_seen": 109857936, "step": 115050 }, { "epoch": 9.38534953911412, "grad_norm": 5.495865345001221, "learning_rate": 5.73362585513293e-07, "loss": 0.568, "num_input_tokens_seen": 109862080, "step": 115055 }, { "epoch": 9.38575740272453, "grad_norm": 1.0725797414779663, "learning_rate": 5.726049232613273e-07, "loss": 0.299, "num_input_tokens_seen": 109866576, "step": 115060 }, { "epoch": 9.386165266334938, "grad_norm": 3.19047212600708, "learning_rate": 5.718477561422797e-07, "loss": 0.4074, "num_input_tokens_seen": 109871344, "step": 115065 }, { "epoch": 9.386573129945345, "grad_norm": 26.244646072387695, "learning_rate": 5.710910841714962e-07, "loss": 0.3293, "num_input_tokens_seen": 109875664, "step": 115070 }, { "epoch": 9.386980993555754, "grad_norm": 21.178813934326172, "learning_rate": 5.703349073643172e-07, "loss": 0.4106, "num_input_tokens_seen": 109880464, "step": 115075 }, { "epoch": 9.387388857166163, "grad_norm": 1.3252257108688354, "learning_rate": 5.695792257360666e-07, "loss": 0.1745, "num_input_tokens_seen": 109884592, "step": 115080 }, { "epoch": 9.387796720776572, "grad_norm": 31.445568084716797, "learning_rate": 5.688240393020683e-07, "loss": 0.4271, "num_input_tokens_seen": 109888976, "step": 115085 }, { "epoch": 9.388204584386981, "grad_norm": 3.3584628105163574, "learning_rate": 5.680693480776184e-07, "loss": 0.2933, "num_input_tokens_seen": 109894000, "step": 115090 }, { "epoch": 9.38861244799739, "grad_norm": 2.9881293773651123, "learning_rate": 5.673151520780267e-07, "loss": 0.3057, "num_input_tokens_seen": 109897904, "step": 115095 }, { "epoch": 9.389020311607798, "grad_norm": 21.246292114257812, "learning_rate": 5.665614513185729e-07, "loss": 0.3284, "num_input_tokens_seen": 109902480, "step": 115100 }, { "epoch": 9.389428175218207, "grad_norm": 31.119300842285156, "learning_rate": 5.658082458145392e-07, "loss": 0.3365, "num_input_tokens_seen": 109906912, "step": 115105 }, { "epoch": 9.389836038828616, "grad_norm": 35.032657623291016, "learning_rate": 5.650555355811882e-07, "loss": 0.2627, "num_input_tokens_seen": 109910832, "step": 115110 }, { "epoch": 9.390243902439025, "grad_norm": 5.752345561981201, "learning_rate": 5.643033206337744e-07, "loss": 0.533, "num_input_tokens_seen": 109915552, "step": 115115 }, { "epoch": 9.390651766049434, "grad_norm": 1.4716517925262451, "learning_rate": 5.635516009875524e-07, "loss": 0.284, "num_input_tokens_seen": 109920304, "step": 115120 }, { "epoch": 9.391059629659841, "grad_norm": 5.639292240142822, "learning_rate": 5.628003766577545e-07, "loss": 0.4262, "num_input_tokens_seen": 109925360, "step": 115125 }, { "epoch": 9.39146749327025, "grad_norm": 1.2867764234542847, "learning_rate": 5.620496476596127e-07, "loss": 0.3473, "num_input_tokens_seen": 109930816, "step": 115130 }, { "epoch": 9.391875356880659, "grad_norm": 13.237587928771973, "learning_rate": 5.612994140083344e-07, "loss": 0.452, "num_input_tokens_seen": 109936144, "step": 115135 }, { "epoch": 9.392283220491068, "grad_norm": 1.068703055381775, "learning_rate": 5.605496757191353e-07, "loss": 0.3218, "num_input_tokens_seen": 109940160, "step": 115140 }, { "epoch": 9.392691084101477, "grad_norm": 18.333465576171875, "learning_rate": 5.598004328072087e-07, "loss": 0.3988, "num_input_tokens_seen": 109945264, "step": 115145 }, { "epoch": 9.393098947711884, "grad_norm": 19.901939392089844, "learning_rate": 5.590516852877453e-07, "loss": 0.4719, "num_input_tokens_seen": 109949696, "step": 115150 }, { "epoch": 9.393506811322293, "grad_norm": 1.8227349519729614, "learning_rate": 5.583034331759162e-07, "loss": 0.2585, "num_input_tokens_seen": 109954336, "step": 115155 }, { "epoch": 9.393914674932702, "grad_norm": 8.045572280883789, "learning_rate": 5.575556764868873e-07, "loss": 0.1874, "num_input_tokens_seen": 109959408, "step": 115160 }, { "epoch": 9.394322538543111, "grad_norm": 36.55229949951172, "learning_rate": 5.568084152358211e-07, "loss": 0.2679, "num_input_tokens_seen": 109963520, "step": 115165 }, { "epoch": 9.39473040215352, "grad_norm": 9.271535873413086, "learning_rate": 5.56061649437864e-07, "loss": 0.4478, "num_input_tokens_seen": 109967584, "step": 115170 }, { "epoch": 9.39513826576393, "grad_norm": 29.133596420288086, "learning_rate": 5.553153791081483e-07, "loss": 0.3213, "num_input_tokens_seen": 109972192, "step": 115175 }, { "epoch": 9.395546129374337, "grad_norm": 32.814064025878906, "learning_rate": 5.545696042618037e-07, "loss": 0.1755, "num_input_tokens_seen": 109976368, "step": 115180 }, { "epoch": 9.395953992984746, "grad_norm": 31.740629196166992, "learning_rate": 5.538243249139457e-07, "loss": 0.398, "num_input_tokens_seen": 109982048, "step": 115185 }, { "epoch": 9.396361856595155, "grad_norm": 25.024320602416992, "learning_rate": 5.530795410796791e-07, "loss": 0.3114, "num_input_tokens_seen": 109987472, "step": 115190 }, { "epoch": 9.396769720205564, "grad_norm": 10.045071601867676, "learning_rate": 5.523352527741027e-07, "loss": 0.3322, "num_input_tokens_seen": 109992560, "step": 115195 }, { "epoch": 9.397177583815973, "grad_norm": 16.03093910217285, "learning_rate": 5.515914600123045e-07, "loss": 0.4003, "num_input_tokens_seen": 109997296, "step": 115200 }, { "epoch": 9.39758544742638, "grad_norm": 20.32979393005371, "learning_rate": 5.508481628093587e-07, "loss": 0.3515, "num_input_tokens_seen": 110002448, "step": 115205 }, { "epoch": 9.397993311036789, "grad_norm": 10.567548751831055, "learning_rate": 5.50105361180328e-07, "loss": 0.3251, "num_input_tokens_seen": 110007088, "step": 115210 }, { "epoch": 9.398401174647198, "grad_norm": 12.919776916503906, "learning_rate": 5.493630551402757e-07, "loss": 0.4363, "num_input_tokens_seen": 110012240, "step": 115215 }, { "epoch": 9.398809038257607, "grad_norm": 3.8485770225524902, "learning_rate": 5.486212447042449e-07, "loss": 0.348, "num_input_tokens_seen": 110017136, "step": 115220 }, { "epoch": 9.399216901868016, "grad_norm": 5.846561431884766, "learning_rate": 5.478799298872655e-07, "loss": 0.364, "num_input_tokens_seen": 110022784, "step": 115225 }, { "epoch": 9.399624765478425, "grad_norm": 0.9828373789787292, "learning_rate": 5.471391107043755e-07, "loss": 0.3928, "num_input_tokens_seen": 110027104, "step": 115230 }, { "epoch": 9.400032629088832, "grad_norm": 10.236700057983398, "learning_rate": 5.463987871705823e-07, "loss": 0.3556, "num_input_tokens_seen": 110031376, "step": 115235 }, { "epoch": 9.400440492699241, "grad_norm": 3.249070167541504, "learning_rate": 5.45658959300896e-07, "loss": 0.1922, "num_input_tokens_seen": 110036496, "step": 115240 }, { "epoch": 9.40084835630965, "grad_norm": 35.14900588989258, "learning_rate": 5.449196271103102e-07, "loss": 0.2267, "num_input_tokens_seen": 110041712, "step": 115245 }, { "epoch": 9.401256219920059, "grad_norm": 11.049660682678223, "learning_rate": 5.44180790613813e-07, "loss": 0.3915, "num_input_tokens_seen": 110046560, "step": 115250 }, { "epoch": 9.401664083530468, "grad_norm": 1.8613064289093018, "learning_rate": 5.434424498263785e-07, "loss": 0.3787, "num_input_tokens_seen": 110051072, "step": 115255 }, { "epoch": 9.402071947140875, "grad_norm": 40.17887878417969, "learning_rate": 5.427046047629753e-07, "loss": 0.3313, "num_input_tokens_seen": 110056096, "step": 115260 }, { "epoch": 9.402479810751284, "grad_norm": 25.379270553588867, "learning_rate": 5.419672554385552e-07, "loss": 0.1912, "num_input_tokens_seen": 110060416, "step": 115265 }, { "epoch": 9.402887674361693, "grad_norm": 3.793607473373413, "learning_rate": 5.412304018680647e-07, "loss": 0.3793, "num_input_tokens_seen": 110065200, "step": 115270 }, { "epoch": 9.403295537972102, "grad_norm": 2.996396780014038, "learning_rate": 5.404940440664447e-07, "loss": 0.2561, "num_input_tokens_seen": 110070272, "step": 115275 }, { "epoch": 9.403703401582511, "grad_norm": 3.3840911388397217, "learning_rate": 5.397581820486136e-07, "loss": 0.3014, "num_input_tokens_seen": 110075152, "step": 115280 }, { "epoch": 9.40411126519292, "grad_norm": 33.85533905029297, "learning_rate": 5.39022815829493e-07, "loss": 0.4102, "num_input_tokens_seen": 110079568, "step": 115285 }, { "epoch": 9.404519128803328, "grad_norm": 25.42278480529785, "learning_rate": 5.38287945423982e-07, "loss": 0.342, "num_input_tokens_seen": 110084976, "step": 115290 }, { "epoch": 9.404926992413737, "grad_norm": 24.610355377197266, "learning_rate": 5.375535708469825e-07, "loss": 0.4575, "num_input_tokens_seen": 110089216, "step": 115295 }, { "epoch": 9.405334856024146, "grad_norm": 36.02292251586914, "learning_rate": 5.368196921133772e-07, "loss": 0.311, "num_input_tokens_seen": 110094592, "step": 115300 }, { "epoch": 9.405742719634555, "grad_norm": 2.030477523803711, "learning_rate": 5.360863092380431e-07, "loss": 0.2819, "num_input_tokens_seen": 110099216, "step": 115305 }, { "epoch": 9.406150583244964, "grad_norm": 2.894319534301758, "learning_rate": 5.353534222358431e-07, "loss": 0.4806, "num_input_tokens_seen": 110104768, "step": 115310 }, { "epoch": 9.406558446855371, "grad_norm": 2.699842691421509, "learning_rate": 5.346210311216321e-07, "loss": 0.3127, "num_input_tokens_seen": 110109968, "step": 115315 }, { "epoch": 9.40696631046578, "grad_norm": 1.5699663162231445, "learning_rate": 5.338891359102594e-07, "loss": 0.3853, "num_input_tokens_seen": 110114816, "step": 115320 }, { "epoch": 9.407374174076189, "grad_norm": 2.2591919898986816, "learning_rate": 5.331577366165547e-07, "loss": 0.4442, "num_input_tokens_seen": 110119504, "step": 115325 }, { "epoch": 9.407782037686598, "grad_norm": 10.185811042785645, "learning_rate": 5.324268332553506e-07, "loss": 0.3555, "num_input_tokens_seen": 110124832, "step": 115330 }, { "epoch": 9.408189901297007, "grad_norm": 7.680922985076904, "learning_rate": 5.316964258414547e-07, "loss": 0.4626, "num_input_tokens_seen": 110130256, "step": 115335 }, { "epoch": 9.408597764907414, "grad_norm": 26.661224365234375, "learning_rate": 5.309665143896747e-07, "loss": 0.328, "num_input_tokens_seen": 110135328, "step": 115340 }, { "epoch": 9.409005628517823, "grad_norm": 1.0923291444778442, "learning_rate": 5.30237098914807e-07, "loss": 0.2398, "num_input_tokens_seen": 110140112, "step": 115345 }, { "epoch": 9.409413492128232, "grad_norm": 8.741786003112793, "learning_rate": 5.295081794316342e-07, "loss": 0.2707, "num_input_tokens_seen": 110145952, "step": 115350 }, { "epoch": 9.409821355738641, "grad_norm": 35.551570892333984, "learning_rate": 5.287797559549335e-07, "loss": 0.2619, "num_input_tokens_seen": 110150784, "step": 115355 }, { "epoch": 9.41022921934905, "grad_norm": 19.11046028137207, "learning_rate": 5.280518284994678e-07, "loss": 0.6015, "num_input_tokens_seen": 110155984, "step": 115360 }, { "epoch": 9.410637082959457, "grad_norm": 15.787821769714355, "learning_rate": 5.273243970799924e-07, "loss": 0.2388, "num_input_tokens_seen": 110160832, "step": 115365 }, { "epoch": 9.411044946569866, "grad_norm": 1.8970146179199219, "learning_rate": 5.265974617112534e-07, "loss": 0.3675, "num_input_tokens_seen": 110165552, "step": 115370 }, { "epoch": 9.411452810180275, "grad_norm": 2.1102306842803955, "learning_rate": 5.258710224079866e-07, "loss": 0.3866, "num_input_tokens_seen": 110170016, "step": 115375 }, { "epoch": 9.411860673790684, "grad_norm": 20.91977882385254, "learning_rate": 5.251450791849105e-07, "loss": 0.4847, "num_input_tokens_seen": 110174592, "step": 115380 }, { "epoch": 9.412268537401093, "grad_norm": 4.2636399269104, "learning_rate": 5.244196320567441e-07, "loss": 0.2008, "num_input_tokens_seen": 110179152, "step": 115385 }, { "epoch": 9.412676401011502, "grad_norm": 5.3563032150268555, "learning_rate": 5.236946810381949e-07, "loss": 0.1458, "num_input_tokens_seen": 110183776, "step": 115390 }, { "epoch": 9.41308426462191, "grad_norm": 5.089714050292969, "learning_rate": 5.229702261439512e-07, "loss": 0.4083, "num_input_tokens_seen": 110188976, "step": 115395 }, { "epoch": 9.413492128232319, "grad_norm": 3.2162671089172363, "learning_rate": 5.222462673886985e-07, "loss": 0.3053, "num_input_tokens_seen": 110193888, "step": 115400 }, { "epoch": 9.413899991842728, "grad_norm": 36.83977508544922, "learning_rate": 5.215228047871168e-07, "loss": 0.7262, "num_input_tokens_seen": 110198272, "step": 115405 }, { "epoch": 9.414307855453137, "grad_norm": 3.3619744777679443, "learning_rate": 5.207998383538637e-07, "loss": 0.3596, "num_input_tokens_seen": 110202992, "step": 115410 }, { "epoch": 9.414715719063546, "grad_norm": 29.301103591918945, "learning_rate": 5.20077368103597e-07, "loss": 0.387, "num_input_tokens_seen": 110206880, "step": 115415 }, { "epoch": 9.415123582673953, "grad_norm": 4.317761421203613, "learning_rate": 5.193553940509605e-07, "loss": 0.2944, "num_input_tokens_seen": 110211040, "step": 115420 }, { "epoch": 9.415531446284362, "grad_norm": 2.6148886680603027, "learning_rate": 5.186339162105841e-07, "loss": 0.3644, "num_input_tokens_seen": 110215232, "step": 115425 }, { "epoch": 9.415939309894771, "grad_norm": 30.65152359008789, "learning_rate": 5.179129345970978e-07, "loss": 0.3446, "num_input_tokens_seen": 110219856, "step": 115430 }, { "epoch": 9.41634717350518, "grad_norm": 1.925673007965088, "learning_rate": 5.171924492251151e-07, "loss": 0.2793, "num_input_tokens_seen": 110224304, "step": 115435 }, { "epoch": 9.416755037115589, "grad_norm": 2.563239574432373, "learning_rate": 5.16472460109238e-07, "loss": 0.1317, "num_input_tokens_seen": 110228768, "step": 115440 }, { "epoch": 9.417162900725998, "grad_norm": 3.9579906463623047, "learning_rate": 5.157529672640576e-07, "loss": 0.2466, "num_input_tokens_seen": 110233024, "step": 115445 }, { "epoch": 9.417570764336405, "grad_norm": 21.73552703857422, "learning_rate": 5.150339707041623e-07, "loss": 0.442, "num_input_tokens_seen": 110238512, "step": 115450 }, { "epoch": 9.417978627946814, "grad_norm": 38.8228645324707, "learning_rate": 5.143154704441238e-07, "loss": 0.3414, "num_input_tokens_seen": 110243504, "step": 115455 }, { "epoch": 9.418386491557223, "grad_norm": 5.0383195877075195, "learning_rate": 5.135974664985055e-07, "loss": 0.193, "num_input_tokens_seen": 110248832, "step": 115460 }, { "epoch": 9.418794355167632, "grad_norm": 43.59135437011719, "learning_rate": 5.128799588818622e-07, "loss": 0.4497, "num_input_tokens_seen": 110254432, "step": 115465 }, { "epoch": 9.419202218778041, "grad_norm": 5.855052947998047, "learning_rate": 5.12162947608738e-07, "loss": 0.4858, "num_input_tokens_seen": 110259680, "step": 115470 }, { "epoch": 9.419610082388449, "grad_norm": 0.9431612491607666, "learning_rate": 5.114464326936657e-07, "loss": 0.2356, "num_input_tokens_seen": 110263296, "step": 115475 }, { "epoch": 9.420017945998858, "grad_norm": 20.27948760986328, "learning_rate": 5.10730414151167e-07, "loss": 0.2682, "num_input_tokens_seen": 110267808, "step": 115480 }, { "epoch": 9.420425809609267, "grad_norm": 11.392064094543457, "learning_rate": 5.100148919957582e-07, "loss": 0.3463, "num_input_tokens_seen": 110273104, "step": 115485 }, { "epoch": 9.420833673219676, "grad_norm": 2.9183459281921387, "learning_rate": 5.092998662419357e-07, "loss": 0.3744, "num_input_tokens_seen": 110277584, "step": 115490 }, { "epoch": 9.421241536830085, "grad_norm": 9.700549125671387, "learning_rate": 5.085853369042049e-07, "loss": 0.3009, "num_input_tokens_seen": 110281680, "step": 115495 }, { "epoch": 9.421649400440494, "grad_norm": 17.312137603759766, "learning_rate": 5.078713039970401e-07, "loss": 0.4118, "num_input_tokens_seen": 110286880, "step": 115500 }, { "epoch": 9.4220572640509, "grad_norm": 30.165245056152344, "learning_rate": 5.071577675349159e-07, "loss": 0.3894, "num_input_tokens_seen": 110291296, "step": 115505 }, { "epoch": 9.42246512766131, "grad_norm": 36.92335891723633, "learning_rate": 5.064447275322959e-07, "loss": 0.4347, "num_input_tokens_seen": 110295216, "step": 115510 }, { "epoch": 9.422872991271719, "grad_norm": 13.706134796142578, "learning_rate": 5.057321840036322e-07, "loss": 0.4601, "num_input_tokens_seen": 110299344, "step": 115515 }, { "epoch": 9.423280854882128, "grad_norm": 2.0439388751983643, "learning_rate": 5.050201369633717e-07, "loss": 0.3399, "num_input_tokens_seen": 110304288, "step": 115520 }, { "epoch": 9.423688718492537, "grad_norm": 2.4984376430511475, "learning_rate": 5.043085864259417e-07, "loss": 0.2555, "num_input_tokens_seen": 110309136, "step": 115525 }, { "epoch": 9.424096582102944, "grad_norm": 2.928335189819336, "learning_rate": 5.035975324057695e-07, "loss": 0.1963, "num_input_tokens_seen": 110314704, "step": 115530 }, { "epoch": 9.424504445713353, "grad_norm": 26.033447265625, "learning_rate": 5.02886974917266e-07, "loss": 0.3694, "num_input_tokens_seen": 110318784, "step": 115535 }, { "epoch": 9.424912309323762, "grad_norm": 31.350435256958008, "learning_rate": 5.021769139748333e-07, "loss": 0.2923, "num_input_tokens_seen": 110323248, "step": 115540 }, { "epoch": 9.425320172934171, "grad_norm": 3.863203525543213, "learning_rate": 5.014673495928657e-07, "loss": 0.3023, "num_input_tokens_seen": 110328384, "step": 115545 }, { "epoch": 9.42572803654458, "grad_norm": 5.762508392333984, "learning_rate": 5.007582817857459e-07, "loss": 0.5129, "num_input_tokens_seen": 110333712, "step": 115550 }, { "epoch": 9.426135900154987, "grad_norm": 29.645618438720703, "learning_rate": 5.000497105678431e-07, "loss": 0.4899, "num_input_tokens_seen": 110339136, "step": 115555 }, { "epoch": 9.426543763765396, "grad_norm": 1.401211142539978, "learning_rate": 4.993416359535236e-07, "loss": 0.4026, "num_input_tokens_seen": 110343424, "step": 115560 }, { "epoch": 9.426951627375805, "grad_norm": 8.753414154052734, "learning_rate": 4.986340579571369e-07, "loss": 0.4118, "num_input_tokens_seen": 110348864, "step": 115565 }, { "epoch": 9.427359490986214, "grad_norm": 7.985153675079346, "learning_rate": 4.979269765930272e-07, "loss": 0.2598, "num_input_tokens_seen": 110354144, "step": 115570 }, { "epoch": 9.427767354596623, "grad_norm": 2.2722859382629395, "learning_rate": 4.972203918755275e-07, "loss": 0.145, "num_input_tokens_seen": 110358960, "step": 115575 }, { "epoch": 9.428175218207032, "grad_norm": 23.876136779785156, "learning_rate": 4.96514303818954e-07, "loss": 0.3941, "num_input_tokens_seen": 110363232, "step": 115580 }, { "epoch": 9.42858308181744, "grad_norm": 2.077460289001465, "learning_rate": 4.958087124376287e-07, "loss": 0.3731, "num_input_tokens_seen": 110368128, "step": 115585 }, { "epoch": 9.428990945427849, "grad_norm": 30.084299087524414, "learning_rate": 4.951036177458457e-07, "loss": 0.3073, "num_input_tokens_seen": 110372592, "step": 115590 }, { "epoch": 9.429398809038258, "grad_norm": 7.3431925773620605, "learning_rate": 4.943990197579018e-07, "loss": 0.3434, "num_input_tokens_seen": 110376816, "step": 115595 }, { "epoch": 9.429806672648667, "grad_norm": 17.650672912597656, "learning_rate": 4.936949184880746e-07, "loss": 0.3514, "num_input_tokens_seen": 110381856, "step": 115600 }, { "epoch": 9.430214536259076, "grad_norm": 18.201745986938477, "learning_rate": 4.929913139506387e-07, "loss": 0.3675, "num_input_tokens_seen": 110386848, "step": 115605 }, { "epoch": 9.430622399869483, "grad_norm": 30.29414939880371, "learning_rate": 4.922882061598549e-07, "loss": 0.3825, "num_input_tokens_seen": 110391392, "step": 115610 }, { "epoch": 9.431030263479892, "grad_norm": 26.15867042541504, "learning_rate": 4.915855951299786e-07, "loss": 0.3342, "num_input_tokens_seen": 110395920, "step": 115615 }, { "epoch": 9.431438127090301, "grad_norm": 1.3557924032211304, "learning_rate": 4.908834808752427e-07, "loss": 0.5091, "num_input_tokens_seen": 110401120, "step": 115620 }, { "epoch": 9.43184599070071, "grad_norm": 30.270912170410156, "learning_rate": 4.90181863409886e-07, "loss": 0.2334, "num_input_tokens_seen": 110404528, "step": 115625 }, { "epoch": 9.432253854311119, "grad_norm": 7.007632255554199, "learning_rate": 4.894807427481274e-07, "loss": 0.5134, "num_input_tokens_seen": 110408992, "step": 115630 }, { "epoch": 9.432661717921526, "grad_norm": 15.145092010498047, "learning_rate": 4.887801189041807e-07, "loss": 0.2176, "num_input_tokens_seen": 110413936, "step": 115635 }, { "epoch": 9.433069581531935, "grad_norm": 1.0271490812301636, "learning_rate": 4.880799918922457e-07, "loss": 0.2708, "num_input_tokens_seen": 110418400, "step": 115640 }, { "epoch": 9.433477445142344, "grad_norm": 1.7994520664215088, "learning_rate": 4.873803617265111e-07, "loss": 0.3695, "num_input_tokens_seen": 110423088, "step": 115645 }, { "epoch": 9.433885308752753, "grad_norm": 1.6959850788116455, "learning_rate": 4.866812284211625e-07, "loss": 0.3221, "num_input_tokens_seen": 110428144, "step": 115650 }, { "epoch": 9.434293172363162, "grad_norm": 26.000080108642578, "learning_rate": 4.859825919903693e-07, "loss": 0.2489, "num_input_tokens_seen": 110433616, "step": 115655 }, { "epoch": 9.434701035973571, "grad_norm": 1.2503360509872437, "learning_rate": 4.852844524482925e-07, "loss": 0.2521, "num_input_tokens_seen": 110439504, "step": 115660 }, { "epoch": 9.435108899583978, "grad_norm": 13.37653636932373, "learning_rate": 4.845868098090816e-07, "loss": 0.4169, "num_input_tokens_seen": 110444688, "step": 115665 }, { "epoch": 9.435516763194387, "grad_norm": 34.98134231567383, "learning_rate": 4.838896640868784e-07, "loss": 0.2953, "num_input_tokens_seen": 110448544, "step": 115670 }, { "epoch": 9.435924626804796, "grad_norm": 1.5195950269699097, "learning_rate": 4.831930152958131e-07, "loss": 0.2969, "num_input_tokens_seen": 110453296, "step": 115675 }, { "epoch": 9.436332490415205, "grad_norm": 2.716517448425293, "learning_rate": 4.824968634500105e-07, "loss": 0.4985, "num_input_tokens_seen": 110458976, "step": 115680 }, { "epoch": 9.436740354025615, "grad_norm": 4.3593950271606445, "learning_rate": 4.818012085635787e-07, "loss": 0.4635, "num_input_tokens_seen": 110464448, "step": 115685 }, { "epoch": 9.437148217636022, "grad_norm": 10.219513893127441, "learning_rate": 4.811060506506149e-07, "loss": 0.4465, "num_input_tokens_seen": 110469680, "step": 115690 }, { "epoch": 9.43755608124643, "grad_norm": 2.3747384548187256, "learning_rate": 4.804113897252133e-07, "loss": 0.4423, "num_input_tokens_seen": 110474816, "step": 115695 }, { "epoch": 9.43796394485684, "grad_norm": 1.712388515472412, "learning_rate": 4.79717225801457e-07, "loss": 0.3785, "num_input_tokens_seen": 110479616, "step": 115700 }, { "epoch": 9.438371808467249, "grad_norm": 15.941919326782227, "learning_rate": 4.7902355889341e-07, "loss": 0.2654, "num_input_tokens_seen": 110484288, "step": 115705 }, { "epoch": 9.438779672077658, "grad_norm": 6.200233459472656, "learning_rate": 4.783303890151387e-07, "loss": 0.6279, "num_input_tokens_seen": 110489296, "step": 115710 }, { "epoch": 9.439187535688067, "grad_norm": 14.516621589660645, "learning_rate": 4.776377161806872e-07, "loss": 0.2525, "num_input_tokens_seen": 110493280, "step": 115715 }, { "epoch": 9.439595399298474, "grad_norm": 1.1688416004180908, "learning_rate": 4.769455404041001e-07, "loss": 0.2501, "num_input_tokens_seen": 110499024, "step": 115720 }, { "epoch": 9.440003262908883, "grad_norm": 0.8933382630348206, "learning_rate": 4.762538616994078e-07, "loss": 0.3601, "num_input_tokens_seen": 110504224, "step": 115725 }, { "epoch": 9.440411126519292, "grad_norm": 2.3526086807250977, "learning_rate": 4.7556268008062953e-07, "loss": 0.3272, "num_input_tokens_seen": 110508688, "step": 115730 }, { "epoch": 9.440818990129701, "grad_norm": 34.90641403198242, "learning_rate": 4.7487199556177364e-07, "loss": 0.5849, "num_input_tokens_seen": 110513632, "step": 115735 }, { "epoch": 9.44122685374011, "grad_norm": 0.9284119009971619, "learning_rate": 4.7418180815684e-07, "loss": 0.1965, "num_input_tokens_seen": 110518080, "step": 115740 }, { "epoch": 9.441634717350517, "grad_norm": 12.137328147888184, "learning_rate": 4.7349211787982296e-07, "loss": 0.4394, "num_input_tokens_seen": 110522656, "step": 115745 }, { "epoch": 9.442042580960926, "grad_norm": 12.686734199523926, "learning_rate": 4.7280292474469466e-07, "loss": 0.2481, "num_input_tokens_seen": 110527808, "step": 115750 }, { "epoch": 9.442450444571335, "grad_norm": 22.913820266723633, "learning_rate": 4.721142287654301e-07, "loss": 0.2556, "num_input_tokens_seen": 110532416, "step": 115755 }, { "epoch": 9.442858308181744, "grad_norm": 1.9193408489227295, "learning_rate": 4.714260299559875e-07, "loss": 0.4085, "num_input_tokens_seen": 110536544, "step": 115760 }, { "epoch": 9.443266171792153, "grad_norm": 21.027694702148438, "learning_rate": 4.707383283303168e-07, "loss": 0.2814, "num_input_tokens_seen": 110541136, "step": 115765 }, { "epoch": 9.44367403540256, "grad_norm": 1.6362104415893555, "learning_rate": 4.7005112390235696e-07, "loss": 0.2924, "num_input_tokens_seen": 110545840, "step": 115770 }, { "epoch": 9.44408189901297, "grad_norm": 6.061681747436523, "learning_rate": 4.6936441668603834e-07, "loss": 0.2593, "num_input_tokens_seen": 110550160, "step": 115775 }, { "epoch": 9.444489762623379, "grad_norm": 12.093918800354004, "learning_rate": 4.6867820669527497e-07, "loss": 0.2441, "num_input_tokens_seen": 110553408, "step": 115780 }, { "epoch": 9.444897626233788, "grad_norm": 2.8386826515197754, "learning_rate": 4.679924939439861e-07, "loss": 0.5198, "num_input_tokens_seen": 110557904, "step": 115785 }, { "epoch": 9.445305489844197, "grad_norm": 5.705394268035889, "learning_rate": 4.6730727844606083e-07, "loss": 0.3304, "num_input_tokens_seen": 110563280, "step": 115790 }, { "epoch": 9.445713353454606, "grad_norm": 5.094232082366943, "learning_rate": 4.666225602153962e-07, "loss": 0.2222, "num_input_tokens_seen": 110568432, "step": 115795 }, { "epoch": 9.446121217065013, "grad_norm": 44.57951354980469, "learning_rate": 4.659383392658645e-07, "loss": 0.4567, "num_input_tokens_seen": 110573408, "step": 115800 }, { "epoch": 9.446529080675422, "grad_norm": 9.695047378540039, "learning_rate": 4.65254615611338e-07, "loss": 0.3996, "num_input_tokens_seen": 110578192, "step": 115805 }, { "epoch": 9.44693694428583, "grad_norm": 6.351623058319092, "learning_rate": 4.64571389265675e-07, "loss": 0.5028, "num_input_tokens_seen": 110581888, "step": 115810 }, { "epoch": 9.44734480789624, "grad_norm": 2.7237584590911865, "learning_rate": 4.6388866024272557e-07, "loss": 0.3249, "num_input_tokens_seen": 110586512, "step": 115815 }, { "epoch": 9.447752671506649, "grad_norm": 38.064369201660156, "learning_rate": 4.632064285563259e-07, "loss": 0.4715, "num_input_tokens_seen": 110590832, "step": 115820 }, { "epoch": 9.448160535117056, "grad_norm": 17.26551055908203, "learning_rate": 4.625246942203065e-07, "loss": 0.2949, "num_input_tokens_seen": 110595376, "step": 115825 }, { "epoch": 9.448568398727465, "grad_norm": 9.230245590209961, "learning_rate": 4.618434572484842e-07, "loss": 0.401, "num_input_tokens_seen": 110600064, "step": 115830 }, { "epoch": 9.448976262337874, "grad_norm": 5.769868850708008, "learning_rate": 4.6116271765466736e-07, "loss": 0.2158, "num_input_tokens_seen": 110604496, "step": 115835 }, { "epoch": 9.449384125948283, "grad_norm": 48.01203536987305, "learning_rate": 4.60482475452656e-07, "loss": 0.3794, "num_input_tokens_seen": 110609072, "step": 115840 }, { "epoch": 9.449791989558692, "grad_norm": 3.0750865936279297, "learning_rate": 4.5980273065623637e-07, "loss": 0.2214, "num_input_tokens_seen": 110614336, "step": 115845 }, { "epoch": 9.4501998531691, "grad_norm": 8.39863395690918, "learning_rate": 4.591234832791891e-07, "loss": 0.38, "num_input_tokens_seen": 110618768, "step": 115850 }, { "epoch": 9.450607716779508, "grad_norm": 5.878868103027344, "learning_rate": 4.5844473333528093e-07, "loss": 0.3433, "num_input_tokens_seen": 110623792, "step": 115855 }, { "epoch": 9.451015580389917, "grad_norm": 31.665699005126953, "learning_rate": 4.5776648083827033e-07, "loss": 0.5137, "num_input_tokens_seen": 110629552, "step": 115860 }, { "epoch": 9.451423444000326, "grad_norm": 15.109070777893066, "learning_rate": 4.5708872580190187e-07, "loss": 0.3278, "num_input_tokens_seen": 110634896, "step": 115865 }, { "epoch": 9.451831307610735, "grad_norm": 2.256525993347168, "learning_rate": 4.564114682399173e-07, "loss": 0.2469, "num_input_tokens_seen": 110638832, "step": 115870 }, { "epoch": 9.452239171221144, "grad_norm": 7.496624946594238, "learning_rate": 4.5573470816604455e-07, "loss": 0.2051, "num_input_tokens_seen": 110643376, "step": 115875 }, { "epoch": 9.452647034831552, "grad_norm": 2.179227828979492, "learning_rate": 4.550584455939977e-07, "loss": 0.2765, "num_input_tokens_seen": 110647808, "step": 115880 }, { "epoch": 9.45305489844196, "grad_norm": 59.73170471191406, "learning_rate": 4.5438268053748513e-07, "loss": 0.3098, "num_input_tokens_seen": 110652816, "step": 115885 }, { "epoch": 9.45346276205237, "grad_norm": 10.512084007263184, "learning_rate": 4.537074130102098e-07, "loss": 0.4123, "num_input_tokens_seen": 110657392, "step": 115890 }, { "epoch": 9.453870625662779, "grad_norm": 19.90419578552246, "learning_rate": 4.530326430258525e-07, "loss": 0.3522, "num_input_tokens_seen": 110661632, "step": 115895 }, { "epoch": 9.454278489273188, "grad_norm": 5.018227577209473, "learning_rate": 4.5235837059809385e-07, "loss": 0.4632, "num_input_tokens_seen": 110665920, "step": 115900 }, { "epoch": 9.454686352883595, "grad_norm": 0.776885449886322, "learning_rate": 4.5168459574059793e-07, "loss": 0.3276, "num_input_tokens_seen": 110671200, "step": 115905 }, { "epoch": 9.455094216494004, "grad_norm": 16.734567642211914, "learning_rate": 4.5101131846702615e-07, "loss": 0.3857, "num_input_tokens_seen": 110675328, "step": 115910 }, { "epoch": 9.455502080104413, "grad_norm": 30.71830940246582, "learning_rate": 4.503385387910203e-07, "loss": 0.3284, "num_input_tokens_seen": 110680288, "step": 115915 }, { "epoch": 9.455909943714822, "grad_norm": 6.145834445953369, "learning_rate": 4.49666256726225e-07, "loss": 0.2129, "num_input_tokens_seen": 110686016, "step": 115920 }, { "epoch": 9.456317807325231, "grad_norm": 4.5330400466918945, "learning_rate": 4.489944722862599e-07, "loss": 0.3948, "num_input_tokens_seen": 110690416, "step": 115925 }, { "epoch": 9.45672567093564, "grad_norm": 11.637042045593262, "learning_rate": 4.483231854847447e-07, "loss": 0.3827, "num_input_tokens_seen": 110695328, "step": 115930 }, { "epoch": 9.457133534546047, "grad_norm": 9.427130699157715, "learning_rate": 4.476523963352852e-07, "loss": 0.2924, "num_input_tokens_seen": 110700576, "step": 115935 }, { "epoch": 9.457541398156456, "grad_norm": 13.436269760131836, "learning_rate": 4.4698210485147887e-07, "loss": 0.1494, "num_input_tokens_seen": 110704704, "step": 115940 }, { "epoch": 9.457949261766865, "grad_norm": 2.477632999420166, "learning_rate": 4.463123110469147e-07, "loss": 0.3681, "num_input_tokens_seen": 110708608, "step": 115945 }, { "epoch": 9.458357125377274, "grad_norm": 16.528696060180664, "learning_rate": 4.4564301493516534e-07, "loss": 0.2934, "num_input_tokens_seen": 110713280, "step": 115950 }, { "epoch": 9.458764988987683, "grad_norm": 21.38909149169922, "learning_rate": 4.4497421652979485e-07, "loss": 0.5059, "num_input_tokens_seen": 110717264, "step": 115955 }, { "epoch": 9.45917285259809, "grad_norm": 0.9254776835441589, "learning_rate": 4.443059158443646e-07, "loss": 0.2474, "num_input_tokens_seen": 110721632, "step": 115960 }, { "epoch": 9.4595807162085, "grad_norm": 0.5402647256851196, "learning_rate": 4.4363811289242216e-07, "loss": 0.3828, "num_input_tokens_seen": 110727296, "step": 115965 }, { "epoch": 9.459988579818909, "grad_norm": 1.9726417064666748, "learning_rate": 4.4297080768749833e-07, "loss": 0.2749, "num_input_tokens_seen": 110732208, "step": 115970 }, { "epoch": 9.460396443429318, "grad_norm": 1.1639797687530518, "learning_rate": 4.4230400024312115e-07, "loss": 0.3648, "num_input_tokens_seen": 110737344, "step": 115975 }, { "epoch": 9.460804307039727, "grad_norm": 38.44068145751953, "learning_rate": 4.416376905728076e-07, "loss": 0.3716, "num_input_tokens_seen": 110742848, "step": 115980 }, { "epoch": 9.461212170650134, "grad_norm": 7.371562957763672, "learning_rate": 4.4097187869006074e-07, "loss": 0.4605, "num_input_tokens_seen": 110748688, "step": 115985 }, { "epoch": 9.461620034260543, "grad_norm": 1.6908128261566162, "learning_rate": 4.403065646083809e-07, "loss": 0.515, "num_input_tokens_seen": 110753120, "step": 115990 }, { "epoch": 9.462027897870952, "grad_norm": 1.516833782196045, "learning_rate": 4.3964174834124906e-07, "loss": 0.3674, "num_input_tokens_seen": 110757504, "step": 115995 }, { "epoch": 9.46243576148136, "grad_norm": 6.491610050201416, "learning_rate": 4.389774299021432e-07, "loss": 0.4354, "num_input_tokens_seen": 110762352, "step": 116000 }, { "epoch": 9.46284362509177, "grad_norm": 13.05616569519043, "learning_rate": 4.383136093045276e-07, "loss": 0.3472, "num_input_tokens_seen": 110766976, "step": 116005 }, { "epoch": 9.463251488702179, "grad_norm": 10.817337036132812, "learning_rate": 4.376502865618581e-07, "loss": 0.3681, "num_input_tokens_seen": 110771408, "step": 116010 }, { "epoch": 9.463659352312586, "grad_norm": 9.884949684143066, "learning_rate": 4.3698746168758243e-07, "loss": 0.1544, "num_input_tokens_seen": 110776224, "step": 116015 }, { "epoch": 9.464067215922995, "grad_norm": 5.703020095825195, "learning_rate": 4.3632513469513137e-07, "loss": 0.3367, "num_input_tokens_seen": 110780688, "step": 116020 }, { "epoch": 9.464475079533404, "grad_norm": 18.705827713012695, "learning_rate": 4.356633055979331e-07, "loss": 0.3804, "num_input_tokens_seen": 110784912, "step": 116025 }, { "epoch": 9.464882943143813, "grad_norm": 3.567383289337158, "learning_rate": 4.350019744094019e-07, "loss": 0.1912, "num_input_tokens_seen": 110789344, "step": 116030 }, { "epoch": 9.465290806754222, "grad_norm": 7.1670050621032715, "learning_rate": 4.34341141142941e-07, "loss": 0.4127, "num_input_tokens_seen": 110794192, "step": 116035 }, { "epoch": 9.46569867036463, "grad_norm": 3.720034122467041, "learning_rate": 4.336808058119479e-07, "loss": 0.4134, "num_input_tokens_seen": 110798032, "step": 116040 }, { "epoch": 9.466106533975038, "grad_norm": 4.696505069732666, "learning_rate": 4.3302096842980364e-07, "loss": 0.2717, "num_input_tokens_seen": 110803104, "step": 116045 }, { "epoch": 9.466514397585447, "grad_norm": 19.655624389648438, "learning_rate": 4.3236162900988643e-07, "loss": 0.3845, "num_input_tokens_seen": 110808496, "step": 116050 }, { "epoch": 9.466922261195856, "grad_norm": 16.471622467041016, "learning_rate": 4.317027875655605e-07, "loss": 0.4344, "num_input_tokens_seen": 110812944, "step": 116055 }, { "epoch": 9.467330124806265, "grad_norm": 1.4320124387741089, "learning_rate": 4.310444441101791e-07, "loss": 0.2556, "num_input_tokens_seen": 110818368, "step": 116060 }, { "epoch": 9.467737988416673, "grad_norm": 10.976720809936523, "learning_rate": 4.303865986570871e-07, "loss": 0.4425, "num_input_tokens_seen": 110823424, "step": 116065 }, { "epoch": 9.468145852027082, "grad_norm": 1.1709716320037842, "learning_rate": 4.297292512196155e-07, "loss": 0.1906, "num_input_tokens_seen": 110827920, "step": 116070 }, { "epoch": 9.46855371563749, "grad_norm": 3.9654135704040527, "learning_rate": 4.290724018110953e-07, "loss": 0.2663, "num_input_tokens_seen": 110833152, "step": 116075 }, { "epoch": 9.4689615792479, "grad_norm": 21.653854370117188, "learning_rate": 4.284160504448353e-07, "loss": 0.4343, "num_input_tokens_seen": 110838064, "step": 116080 }, { "epoch": 9.469369442858309, "grad_norm": 1.7189069986343384, "learning_rate": 4.2776019713414163e-07, "loss": 0.2231, "num_input_tokens_seen": 110842992, "step": 116085 }, { "epoch": 9.469777306468718, "grad_norm": 9.163899421691895, "learning_rate": 4.2710484189230346e-07, "loss": 0.3415, "num_input_tokens_seen": 110847552, "step": 116090 }, { "epoch": 9.470185170079125, "grad_norm": 17.69520378112793, "learning_rate": 4.264499847326131e-07, "loss": 0.3948, "num_input_tokens_seen": 110852336, "step": 116095 }, { "epoch": 9.470593033689534, "grad_norm": 7.071730613708496, "learning_rate": 4.2579562566833766e-07, "loss": 0.3573, "num_input_tokens_seen": 110857424, "step": 116100 }, { "epoch": 9.471000897299943, "grad_norm": 1.426164150238037, "learning_rate": 4.2514176471274424e-07, "loss": 0.2488, "num_input_tokens_seen": 110861776, "step": 116105 }, { "epoch": 9.471408760910352, "grad_norm": 10.859003067016602, "learning_rate": 4.2448840187908344e-07, "loss": 0.3361, "num_input_tokens_seen": 110866624, "step": 116110 }, { "epoch": 9.471816624520761, "grad_norm": 40.32784652709961, "learning_rate": 4.2383553718059746e-07, "loss": 0.3029, "num_input_tokens_seen": 110870944, "step": 116115 }, { "epoch": 9.472224488131168, "grad_norm": 7.852794170379639, "learning_rate": 4.231831706305256e-07, "loss": 0.1786, "num_input_tokens_seen": 110875600, "step": 116120 }, { "epoch": 9.472632351741577, "grad_norm": 3.5123648643493652, "learning_rate": 4.225313022420879e-07, "loss": 0.2838, "num_input_tokens_seen": 110880800, "step": 116125 }, { "epoch": 9.473040215351986, "grad_norm": 17.47815704345703, "learning_rate": 4.218799320284961e-07, "loss": 0.4626, "num_input_tokens_seen": 110885760, "step": 116130 }, { "epoch": 9.473448078962395, "grad_norm": 2.3218302726745605, "learning_rate": 4.212290600029534e-07, "loss": 0.2826, "num_input_tokens_seen": 110891056, "step": 116135 }, { "epoch": 9.473855942572804, "grad_norm": 20.196680068969727, "learning_rate": 4.2057868617865484e-07, "loss": 0.198, "num_input_tokens_seen": 110896528, "step": 116140 }, { "epoch": 9.474263806183213, "grad_norm": 4.966709613800049, "learning_rate": 4.199288105687815e-07, "loss": 0.2543, "num_input_tokens_seen": 110901696, "step": 116145 }, { "epoch": 9.47467166979362, "grad_norm": 4.673581600189209, "learning_rate": 4.192794331865063e-07, "loss": 0.4012, "num_input_tokens_seen": 110906224, "step": 116150 }, { "epoch": 9.47507953340403, "grad_norm": 13.653801918029785, "learning_rate": 4.186305540449909e-07, "loss": 0.4744, "num_input_tokens_seen": 110910720, "step": 116155 }, { "epoch": 9.475487397014438, "grad_norm": 13.248236656188965, "learning_rate": 4.179821731573913e-07, "loss": 0.4137, "num_input_tokens_seen": 110914848, "step": 116160 }, { "epoch": 9.475895260624847, "grad_norm": 17.206388473510742, "learning_rate": 4.1733429053684714e-07, "loss": 0.241, "num_input_tokens_seen": 110920192, "step": 116165 }, { "epoch": 9.476303124235256, "grad_norm": 36.667869567871094, "learning_rate": 4.166869061964895e-07, "loss": 0.3581, "num_input_tokens_seen": 110925456, "step": 116170 }, { "epoch": 9.476710987845664, "grad_norm": 12.831374168395996, "learning_rate": 4.160400201494441e-07, "loss": 0.3369, "num_input_tokens_seen": 110930208, "step": 116175 }, { "epoch": 9.477118851456073, "grad_norm": 20.81499481201172, "learning_rate": 4.15393632408817e-07, "loss": 0.3393, "num_input_tokens_seen": 110935536, "step": 116180 }, { "epoch": 9.477526715066482, "grad_norm": 6.944016933441162, "learning_rate": 4.147477429877172e-07, "loss": 0.4549, "num_input_tokens_seen": 110939472, "step": 116185 }, { "epoch": 9.47793457867689, "grad_norm": 12.372458457946777, "learning_rate": 4.141023518992343e-07, "loss": 0.2585, "num_input_tokens_seen": 110944560, "step": 116190 }, { "epoch": 9.4783424422873, "grad_norm": 2.7134456634521484, "learning_rate": 4.134574591564494e-07, "loss": 0.2827, "num_input_tokens_seen": 110949600, "step": 116195 }, { "epoch": 9.478750305897707, "grad_norm": 20.892484664916992, "learning_rate": 4.1281306477243264e-07, "loss": 0.2924, "num_input_tokens_seen": 110955200, "step": 116200 }, { "epoch": 9.479158169508116, "grad_norm": 28.54571533203125, "learning_rate": 4.1216916876024856e-07, "loss": 0.4117, "num_input_tokens_seen": 110960608, "step": 116205 }, { "epoch": 9.479566033118525, "grad_norm": 2.2112650871276855, "learning_rate": 4.115257711329479e-07, "loss": 0.3213, "num_input_tokens_seen": 110965056, "step": 116210 }, { "epoch": 9.479973896728934, "grad_norm": 13.505776405334473, "learning_rate": 4.108828719035701e-07, "loss": 0.4637, "num_input_tokens_seen": 110969536, "step": 116215 }, { "epoch": 9.480381760339343, "grad_norm": 29.860124588012695, "learning_rate": 4.1024047108514927e-07, "loss": 0.4303, "num_input_tokens_seen": 110974672, "step": 116220 }, { "epoch": 9.480789623949752, "grad_norm": 3.1706056594848633, "learning_rate": 4.0959856869070556e-07, "loss": 0.3673, "num_input_tokens_seen": 110979952, "step": 116225 }, { "epoch": 9.48119748756016, "grad_norm": 15.473710060119629, "learning_rate": 4.0895716473324795e-07, "loss": 0.2815, "num_input_tokens_seen": 110983824, "step": 116230 }, { "epoch": 9.481605351170568, "grad_norm": 14.085193634033203, "learning_rate": 4.0831625922578e-07, "loss": 0.3909, "num_input_tokens_seen": 110988624, "step": 116235 }, { "epoch": 9.482013214780977, "grad_norm": 28.089435577392578, "learning_rate": 4.07675852181294e-07, "loss": 0.248, "num_input_tokens_seen": 110993232, "step": 116240 }, { "epoch": 9.482421078391386, "grad_norm": 4.708751678466797, "learning_rate": 4.0703594361276573e-07, "loss": 0.3421, "num_input_tokens_seen": 110998112, "step": 116245 }, { "epoch": 9.482828942001795, "grad_norm": 16.196516036987305, "learning_rate": 4.063965335331682e-07, "loss": 0.3676, "num_input_tokens_seen": 111002752, "step": 116250 }, { "epoch": 9.483236805612203, "grad_norm": 2.160914182662964, "learning_rate": 4.05757621955466e-07, "loss": 0.351, "num_input_tokens_seen": 111007792, "step": 116255 }, { "epoch": 9.483644669222612, "grad_norm": 5.315179347991943, "learning_rate": 4.051192088926042e-07, "loss": 0.4147, "num_input_tokens_seen": 111012752, "step": 116260 }, { "epoch": 9.48405253283302, "grad_norm": 3.4037766456604004, "learning_rate": 4.044812943575255e-07, "loss": 0.4585, "num_input_tokens_seen": 111017104, "step": 116265 }, { "epoch": 9.48446039644343, "grad_norm": 15.658526420593262, "learning_rate": 4.0384387836315816e-07, "loss": 0.4018, "num_input_tokens_seen": 111022704, "step": 116270 }, { "epoch": 9.484868260053839, "grad_norm": 43.07913589477539, "learning_rate": 4.032069609224254e-07, "loss": 0.5582, "num_input_tokens_seen": 111027072, "step": 116275 }, { "epoch": 9.485276123664246, "grad_norm": 0.9372084736824036, "learning_rate": 4.025705420482362e-07, "loss": 0.3817, "num_input_tokens_seen": 111032160, "step": 116280 }, { "epoch": 9.485683987274655, "grad_norm": 1.07852041721344, "learning_rate": 4.019346217534886e-07, "loss": 0.3446, "num_input_tokens_seen": 111037024, "step": 116285 }, { "epoch": 9.486091850885064, "grad_norm": 22.536598205566406, "learning_rate": 4.012992000510751e-07, "loss": 0.3909, "num_input_tokens_seen": 111041264, "step": 116290 }, { "epoch": 9.486499714495473, "grad_norm": 1.765608549118042, "learning_rate": 4.0066427695387153e-07, "loss": 0.4401, "num_input_tokens_seen": 111045248, "step": 116295 }, { "epoch": 9.486907578105882, "grad_norm": 8.84526252746582, "learning_rate": 4.0002985247475366e-07, "loss": 0.3693, "num_input_tokens_seen": 111049456, "step": 116300 }, { "epoch": 9.48731544171629, "grad_norm": 3.230408191680908, "learning_rate": 3.993959266265751e-07, "loss": 0.2377, "num_input_tokens_seen": 111054176, "step": 116305 }, { "epoch": 9.487723305326698, "grad_norm": 2.9881439208984375, "learning_rate": 3.9876249942218946e-07, "loss": 0.3043, "num_input_tokens_seen": 111059216, "step": 116310 }, { "epoch": 9.488131168937107, "grad_norm": 4.363719463348389, "learning_rate": 3.981295708744337e-07, "loss": 0.3827, "num_input_tokens_seen": 111064272, "step": 116315 }, { "epoch": 9.488539032547516, "grad_norm": 25.574934005737305, "learning_rate": 3.9749714099613643e-07, "loss": 0.4618, "num_input_tokens_seen": 111068816, "step": 116320 }, { "epoch": 9.488946896157925, "grad_norm": 28.254756927490234, "learning_rate": 3.96865209800118e-07, "loss": 0.2622, "num_input_tokens_seen": 111073792, "step": 116325 }, { "epoch": 9.489354759768334, "grad_norm": 36.851375579833984, "learning_rate": 3.962337772991903e-07, "loss": 0.3865, "num_input_tokens_seen": 111079296, "step": 116330 }, { "epoch": 9.489762623378741, "grad_norm": 5.304149150848389, "learning_rate": 3.9560284350614317e-07, "loss": 0.4894, "num_input_tokens_seen": 111084096, "step": 116335 }, { "epoch": 9.49017048698915, "grad_norm": 2.0660786628723145, "learning_rate": 3.949724084337747e-07, "loss": 0.2918, "num_input_tokens_seen": 111089216, "step": 116340 }, { "epoch": 9.49057835059956, "grad_norm": 2.2828381061553955, "learning_rate": 3.9434247209486074e-07, "loss": 0.2824, "num_input_tokens_seen": 111094256, "step": 116345 }, { "epoch": 9.490986214209968, "grad_norm": 67.58649444580078, "learning_rate": 3.937130345021717e-07, "loss": 0.4227, "num_input_tokens_seen": 111098672, "step": 116350 }, { "epoch": 9.491394077820377, "grad_norm": 15.387125015258789, "learning_rate": 3.930840956684584e-07, "loss": 0.4146, "num_input_tokens_seen": 111103568, "step": 116355 }, { "epoch": 9.491801941430786, "grad_norm": 46.011451721191406, "learning_rate": 3.9245565560647736e-07, "loss": 0.442, "num_input_tokens_seen": 111108048, "step": 116360 }, { "epoch": 9.492209805041194, "grad_norm": 5.978000164031982, "learning_rate": 3.918277143289628e-07, "loss": 0.3203, "num_input_tokens_seen": 111113408, "step": 116365 }, { "epoch": 9.492617668651603, "grad_norm": 10.732394218444824, "learning_rate": 3.9120027184864627e-07, "loss": 0.2042, "num_input_tokens_seen": 111118192, "step": 116370 }, { "epoch": 9.493025532262012, "grad_norm": 9.51949405670166, "learning_rate": 3.9057332817823975e-07, "loss": 0.2034, "num_input_tokens_seen": 111122384, "step": 116375 }, { "epoch": 9.49343339587242, "grad_norm": 21.39914321899414, "learning_rate": 3.899468833304554e-07, "loss": 0.2766, "num_input_tokens_seen": 111127056, "step": 116380 }, { "epoch": 9.49384125948283, "grad_norm": 4.772549152374268, "learning_rate": 3.8932093731799126e-07, "loss": 0.3972, "num_input_tokens_seen": 111132048, "step": 116385 }, { "epoch": 9.494249123093237, "grad_norm": 6.199352741241455, "learning_rate": 3.8869549015353444e-07, "loss": 0.3194, "num_input_tokens_seen": 111136496, "step": 116390 }, { "epoch": 9.494656986703646, "grad_norm": 9.497383117675781, "learning_rate": 3.880705418497638e-07, "loss": 0.2582, "num_input_tokens_seen": 111141664, "step": 116395 }, { "epoch": 9.495064850314055, "grad_norm": 13.033835411071777, "learning_rate": 3.874460924193413e-07, "loss": 0.2481, "num_input_tokens_seen": 111146368, "step": 116400 }, { "epoch": 9.495472713924464, "grad_norm": 12.474300384521484, "learning_rate": 3.8682214187492903e-07, "loss": 0.3803, "num_input_tokens_seen": 111151520, "step": 116405 }, { "epoch": 9.495880577534873, "grad_norm": 1.0435384511947632, "learning_rate": 3.8619869022917256e-07, "loss": 0.1905, "num_input_tokens_seen": 111156384, "step": 116410 }, { "epoch": 9.496288441145282, "grad_norm": 0.7814717292785645, "learning_rate": 3.8557573749471164e-07, "loss": 0.2602, "num_input_tokens_seen": 111161008, "step": 116415 }, { "epoch": 9.49669630475569, "grad_norm": 18.30799674987793, "learning_rate": 3.849532836841696e-07, "loss": 0.3863, "num_input_tokens_seen": 111165456, "step": 116420 }, { "epoch": 9.497104168366098, "grad_norm": 47.50973892211914, "learning_rate": 3.8433132881016684e-07, "loss": 0.4424, "num_input_tokens_seen": 111170432, "step": 116425 }, { "epoch": 9.497512031976507, "grad_norm": 15.639080047607422, "learning_rate": 3.8370987288530724e-07, "loss": 0.3303, "num_input_tokens_seen": 111175184, "step": 116430 }, { "epoch": 9.497919895586916, "grad_norm": 12.150996208190918, "learning_rate": 3.8308891592218897e-07, "loss": 0.3128, "num_input_tokens_seen": 111180064, "step": 116435 }, { "epoch": 9.498327759197325, "grad_norm": 1.0049538612365723, "learning_rate": 3.8246845793339645e-07, "loss": 0.2877, "num_input_tokens_seen": 111184544, "step": 116440 }, { "epoch": 9.498735622807732, "grad_norm": 3.749361515045166, "learning_rate": 3.8184849893150855e-07, "loss": 0.3366, "num_input_tokens_seen": 111188864, "step": 116445 }, { "epoch": 9.499143486418141, "grad_norm": 2.3965096473693848, "learning_rate": 3.812290389290901e-07, "loss": 0.327, "num_input_tokens_seen": 111193504, "step": 116450 }, { "epoch": 9.49955135002855, "grad_norm": 2.124617576599121, "learning_rate": 3.8061007793869783e-07, "loss": 0.402, "num_input_tokens_seen": 111198256, "step": 116455 }, { "epoch": 9.49995921363896, "grad_norm": 21.81849479675293, "learning_rate": 3.7999161597288e-07, "loss": 0.2991, "num_input_tokens_seen": 111202480, "step": 116460 }, { "epoch": 9.500367077249368, "grad_norm": 35.76846694946289, "learning_rate": 3.7937365304416827e-07, "loss": 0.3757, "num_input_tokens_seen": 111207920, "step": 116465 }, { "epoch": 9.500774940859776, "grad_norm": 6.842708587646484, "learning_rate": 3.7875618916509146e-07, "loss": 0.4091, "num_input_tokens_seen": 111213008, "step": 116470 }, { "epoch": 9.500774940859776, "eval_loss": 0.3412568271160126, "eval_runtime": 570.8944, "eval_samples_per_second": 4.773, "eval_steps_per_second": 2.387, "num_input_tokens_seen": 111213008, "step": 116470 }, { "epoch": 9.501182804470185, "grad_norm": 7.092164039611816, "learning_rate": 3.7813922434816465e-07, "loss": 0.3923, "num_input_tokens_seen": 111218112, "step": 116475 }, { "epoch": 9.501590668080594, "grad_norm": 4.391597270965576, "learning_rate": 3.7752275860589157e-07, "loss": 0.3495, "num_input_tokens_seen": 111222576, "step": 116480 }, { "epoch": 9.501998531691003, "grad_norm": 15.782483100891113, "learning_rate": 3.769067919507707e-07, "loss": 0.3299, "num_input_tokens_seen": 111227440, "step": 116485 }, { "epoch": 9.502406395301412, "grad_norm": 8.440590858459473, "learning_rate": 3.762913243952865e-07, "loss": 0.5157, "num_input_tokens_seen": 111232224, "step": 116490 }, { "epoch": 9.502814258911819, "grad_norm": 56.399383544921875, "learning_rate": 3.756763559519122e-07, "loss": 0.2591, "num_input_tokens_seen": 111237168, "step": 116495 }, { "epoch": 9.503222122522228, "grad_norm": 25.45534896850586, "learning_rate": 3.7506188663311847e-07, "loss": 0.2419, "num_input_tokens_seen": 111241632, "step": 116500 }, { "epoch": 9.503629986132637, "grad_norm": 2.5003175735473633, "learning_rate": 3.7444791645135366e-07, "loss": 0.2335, "num_input_tokens_seen": 111245280, "step": 116505 }, { "epoch": 9.504037849743046, "grad_norm": 5.336855888366699, "learning_rate": 3.7383444541906345e-07, "loss": 0.3103, "num_input_tokens_seen": 111249968, "step": 116510 }, { "epoch": 9.504445713353455, "grad_norm": 2.52763295173645, "learning_rate": 3.7322147354868777e-07, "loss": 0.3273, "num_input_tokens_seen": 111255312, "step": 116515 }, { "epoch": 9.504853576963864, "grad_norm": 3.373634099960327, "learning_rate": 3.7260900085265007e-07, "loss": 0.3982, "num_input_tokens_seen": 111260784, "step": 116520 }, { "epoch": 9.505261440574271, "grad_norm": 4.8534698486328125, "learning_rate": 3.7199702734335985e-07, "loss": 0.3083, "num_input_tokens_seen": 111265568, "step": 116525 }, { "epoch": 9.50566930418468, "grad_norm": 25.80337905883789, "learning_rate": 3.713855530332266e-07, "loss": 0.3925, "num_input_tokens_seen": 111269744, "step": 116530 }, { "epoch": 9.50607716779509, "grad_norm": 10.013846397399902, "learning_rate": 3.7077457793464057e-07, "loss": 0.428, "num_input_tokens_seen": 111275024, "step": 116535 }, { "epoch": 9.506485031405498, "grad_norm": 18.485271453857422, "learning_rate": 3.7016410205998886e-07, "loss": 0.3696, "num_input_tokens_seen": 111279712, "step": 116540 }, { "epoch": 9.506892895015907, "grad_norm": 10.932453155517578, "learning_rate": 3.695541254216478e-07, "loss": 0.2534, "num_input_tokens_seen": 111284496, "step": 116545 }, { "epoch": 9.507300758626315, "grad_norm": 6.689908027648926, "learning_rate": 3.689446480319769e-07, "loss": 0.455, "num_input_tokens_seen": 111289680, "step": 116550 }, { "epoch": 9.507708622236724, "grad_norm": 5.9997382164001465, "learning_rate": 3.6833566990333024e-07, "loss": 0.4766, "num_input_tokens_seen": 111293968, "step": 116555 }, { "epoch": 9.508116485847133, "grad_norm": 1.784796118736267, "learning_rate": 3.677271910480562e-07, "loss": 0.3235, "num_input_tokens_seen": 111298064, "step": 116560 }, { "epoch": 9.508524349457542, "grad_norm": 11.099157333374023, "learning_rate": 3.6711921147848104e-07, "loss": 0.5198, "num_input_tokens_seen": 111302752, "step": 116565 }, { "epoch": 9.50893221306795, "grad_norm": 4.750421524047852, "learning_rate": 3.665117312069366e-07, "loss": 0.3017, "num_input_tokens_seen": 111308288, "step": 116570 }, { "epoch": 9.50934007667836, "grad_norm": 35.2308464050293, "learning_rate": 3.659047502457269e-07, "loss": 0.4441, "num_input_tokens_seen": 111312816, "step": 116575 }, { "epoch": 9.509747940288767, "grad_norm": 1.5335865020751953, "learning_rate": 3.652982686071643e-07, "loss": 0.4186, "num_input_tokens_seen": 111318192, "step": 116580 }, { "epoch": 9.510155803899176, "grad_norm": 14.279157638549805, "learning_rate": 3.6469228630353627e-07, "loss": 0.2898, "num_input_tokens_seen": 111323568, "step": 116585 }, { "epoch": 9.510563667509585, "grad_norm": 13.088391304016113, "learning_rate": 3.640868033471273e-07, "loss": 0.3034, "num_input_tokens_seen": 111328464, "step": 116590 }, { "epoch": 9.510971531119994, "grad_norm": 45.30780029296875, "learning_rate": 3.63481819750211e-07, "loss": 0.4792, "num_input_tokens_seen": 111334032, "step": 116595 }, { "epoch": 9.511379394730403, "grad_norm": 6.890456199645996, "learning_rate": 3.62877335525047e-07, "loss": 0.4022, "num_input_tokens_seen": 111339056, "step": 116600 }, { "epoch": 9.51178725834081, "grad_norm": 20.023271560668945, "learning_rate": 3.622733506838949e-07, "loss": 0.3667, "num_input_tokens_seen": 111343968, "step": 116605 }, { "epoch": 9.512195121951219, "grad_norm": 2.578160285949707, "learning_rate": 3.6166986523898936e-07, "loss": 0.3555, "num_input_tokens_seen": 111348624, "step": 116610 }, { "epoch": 9.512602985561628, "grad_norm": 40.24373245239258, "learning_rate": 3.610668792025679e-07, "loss": 0.4695, "num_input_tokens_seen": 111354208, "step": 116615 }, { "epoch": 9.513010849172037, "grad_norm": 21.515058517456055, "learning_rate": 3.6046439258685114e-07, "loss": 0.5096, "num_input_tokens_seen": 111360208, "step": 116620 }, { "epoch": 9.513418712782446, "grad_norm": 18.72354507446289, "learning_rate": 3.5986240540404894e-07, "loss": 0.4177, "num_input_tokens_seen": 111365856, "step": 116625 }, { "epoch": 9.513826576392855, "grad_norm": 16.774280548095703, "learning_rate": 3.5926091766636807e-07, "loss": 0.4075, "num_input_tokens_seen": 111370944, "step": 116630 }, { "epoch": 9.514234440003262, "grad_norm": 9.377452850341797, "learning_rate": 3.586599293859988e-07, "loss": 0.2855, "num_input_tokens_seen": 111376000, "step": 116635 }, { "epoch": 9.514642303613671, "grad_norm": 1.0306106805801392, "learning_rate": 3.5805944057512033e-07, "loss": 0.3124, "num_input_tokens_seen": 111380272, "step": 116640 }, { "epoch": 9.51505016722408, "grad_norm": 15.848241806030273, "learning_rate": 3.574594512459034e-07, "loss": 0.6068, "num_input_tokens_seen": 111385472, "step": 116645 }, { "epoch": 9.51545803083449, "grad_norm": 6.03299617767334, "learning_rate": 3.5685996141051616e-07, "loss": 0.2914, "num_input_tokens_seen": 111390096, "step": 116650 }, { "epoch": 9.515865894444898, "grad_norm": 7.522875785827637, "learning_rate": 3.562609710811043e-07, "loss": 0.5412, "num_input_tokens_seen": 111394528, "step": 116655 }, { "epoch": 9.516273758055306, "grad_norm": 13.747296333312988, "learning_rate": 3.5566248026981106e-07, "loss": 0.4072, "num_input_tokens_seen": 111399264, "step": 116660 }, { "epoch": 9.516681621665715, "grad_norm": 22.538854598999023, "learning_rate": 3.550644889887683e-07, "loss": 0.3673, "num_input_tokens_seen": 111403808, "step": 116665 }, { "epoch": 9.517089485276124, "grad_norm": 28.730045318603516, "learning_rate": 3.544669972500914e-07, "loss": 0.2596, "num_input_tokens_seen": 111408784, "step": 116670 }, { "epoch": 9.517497348886533, "grad_norm": 6.571080207824707, "learning_rate": 3.5387000506590107e-07, "loss": 0.2866, "num_input_tokens_seen": 111413792, "step": 116675 }, { "epoch": 9.517905212496942, "grad_norm": 7.210084438323975, "learning_rate": 3.532735124482933e-07, "loss": 0.2761, "num_input_tokens_seen": 111418352, "step": 116680 }, { "epoch": 9.518313076107349, "grad_norm": 5.9499616622924805, "learning_rate": 3.526775194093557e-07, "loss": 0.3325, "num_input_tokens_seen": 111423008, "step": 116685 }, { "epoch": 9.518720939717758, "grad_norm": 15.453218460083008, "learning_rate": 3.520820259611729e-07, "loss": 0.4745, "num_input_tokens_seen": 111426912, "step": 116690 }, { "epoch": 9.519128803328167, "grad_norm": 0.7324364185333252, "learning_rate": 3.5148703211581313e-07, "loss": 0.3971, "num_input_tokens_seen": 111431472, "step": 116695 }, { "epoch": 9.519536666938576, "grad_norm": 14.973761558532715, "learning_rate": 3.508925378853362e-07, "loss": 0.4724, "num_input_tokens_seen": 111436224, "step": 116700 }, { "epoch": 9.519944530548985, "grad_norm": 19.505290985107422, "learning_rate": 3.5029854328179623e-07, "loss": 0.4821, "num_input_tokens_seen": 111441104, "step": 116705 }, { "epoch": 9.520352394159392, "grad_norm": 5.444526195526123, "learning_rate": 3.4970504831722814e-07, "loss": 0.2055, "num_input_tokens_seen": 111445648, "step": 116710 }, { "epoch": 9.520760257769801, "grad_norm": 12.926633834838867, "learning_rate": 3.4911205300366677e-07, "loss": 0.3144, "num_input_tokens_seen": 111451056, "step": 116715 }, { "epoch": 9.52116812138021, "grad_norm": 9.037222862243652, "learning_rate": 3.485195573531275e-07, "loss": 0.3313, "num_input_tokens_seen": 111455920, "step": 116720 }, { "epoch": 9.52157598499062, "grad_norm": 10.320701599121094, "learning_rate": 3.479275613776228e-07, "loss": 0.3934, "num_input_tokens_seen": 111460256, "step": 116725 }, { "epoch": 9.521983848601028, "grad_norm": 1.4402251243591309, "learning_rate": 3.473360650891516e-07, "loss": 0.146, "num_input_tokens_seen": 111465232, "step": 116730 }, { "epoch": 9.522391712211437, "grad_norm": 1.3435786962509155, "learning_rate": 3.467450684996987e-07, "loss": 0.2564, "num_input_tokens_seen": 111470096, "step": 116735 }, { "epoch": 9.522799575821844, "grad_norm": 1.4793509244918823, "learning_rate": 3.4615457162125163e-07, "loss": 0.2365, "num_input_tokens_seen": 111474960, "step": 116740 }, { "epoch": 9.523207439432253, "grad_norm": 7.524028778076172, "learning_rate": 3.4556457446577595e-07, "loss": 0.3647, "num_input_tokens_seen": 111479952, "step": 116745 }, { "epoch": 9.523615303042662, "grad_norm": 53.71116256713867, "learning_rate": 3.4497507704522594e-07, "loss": 0.4169, "num_input_tokens_seen": 111484400, "step": 116750 }, { "epoch": 9.524023166653071, "grad_norm": 78.15371704101562, "learning_rate": 3.4438607937155587e-07, "loss": 0.4736, "num_input_tokens_seen": 111488672, "step": 116755 }, { "epoch": 9.52443103026348, "grad_norm": 42.6385383605957, "learning_rate": 3.4379758145670625e-07, "loss": 0.4558, "num_input_tokens_seen": 111492896, "step": 116760 }, { "epoch": 9.524838893873888, "grad_norm": 7.317156791687012, "learning_rate": 3.432095833126009e-07, "loss": 0.2099, "num_input_tokens_seen": 111498160, "step": 116765 }, { "epoch": 9.525246757484297, "grad_norm": 8.343524932861328, "learning_rate": 3.42622084951158e-07, "loss": 0.2907, "num_input_tokens_seen": 111502848, "step": 116770 }, { "epoch": 9.525654621094706, "grad_norm": 2.7584211826324463, "learning_rate": 3.4203508638429024e-07, "loss": 0.3053, "num_input_tokens_seen": 111508144, "step": 116775 }, { "epoch": 9.526062484705115, "grad_norm": 27.921098709106445, "learning_rate": 3.41448587623891e-07, "loss": 0.3199, "num_input_tokens_seen": 111512880, "step": 116780 }, { "epoch": 9.526470348315524, "grad_norm": 42.70395278930664, "learning_rate": 3.408625886818534e-07, "loss": 0.2023, "num_input_tokens_seen": 111518288, "step": 116785 }, { "epoch": 9.526878211925933, "grad_norm": 5.17982292175293, "learning_rate": 3.402770895700513e-07, "loss": 0.3335, "num_input_tokens_seen": 111523744, "step": 116790 }, { "epoch": 9.52728607553634, "grad_norm": 24.47589683532715, "learning_rate": 3.396920903003559e-07, "loss": 0.4027, "num_input_tokens_seen": 111527920, "step": 116795 }, { "epoch": 9.527693939146749, "grad_norm": 14.254733085632324, "learning_rate": 3.3910759088462143e-07, "loss": 0.2983, "num_input_tokens_seen": 111532256, "step": 116800 }, { "epoch": 9.528101802757158, "grad_norm": 13.49553108215332, "learning_rate": 3.385235913346968e-07, "loss": 0.3414, "num_input_tokens_seen": 111537232, "step": 116805 }, { "epoch": 9.528509666367567, "grad_norm": 13.62407112121582, "learning_rate": 3.379400916624198e-07, "loss": 0.3201, "num_input_tokens_seen": 111541744, "step": 116810 }, { "epoch": 9.528917529977976, "grad_norm": 12.214893341064453, "learning_rate": 3.373570918796198e-07, "loss": 0.342, "num_input_tokens_seen": 111546336, "step": 116815 }, { "epoch": 9.529325393588383, "grad_norm": 1.0495190620422363, "learning_rate": 3.367745919981097e-07, "loss": 0.299, "num_input_tokens_seen": 111550608, "step": 116820 }, { "epoch": 9.529733257198792, "grad_norm": 16.943510055541992, "learning_rate": 3.361925920296965e-07, "loss": 0.3109, "num_input_tokens_seen": 111555024, "step": 116825 }, { "epoch": 9.530141120809201, "grad_norm": 3.493394136428833, "learning_rate": 3.356110919861821e-07, "loss": 0.4341, "num_input_tokens_seen": 111560512, "step": 116830 }, { "epoch": 9.53054898441961, "grad_norm": 20.38507843017578, "learning_rate": 3.350300918793514e-07, "loss": 0.4421, "num_input_tokens_seen": 111565424, "step": 116835 }, { "epoch": 9.53095684803002, "grad_norm": 3.3082709312438965, "learning_rate": 3.3444959172097833e-07, "loss": 0.2814, "num_input_tokens_seen": 111569904, "step": 116840 }, { "epoch": 9.531364711640428, "grad_norm": 27.984703063964844, "learning_rate": 3.3386959152283126e-07, "loss": 0.3288, "num_input_tokens_seen": 111574784, "step": 116845 }, { "epoch": 9.531772575250836, "grad_norm": 36.244422912597656, "learning_rate": 3.332900912966674e-07, "loss": 0.3157, "num_input_tokens_seen": 111579696, "step": 116850 }, { "epoch": 9.532180438861245, "grad_norm": 9.616392135620117, "learning_rate": 3.327110910542303e-07, "loss": 0.3915, "num_input_tokens_seen": 111584224, "step": 116855 }, { "epoch": 9.532588302471654, "grad_norm": 3.6854546070098877, "learning_rate": 3.3213259080725757e-07, "loss": 0.4015, "num_input_tokens_seen": 111589776, "step": 116860 }, { "epoch": 9.532996166082063, "grad_norm": 20.384737014770508, "learning_rate": 3.315545905674761e-07, "loss": 0.3594, "num_input_tokens_seen": 111593840, "step": 116865 }, { "epoch": 9.533404029692472, "grad_norm": 37.22783660888672, "learning_rate": 3.309770903465986e-07, "loss": 0.4594, "num_input_tokens_seen": 111598656, "step": 116870 }, { "epoch": 9.533811893302879, "grad_norm": 25.236709594726562, "learning_rate": 3.3040009015633534e-07, "loss": 0.3826, "num_input_tokens_seen": 111602928, "step": 116875 }, { "epoch": 9.534219756913288, "grad_norm": 1.718158483505249, "learning_rate": 3.298235900083796e-07, "loss": 0.2519, "num_input_tokens_seen": 111608160, "step": 116880 }, { "epoch": 9.534627620523697, "grad_norm": 2.856264114379883, "learning_rate": 3.292475899144165e-07, "loss": 0.3403, "num_input_tokens_seen": 111612592, "step": 116885 }, { "epoch": 9.535035484134106, "grad_norm": 6.011477470397949, "learning_rate": 3.286720898861201e-07, "loss": 0.3727, "num_input_tokens_seen": 111617168, "step": 116890 }, { "epoch": 9.535443347744515, "grad_norm": 5.181137561798096, "learning_rate": 3.28097089935156e-07, "loss": 0.3097, "num_input_tokens_seen": 111622992, "step": 116895 }, { "epoch": 9.535851211354922, "grad_norm": 2.340451717376709, "learning_rate": 3.275225900731843e-07, "loss": 0.3373, "num_input_tokens_seen": 111627776, "step": 116900 }, { "epoch": 9.536259074965331, "grad_norm": 15.768847465515137, "learning_rate": 3.26948590311843e-07, "loss": 0.4769, "num_input_tokens_seen": 111632192, "step": 116905 }, { "epoch": 9.53666693857574, "grad_norm": 3.0944864749908447, "learning_rate": 3.263750906627672e-07, "loss": 0.3447, "num_input_tokens_seen": 111636896, "step": 116910 }, { "epoch": 9.53707480218615, "grad_norm": 13.44897747039795, "learning_rate": 3.258020911375892e-07, "loss": 0.2971, "num_input_tokens_seen": 111641760, "step": 116915 }, { "epoch": 9.537482665796558, "grad_norm": 0.8824487328529358, "learning_rate": 3.252295917479137e-07, "loss": 0.2578, "num_input_tokens_seen": 111646256, "step": 116920 }, { "epoch": 9.537890529406967, "grad_norm": 0.8090168833732605, "learning_rate": 3.2465759250535356e-07, "loss": 0.5332, "num_input_tokens_seen": 111651776, "step": 116925 }, { "epoch": 9.538298393017374, "grad_norm": 1.9302153587341309, "learning_rate": 3.240860934214968e-07, "loss": 0.2697, "num_input_tokens_seen": 111657328, "step": 116930 }, { "epoch": 9.538706256627783, "grad_norm": 51.375186920166016, "learning_rate": 3.235150945079257e-07, "loss": 0.255, "num_input_tokens_seen": 111661904, "step": 116935 }, { "epoch": 9.539114120238192, "grad_norm": 20.406063079833984, "learning_rate": 3.2294459577622284e-07, "loss": 0.246, "num_input_tokens_seen": 111666528, "step": 116940 }, { "epoch": 9.539521983848601, "grad_norm": 8.396345138549805, "learning_rate": 3.2237459723794547e-07, "loss": 0.2052, "num_input_tokens_seen": 111670704, "step": 116945 }, { "epoch": 9.53992984745901, "grad_norm": 23.12881851196289, "learning_rate": 3.2180509890465115e-07, "loss": 0.4474, "num_input_tokens_seen": 111675424, "step": 116950 }, { "epoch": 9.540337711069418, "grad_norm": 17.190792083740234, "learning_rate": 3.21236100787875e-07, "loss": 0.3712, "num_input_tokens_seen": 111679888, "step": 116955 }, { "epoch": 9.540745574679827, "grad_norm": 4.216430187225342, "learning_rate": 3.2066760289916063e-07, "loss": 0.1844, "num_input_tokens_seen": 111684784, "step": 116960 }, { "epoch": 9.541153438290236, "grad_norm": 4.065374374389648, "learning_rate": 3.2009960525002937e-07, "loss": 0.6827, "num_input_tokens_seen": 111688960, "step": 116965 }, { "epoch": 9.541561301900645, "grad_norm": 3.8313584327697754, "learning_rate": 3.195321078519886e-07, "loss": 0.2241, "num_input_tokens_seen": 111694256, "step": 116970 }, { "epoch": 9.541969165511054, "grad_norm": 21.95708465576172, "learning_rate": 3.189651107165459e-07, "loss": 0.2456, "num_input_tokens_seen": 111699120, "step": 116975 }, { "epoch": 9.542377029121461, "grad_norm": 3.480255126953125, "learning_rate": 3.18398613855192e-07, "loss": 0.3201, "num_input_tokens_seen": 111703200, "step": 116980 }, { "epoch": 9.54278489273187, "grad_norm": 40.636497497558594, "learning_rate": 3.1783261727941213e-07, "loss": 0.399, "num_input_tokens_seen": 111707808, "step": 116985 }, { "epoch": 9.543192756342279, "grad_norm": 5.069387912750244, "learning_rate": 3.17267121000675e-07, "loss": 0.437, "num_input_tokens_seen": 111711984, "step": 116990 }, { "epoch": 9.543600619952688, "grad_norm": 0.8726741075515747, "learning_rate": 3.167021250304464e-07, "loss": 0.5171, "num_input_tokens_seen": 111715952, "step": 116995 }, { "epoch": 9.544008483563097, "grad_norm": 8.48891544342041, "learning_rate": 3.1613762938017544e-07, "loss": 0.2025, "num_input_tokens_seen": 111721296, "step": 117000 }, { "epoch": 9.544416347173506, "grad_norm": 14.185662269592285, "learning_rate": 3.155736340613086e-07, "loss": 0.1941, "num_input_tokens_seen": 111726688, "step": 117005 }, { "epoch": 9.544824210783913, "grad_norm": 1.3964539766311646, "learning_rate": 3.1501013908527275e-07, "loss": 0.3246, "num_input_tokens_seen": 111732176, "step": 117010 }, { "epoch": 9.545232074394322, "grad_norm": 1.7890692949295044, "learning_rate": 3.144471444634922e-07, "loss": 0.309, "num_input_tokens_seen": 111737136, "step": 117015 }, { "epoch": 9.545639938004731, "grad_norm": 4.278000354766846, "learning_rate": 3.1388465020738e-07, "loss": 0.4213, "num_input_tokens_seen": 111742608, "step": 117020 }, { "epoch": 9.54604780161514, "grad_norm": 44.8065185546875, "learning_rate": 3.1332265632833533e-07, "loss": 0.384, "num_input_tokens_seen": 111747440, "step": 117025 }, { "epoch": 9.54645566522555, "grad_norm": 2.50862979888916, "learning_rate": 3.1276116283774913e-07, "loss": 0.3016, "num_input_tokens_seen": 111752336, "step": 117030 }, { "epoch": 9.546863528835956, "grad_norm": 49.648006439208984, "learning_rate": 3.122001697470067e-07, "loss": 0.4209, "num_input_tokens_seen": 111757392, "step": 117035 }, { "epoch": 9.547271392446365, "grad_norm": 6.818438529968262, "learning_rate": 3.11639677067474e-07, "loss": 0.5862, "num_input_tokens_seen": 111762512, "step": 117040 }, { "epoch": 9.547679256056774, "grad_norm": 0.8190313577651978, "learning_rate": 3.110796848105141e-07, "loss": 0.2027, "num_input_tokens_seen": 111766416, "step": 117045 }, { "epoch": 9.548087119667183, "grad_norm": 5.188661575317383, "learning_rate": 3.10520192987479e-07, "loss": 0.3564, "num_input_tokens_seen": 111770608, "step": 117050 }, { "epoch": 9.548494983277592, "grad_norm": 3.5198283195495605, "learning_rate": 3.0996120160970974e-07, "loss": 0.2653, "num_input_tokens_seen": 111775184, "step": 117055 }, { "epoch": 9.548902846888002, "grad_norm": 25.35212516784668, "learning_rate": 3.094027106885333e-07, "loss": 0.3619, "num_input_tokens_seen": 111780176, "step": 117060 }, { "epoch": 9.549310710498409, "grad_norm": 30.4596004486084, "learning_rate": 3.08844720235274e-07, "loss": 0.2496, "num_input_tokens_seen": 111784304, "step": 117065 }, { "epoch": 9.549718574108818, "grad_norm": 15.432849884033203, "learning_rate": 3.082872302612394e-07, "loss": 0.2461, "num_input_tokens_seen": 111788864, "step": 117070 }, { "epoch": 9.550126437719227, "grad_norm": 1.6300698518753052, "learning_rate": 3.077302407777316e-07, "loss": 0.1826, "num_input_tokens_seen": 111793680, "step": 117075 }, { "epoch": 9.550534301329636, "grad_norm": 11.567642211914062, "learning_rate": 3.071737517960388e-07, "loss": 0.4992, "num_input_tokens_seen": 111799248, "step": 117080 }, { "epoch": 9.550942164940045, "grad_norm": 2.749835968017578, "learning_rate": 3.066177633274409e-07, "loss": 0.2229, "num_input_tokens_seen": 111804608, "step": 117085 }, { "epoch": 9.551350028550452, "grad_norm": 2.834770917892456, "learning_rate": 3.060622753832065e-07, "loss": 0.4378, "num_input_tokens_seen": 111809344, "step": 117090 }, { "epoch": 9.551757892160861, "grad_norm": 46.96528625488281, "learning_rate": 3.0550728797459907e-07, "loss": 0.3282, "num_input_tokens_seen": 111814112, "step": 117095 }, { "epoch": 9.55216575577127, "grad_norm": 15.060888290405273, "learning_rate": 3.0495280111286495e-07, "loss": 0.2641, "num_input_tokens_seen": 111818176, "step": 117100 }, { "epoch": 9.552573619381679, "grad_norm": 16.500701904296875, "learning_rate": 3.043988148092425e-07, "loss": 0.3772, "num_input_tokens_seen": 111822576, "step": 117105 }, { "epoch": 9.552981482992088, "grad_norm": 1.4311405420303345, "learning_rate": 3.0384532907496433e-07, "loss": 0.3521, "num_input_tokens_seen": 111827504, "step": 117110 }, { "epoch": 9.553389346602497, "grad_norm": 1.0351163148880005, "learning_rate": 3.0329234392124927e-07, "loss": 0.4821, "num_input_tokens_seen": 111833296, "step": 117115 }, { "epoch": 9.553797210212904, "grad_norm": 49.79587173461914, "learning_rate": 3.027398593592995e-07, "loss": 0.3196, "num_input_tokens_seen": 111837632, "step": 117120 }, { "epoch": 9.554205073823313, "grad_norm": 8.343101501464844, "learning_rate": 3.0218787540032265e-07, "loss": 0.2569, "num_input_tokens_seen": 111841840, "step": 117125 }, { "epoch": 9.554612937433722, "grad_norm": 1.614417314529419, "learning_rate": 3.0163639205550155e-07, "loss": 0.3336, "num_input_tokens_seen": 111847328, "step": 117130 }, { "epoch": 9.555020801044131, "grad_norm": 14.041328430175781, "learning_rate": 3.0108540933601323e-07, "loss": 0.2901, "num_input_tokens_seen": 111852720, "step": 117135 }, { "epoch": 9.55542866465454, "grad_norm": 29.204118728637695, "learning_rate": 3.005349272530295e-07, "loss": 0.3324, "num_input_tokens_seen": 111857328, "step": 117140 }, { "epoch": 9.555836528264948, "grad_norm": 22.739944458007812, "learning_rate": 2.999849458177079e-07, "loss": 0.4315, "num_input_tokens_seen": 111862288, "step": 117145 }, { "epoch": 9.556244391875357, "grad_norm": 1.4779959917068481, "learning_rate": 2.9943546504119513e-07, "loss": 0.2593, "num_input_tokens_seen": 111866944, "step": 117150 }, { "epoch": 9.556652255485766, "grad_norm": 6.849167823791504, "learning_rate": 2.9888648493462955e-07, "loss": 0.2378, "num_input_tokens_seen": 111871632, "step": 117155 }, { "epoch": 9.557060119096175, "grad_norm": 33.49882888793945, "learning_rate": 2.9833800550914115e-07, "loss": 0.4284, "num_input_tokens_seen": 111876208, "step": 117160 }, { "epoch": 9.557467982706584, "grad_norm": 1.9949872493743896, "learning_rate": 2.977900267758432e-07, "loss": 0.2325, "num_input_tokens_seen": 111880400, "step": 117165 }, { "epoch": 9.55787584631699, "grad_norm": 2.9169580936431885, "learning_rate": 2.9724254874584354e-07, "loss": 0.2391, "num_input_tokens_seen": 111885840, "step": 117170 }, { "epoch": 9.5582837099274, "grad_norm": 3.514366865158081, "learning_rate": 2.966955714302444e-07, "loss": 0.3648, "num_input_tokens_seen": 111890304, "step": 117175 }, { "epoch": 9.558691573537809, "grad_norm": 26.324613571166992, "learning_rate": 2.9614909484012297e-07, "loss": 0.3944, "num_input_tokens_seen": 111895664, "step": 117180 }, { "epoch": 9.559099437148218, "grad_norm": 34.461631774902344, "learning_rate": 2.9560311898656766e-07, "loss": 0.2846, "num_input_tokens_seen": 111900560, "step": 117185 }, { "epoch": 9.559507300758627, "grad_norm": 1.335108757019043, "learning_rate": 2.950576438806363e-07, "loss": 0.2674, "num_input_tokens_seen": 111904544, "step": 117190 }, { "epoch": 9.559915164369034, "grad_norm": 27.324342727661133, "learning_rate": 2.9451266953339216e-07, "loss": 0.2687, "num_input_tokens_seen": 111909248, "step": 117195 }, { "epoch": 9.560323027979443, "grad_norm": 29.027816772460938, "learning_rate": 2.9396819595587376e-07, "loss": 0.2928, "num_input_tokens_seen": 111914256, "step": 117200 }, { "epoch": 9.560730891589852, "grad_norm": 16.497997283935547, "learning_rate": 2.93424223159125e-07, "loss": 0.4061, "num_input_tokens_seen": 111919584, "step": 117205 }, { "epoch": 9.561138755200261, "grad_norm": 3.8188204765319824, "learning_rate": 2.9288075115416755e-07, "loss": 0.4801, "num_input_tokens_seen": 111923840, "step": 117210 }, { "epoch": 9.56154661881067, "grad_norm": 10.67021369934082, "learning_rate": 2.923377799520205e-07, "loss": 0.411, "num_input_tokens_seen": 111928464, "step": 117215 }, { "epoch": 9.56195448242108, "grad_norm": 2.4200174808502197, "learning_rate": 2.9179530956368606e-07, "loss": 0.3181, "num_input_tokens_seen": 111933344, "step": 117220 }, { "epoch": 9.562362346031486, "grad_norm": 6.467616558074951, "learning_rate": 2.91253340000161e-07, "loss": 0.5126, "num_input_tokens_seen": 111938944, "step": 117225 }, { "epoch": 9.562770209641895, "grad_norm": 2.000020980834961, "learning_rate": 2.9071187127243105e-07, "loss": 0.3587, "num_input_tokens_seen": 111943312, "step": 117230 }, { "epoch": 9.563178073252304, "grad_norm": 34.99582290649414, "learning_rate": 2.9017090339147343e-07, "loss": 0.2244, "num_input_tokens_seen": 111947888, "step": 117235 }, { "epoch": 9.563585936862713, "grad_norm": 11.067061424255371, "learning_rate": 2.8963043636825173e-07, "loss": 0.4093, "num_input_tokens_seen": 111953296, "step": 117240 }, { "epoch": 9.563993800473122, "grad_norm": 1.153777003288269, "learning_rate": 2.8909047021372093e-07, "loss": 0.239, "num_input_tokens_seen": 111958112, "step": 117245 }, { "epoch": 9.56440166408353, "grad_norm": 34.667362213134766, "learning_rate": 2.8855100493882513e-07, "loss": 0.4176, "num_input_tokens_seen": 111962704, "step": 117250 }, { "epoch": 9.564809527693939, "grad_norm": 2.9481117725372314, "learning_rate": 2.8801204055450005e-07, "loss": 0.3016, "num_input_tokens_seen": 111966992, "step": 117255 }, { "epoch": 9.565217391304348, "grad_norm": 1.6892809867858887, "learning_rate": 2.8747357707167024e-07, "loss": 0.4222, "num_input_tokens_seen": 111971904, "step": 117260 }, { "epoch": 9.565625254914757, "grad_norm": 11.865288734436035, "learning_rate": 2.869356145012492e-07, "loss": 0.3812, "num_input_tokens_seen": 111975952, "step": 117265 }, { "epoch": 9.566033118525166, "grad_norm": 29.135700225830078, "learning_rate": 2.8639815285414484e-07, "loss": 0.4654, "num_input_tokens_seen": 111980528, "step": 117270 }, { "epoch": 9.566440982135575, "grad_norm": 11.114185333251953, "learning_rate": 2.8586119214124574e-07, "loss": 0.3585, "num_input_tokens_seen": 111985136, "step": 117275 }, { "epoch": 9.566848845745982, "grad_norm": 19.588430404663086, "learning_rate": 2.853247323734404e-07, "loss": 0.3285, "num_input_tokens_seen": 111989552, "step": 117280 }, { "epoch": 9.567256709356391, "grad_norm": 20.792102813720703, "learning_rate": 2.847887735616006e-07, "loss": 0.2573, "num_input_tokens_seen": 111994608, "step": 117285 }, { "epoch": 9.5676645729668, "grad_norm": 1.2723898887634277, "learning_rate": 2.842533157165872e-07, "loss": 0.2983, "num_input_tokens_seen": 111998592, "step": 117290 }, { "epoch": 9.568072436577209, "grad_norm": 1.2534734010696411, "learning_rate": 2.8371835884925813e-07, "loss": 0.2875, "num_input_tokens_seen": 112004096, "step": 117295 }, { "epoch": 9.568480300187618, "grad_norm": 25.009267807006836, "learning_rate": 2.831839029704575e-07, "loss": 0.4659, "num_input_tokens_seen": 112009152, "step": 117300 }, { "epoch": 9.568888163798025, "grad_norm": 2.8721344470977783, "learning_rate": 2.826499480910155e-07, "loss": 0.4211, "num_input_tokens_seen": 112013312, "step": 117305 }, { "epoch": 9.569296027408434, "grad_norm": 1.7224915027618408, "learning_rate": 2.821164942217541e-07, "loss": 0.2451, "num_input_tokens_seen": 112018688, "step": 117310 }, { "epoch": 9.569703891018843, "grad_norm": 3.0435006618499756, "learning_rate": 2.8158354137348964e-07, "loss": 0.3427, "num_input_tokens_seen": 112024112, "step": 117315 }, { "epoch": 9.570111754629252, "grad_norm": 27.900836944580078, "learning_rate": 2.810510895570245e-07, "loss": 0.4179, "num_input_tokens_seen": 112028880, "step": 117320 }, { "epoch": 9.570519618239661, "grad_norm": 3.471867799758911, "learning_rate": 2.8051913878315017e-07, "loss": 0.2606, "num_input_tokens_seen": 112033632, "step": 117325 }, { "epoch": 9.57092748185007, "grad_norm": 20.339168548583984, "learning_rate": 2.799876890626468e-07, "loss": 0.4282, "num_input_tokens_seen": 112038208, "step": 117330 }, { "epoch": 9.571335345460477, "grad_norm": 1.7934561967849731, "learning_rate": 2.794567404062892e-07, "loss": 0.3542, "num_input_tokens_seen": 112042480, "step": 117335 }, { "epoch": 9.571743209070886, "grad_norm": 4.346853256225586, "learning_rate": 2.7892629282483816e-07, "loss": 0.2916, "num_input_tokens_seen": 112047392, "step": 117340 }, { "epoch": 9.572151072681296, "grad_norm": 12.8724365234375, "learning_rate": 2.78396346329049e-07, "loss": 0.301, "num_input_tokens_seen": 112052048, "step": 117345 }, { "epoch": 9.572558936291705, "grad_norm": 11.95837688446045, "learning_rate": 2.778669009296603e-07, "loss": 0.2937, "num_input_tokens_seen": 112057376, "step": 117350 }, { "epoch": 9.572966799902114, "grad_norm": 22.825132369995117, "learning_rate": 2.7733795663740237e-07, "loss": 0.1589, "num_input_tokens_seen": 112062800, "step": 117355 }, { "epoch": 9.57337466351252, "grad_norm": 12.103039741516113, "learning_rate": 2.7680951346300275e-07, "loss": 0.2539, "num_input_tokens_seen": 112067824, "step": 117360 }, { "epoch": 9.57378252712293, "grad_norm": 20.620559692382812, "learning_rate": 2.762815714171668e-07, "loss": 0.441, "num_input_tokens_seen": 112072544, "step": 117365 }, { "epoch": 9.574190390733339, "grad_norm": 24.48227882385254, "learning_rate": 2.757541305105971e-07, "loss": 0.5412, "num_input_tokens_seen": 112077824, "step": 117370 }, { "epoch": 9.574598254343748, "grad_norm": 31.501306533813477, "learning_rate": 2.7522719075398493e-07, "loss": 0.2705, "num_input_tokens_seen": 112082256, "step": 117375 }, { "epoch": 9.575006117954157, "grad_norm": 4.049208641052246, "learning_rate": 2.747007521580136e-07, "loss": 0.2263, "num_input_tokens_seen": 112087024, "step": 117380 }, { "epoch": 9.575413981564564, "grad_norm": 9.183040618896484, "learning_rate": 2.741748147333495e-07, "loss": 0.28, "num_input_tokens_seen": 112091520, "step": 117385 }, { "epoch": 9.575821845174973, "grad_norm": 1.2871170043945312, "learning_rate": 2.736493784906535e-07, "loss": 0.3807, "num_input_tokens_seen": 112096240, "step": 117390 }, { "epoch": 9.576229708785382, "grad_norm": 0.7681952118873596, "learning_rate": 2.731244434405811e-07, "loss": 0.5092, "num_input_tokens_seen": 112100576, "step": 117395 }, { "epoch": 9.576637572395791, "grad_norm": 4.70112943649292, "learning_rate": 2.7260000959376806e-07, "loss": 0.4805, "num_input_tokens_seen": 112105360, "step": 117400 }, { "epoch": 9.5770454360062, "grad_norm": 24.37054443359375, "learning_rate": 2.720760769608449e-07, "loss": 0.3181, "num_input_tokens_seen": 112110496, "step": 117405 }, { "epoch": 9.577453299616607, "grad_norm": 1.0496309995651245, "learning_rate": 2.7155264555243365e-07, "loss": 0.4024, "num_input_tokens_seen": 112114992, "step": 117410 }, { "epoch": 9.577861163227016, "grad_norm": 0.7918839454650879, "learning_rate": 2.7102971537914247e-07, "loss": 0.2742, "num_input_tokens_seen": 112119776, "step": 117415 }, { "epoch": 9.578269026837425, "grad_norm": 25.068201065063477, "learning_rate": 2.705072864515712e-07, "loss": 0.3511, "num_input_tokens_seen": 112124320, "step": 117420 }, { "epoch": 9.578676890447834, "grad_norm": 3.102689743041992, "learning_rate": 2.6998535878030586e-07, "loss": 0.3093, "num_input_tokens_seen": 112128480, "step": 117425 }, { "epoch": 9.579084754058243, "grad_norm": 1.3912485837936401, "learning_rate": 2.694639323759324e-07, "loss": 0.2323, "num_input_tokens_seen": 112133360, "step": 117430 }, { "epoch": 9.579492617668652, "grad_norm": 2.1272213459014893, "learning_rate": 2.6894300724901453e-07, "loss": 0.2805, "num_input_tokens_seen": 112138672, "step": 117435 }, { "epoch": 9.57990048127906, "grad_norm": 41.23583984375, "learning_rate": 2.684225834101134e-07, "loss": 0.2474, "num_input_tokens_seen": 112143552, "step": 117440 }, { "epoch": 9.580308344889469, "grad_norm": 17.783578872680664, "learning_rate": 2.67902660869776e-07, "loss": 0.4428, "num_input_tokens_seen": 112147488, "step": 117445 }, { "epoch": 9.580716208499878, "grad_norm": 2.6757378578186035, "learning_rate": 2.6738323963854394e-07, "loss": 0.4494, "num_input_tokens_seen": 112152464, "step": 117450 }, { "epoch": 9.581124072110287, "grad_norm": 40.376529693603516, "learning_rate": 2.668643197269449e-07, "loss": 0.5243, "num_input_tokens_seen": 112157440, "step": 117455 }, { "epoch": 9.581531935720696, "grad_norm": 22.86661720275879, "learning_rate": 2.663459011454955e-07, "loss": 0.4084, "num_input_tokens_seen": 112161968, "step": 117460 }, { "epoch": 9.581939799331103, "grad_norm": 3.647521495819092, "learning_rate": 2.6582798390470396e-07, "loss": 0.3686, "num_input_tokens_seen": 112165952, "step": 117465 }, { "epoch": 9.582347662941512, "grad_norm": 27.71834373474121, "learning_rate": 2.653105680150703e-07, "loss": 0.2636, "num_input_tokens_seen": 112170992, "step": 117470 }, { "epoch": 9.58275552655192, "grad_norm": 11.66189193725586, "learning_rate": 2.647936534870804e-07, "loss": 0.3391, "num_input_tokens_seen": 112176112, "step": 117475 }, { "epoch": 9.58316339016233, "grad_norm": 1.7198646068572998, "learning_rate": 2.6427724033121215e-07, "loss": 0.3023, "num_input_tokens_seen": 112180912, "step": 117480 }, { "epoch": 9.583571253772739, "grad_norm": 0.9613930583000183, "learning_rate": 2.637613285579349e-07, "loss": 0.3045, "num_input_tokens_seen": 112185584, "step": 117485 }, { "epoch": 9.583979117383148, "grad_norm": 26.255504608154297, "learning_rate": 2.6324591817770137e-07, "loss": 0.4633, "num_input_tokens_seen": 112189936, "step": 117490 }, { "epoch": 9.584386980993555, "grad_norm": 6.74113130569458, "learning_rate": 2.6273100920096437e-07, "loss": 0.3081, "num_input_tokens_seen": 112193840, "step": 117495 }, { "epoch": 9.584794844603964, "grad_norm": 13.720830917358398, "learning_rate": 2.622166016381572e-07, "loss": 0.421, "num_input_tokens_seen": 112199440, "step": 117500 }, { "epoch": 9.585202708214373, "grad_norm": 3.6354150772094727, "learning_rate": 2.617026954997076e-07, "loss": 0.3525, "num_input_tokens_seen": 112203984, "step": 117505 }, { "epoch": 9.585610571824782, "grad_norm": 3.9672935009002686, "learning_rate": 2.611892907960323e-07, "loss": 0.1974, "num_input_tokens_seen": 112208864, "step": 117510 }, { "epoch": 9.586018435435191, "grad_norm": 6.9623823165893555, "learning_rate": 2.6067638753753955e-07, "loss": 0.3658, "num_input_tokens_seen": 112213856, "step": 117515 }, { "epoch": 9.586426299045598, "grad_norm": 19.195642471313477, "learning_rate": 2.6016398573462386e-07, "loss": 0.3393, "num_input_tokens_seen": 112218752, "step": 117520 }, { "epoch": 9.586834162656007, "grad_norm": 7.008508682250977, "learning_rate": 2.596520853976714e-07, "loss": 0.3215, "num_input_tokens_seen": 112223968, "step": 117525 }, { "epoch": 9.587242026266416, "grad_norm": 29.125288009643555, "learning_rate": 2.5914068653705714e-07, "loss": 0.276, "num_input_tokens_seen": 112229056, "step": 117530 }, { "epoch": 9.587649889876825, "grad_norm": 9.28061294555664, "learning_rate": 2.586297891631506e-07, "loss": 0.3809, "num_input_tokens_seen": 112234352, "step": 117535 }, { "epoch": 9.588057753487234, "grad_norm": 22.604568481445312, "learning_rate": 2.5811939328630185e-07, "loss": 0.3045, "num_input_tokens_seen": 112239312, "step": 117540 }, { "epoch": 9.588465617097643, "grad_norm": 3.1570169925689697, "learning_rate": 2.576094989168637e-07, "loss": 0.3272, "num_input_tokens_seen": 112244752, "step": 117545 }, { "epoch": 9.58887348070805, "grad_norm": 1.6418648958206177, "learning_rate": 2.57100106065164e-07, "loss": 0.3187, "num_input_tokens_seen": 112249648, "step": 117550 }, { "epoch": 9.58928134431846, "grad_norm": 5.858870029449463, "learning_rate": 2.565912147415306e-07, "loss": 0.3886, "num_input_tokens_seen": 112254848, "step": 117555 }, { "epoch": 9.589689207928869, "grad_norm": 1.920831561088562, "learning_rate": 2.560828249562802e-07, "loss": 0.4678, "num_input_tokens_seen": 112260048, "step": 117560 }, { "epoch": 9.590097071539278, "grad_norm": 26.19496726989746, "learning_rate": 2.5557493671971856e-07, "loss": 0.2972, "num_input_tokens_seen": 112265168, "step": 117565 }, { "epoch": 9.590504935149687, "grad_norm": 1.9165022373199463, "learning_rate": 2.5506755004213457e-07, "loss": 0.1848, "num_input_tokens_seen": 112270000, "step": 117570 }, { "epoch": 9.590912798760094, "grad_norm": 41.01541519165039, "learning_rate": 2.5456066493382005e-07, "loss": 0.4224, "num_input_tokens_seen": 112275568, "step": 117575 }, { "epoch": 9.591320662370503, "grad_norm": 52.202064514160156, "learning_rate": 2.540542814050445e-07, "loss": 0.4542, "num_input_tokens_seen": 112280784, "step": 117580 }, { "epoch": 9.591728525980912, "grad_norm": 1.6964212656021118, "learning_rate": 2.53548399466072e-07, "loss": 0.2745, "num_input_tokens_seen": 112285952, "step": 117585 }, { "epoch": 9.592136389591321, "grad_norm": 4.872762680053711, "learning_rate": 2.53043019127161e-07, "loss": 0.5234, "num_input_tokens_seen": 112290432, "step": 117590 }, { "epoch": 9.59254425320173, "grad_norm": 8.9513578414917, "learning_rate": 2.525381403985505e-07, "loss": 0.3329, "num_input_tokens_seen": 112294912, "step": 117595 }, { "epoch": 9.592952116812137, "grad_norm": 2.9730284214019775, "learning_rate": 2.5203376329047677e-07, "loss": 0.4559, "num_input_tokens_seen": 112299728, "step": 117600 }, { "epoch": 9.593359980422546, "grad_norm": 12.108768463134766, "learning_rate": 2.515298878131622e-07, "loss": 0.2263, "num_input_tokens_seen": 112304256, "step": 117605 }, { "epoch": 9.593767844032955, "grad_norm": 2.9068851470947266, "learning_rate": 2.510265139768209e-07, "loss": 0.2242, "num_input_tokens_seen": 112309456, "step": 117610 }, { "epoch": 9.594175707643364, "grad_norm": 0.6432240009307861, "learning_rate": 2.5052364179165565e-07, "loss": 0.3227, "num_input_tokens_seen": 112314736, "step": 117615 }, { "epoch": 9.594583571253773, "grad_norm": 9.800531387329102, "learning_rate": 2.5002127126785846e-07, "loss": 0.5299, "num_input_tokens_seen": 112318736, "step": 117620 }, { "epoch": 9.59499143486418, "grad_norm": 17.7071475982666, "learning_rate": 2.495194024156128e-07, "loss": 0.3413, "num_input_tokens_seen": 112322944, "step": 117625 }, { "epoch": 9.59539929847459, "grad_norm": 27.28481101989746, "learning_rate": 2.490180352450938e-07, "loss": 0.4228, "num_input_tokens_seen": 112328080, "step": 117630 }, { "epoch": 9.595807162084999, "grad_norm": 12.189199447631836, "learning_rate": 2.4851716976646286e-07, "loss": 0.3669, "num_input_tokens_seen": 112332768, "step": 117635 }, { "epoch": 9.596215025695408, "grad_norm": 4.974836826324463, "learning_rate": 2.4801680598986745e-07, "loss": 0.2951, "num_input_tokens_seen": 112337424, "step": 117640 }, { "epoch": 9.596622889305817, "grad_norm": 3.38826584815979, "learning_rate": 2.4751694392545487e-07, "loss": 0.3643, "num_input_tokens_seen": 112342448, "step": 117645 }, { "epoch": 9.597030752916226, "grad_norm": 29.320735931396484, "learning_rate": 2.470175835833588e-07, "loss": 0.4639, "num_input_tokens_seen": 112347280, "step": 117650 }, { "epoch": 9.597438616526633, "grad_norm": 0.41642120480537415, "learning_rate": 2.4651872497369613e-07, "loss": 0.2837, "num_input_tokens_seen": 112352112, "step": 117655 }, { "epoch": 9.597846480137042, "grad_norm": 3.5351948738098145, "learning_rate": 2.460203681065809e-07, "loss": 0.357, "num_input_tokens_seen": 112356208, "step": 117660 }, { "epoch": 9.59825434374745, "grad_norm": 3.127971649169922, "learning_rate": 2.455225129921135e-07, "loss": 0.3807, "num_input_tokens_seen": 112361504, "step": 117665 }, { "epoch": 9.59866220735786, "grad_norm": 32.64763259887695, "learning_rate": 2.4502515964038574e-07, "loss": 0.2955, "num_input_tokens_seen": 112366032, "step": 117670 }, { "epoch": 9.599070070968269, "grad_norm": 2.4934189319610596, "learning_rate": 2.445283080614813e-07, "loss": 0.3714, "num_input_tokens_seen": 112371152, "step": 117675 }, { "epoch": 9.599477934578676, "grad_norm": 4.388485908508301, "learning_rate": 2.440319582654671e-07, "loss": 0.3863, "num_input_tokens_seen": 112375904, "step": 117680 }, { "epoch": 9.599885798189085, "grad_norm": 22.370283126831055, "learning_rate": 2.435361102624045e-07, "loss": 0.5235, "num_input_tokens_seen": 112380432, "step": 117685 }, { "epoch": 9.600293661799494, "grad_norm": 1.578768253326416, "learning_rate": 2.4304076406234666e-07, "loss": 0.3142, "num_input_tokens_seen": 112383888, "step": 117690 }, { "epoch": 9.600701525409903, "grad_norm": 12.827970504760742, "learning_rate": 2.425459196753327e-07, "loss": 0.2907, "num_input_tokens_seen": 112388832, "step": 117695 }, { "epoch": 9.601109389020312, "grad_norm": 2.2529513835906982, "learning_rate": 2.420515771113907e-07, "loss": 0.2887, "num_input_tokens_seen": 112392832, "step": 117700 }, { "epoch": 9.601517252630721, "grad_norm": 53.07674789428711, "learning_rate": 2.41557736380546e-07, "loss": 0.3892, "num_input_tokens_seen": 112398192, "step": 117705 }, { "epoch": 9.601925116241128, "grad_norm": 2.2019641399383545, "learning_rate": 2.4106439749280174e-07, "loss": 0.2553, "num_input_tokens_seen": 112402976, "step": 117710 }, { "epoch": 9.602332979851537, "grad_norm": 4.0693206787109375, "learning_rate": 2.405715604581665e-07, "loss": 0.3156, "num_input_tokens_seen": 112408512, "step": 117715 }, { "epoch": 9.602740843461946, "grad_norm": 6.384553909301758, "learning_rate": 2.400792252866213e-07, "loss": 0.223, "num_input_tokens_seen": 112412912, "step": 117720 }, { "epoch": 9.603148707072355, "grad_norm": 1.9877738952636719, "learning_rate": 2.395873919881497e-07, "loss": 0.2892, "num_input_tokens_seen": 112417920, "step": 117725 }, { "epoch": 9.603556570682764, "grad_norm": 27.051916122436523, "learning_rate": 2.390960605727216e-07, "loss": 0.5025, "num_input_tokens_seen": 112422544, "step": 117730 }, { "epoch": 9.603964434293172, "grad_norm": 2.4897801876068115, "learning_rate": 2.3860523105029285e-07, "loss": 0.2036, "num_input_tokens_seen": 112426688, "step": 117735 }, { "epoch": 9.60437229790358, "grad_norm": 2.9830989837646484, "learning_rate": 2.3811490343081388e-07, "loss": 0.3709, "num_input_tokens_seen": 112431584, "step": 117740 }, { "epoch": 9.60478016151399, "grad_norm": 14.948629379272461, "learning_rate": 2.376250777242267e-07, "loss": 0.3427, "num_input_tokens_seen": 112436080, "step": 117745 }, { "epoch": 9.605188025124399, "grad_norm": 11.173463821411133, "learning_rate": 2.3713575394045673e-07, "loss": 0.2663, "num_input_tokens_seen": 112441472, "step": 117750 }, { "epoch": 9.605595888734808, "grad_norm": 41.683406829833984, "learning_rate": 2.3664693208941823e-07, "loss": 0.3673, "num_input_tokens_seen": 112446464, "step": 117755 }, { "epoch": 9.606003752345217, "grad_norm": 3.459458827972412, "learning_rate": 2.3615861218102832e-07, "loss": 0.1814, "num_input_tokens_seen": 112451696, "step": 117760 }, { "epoch": 9.606411615955624, "grad_norm": 3.1191651821136475, "learning_rate": 2.35670794225179e-07, "loss": 0.2747, "num_input_tokens_seen": 112455584, "step": 117765 }, { "epoch": 9.606819479566033, "grad_norm": 55.919456481933594, "learning_rate": 2.3518347823176246e-07, "loss": 0.2517, "num_input_tokens_seen": 112459792, "step": 117770 }, { "epoch": 9.607227343176442, "grad_norm": 19.104564666748047, "learning_rate": 2.346966642106513e-07, "loss": 0.4712, "num_input_tokens_seen": 112465104, "step": 117775 }, { "epoch": 9.607635206786851, "grad_norm": 25.327383041381836, "learning_rate": 2.3421035217171262e-07, "loss": 0.3925, "num_input_tokens_seen": 112470192, "step": 117780 }, { "epoch": 9.60804307039726, "grad_norm": 1.0368237495422363, "learning_rate": 2.3372454212481077e-07, "loss": 0.2014, "num_input_tokens_seen": 112474688, "step": 117785 }, { "epoch": 9.608450934007667, "grad_norm": 18.49274253845215, "learning_rate": 2.332392340797851e-07, "loss": 0.4773, "num_input_tokens_seen": 112478976, "step": 117790 }, { "epoch": 9.608858797618076, "grad_norm": 1.923824667930603, "learning_rate": 2.3275442804648052e-07, "loss": 0.3398, "num_input_tokens_seen": 112484336, "step": 117795 }, { "epoch": 9.609266661228485, "grad_norm": 36.837215423583984, "learning_rate": 2.3227012403471692e-07, "loss": 0.3795, "num_input_tokens_seen": 112489392, "step": 117800 }, { "epoch": 9.609674524838894, "grad_norm": 2.232664108276367, "learning_rate": 2.317863220543115e-07, "loss": 0.4729, "num_input_tokens_seen": 112493472, "step": 117805 }, { "epoch": 9.610082388449303, "grad_norm": 11.561185836791992, "learning_rate": 2.3130302211507582e-07, "loss": 0.3871, "num_input_tokens_seen": 112497936, "step": 117810 }, { "epoch": 9.61049025205971, "grad_norm": 2.743403911590576, "learning_rate": 2.3082022422680206e-07, "loss": 0.359, "num_input_tokens_seen": 112502704, "step": 117815 }, { "epoch": 9.61089811567012, "grad_norm": 9.395280838012695, "learning_rate": 2.303379283992768e-07, "loss": 0.3842, "num_input_tokens_seen": 112507248, "step": 117820 }, { "epoch": 9.611305979280528, "grad_norm": 2.505249500274658, "learning_rate": 2.2985613464227562e-07, "loss": 0.246, "num_input_tokens_seen": 112512672, "step": 117825 }, { "epoch": 9.611713842890937, "grad_norm": 12.419844627380371, "learning_rate": 2.2937484296556566e-07, "loss": 0.2999, "num_input_tokens_seen": 112516048, "step": 117830 }, { "epoch": 9.612121706501346, "grad_norm": 10.064899444580078, "learning_rate": 2.2889405337890303e-07, "loss": 0.4508, "num_input_tokens_seen": 112521520, "step": 117835 }, { "epoch": 9.612529570111755, "grad_norm": 10.868916511535645, "learning_rate": 2.2841376589203268e-07, "loss": 0.3674, "num_input_tokens_seen": 112526976, "step": 117840 }, { "epoch": 9.612937433722163, "grad_norm": 16.84426498413086, "learning_rate": 2.2793398051468574e-07, "loss": 0.2656, "num_input_tokens_seen": 112532192, "step": 117845 }, { "epoch": 9.613345297332572, "grad_norm": 4.496588706970215, "learning_rate": 2.2745469725659329e-07, "loss": 0.4804, "num_input_tokens_seen": 112536976, "step": 117850 }, { "epoch": 9.61375316094298, "grad_norm": 6.993987560272217, "learning_rate": 2.2697591612746982e-07, "loss": 0.4248, "num_input_tokens_seen": 112542368, "step": 117855 }, { "epoch": 9.61416102455339, "grad_norm": 2.9540114402770996, "learning_rate": 2.264976371370159e-07, "loss": 0.3508, "num_input_tokens_seen": 112547696, "step": 117860 }, { "epoch": 9.614568888163799, "grad_norm": 36.50883865356445, "learning_rate": 2.260198602949265e-07, "loss": 0.3318, "num_input_tokens_seen": 112553408, "step": 117865 }, { "epoch": 9.614976751774206, "grad_norm": 4.4635329246521, "learning_rate": 2.2554258561088837e-07, "loss": 0.1916, "num_input_tokens_seen": 112558224, "step": 117870 }, { "epoch": 9.615384615384615, "grad_norm": 1.992803931236267, "learning_rate": 2.250658130945743e-07, "loss": 0.2714, "num_input_tokens_seen": 112563712, "step": 117875 }, { "epoch": 9.615792478995024, "grad_norm": 3.908698797225952, "learning_rate": 2.2458954275564882e-07, "loss": 0.3784, "num_input_tokens_seen": 112568592, "step": 117880 }, { "epoch": 9.616200342605433, "grad_norm": 2.852013111114502, "learning_rate": 2.2411377460376526e-07, "loss": 0.3206, "num_input_tokens_seen": 112573200, "step": 117885 }, { "epoch": 9.616608206215842, "grad_norm": 3.28265380859375, "learning_rate": 2.2363850864856873e-07, "loss": 0.3039, "num_input_tokens_seen": 112577408, "step": 117890 }, { "epoch": 9.61701606982625, "grad_norm": 7.5742692947387695, "learning_rate": 2.231637448996904e-07, "loss": 0.4271, "num_input_tokens_seen": 112582896, "step": 117895 }, { "epoch": 9.617423933436658, "grad_norm": 20.511465072631836, "learning_rate": 2.226894833667559e-07, "loss": 0.3022, "num_input_tokens_seen": 112587520, "step": 117900 }, { "epoch": 9.617831797047067, "grad_norm": 37.23877716064453, "learning_rate": 2.2221572405937697e-07, "loss": 0.4339, "num_input_tokens_seen": 112592032, "step": 117905 }, { "epoch": 9.618239660657476, "grad_norm": 17.53106117248535, "learning_rate": 2.217424669871543e-07, "loss": 0.4128, "num_input_tokens_seen": 112596432, "step": 117910 }, { "epoch": 9.618647524267885, "grad_norm": 1.3565119504928589, "learning_rate": 2.2126971215968573e-07, "loss": 0.2677, "num_input_tokens_seen": 112601376, "step": 117915 }, { "epoch": 9.619055387878294, "grad_norm": 1.200614333152771, "learning_rate": 2.2079745958654973e-07, "loss": 0.1994, "num_input_tokens_seen": 112607008, "step": 117920 }, { "epoch": 9.619463251488702, "grad_norm": 1.8702313899993896, "learning_rate": 2.203257092773192e-07, "loss": 0.1271, "num_input_tokens_seen": 112612480, "step": 117925 }, { "epoch": 9.61987111509911, "grad_norm": 4.3018083572387695, "learning_rate": 2.1985446124155872e-07, "loss": 0.3554, "num_input_tokens_seen": 112617552, "step": 117930 }, { "epoch": 9.62027897870952, "grad_norm": 36.721763610839844, "learning_rate": 2.1938371548881898e-07, "loss": 0.3268, "num_input_tokens_seen": 112622576, "step": 117935 }, { "epoch": 9.620686842319929, "grad_norm": 2.397433042526245, "learning_rate": 2.1891347202863955e-07, "loss": 0.1862, "num_input_tokens_seen": 112626256, "step": 117940 }, { "epoch": 9.621094705930338, "grad_norm": 17.468441009521484, "learning_rate": 2.1844373087055446e-07, "loss": 0.3276, "num_input_tokens_seen": 112631072, "step": 117945 }, { "epoch": 9.621502569540745, "grad_norm": 5.250377178192139, "learning_rate": 2.179744920240867e-07, "loss": 0.304, "num_input_tokens_seen": 112635568, "step": 117950 }, { "epoch": 9.621910433151154, "grad_norm": 7.93645715713501, "learning_rate": 2.1750575549874253e-07, "loss": 0.3873, "num_input_tokens_seen": 112639984, "step": 117955 }, { "epoch": 9.622318296761563, "grad_norm": 17.726720809936523, "learning_rate": 2.1703752130402543e-07, "loss": 0.3977, "num_input_tokens_seen": 112645008, "step": 117960 }, { "epoch": 9.622726160371972, "grad_norm": 2.5000083446502686, "learning_rate": 2.1656978944943061e-07, "loss": 0.1956, "num_input_tokens_seen": 112649568, "step": 117965 }, { "epoch": 9.62313402398238, "grad_norm": 4.316915988922119, "learning_rate": 2.1610255994443107e-07, "loss": 0.3186, "num_input_tokens_seen": 112655168, "step": 117970 }, { "epoch": 9.62354188759279, "grad_norm": 5.376336574554443, "learning_rate": 2.156358327985053e-07, "loss": 0.3098, "num_input_tokens_seen": 112660560, "step": 117975 }, { "epoch": 9.623949751203197, "grad_norm": 7.3921613693237305, "learning_rate": 2.1516960802110409e-07, "loss": 0.238, "num_input_tokens_seen": 112665008, "step": 117980 }, { "epoch": 9.624357614813606, "grad_norm": 1.7755507230758667, "learning_rate": 2.1470388562168652e-07, "loss": 0.3476, "num_input_tokens_seen": 112669952, "step": 117985 }, { "epoch": 9.624765478424015, "grad_norm": 33.08872985839844, "learning_rate": 2.142386656096923e-07, "loss": 0.3514, "num_input_tokens_seen": 112674976, "step": 117990 }, { "epoch": 9.625173342034424, "grad_norm": 9.222858428955078, "learning_rate": 2.137739479945444e-07, "loss": 0.3261, "num_input_tokens_seen": 112678704, "step": 117995 }, { "epoch": 9.625581205644833, "grad_norm": 0.6726688742637634, "learning_rate": 2.1330973278566592e-07, "loss": 0.3965, "num_input_tokens_seen": 112683648, "step": 118000 }, { "epoch": 9.62598906925524, "grad_norm": 0.44012734293937683, "learning_rate": 2.1284601999246868e-07, "loss": 0.2058, "num_input_tokens_seen": 112688096, "step": 118005 }, { "epoch": 9.62639693286565, "grad_norm": 17.09645652770996, "learning_rate": 2.1238280962434798e-07, "loss": 0.2564, "num_input_tokens_seen": 112692400, "step": 118010 }, { "epoch": 9.626804796476058, "grad_norm": 1.8899235725402832, "learning_rate": 2.1192010169069632e-07, "loss": 0.2966, "num_input_tokens_seen": 112697424, "step": 118015 }, { "epoch": 9.627212660086467, "grad_norm": 1.033941626548767, "learning_rate": 2.1145789620088951e-07, "loss": 0.2267, "num_input_tokens_seen": 112702464, "step": 118020 }, { "epoch": 9.627620523696876, "grad_norm": 68.44564819335938, "learning_rate": 2.1099619316430063e-07, "loss": 0.4158, "num_input_tokens_seen": 112707376, "step": 118025 }, { "epoch": 9.628028387307285, "grad_norm": 1.6659373044967651, "learning_rate": 2.105349925902833e-07, "loss": 0.2464, "num_input_tokens_seen": 112712464, "step": 118030 }, { "epoch": 9.628436250917693, "grad_norm": 20.808549880981445, "learning_rate": 2.1007429448818838e-07, "loss": 0.1996, "num_input_tokens_seen": 112717536, "step": 118035 }, { "epoch": 9.628844114528102, "grad_norm": 2.4519004821777344, "learning_rate": 2.0961409886735562e-07, "loss": 0.2958, "num_input_tokens_seen": 112721616, "step": 118040 }, { "epoch": 9.62925197813851, "grad_norm": 5.199356555938721, "learning_rate": 2.091544057371081e-07, "loss": 0.2987, "num_input_tokens_seen": 112726400, "step": 118045 }, { "epoch": 9.62965984174892, "grad_norm": 2.165574789047241, "learning_rate": 2.0869521510676616e-07, "loss": 0.4628, "num_input_tokens_seen": 112731680, "step": 118050 }, { "epoch": 9.630067705359329, "grad_norm": 1.9871537685394287, "learning_rate": 2.08236526985639e-07, "loss": 0.2633, "num_input_tokens_seen": 112735520, "step": 118055 }, { "epoch": 9.630475568969736, "grad_norm": 1.8688870668411255, "learning_rate": 2.0777834138302477e-07, "loss": 0.2851, "num_input_tokens_seen": 112740240, "step": 118060 }, { "epoch": 9.630883432580145, "grad_norm": 30.224750518798828, "learning_rate": 2.073206583082077e-07, "loss": 0.2384, "num_input_tokens_seen": 112745328, "step": 118065 }, { "epoch": 9.631291296190554, "grad_norm": 4.042758464813232, "learning_rate": 2.0686347777046366e-07, "loss": 0.3963, "num_input_tokens_seen": 112750368, "step": 118070 }, { "epoch": 9.631699159800963, "grad_norm": 4.361959934234619, "learning_rate": 2.0640679977906307e-07, "loss": 0.284, "num_input_tokens_seen": 112754432, "step": 118075 }, { "epoch": 9.632107023411372, "grad_norm": 2.0621979236602783, "learning_rate": 2.059506243432624e-07, "loss": 0.3388, "num_input_tokens_seen": 112759296, "step": 118080 }, { "epoch": 9.63251488702178, "grad_norm": 10.297625541687012, "learning_rate": 2.0549495147230423e-07, "loss": 0.3415, "num_input_tokens_seen": 112764800, "step": 118085 }, { "epoch": 9.632922750632188, "grad_norm": 40.42017364501953, "learning_rate": 2.0503978117542843e-07, "loss": 0.5029, "num_input_tokens_seen": 112769040, "step": 118090 }, { "epoch": 9.633330614242597, "grad_norm": 1.69223153591156, "learning_rate": 2.0458511346186093e-07, "loss": 0.3639, "num_input_tokens_seen": 112774496, "step": 118095 }, { "epoch": 9.633738477853006, "grad_norm": 7.385974884033203, "learning_rate": 2.0413094834081936e-07, "loss": 0.2638, "num_input_tokens_seen": 112779360, "step": 118100 }, { "epoch": 9.634146341463415, "grad_norm": 1.5104135274887085, "learning_rate": 2.036772858215047e-07, "loss": 0.3082, "num_input_tokens_seen": 112783360, "step": 118105 }, { "epoch": 9.634554205073822, "grad_norm": 2.7262113094329834, "learning_rate": 2.0322412591311234e-07, "loss": 0.2825, "num_input_tokens_seen": 112787872, "step": 118110 }, { "epoch": 9.634962068684231, "grad_norm": 4.519903182983398, "learning_rate": 2.027714686248322e-07, "loss": 0.2548, "num_input_tokens_seen": 112791632, "step": 118115 }, { "epoch": 9.63536993229464, "grad_norm": 3.498077392578125, "learning_rate": 2.0231931396584025e-07, "loss": 0.4691, "num_input_tokens_seen": 112795936, "step": 118120 }, { "epoch": 9.63577779590505, "grad_norm": 10.191393852233887, "learning_rate": 2.0186766194529582e-07, "loss": 0.2983, "num_input_tokens_seen": 112801136, "step": 118125 }, { "epoch": 9.636185659515458, "grad_norm": 17.850677490234375, "learning_rate": 2.0141651257235828e-07, "loss": 0.4635, "num_input_tokens_seen": 112805360, "step": 118130 }, { "epoch": 9.636593523125867, "grad_norm": 3.03316068649292, "learning_rate": 2.009658658561675e-07, "loss": 0.4553, "num_input_tokens_seen": 112809472, "step": 118135 }, { "epoch": 9.637001386736275, "grad_norm": 25.797536849975586, "learning_rate": 2.0051572180586065e-07, "loss": 0.5818, "num_input_tokens_seen": 112814464, "step": 118140 }, { "epoch": 9.637409250346684, "grad_norm": 1.8024810552597046, "learning_rate": 2.0006608043056653e-07, "loss": 0.2485, "num_input_tokens_seen": 112819056, "step": 118145 }, { "epoch": 9.637817113957093, "grad_norm": 0.5110035538673401, "learning_rate": 1.9961694173939171e-07, "loss": 0.2638, "num_input_tokens_seen": 112824272, "step": 118150 }, { "epoch": 9.638224977567502, "grad_norm": 12.999256134033203, "learning_rate": 1.9916830574144285e-07, "loss": 0.2199, "num_input_tokens_seen": 112828432, "step": 118155 }, { "epoch": 9.63863284117791, "grad_norm": 17.44509506225586, "learning_rate": 1.9872017244581542e-07, "loss": 0.4184, "num_input_tokens_seen": 112832752, "step": 118160 }, { "epoch": 9.639040704788318, "grad_norm": 28.39256477355957, "learning_rate": 1.9827254186159106e-07, "loss": 0.3267, "num_input_tokens_seen": 112836160, "step": 118165 }, { "epoch": 9.639448568398727, "grad_norm": 2.8305201530456543, "learning_rate": 1.9782541399784304e-07, "loss": 0.4251, "num_input_tokens_seen": 112840992, "step": 118170 }, { "epoch": 9.639856432009136, "grad_norm": 2.1308529376983643, "learning_rate": 1.9737878886363358e-07, "loss": 0.3151, "num_input_tokens_seen": 112846416, "step": 118175 }, { "epoch": 9.640264295619545, "grad_norm": 32.16022872924805, "learning_rate": 1.9693266646801933e-07, "loss": 0.3983, "num_input_tokens_seen": 112850832, "step": 118180 }, { "epoch": 9.640672159229954, "grad_norm": 3.195606231689453, "learning_rate": 1.964870468200375e-07, "loss": 0.3921, "num_input_tokens_seen": 112855680, "step": 118185 }, { "epoch": 9.641080022840363, "grad_norm": 17.402902603149414, "learning_rate": 1.960419299287253e-07, "loss": 0.3952, "num_input_tokens_seen": 112860240, "step": 118190 }, { "epoch": 9.64148788645077, "grad_norm": 0.4783252477645874, "learning_rate": 1.9559731580310604e-07, "loss": 0.2554, "num_input_tokens_seen": 112865520, "step": 118195 }, { "epoch": 9.64189575006118, "grad_norm": 0.9192572236061096, "learning_rate": 1.9515320445218642e-07, "loss": 0.1806, "num_input_tokens_seen": 112869568, "step": 118200 }, { "epoch": 9.642303613671588, "grad_norm": 6.378063678741455, "learning_rate": 1.947095958849704e-07, "loss": 0.297, "num_input_tokens_seen": 112874496, "step": 118205 }, { "epoch": 9.642711477281997, "grad_norm": 2.4945297241210938, "learning_rate": 1.9426649011045072e-07, "loss": 0.4773, "num_input_tokens_seen": 112878816, "step": 118210 }, { "epoch": 9.643119340892406, "grad_norm": 2.9942870140075684, "learning_rate": 1.9382388713760912e-07, "loss": 0.3565, "num_input_tokens_seen": 112883856, "step": 118215 }, { "epoch": 9.643527204502814, "grad_norm": 2.500023365020752, "learning_rate": 1.933817869754162e-07, "loss": 0.2876, "num_input_tokens_seen": 112889488, "step": 118220 }, { "epoch": 9.643935068113223, "grad_norm": 8.078774452209473, "learning_rate": 1.9294018963283423e-07, "loss": 0.254, "num_input_tokens_seen": 112894224, "step": 118225 }, { "epoch": 9.644342931723632, "grad_norm": 17.97173500061035, "learning_rate": 1.9249909511881435e-07, "loss": 0.35, "num_input_tokens_seen": 112898832, "step": 118230 }, { "epoch": 9.64475079533404, "grad_norm": 1.8854271173477173, "learning_rate": 1.9205850344229393e-07, "loss": 0.3829, "num_input_tokens_seen": 112904096, "step": 118235 }, { "epoch": 9.64515865894445, "grad_norm": 21.0610408782959, "learning_rate": 1.916184146122102e-07, "loss": 0.3454, "num_input_tokens_seen": 112909072, "step": 118240 }, { "epoch": 9.645566522554859, "grad_norm": 2.460897445678711, "learning_rate": 1.911788286374755e-07, "loss": 0.1704, "num_input_tokens_seen": 112913344, "step": 118245 }, { "epoch": 9.645974386165266, "grad_norm": 1.6715548038482666, "learning_rate": 1.9073974552700492e-07, "loss": 0.2472, "num_input_tokens_seen": 112918096, "step": 118250 }, { "epoch": 9.646382249775675, "grad_norm": 17.297239303588867, "learning_rate": 1.9030116528969688e-07, "loss": 0.3581, "num_input_tokens_seen": 112922240, "step": 118255 }, { "epoch": 9.646790113386084, "grad_norm": 6.334214210510254, "learning_rate": 1.898630879344443e-07, "loss": 0.4196, "num_input_tokens_seen": 112927632, "step": 118260 }, { "epoch": 9.647197976996493, "grad_norm": 1.5401843786239624, "learning_rate": 1.8942551347012337e-07, "loss": 0.3437, "num_input_tokens_seen": 112933072, "step": 118265 }, { "epoch": 9.647605840606902, "grad_norm": 0.6818069219589233, "learning_rate": 1.8898844190560206e-07, "loss": 0.3615, "num_input_tokens_seen": 112937728, "step": 118270 }, { "epoch": 9.648013704217309, "grad_norm": 4.082150459289551, "learning_rate": 1.8855187324974544e-07, "loss": 0.3421, "num_input_tokens_seen": 112941808, "step": 118275 }, { "epoch": 9.648421567827718, "grad_norm": 26.041519165039062, "learning_rate": 1.8811580751139924e-07, "loss": 0.4703, "num_input_tokens_seen": 112946432, "step": 118280 }, { "epoch": 9.648829431438127, "grad_norm": 14.33192253112793, "learning_rate": 1.8768024469940083e-07, "loss": 0.2909, "num_input_tokens_seen": 112950720, "step": 118285 }, { "epoch": 9.649237295048536, "grad_norm": 7.414327621459961, "learning_rate": 1.8724518482258203e-07, "loss": 0.2308, "num_input_tokens_seen": 112955744, "step": 118290 }, { "epoch": 9.649645158658945, "grad_norm": 11.718382835388184, "learning_rate": 1.8681062788975802e-07, "loss": 0.2694, "num_input_tokens_seen": 112960416, "step": 118295 }, { "epoch": 9.650053022269352, "grad_norm": 11.102587699890137, "learning_rate": 1.8637657390974117e-07, "loss": 0.3611, "num_input_tokens_seen": 112964800, "step": 118300 }, { "epoch": 9.650460885879761, "grad_norm": 10.460494995117188, "learning_rate": 1.8594302289132447e-07, "loss": 0.1836, "num_input_tokens_seen": 112969568, "step": 118305 }, { "epoch": 9.65086874949017, "grad_norm": 16.29047966003418, "learning_rate": 1.855099748433009e-07, "loss": 0.5475, "num_input_tokens_seen": 112974912, "step": 118310 }, { "epoch": 9.65127661310058, "grad_norm": 6.896359920501709, "learning_rate": 1.8507742977444398e-07, "loss": 0.4883, "num_input_tokens_seen": 112979232, "step": 118315 }, { "epoch": 9.651684476710988, "grad_norm": 2.011446714401245, "learning_rate": 1.8464538769352448e-07, "loss": 0.2552, "num_input_tokens_seen": 112984144, "step": 118320 }, { "epoch": 9.652092340321396, "grad_norm": 9.833885192871094, "learning_rate": 1.8421384860929924e-07, "loss": 0.2739, "num_input_tokens_seen": 112988192, "step": 118325 }, { "epoch": 9.652500203931805, "grad_norm": 34.73677444458008, "learning_rate": 1.8378281253051132e-07, "loss": 0.455, "num_input_tokens_seen": 112993136, "step": 118330 }, { "epoch": 9.652908067542214, "grad_norm": 1.254065752029419, "learning_rate": 1.8335227946590094e-07, "loss": 0.241, "num_input_tokens_seen": 112998592, "step": 118335 }, { "epoch": 9.653315931152623, "grad_norm": 1.9461300373077393, "learning_rate": 1.8292224942419723e-07, "loss": 0.3027, "num_input_tokens_seen": 113003744, "step": 118340 }, { "epoch": 9.653723794763032, "grad_norm": 0.7036702632904053, "learning_rate": 1.8249272241411265e-07, "loss": 0.2355, "num_input_tokens_seen": 113009040, "step": 118345 }, { "epoch": 9.65413165837344, "grad_norm": 23.795326232910156, "learning_rate": 1.820636984443569e-07, "loss": 0.3544, "num_input_tokens_seen": 113014624, "step": 118350 }, { "epoch": 9.654539521983848, "grad_norm": 6.16182804107666, "learning_rate": 1.8163517752362301e-07, "loss": 0.4906, "num_input_tokens_seen": 113019360, "step": 118355 }, { "epoch": 9.654947385594257, "grad_norm": 3.1668171882629395, "learning_rate": 1.8120715966059576e-07, "loss": 0.3116, "num_input_tokens_seen": 113023664, "step": 118360 }, { "epoch": 9.655355249204666, "grad_norm": 4.787877082824707, "learning_rate": 1.8077964486395705e-07, "loss": 0.2911, "num_input_tokens_seen": 113028176, "step": 118365 }, { "epoch": 9.655763112815075, "grad_norm": 1.4098252058029175, "learning_rate": 1.8035263314236662e-07, "loss": 0.3487, "num_input_tokens_seen": 113033760, "step": 118370 }, { "epoch": 9.656170976425484, "grad_norm": 21.471769332885742, "learning_rate": 1.7992612450448143e-07, "loss": 0.3589, "num_input_tokens_seen": 113038944, "step": 118375 }, { "epoch": 9.656578840035891, "grad_norm": 1.7463691234588623, "learning_rate": 1.7950011895894735e-07, "loss": 0.1929, "num_input_tokens_seen": 113043728, "step": 118380 }, { "epoch": 9.6569867036463, "grad_norm": 1.2441082000732422, "learning_rate": 1.7907461651439916e-07, "loss": 0.2552, "num_input_tokens_seen": 113048608, "step": 118385 }, { "epoch": 9.65739456725671, "grad_norm": 10.392620086669922, "learning_rate": 1.786496171794605e-07, "loss": 0.2476, "num_input_tokens_seen": 113053120, "step": 118390 }, { "epoch": 9.657802430867118, "grad_norm": 20.263263702392578, "learning_rate": 1.7822512096274946e-07, "loss": 0.4276, "num_input_tokens_seen": 113057920, "step": 118395 }, { "epoch": 9.658210294477527, "grad_norm": 2.223771333694458, "learning_rate": 1.7780112787286473e-07, "loss": 0.2463, "num_input_tokens_seen": 113062704, "step": 118400 }, { "epoch": 9.658618158087936, "grad_norm": 2.724144220352173, "learning_rate": 1.77377637918405e-07, "loss": 0.3961, "num_input_tokens_seen": 113067520, "step": 118405 }, { "epoch": 9.659026021698343, "grad_norm": 7.369124412536621, "learning_rate": 1.7695465110795227e-07, "loss": 0.3149, "num_input_tokens_seen": 113073360, "step": 118410 }, { "epoch": 9.659433885308752, "grad_norm": 16.938539505004883, "learning_rate": 1.7653216745008306e-07, "loss": 0.4153, "num_input_tokens_seen": 113078256, "step": 118415 }, { "epoch": 9.659841748919161, "grad_norm": 17.967761993408203, "learning_rate": 1.7611018695335434e-07, "loss": 0.1727, "num_input_tokens_seen": 113082464, "step": 118420 }, { "epoch": 9.66024961252957, "grad_norm": 2.1647324562072754, "learning_rate": 1.7568870962632878e-07, "loss": 0.2111, "num_input_tokens_seen": 113086528, "step": 118425 }, { "epoch": 9.66065747613998, "grad_norm": 1.5829402208328247, "learning_rate": 1.7526773547754116e-07, "loss": 0.2921, "num_input_tokens_seen": 113092016, "step": 118430 }, { "epoch": 9.661065339750387, "grad_norm": 1.4517207145690918, "learning_rate": 1.7484726451552914e-07, "loss": 0.4391, "num_input_tokens_seen": 113097888, "step": 118435 }, { "epoch": 9.661473203360796, "grad_norm": 0.5975748300552368, "learning_rate": 1.7442729674881365e-07, "loss": 0.4791, "num_input_tokens_seen": 113102416, "step": 118440 }, { "epoch": 9.661881066971205, "grad_norm": 1.4410243034362793, "learning_rate": 1.7400783218590733e-07, "loss": 0.323, "num_input_tokens_seen": 113107520, "step": 118445 }, { "epoch": 9.662288930581614, "grad_norm": 4.772919654846191, "learning_rate": 1.735888708353145e-07, "loss": 0.5105, "num_input_tokens_seen": 113112160, "step": 118450 }, { "epoch": 9.662696794192023, "grad_norm": 2.39412784576416, "learning_rate": 1.7317041270552558e-07, "loss": 0.4754, "num_input_tokens_seen": 113116016, "step": 118455 }, { "epoch": 9.663104657802432, "grad_norm": 16.35071563720703, "learning_rate": 1.7275245780502268e-07, "loss": 0.3626, "num_input_tokens_seen": 113120336, "step": 118460 }, { "epoch": 9.663512521412839, "grad_norm": 2.7331550121307373, "learning_rate": 1.7233500614227682e-07, "loss": 0.2445, "num_input_tokens_seen": 113125264, "step": 118465 }, { "epoch": 9.663920385023248, "grad_norm": 3.123897075653076, "learning_rate": 1.7191805772575065e-07, "loss": 0.3176, "num_input_tokens_seen": 113129520, "step": 118470 }, { "epoch": 9.664328248633657, "grad_norm": 16.494884490966797, "learning_rate": 1.7150161256389574e-07, "loss": 0.3621, "num_input_tokens_seen": 113134688, "step": 118475 }, { "epoch": 9.664736112244066, "grad_norm": 31.96410369873047, "learning_rate": 1.7108567066515535e-07, "loss": 0.3848, "num_input_tokens_seen": 113139936, "step": 118480 }, { "epoch": 9.665143975854475, "grad_norm": 6.12869119644165, "learning_rate": 1.7067023203795608e-07, "loss": 0.4305, "num_input_tokens_seen": 113145488, "step": 118485 }, { "epoch": 9.665551839464882, "grad_norm": 38.45309829711914, "learning_rate": 1.7025529669071895e-07, "loss": 0.2762, "num_input_tokens_seen": 113150032, "step": 118490 }, { "epoch": 9.665959703075291, "grad_norm": 6.87220573425293, "learning_rate": 1.6984086463185667e-07, "loss": 0.4484, "num_input_tokens_seen": 113154832, "step": 118495 }, { "epoch": 9.6663675666857, "grad_norm": 4.65762186050415, "learning_rate": 1.6942693586977089e-07, "loss": 0.215, "num_input_tokens_seen": 113159008, "step": 118500 }, { "epoch": 9.66677543029611, "grad_norm": 18.755800247192383, "learning_rate": 1.690135104128493e-07, "loss": 0.2394, "num_input_tokens_seen": 113164480, "step": 118505 }, { "epoch": 9.667183293906518, "grad_norm": 17.995153427124023, "learning_rate": 1.6860058826947133e-07, "loss": 0.4015, "num_input_tokens_seen": 113169824, "step": 118510 }, { "epoch": 9.667591157516926, "grad_norm": 13.688920974731445, "learning_rate": 1.681881694480081e-07, "loss": 0.3532, "num_input_tokens_seen": 113175072, "step": 118515 }, { "epoch": 9.667999021127335, "grad_norm": 9.066963195800781, "learning_rate": 1.677762539568195e-07, "loss": 0.1962, "num_input_tokens_seen": 113180160, "step": 118520 }, { "epoch": 9.668406884737744, "grad_norm": 2.2126667499542236, "learning_rate": 1.673648418042545e-07, "loss": 0.1825, "num_input_tokens_seen": 113184944, "step": 118525 }, { "epoch": 9.668814748348153, "grad_norm": 7.440408706665039, "learning_rate": 1.6695393299865358e-07, "loss": 0.2906, "num_input_tokens_seen": 113189680, "step": 118530 }, { "epoch": 9.669222611958562, "grad_norm": 25.647546768188477, "learning_rate": 1.665435275483407e-07, "loss": 0.3693, "num_input_tokens_seen": 113193872, "step": 118535 }, { "epoch": 9.669630475568969, "grad_norm": 1.5169830322265625, "learning_rate": 1.6613362546164247e-07, "loss": 0.3019, "num_input_tokens_seen": 113198608, "step": 118540 }, { "epoch": 9.670038339179378, "grad_norm": 22.259641647338867, "learning_rate": 1.6572422674686062e-07, "loss": 0.392, "num_input_tokens_seen": 113202944, "step": 118545 }, { "epoch": 9.670446202789787, "grad_norm": 17.922321319580078, "learning_rate": 1.6531533141229684e-07, "loss": 0.2348, "num_input_tokens_seen": 113207680, "step": 118550 }, { "epoch": 9.670854066400196, "grad_norm": 4.608146667480469, "learning_rate": 1.6490693946623616e-07, "loss": 0.2664, "num_input_tokens_seen": 113212880, "step": 118555 }, { "epoch": 9.671261930010605, "grad_norm": 34.17307662963867, "learning_rate": 1.6449905091696083e-07, "loss": 0.3186, "num_input_tokens_seen": 113217200, "step": 118560 }, { "epoch": 9.671669793621014, "grad_norm": 1.1505969762802124, "learning_rate": 1.640916657727365e-07, "loss": 0.3655, "num_input_tokens_seen": 113222288, "step": 118565 }, { "epoch": 9.672077657231421, "grad_norm": 6.632674694061279, "learning_rate": 1.636847840418204e-07, "loss": 0.4702, "num_input_tokens_seen": 113226912, "step": 118570 }, { "epoch": 9.67248552084183, "grad_norm": 2.5044898986816406, "learning_rate": 1.6327840573245877e-07, "loss": 0.2404, "num_input_tokens_seen": 113232176, "step": 118575 }, { "epoch": 9.67289338445224, "grad_norm": 4.3030829429626465, "learning_rate": 1.628725308528922e-07, "loss": 0.4585, "num_input_tokens_seen": 113237120, "step": 118580 }, { "epoch": 9.673301248062648, "grad_norm": 6.840481281280518, "learning_rate": 1.624671594113447e-07, "loss": 0.3105, "num_input_tokens_seen": 113242480, "step": 118585 }, { "epoch": 9.673709111673057, "grad_norm": 18.960100173950195, "learning_rate": 1.6206229141603191e-07, "loss": 0.2288, "num_input_tokens_seen": 113247328, "step": 118590 }, { "epoch": 9.674116975283464, "grad_norm": 1.8225477933883667, "learning_rate": 1.616579268751639e-07, "loss": 0.3009, "num_input_tokens_seen": 113251760, "step": 118595 }, { "epoch": 9.674524838893873, "grad_norm": 3.7598488330841064, "learning_rate": 1.6125406579693414e-07, "loss": 0.4747, "num_input_tokens_seen": 113256416, "step": 118600 }, { "epoch": 9.674932702504282, "grad_norm": 3.6670572757720947, "learning_rate": 1.608507081895333e-07, "loss": 0.2757, "num_input_tokens_seen": 113260720, "step": 118605 }, { "epoch": 9.675340566114691, "grad_norm": 6.904434680938721, "learning_rate": 1.6044785406112982e-07, "loss": 0.3907, "num_input_tokens_seen": 113265296, "step": 118610 }, { "epoch": 9.6757484297251, "grad_norm": 51.40586471557617, "learning_rate": 1.6004550341989776e-07, "loss": 0.3144, "num_input_tokens_seen": 113269744, "step": 118615 }, { "epoch": 9.67615629333551, "grad_norm": 7.436081409454346, "learning_rate": 1.5964365627398336e-07, "loss": 0.2541, "num_input_tokens_seen": 113273776, "step": 118620 }, { "epoch": 9.676564156945917, "grad_norm": 31.480560302734375, "learning_rate": 1.592423126315412e-07, "loss": 0.3673, "num_input_tokens_seen": 113278640, "step": 118625 }, { "epoch": 9.676972020556326, "grad_norm": 16.46253204345703, "learning_rate": 1.5884147250069815e-07, "loss": 0.3252, "num_input_tokens_seen": 113284032, "step": 118630 }, { "epoch": 9.677379884166735, "grad_norm": 10.719826698303223, "learning_rate": 1.5844113588958375e-07, "loss": 0.2958, "num_input_tokens_seen": 113288064, "step": 118635 }, { "epoch": 9.677787747777144, "grad_norm": 50.93056869506836, "learning_rate": 1.5804130280631379e-07, "loss": 0.205, "num_input_tokens_seen": 113293120, "step": 118640 }, { "epoch": 9.678195611387553, "grad_norm": 6.616665363311768, "learning_rate": 1.5764197325898732e-07, "loss": 0.4994, "num_input_tokens_seen": 113297696, "step": 118645 }, { "epoch": 9.67860347499796, "grad_norm": 1.087738037109375, "learning_rate": 1.5724314725570345e-07, "loss": 0.3095, "num_input_tokens_seen": 113302672, "step": 118650 }, { "epoch": 9.679011338608369, "grad_norm": 4.063800811767578, "learning_rate": 1.568448248045473e-07, "loss": 0.3881, "num_input_tokens_seen": 113307600, "step": 118655 }, { "epoch": 9.679419202218778, "grad_norm": 26.620893478393555, "learning_rate": 1.5644700591358752e-07, "loss": 0.2709, "num_input_tokens_seen": 113312608, "step": 118660 }, { "epoch": 9.679827065829187, "grad_norm": 5.008009433746338, "learning_rate": 1.5604969059088702e-07, "loss": 0.173, "num_input_tokens_seen": 113317248, "step": 118665 }, { "epoch": 9.680234929439596, "grad_norm": 29.087642669677734, "learning_rate": 1.5565287884450607e-07, "loss": 0.3332, "num_input_tokens_seen": 113321760, "step": 118670 }, { "epoch": 9.680642793050005, "grad_norm": 1.6226089000701904, "learning_rate": 1.5525657068248544e-07, "loss": 0.2082, "num_input_tokens_seen": 113327056, "step": 118675 }, { "epoch": 9.681050656660412, "grad_norm": 10.084321975708008, "learning_rate": 1.5486076611285483e-07, "loss": 0.2058, "num_input_tokens_seen": 113331040, "step": 118680 }, { "epoch": 9.681458520270821, "grad_norm": 33.607666015625, "learning_rate": 1.5446546514364114e-07, "loss": 0.3189, "num_input_tokens_seen": 113335264, "step": 118685 }, { "epoch": 9.68186638388123, "grad_norm": 10.538055419921875, "learning_rate": 1.5407066778285184e-07, "loss": 0.4567, "num_input_tokens_seen": 113340080, "step": 118690 }, { "epoch": 9.68227424749164, "grad_norm": 14.389968872070312, "learning_rate": 1.5367637403849443e-07, "loss": 0.2603, "num_input_tokens_seen": 113344336, "step": 118695 }, { "epoch": 9.682682111102048, "grad_norm": 0.5529049038887024, "learning_rate": 1.5328258391855975e-07, "loss": 0.4399, "num_input_tokens_seen": 113349600, "step": 118700 }, { "epoch": 9.683089974712455, "grad_norm": 1.0477505922317505, "learning_rate": 1.5288929743102754e-07, "loss": 0.285, "num_input_tokens_seen": 113354832, "step": 118705 }, { "epoch": 9.683497838322864, "grad_norm": 3.9099485874176025, "learning_rate": 1.5249651458387194e-07, "loss": 0.3115, "num_input_tokens_seen": 113359776, "step": 118710 }, { "epoch": 9.683905701933273, "grad_norm": 29.416982650756836, "learning_rate": 1.5210423538505324e-07, "loss": 0.3703, "num_input_tokens_seen": 113364880, "step": 118715 }, { "epoch": 9.684313565543683, "grad_norm": 2.6446034908294678, "learning_rate": 1.5171245984252348e-07, "loss": 0.3611, "num_input_tokens_seen": 113370336, "step": 118720 }, { "epoch": 9.684721429154092, "grad_norm": 1.594305396080017, "learning_rate": 1.5132118796422345e-07, "loss": 0.3271, "num_input_tokens_seen": 113375888, "step": 118725 }, { "epoch": 9.685129292764499, "grad_norm": 12.029365539550781, "learning_rate": 1.5093041975808297e-07, "loss": 0.2147, "num_input_tokens_seen": 113380928, "step": 118730 }, { "epoch": 9.685537156374908, "grad_norm": 5.009341239929199, "learning_rate": 1.5054015523202626e-07, "loss": 0.5246, "num_input_tokens_seen": 113385552, "step": 118735 }, { "epoch": 9.685945019985317, "grad_norm": 22.318557739257812, "learning_rate": 1.5015039439395807e-07, "loss": 0.4007, "num_input_tokens_seen": 113389744, "step": 118740 }, { "epoch": 9.686352883595726, "grad_norm": 12.446802139282227, "learning_rate": 1.4976113725178598e-07, "loss": 0.2376, "num_input_tokens_seen": 113394576, "step": 118745 }, { "epoch": 9.686760747206135, "grad_norm": 23.051929473876953, "learning_rate": 1.4937238381339535e-07, "loss": 0.2604, "num_input_tokens_seen": 113399264, "step": 118750 }, { "epoch": 9.687168610816542, "grad_norm": 35.8438720703125, "learning_rate": 1.4898413408666324e-07, "loss": 0.3392, "num_input_tokens_seen": 113404544, "step": 118755 }, { "epoch": 9.687576474426951, "grad_norm": 14.12531566619873, "learning_rate": 1.4859638807946664e-07, "loss": 0.3662, "num_input_tokens_seen": 113409456, "step": 118760 }, { "epoch": 9.68798433803736, "grad_norm": 17.778593063354492, "learning_rate": 1.4820914579966038e-07, "loss": 0.4515, "num_input_tokens_seen": 113413376, "step": 118765 }, { "epoch": 9.688392201647769, "grad_norm": 9.388675689697266, "learning_rate": 1.4782240725509377e-07, "loss": 0.3331, "num_input_tokens_seen": 113418272, "step": 118770 }, { "epoch": 9.688800065258178, "grad_norm": 1.6397165060043335, "learning_rate": 1.4743617245360776e-07, "loss": 0.4265, "num_input_tokens_seen": 113422048, "step": 118775 }, { "epoch": 9.689207928868587, "grad_norm": 42.77513122558594, "learning_rate": 1.4705044140302936e-07, "loss": 0.2799, "num_input_tokens_seen": 113426160, "step": 118780 }, { "epoch": 9.689615792478994, "grad_norm": 29.239198684692383, "learning_rate": 1.466652141111774e-07, "loss": 0.4945, "num_input_tokens_seen": 113431376, "step": 118785 }, { "epoch": 9.690023656089403, "grad_norm": 38.15976333618164, "learning_rate": 1.4628049058586224e-07, "loss": 0.5251, "num_input_tokens_seen": 113436128, "step": 118790 }, { "epoch": 9.690431519699812, "grad_norm": 2.1192972660064697, "learning_rate": 1.4589627083487768e-07, "loss": 0.2215, "num_input_tokens_seen": 113441296, "step": 118795 }, { "epoch": 9.690839383310221, "grad_norm": 4.449038028717041, "learning_rate": 1.4551255486601745e-07, "loss": 0.3191, "num_input_tokens_seen": 113446832, "step": 118800 }, { "epoch": 9.69124724692063, "grad_norm": 3.035695791244507, "learning_rate": 1.4512934268705314e-07, "loss": 0.3678, "num_input_tokens_seen": 113451312, "step": 118805 }, { "epoch": 9.691655110531038, "grad_norm": 4.720458030700684, "learning_rate": 1.4474663430575908e-07, "loss": 0.3337, "num_input_tokens_seen": 113456768, "step": 118810 }, { "epoch": 9.692062974141447, "grad_norm": 2.1906464099884033, "learning_rate": 1.443644297298874e-07, "loss": 0.3787, "num_input_tokens_seen": 113460800, "step": 118815 }, { "epoch": 9.692470837751856, "grad_norm": 0.9241340756416321, "learning_rate": 1.4398272896718745e-07, "loss": 0.3543, "num_input_tokens_seen": 113466448, "step": 118820 }, { "epoch": 9.692878701362265, "grad_norm": 26.080060958862305, "learning_rate": 1.4360153202539472e-07, "loss": 0.2529, "num_input_tokens_seen": 113472048, "step": 118825 }, { "epoch": 9.693286564972674, "grad_norm": 11.019774436950684, "learning_rate": 1.4322083891223914e-07, "loss": 0.4922, "num_input_tokens_seen": 113476784, "step": 118830 }, { "epoch": 9.693694428583083, "grad_norm": 26.03137969970703, "learning_rate": 1.428406496354312e-07, "loss": 0.4448, "num_input_tokens_seen": 113482000, "step": 118835 }, { "epoch": 9.69410229219349, "grad_norm": 40.46333694458008, "learning_rate": 1.4246096420268418e-07, "loss": 0.3043, "num_input_tokens_seen": 113486672, "step": 118840 }, { "epoch": 9.694510155803899, "grad_norm": 10.243218421936035, "learning_rate": 1.4208178262168914e-07, "loss": 0.3375, "num_input_tokens_seen": 113490768, "step": 118845 }, { "epoch": 9.694918019414308, "grad_norm": 3.4644885063171387, "learning_rate": 1.4170310490013438e-07, "loss": 0.322, "num_input_tokens_seen": 113495296, "step": 118850 }, { "epoch": 9.695325883024717, "grad_norm": 30.697683334350586, "learning_rate": 1.4132493104569432e-07, "loss": 0.3942, "num_input_tokens_seen": 113498496, "step": 118855 }, { "epoch": 9.695733746635126, "grad_norm": 19.649747848510742, "learning_rate": 1.4094726106603505e-07, "loss": 0.2004, "num_input_tokens_seen": 113502816, "step": 118860 }, { "epoch": 9.696141610245533, "grad_norm": 4.258518218994141, "learning_rate": 1.4057009496881158e-07, "loss": 0.347, "num_input_tokens_seen": 113507744, "step": 118865 }, { "epoch": 9.696549473855942, "grad_norm": 1.8790690898895264, "learning_rate": 1.4019343276166774e-07, "loss": 0.3194, "num_input_tokens_seen": 113512848, "step": 118870 }, { "epoch": 9.696957337466351, "grad_norm": 1.7287644147872925, "learning_rate": 1.3981727445223912e-07, "loss": 0.2949, "num_input_tokens_seen": 113517792, "step": 118875 }, { "epoch": 9.69736520107676, "grad_norm": 15.061102867126465, "learning_rate": 1.3944162004815299e-07, "loss": 0.2234, "num_input_tokens_seen": 113521872, "step": 118880 }, { "epoch": 9.69777306468717, "grad_norm": 3.1002511978149414, "learning_rate": 1.3906646955701984e-07, "loss": 0.2767, "num_input_tokens_seen": 113526304, "step": 118885 }, { "epoch": 9.698180928297578, "grad_norm": 2.5127978324890137, "learning_rate": 1.3869182298644755e-07, "loss": 0.2484, "num_input_tokens_seen": 113531024, "step": 118890 }, { "epoch": 9.698588791907985, "grad_norm": 0.8101027607917786, "learning_rate": 1.3831768034402448e-07, "loss": 0.2393, "num_input_tokens_seen": 113535872, "step": 118895 }, { "epoch": 9.698996655518394, "grad_norm": 30.514930725097656, "learning_rate": 1.3794404163733898e-07, "loss": 0.2623, "num_input_tokens_seen": 113540432, "step": 118900 }, { "epoch": 9.699404519128803, "grad_norm": 1.3255836963653564, "learning_rate": 1.3757090687396278e-07, "loss": 0.2537, "num_input_tokens_seen": 113546048, "step": 118905 }, { "epoch": 9.699812382739212, "grad_norm": 1.8907650709152222, "learning_rate": 1.3719827606146206e-07, "loss": 0.3366, "num_input_tokens_seen": 113550992, "step": 118910 }, { "epoch": 9.700220246349621, "grad_norm": 1.7634718418121338, "learning_rate": 1.3682614920738635e-07, "loss": 0.2057, "num_input_tokens_seen": 113554720, "step": 118915 }, { "epoch": 9.700628109960029, "grad_norm": 2.841355085372925, "learning_rate": 1.364545263192768e-07, "loss": 0.4263, "num_input_tokens_seen": 113559408, "step": 118920 }, { "epoch": 9.701035973570438, "grad_norm": 18.583372116088867, "learning_rate": 1.3608340740467185e-07, "loss": 0.3295, "num_input_tokens_seen": 113563968, "step": 118925 }, { "epoch": 9.701443837180847, "grad_norm": 5.919561386108398, "learning_rate": 1.357127924710877e-07, "loss": 0.4794, "num_input_tokens_seen": 113568704, "step": 118930 }, { "epoch": 9.701851700791256, "grad_norm": 8.948342323303223, "learning_rate": 1.3534268152604336e-07, "loss": 0.4037, "num_input_tokens_seen": 113573504, "step": 118935 }, { "epoch": 9.702259564401665, "grad_norm": 19.105321884155273, "learning_rate": 1.3497307457703556e-07, "loss": 0.2319, "num_input_tokens_seen": 113576928, "step": 118940 }, { "epoch": 9.702667428012072, "grad_norm": 3.7595620155334473, "learning_rate": 1.3460397163155836e-07, "loss": 0.4072, "num_input_tokens_seen": 113581536, "step": 118945 }, { "epoch": 9.703075291622481, "grad_norm": 5.444968223571777, "learning_rate": 1.3423537269709185e-07, "loss": 0.4544, "num_input_tokens_seen": 113585888, "step": 118950 }, { "epoch": 9.70348315523289, "grad_norm": 16.816661834716797, "learning_rate": 1.3386727778110786e-07, "loss": 0.3486, "num_input_tokens_seen": 113590080, "step": 118955 }, { "epoch": 9.703891018843299, "grad_norm": 2.3689563274383545, "learning_rate": 1.3349968689106984e-07, "loss": 0.469, "num_input_tokens_seen": 113594368, "step": 118960 }, { "epoch": 9.704298882453708, "grad_norm": 4.468533992767334, "learning_rate": 1.331326000344274e-07, "loss": 0.3602, "num_input_tokens_seen": 113598320, "step": 118965 }, { "epoch": 9.704706746064117, "grad_norm": 1.13511323928833, "learning_rate": 1.3276601721861904e-07, "loss": 0.3074, "num_input_tokens_seen": 113603264, "step": 118970 }, { "epoch": 9.705114609674524, "grad_norm": 11.898822784423828, "learning_rate": 1.323999384510749e-07, "loss": 0.303, "num_input_tokens_seen": 113608384, "step": 118975 }, { "epoch": 9.705522473284933, "grad_norm": 4.231419563293457, "learning_rate": 1.3203436373921963e-07, "loss": 0.2839, "num_input_tokens_seen": 113612912, "step": 118980 }, { "epoch": 9.705930336895342, "grad_norm": 2.150834321975708, "learning_rate": 1.316692930904584e-07, "loss": 0.3448, "num_input_tokens_seen": 113617872, "step": 118985 }, { "epoch": 9.706338200505751, "grad_norm": 1.9575034379959106, "learning_rate": 1.3130472651219639e-07, "loss": 0.0992, "num_input_tokens_seen": 113623680, "step": 118990 }, { "epoch": 9.70674606411616, "grad_norm": 52.541500091552734, "learning_rate": 1.3094066401181937e-07, "loss": 0.4745, "num_input_tokens_seen": 113628304, "step": 118995 }, { "epoch": 9.707153927726567, "grad_norm": 1.7898705005645752, "learning_rate": 1.3057710559670478e-07, "loss": 0.29, "num_input_tokens_seen": 113631776, "step": 119000 }, { "epoch": 9.707561791336976, "grad_norm": 12.52351188659668, "learning_rate": 1.3021405127422726e-07, "loss": 0.3863, "num_input_tokens_seen": 113636400, "step": 119005 }, { "epoch": 9.707969654947386, "grad_norm": 12.845893859863281, "learning_rate": 1.2985150105174483e-07, "loss": 0.2366, "num_input_tokens_seen": 113641376, "step": 119010 }, { "epoch": 9.708377518557795, "grad_norm": 3.017878293991089, "learning_rate": 1.2948945493660158e-07, "loss": 0.6287, "num_input_tokens_seen": 113645952, "step": 119015 }, { "epoch": 9.708785382168204, "grad_norm": 8.737531661987305, "learning_rate": 1.291279129361389e-07, "loss": 0.2553, "num_input_tokens_seen": 113651136, "step": 119020 }, { "epoch": 9.70919324577861, "grad_norm": 2.2102081775665283, "learning_rate": 1.2876687505768702e-07, "loss": 0.2666, "num_input_tokens_seen": 113655632, "step": 119025 }, { "epoch": 9.70960110938902, "grad_norm": 1.6110764741897583, "learning_rate": 1.2840634130855956e-07, "loss": 0.2569, "num_input_tokens_seen": 113660016, "step": 119030 }, { "epoch": 9.710008972999429, "grad_norm": 20.06843376159668, "learning_rate": 1.2804631169607007e-07, "loss": 0.2712, "num_input_tokens_seen": 113664656, "step": 119035 }, { "epoch": 9.710416836609838, "grad_norm": 5.214044094085693, "learning_rate": 1.2768678622751272e-07, "loss": 0.4819, "num_input_tokens_seen": 113670640, "step": 119040 }, { "epoch": 9.710824700220247, "grad_norm": 2.5050272941589355, "learning_rate": 1.273277649101734e-07, "loss": 0.2251, "num_input_tokens_seen": 113675776, "step": 119045 }, { "epoch": 9.711232563830656, "grad_norm": 0.6249024868011475, "learning_rate": 1.2696924775133233e-07, "loss": 0.1759, "num_input_tokens_seen": 113680352, "step": 119050 }, { "epoch": 9.711640427441063, "grad_norm": 8.500557899475098, "learning_rate": 1.2661123475825598e-07, "loss": 0.2029, "num_input_tokens_seen": 113685024, "step": 119055 }, { "epoch": 9.712048291051472, "grad_norm": 8.134756088256836, "learning_rate": 1.2625372593819963e-07, "loss": 0.4618, "num_input_tokens_seen": 113689872, "step": 119060 }, { "epoch": 9.712456154661881, "grad_norm": 1.3344964981079102, "learning_rate": 1.2589672129841024e-07, "loss": 0.4256, "num_input_tokens_seen": 113694432, "step": 119065 }, { "epoch": 9.71286401827229, "grad_norm": 21.133464813232422, "learning_rate": 1.255402208461237e-07, "loss": 0.3157, "num_input_tokens_seen": 113699040, "step": 119070 }, { "epoch": 9.713271881882699, "grad_norm": 20.486968994140625, "learning_rate": 1.2518422458857038e-07, "loss": 0.4105, "num_input_tokens_seen": 113703024, "step": 119075 }, { "epoch": 9.713679745493106, "grad_norm": 12.163434028625488, "learning_rate": 1.2482873253296112e-07, "loss": 0.4412, "num_input_tokens_seen": 113708208, "step": 119080 }, { "epoch": 9.714087609103515, "grad_norm": 2.5028939247131348, "learning_rate": 1.244737446865041e-07, "loss": 0.271, "num_input_tokens_seen": 113712672, "step": 119085 }, { "epoch": 9.714495472713924, "grad_norm": 8.301706314086914, "learning_rate": 1.241192610563907e-07, "loss": 0.2459, "num_input_tokens_seen": 113716752, "step": 119090 }, { "epoch": 9.714903336324333, "grad_norm": 28.870765686035156, "learning_rate": 1.2376528164981248e-07, "loss": 0.2077, "num_input_tokens_seen": 113721808, "step": 119095 }, { "epoch": 9.715311199934742, "grad_norm": 5.050585746765137, "learning_rate": 1.2341180647394146e-07, "loss": 0.3254, "num_input_tokens_seen": 113726592, "step": 119100 }, { "epoch": 9.715719063545151, "grad_norm": 2.3207833766937256, "learning_rate": 1.230588355359441e-07, "loss": 0.4199, "num_input_tokens_seen": 113732048, "step": 119105 }, { "epoch": 9.716126927155559, "grad_norm": 6.572636127471924, "learning_rate": 1.2270636884297026e-07, "loss": 0.2104, "num_input_tokens_seen": 113737056, "step": 119110 }, { "epoch": 9.716534790765968, "grad_norm": 8.222639083862305, "learning_rate": 1.22354406402167e-07, "loss": 0.3396, "num_input_tokens_seen": 113741904, "step": 119115 }, { "epoch": 9.716942654376377, "grad_norm": 2.7903053760528564, "learning_rate": 1.2200294822067027e-07, "loss": 0.4744, "num_input_tokens_seen": 113747200, "step": 119120 }, { "epoch": 9.717350517986786, "grad_norm": 4.282522201538086, "learning_rate": 1.2165199430560214e-07, "loss": 0.4842, "num_input_tokens_seen": 113751920, "step": 119125 }, { "epoch": 9.717758381597195, "grad_norm": 10.560210227966309, "learning_rate": 1.213015446640764e-07, "loss": 0.3623, "num_input_tokens_seen": 113756496, "step": 119130 }, { "epoch": 9.718166245207602, "grad_norm": 9.503057479858398, "learning_rate": 1.2095159930319845e-07, "loss": 0.3497, "num_input_tokens_seen": 113761616, "step": 119135 }, { "epoch": 9.71857410881801, "grad_norm": 1.349495768547058, "learning_rate": 1.2060215823005706e-07, "loss": 0.2165, "num_input_tokens_seen": 113766352, "step": 119140 }, { "epoch": 9.71898197242842, "grad_norm": 22.078664779663086, "learning_rate": 1.20253221451741e-07, "loss": 0.4201, "num_input_tokens_seen": 113770896, "step": 119145 }, { "epoch": 9.719389836038829, "grad_norm": 12.85473918914795, "learning_rate": 1.1990478897531965e-07, "loss": 0.3479, "num_input_tokens_seen": 113775312, "step": 119150 }, { "epoch": 9.719797699649238, "grad_norm": 22.84657859802246, "learning_rate": 1.19556860807854e-07, "loss": 0.3691, "num_input_tokens_seen": 113780544, "step": 119155 }, { "epoch": 9.720205563259647, "grad_norm": 5.185100555419922, "learning_rate": 1.1920943695639952e-07, "loss": 0.3909, "num_input_tokens_seen": 113785936, "step": 119160 }, { "epoch": 9.720613426870054, "grad_norm": 31.689748764038086, "learning_rate": 1.1886251742799781e-07, "loss": 0.3949, "num_input_tokens_seen": 113791008, "step": 119165 }, { "epoch": 9.721021290480463, "grad_norm": 8.26175594329834, "learning_rate": 1.1851610222967935e-07, "loss": 0.2799, "num_input_tokens_seen": 113795792, "step": 119170 }, { "epoch": 9.721429154090872, "grad_norm": 4.135063648223877, "learning_rate": 1.181701913684663e-07, "loss": 0.2294, "num_input_tokens_seen": 113800096, "step": 119175 }, { "epoch": 9.721837017701281, "grad_norm": 14.079840660095215, "learning_rate": 1.1782478485137249e-07, "loss": 0.2784, "num_input_tokens_seen": 113805264, "step": 119180 }, { "epoch": 9.72224488131169, "grad_norm": 1.8341052532196045, "learning_rate": 1.1747988268539512e-07, "loss": 0.2273, "num_input_tokens_seen": 113810480, "step": 119185 }, { "epoch": 9.722652744922097, "grad_norm": 22.126850128173828, "learning_rate": 1.1713548487752857e-07, "loss": 0.2953, "num_input_tokens_seen": 113815712, "step": 119190 }, { "epoch": 9.723060608532506, "grad_norm": 1.2134642601013184, "learning_rate": 1.1679159143475061e-07, "loss": 0.4526, "num_input_tokens_seen": 113820880, "step": 119195 }, { "epoch": 9.723468472142915, "grad_norm": 36.8576774597168, "learning_rate": 1.164482023640362e-07, "loss": 0.3298, "num_input_tokens_seen": 113824944, "step": 119200 }, { "epoch": 9.723876335753324, "grad_norm": 12.554403305053711, "learning_rate": 1.1610531767234089e-07, "loss": 0.5789, "num_input_tokens_seen": 113830352, "step": 119205 }, { "epoch": 9.724284199363733, "grad_norm": 1.9887677431106567, "learning_rate": 1.1576293736661747e-07, "loss": 0.4082, "num_input_tokens_seen": 113834480, "step": 119210 }, { "epoch": 9.72469206297414, "grad_norm": 1.432800531387329, "learning_rate": 1.154210614538076e-07, "loss": 0.2017, "num_input_tokens_seen": 113838480, "step": 119215 }, { "epoch": 9.72509992658455, "grad_norm": 4.727451801300049, "learning_rate": 1.1507968994083351e-07, "loss": 0.3483, "num_input_tokens_seen": 113842160, "step": 119220 }, { "epoch": 9.725507790194959, "grad_norm": 1.2518733739852905, "learning_rate": 1.1473882283462578e-07, "loss": 0.4742, "num_input_tokens_seen": 113846832, "step": 119225 }, { "epoch": 9.725915653805368, "grad_norm": 21.441802978515625, "learning_rate": 1.1439846014208443e-07, "loss": 0.2519, "num_input_tokens_seen": 113851040, "step": 119230 }, { "epoch": 9.726323517415777, "grad_norm": 1.5409153699874878, "learning_rate": 1.140586018701123e-07, "loss": 0.2701, "num_input_tokens_seen": 113855632, "step": 119235 }, { "epoch": 9.726731381026184, "grad_norm": 6.291286468505859, "learning_rate": 1.137192480255983e-07, "loss": 0.5226, "num_input_tokens_seen": 113860720, "step": 119240 }, { "epoch": 9.727139244636593, "grad_norm": 13.554303169250488, "learning_rate": 1.1338039861541749e-07, "loss": 0.2709, "num_input_tokens_seen": 113865328, "step": 119245 }, { "epoch": 9.727547108247002, "grad_norm": 28.020240783691406, "learning_rate": 1.1304205364644494e-07, "loss": 0.4544, "num_input_tokens_seen": 113869424, "step": 119250 }, { "epoch": 9.727954971857411, "grad_norm": 1.3403313159942627, "learning_rate": 1.1270421312553348e-07, "loss": 0.3856, "num_input_tokens_seen": 113874528, "step": 119255 }, { "epoch": 9.72836283546782, "grad_norm": 7.713074207305908, "learning_rate": 1.1236687705953319e-07, "loss": 0.2879, "num_input_tokens_seen": 113879424, "step": 119260 }, { "epoch": 9.728770699078229, "grad_norm": 1.661177635192871, "learning_rate": 1.120300454552775e-07, "loss": 0.3993, "num_input_tokens_seen": 113884384, "step": 119265 }, { "epoch": 9.729178562688636, "grad_norm": 3.843397378921509, "learning_rate": 1.1169371831959985e-07, "loss": 0.3242, "num_input_tokens_seen": 113888800, "step": 119270 }, { "epoch": 9.729586426299045, "grad_norm": 5.628800868988037, "learning_rate": 1.1135789565931421e-07, "loss": 0.3986, "num_input_tokens_seen": 113894192, "step": 119275 }, { "epoch": 9.729994289909454, "grad_norm": 6.893943786621094, "learning_rate": 1.1102257748122901e-07, "loss": 0.2762, "num_input_tokens_seen": 113898352, "step": 119280 }, { "epoch": 9.730402153519863, "grad_norm": 3.108165979385376, "learning_rate": 1.1068776379214163e-07, "loss": 0.3458, "num_input_tokens_seen": 113903936, "step": 119285 }, { "epoch": 9.730810017130272, "grad_norm": 2.3739125728607178, "learning_rate": 1.1035345459883273e-07, "loss": 0.3078, "num_input_tokens_seen": 113908656, "step": 119290 }, { "epoch": 9.73121788074068, "grad_norm": 4.325709342956543, "learning_rate": 1.1001964990808577e-07, "loss": 0.3336, "num_input_tokens_seen": 113912800, "step": 119295 }, { "epoch": 9.731625744351089, "grad_norm": 4.598196506500244, "learning_rate": 1.096863497266648e-07, "loss": 0.3895, "num_input_tokens_seen": 113917760, "step": 119300 }, { "epoch": 9.732033607961498, "grad_norm": 2.9176411628723145, "learning_rate": 1.0935355406132553e-07, "loss": 0.4067, "num_input_tokens_seen": 113922560, "step": 119305 }, { "epoch": 9.732441471571907, "grad_norm": 3.333250045776367, "learning_rate": 1.0902126291880977e-07, "loss": 0.3879, "num_input_tokens_seen": 113926944, "step": 119310 }, { "epoch": 9.732849335182316, "grad_norm": 3.9954376220703125, "learning_rate": 1.0868947630585936e-07, "loss": 0.347, "num_input_tokens_seen": 113930864, "step": 119315 }, { "epoch": 9.733257198792725, "grad_norm": 1.0462212562561035, "learning_rate": 1.0835819422919669e-07, "loss": 0.306, "num_input_tokens_seen": 113936000, "step": 119320 }, { "epoch": 9.733665062403132, "grad_norm": 36.31295394897461, "learning_rate": 1.0802741669553306e-07, "loss": 0.4233, "num_input_tokens_seen": 113940992, "step": 119325 }, { "epoch": 9.73407292601354, "grad_norm": 7.1239495277404785, "learning_rate": 1.0769714371157702e-07, "loss": 0.2056, "num_input_tokens_seen": 113946656, "step": 119330 }, { "epoch": 9.73448078962395, "grad_norm": 17.993492126464844, "learning_rate": 1.0736737528402596e-07, "loss": 0.4492, "num_input_tokens_seen": 113951264, "step": 119335 }, { "epoch": 9.734888653234359, "grad_norm": 2.5863959789276123, "learning_rate": 1.0703811141955788e-07, "loss": 0.3396, "num_input_tokens_seen": 113956224, "step": 119340 }, { "epoch": 9.735296516844768, "grad_norm": 10.940483093261719, "learning_rate": 1.0670935212485078e-07, "loss": 0.4435, "num_input_tokens_seen": 113961264, "step": 119345 }, { "epoch": 9.735704380455175, "grad_norm": 13.777334213256836, "learning_rate": 1.06381097406566e-07, "loss": 0.4036, "num_input_tokens_seen": 113966048, "step": 119350 }, { "epoch": 9.736112244065584, "grad_norm": 16.793054580688477, "learning_rate": 1.0605334727135651e-07, "loss": 0.346, "num_input_tokens_seen": 113970304, "step": 119355 }, { "epoch": 9.736520107675993, "grad_norm": 1.7822985649108887, "learning_rate": 1.0572610172587261e-07, "loss": 0.2379, "num_input_tokens_seen": 113974240, "step": 119360 }, { "epoch": 9.736927971286402, "grad_norm": 8.567342758178711, "learning_rate": 1.0539936077673951e-07, "loss": 0.2498, "num_input_tokens_seen": 113979440, "step": 119365 }, { "epoch": 9.737335834896811, "grad_norm": 4.726101875305176, "learning_rate": 1.0507312443058248e-07, "loss": 0.273, "num_input_tokens_seen": 113984208, "step": 119370 }, { "epoch": 9.73774369850722, "grad_norm": 35.40901565551758, "learning_rate": 1.0474739269401568e-07, "loss": 0.4157, "num_input_tokens_seen": 113989936, "step": 119375 }, { "epoch": 9.738151562117627, "grad_norm": 37.31208801269531, "learning_rate": 1.0442216557363938e-07, "loss": 0.3848, "num_input_tokens_seen": 113995264, "step": 119380 }, { "epoch": 9.738559425728036, "grad_norm": 31.110206604003906, "learning_rate": 1.040974430760483e-07, "loss": 0.3705, "num_input_tokens_seen": 113999584, "step": 119385 }, { "epoch": 9.738967289338445, "grad_norm": 1.72561776638031, "learning_rate": 1.0377322520782329e-07, "loss": 0.1942, "num_input_tokens_seen": 114003840, "step": 119390 }, { "epoch": 9.739375152948854, "grad_norm": 4.016476631164551, "learning_rate": 1.0344951197553687e-07, "loss": 0.4059, "num_input_tokens_seen": 114008368, "step": 119395 }, { "epoch": 9.739783016559263, "grad_norm": 9.309672355651855, "learning_rate": 1.0312630338574769e-07, "loss": 0.2453, "num_input_tokens_seen": 114013168, "step": 119400 }, { "epoch": 9.74019088016967, "grad_norm": 26.383516311645508, "learning_rate": 1.0280359944500884e-07, "loss": 0.1877, "num_input_tokens_seen": 114018880, "step": 119405 }, { "epoch": 9.74059874378008, "grad_norm": 33.20450973510742, "learning_rate": 1.0248140015986507e-07, "loss": 0.2483, "num_input_tokens_seen": 114024704, "step": 119410 }, { "epoch": 9.741006607390489, "grad_norm": 3.56071400642395, "learning_rate": 1.0215970553683896e-07, "loss": 0.3215, "num_input_tokens_seen": 114029856, "step": 119415 }, { "epoch": 9.741414471000898, "grad_norm": 33.45291519165039, "learning_rate": 1.0183851558245861e-07, "loss": 0.3569, "num_input_tokens_seen": 114034768, "step": 119420 }, { "epoch": 9.741822334611307, "grad_norm": 31.252906799316406, "learning_rate": 1.0151783030322992e-07, "loss": 0.4812, "num_input_tokens_seen": 114039024, "step": 119425 }, { "epoch": 9.742230198221714, "grad_norm": 12.870285987854004, "learning_rate": 1.0119764970565881e-07, "loss": 0.1338, "num_input_tokens_seen": 114044016, "step": 119430 }, { "epoch": 9.742638061832123, "grad_norm": 2.0963573455810547, "learning_rate": 1.0087797379622621e-07, "loss": 0.347, "num_input_tokens_seen": 114048544, "step": 119435 }, { "epoch": 9.743045925442532, "grad_norm": 1.9131972789764404, "learning_rate": 1.0055880258142137e-07, "loss": 0.3764, "num_input_tokens_seen": 114053376, "step": 119440 }, { "epoch": 9.743453789052941, "grad_norm": 31.895849227905273, "learning_rate": 1.0024013606770577e-07, "loss": 0.4596, "num_input_tokens_seen": 114057904, "step": 119445 }, { "epoch": 9.74386165266335, "grad_norm": 11.571606636047363, "learning_rate": 9.992197426154371e-08, "loss": 0.3062, "num_input_tokens_seen": 114063120, "step": 119450 }, { "epoch": 9.744269516273757, "grad_norm": 5.328619003295898, "learning_rate": 9.960431716938279e-08, "loss": 0.3361, "num_input_tokens_seen": 114067664, "step": 119455 }, { "epoch": 9.744677379884166, "grad_norm": 1.0397597551345825, "learning_rate": 9.928716479765954e-08, "loss": 0.3999, "num_input_tokens_seen": 114071712, "step": 119460 }, { "epoch": 9.745085243494575, "grad_norm": 6.284655570983887, "learning_rate": 9.897051715280492e-08, "loss": 0.5237, "num_input_tokens_seen": 114076128, "step": 119465 }, { "epoch": 9.745493107104984, "grad_norm": 19.658321380615234, "learning_rate": 9.86543742412388e-08, "loss": 0.3268, "num_input_tokens_seen": 114081360, "step": 119470 }, { "epoch": 9.745900970715393, "grad_norm": 39.77619552612305, "learning_rate": 9.833873606936717e-08, "loss": 0.3466, "num_input_tokens_seen": 114085872, "step": 119475 }, { "epoch": 9.746308834325802, "grad_norm": 6.9801812171936035, "learning_rate": 9.80236026435849e-08, "loss": 0.3275, "num_input_tokens_seen": 114090240, "step": 119480 }, { "epoch": 9.74671669793621, "grad_norm": 2.170062303543091, "learning_rate": 9.770897397028412e-08, "loss": 0.3024, "num_input_tokens_seen": 114095232, "step": 119485 }, { "epoch": 9.747124561546618, "grad_norm": 18.890525817871094, "learning_rate": 9.739485005584303e-08, "loss": 0.3162, "num_input_tokens_seen": 114099600, "step": 119490 }, { "epoch": 9.747532425157027, "grad_norm": 16.025712966918945, "learning_rate": 9.708123090662325e-08, "loss": 0.3721, "num_input_tokens_seen": 114104944, "step": 119495 }, { "epoch": 9.747940288767436, "grad_norm": 35.776206970214844, "learning_rate": 9.676811652898909e-08, "loss": 0.2681, "num_input_tokens_seen": 114108656, "step": 119500 }, { "epoch": 9.748348152377845, "grad_norm": 7.349909782409668, "learning_rate": 9.645550692927996e-08, "loss": 0.2472, "num_input_tokens_seen": 114113920, "step": 119505 }, { "epoch": 9.748756015988253, "grad_norm": 36.60444641113281, "learning_rate": 9.614340211383521e-08, "loss": 0.2772, "num_input_tokens_seen": 114119120, "step": 119510 }, { "epoch": 9.749163879598662, "grad_norm": 2.9794466495513916, "learning_rate": 9.583180208898313e-08, "loss": 0.4305, "num_input_tokens_seen": 114123920, "step": 119515 }, { "epoch": 9.74957174320907, "grad_norm": 42.6304931640625, "learning_rate": 9.552070686103809e-08, "loss": 0.3372, "num_input_tokens_seen": 114128608, "step": 119520 }, { "epoch": 9.74997960681948, "grad_norm": 17.734758377075195, "learning_rate": 9.521011643630617e-08, "loss": 0.2582, "num_input_tokens_seen": 114133136, "step": 119525 }, { "epoch": 9.750387470429889, "grad_norm": 7.863000869750977, "learning_rate": 9.490003082107957e-08, "loss": 0.2496, "num_input_tokens_seen": 114137648, "step": 119530 }, { "epoch": 9.750795334040298, "grad_norm": 12.017704010009766, "learning_rate": 9.459045002165046e-08, "loss": 0.3711, "num_input_tokens_seen": 114143104, "step": 119535 }, { "epoch": 9.751203197650705, "grad_norm": 36.681854248046875, "learning_rate": 9.428137404428605e-08, "loss": 0.3999, "num_input_tokens_seen": 114148080, "step": 119540 }, { "epoch": 9.751611061261114, "grad_norm": 4.501072406768799, "learning_rate": 9.397280289525911e-08, "loss": 0.383, "num_input_tokens_seen": 114153488, "step": 119545 }, { "epoch": 9.752018924871523, "grad_norm": 5.205632209777832, "learning_rate": 9.366473658081742e-08, "loss": 0.4052, "num_input_tokens_seen": 114157952, "step": 119550 }, { "epoch": 9.752426788481932, "grad_norm": 3.265403985977173, "learning_rate": 9.335717510720875e-08, "loss": 0.4849, "num_input_tokens_seen": 114163216, "step": 119555 }, { "epoch": 9.752834652092341, "grad_norm": 1.1031867265701294, "learning_rate": 9.305011848066702e-08, "loss": 0.2601, "num_input_tokens_seen": 114166848, "step": 119560 }, { "epoch": 9.753242515702748, "grad_norm": 12.429756164550781, "learning_rate": 9.274356670741502e-08, "loss": 0.3533, "num_input_tokens_seen": 114171792, "step": 119565 }, { "epoch": 9.753650379313157, "grad_norm": 1.179105281829834, "learning_rate": 9.243751979366721e-08, "loss": 0.3, "num_input_tokens_seen": 114175232, "step": 119570 }, { "epoch": 9.754058242923566, "grad_norm": 2.1466431617736816, "learning_rate": 9.213197774562977e-08, "loss": 0.2004, "num_input_tokens_seen": 114180432, "step": 119575 }, { "epoch": 9.754466106533975, "grad_norm": 46.045753479003906, "learning_rate": 9.182694056948937e-08, "loss": 0.2756, "num_input_tokens_seen": 114185136, "step": 119580 }, { "epoch": 9.754873970144384, "grad_norm": 28.830045700073242, "learning_rate": 9.152240827143555e-08, "loss": 0.3487, "num_input_tokens_seen": 114190336, "step": 119585 }, { "epoch": 9.755281833754793, "grad_norm": 3.911179780960083, "learning_rate": 9.121838085763557e-08, "loss": 0.2979, "num_input_tokens_seen": 114195424, "step": 119590 }, { "epoch": 9.7556896973652, "grad_norm": 4.869597434997559, "learning_rate": 9.091485833425673e-08, "loss": 0.4916, "num_input_tokens_seen": 114199776, "step": 119595 }, { "epoch": 9.75609756097561, "grad_norm": 6.580834865570068, "learning_rate": 9.061184070744965e-08, "loss": 0.3834, "num_input_tokens_seen": 114204752, "step": 119600 }, { "epoch": 9.756505424586019, "grad_norm": 1.6762605905532837, "learning_rate": 9.03093279833539e-08, "loss": 0.4069, "num_input_tokens_seen": 114209328, "step": 119605 }, { "epoch": 9.756913288196428, "grad_norm": 1.6154320240020752, "learning_rate": 9.00073201681062e-08, "loss": 0.1513, "num_input_tokens_seen": 114213296, "step": 119610 }, { "epoch": 9.757321151806837, "grad_norm": 1.8563244342803955, "learning_rate": 8.970581726782112e-08, "loss": 0.2524, "num_input_tokens_seen": 114218000, "step": 119615 }, { "epoch": 9.757729015417244, "grad_norm": 16.55003547668457, "learning_rate": 8.9404819288616e-08, "loss": 0.4039, "num_input_tokens_seen": 114223136, "step": 119620 }, { "epoch": 9.758136879027653, "grad_norm": 12.575695991516113, "learning_rate": 8.910432623659149e-08, "loss": 0.4378, "num_input_tokens_seen": 114227760, "step": 119625 }, { "epoch": 9.758544742638062, "grad_norm": 26.86027717590332, "learning_rate": 8.88043381178344e-08, "loss": 0.2517, "num_input_tokens_seen": 114232512, "step": 119630 }, { "epoch": 9.75895260624847, "grad_norm": 1.0324535369873047, "learning_rate": 8.850485493843152e-08, "loss": 0.4688, "num_input_tokens_seen": 114237072, "step": 119635 }, { "epoch": 9.75936046985888, "grad_norm": 2.406280994415283, "learning_rate": 8.820587670444746e-08, "loss": 0.3125, "num_input_tokens_seen": 114241872, "step": 119640 }, { "epoch": 9.759768333469287, "grad_norm": 3.232635259628296, "learning_rate": 8.790740342194404e-08, "loss": 0.3643, "num_input_tokens_seen": 114246304, "step": 119645 }, { "epoch": 9.760176197079696, "grad_norm": 1.0667493343353271, "learning_rate": 8.760943509697195e-08, "loss": 0.352, "num_input_tokens_seen": 114251088, "step": 119650 }, { "epoch": 9.760584060690105, "grad_norm": 27.992280960083008, "learning_rate": 8.731197173557082e-08, "loss": 0.2973, "num_input_tokens_seen": 114255584, "step": 119655 }, { "epoch": 9.760991924300514, "grad_norm": 1.5922112464904785, "learning_rate": 8.701501334377193e-08, "loss": 0.2399, "num_input_tokens_seen": 114259744, "step": 119660 }, { "epoch": 9.761399787910923, "grad_norm": 1.0762087106704712, "learning_rate": 8.671855992758992e-08, "loss": 0.4003, "num_input_tokens_seen": 114264800, "step": 119665 }, { "epoch": 9.76180765152133, "grad_norm": 1.6291046142578125, "learning_rate": 8.642261149303665e-08, "loss": 0.2345, "num_input_tokens_seen": 114270112, "step": 119670 }, { "epoch": 9.76221551513174, "grad_norm": 6.654835224151611, "learning_rate": 8.612716804611287e-08, "loss": 0.4568, "num_input_tokens_seen": 114275392, "step": 119675 }, { "epoch": 9.762623378742148, "grad_norm": 6.9968647956848145, "learning_rate": 8.583222959280269e-08, "loss": 0.3127, "num_input_tokens_seen": 114280816, "step": 119680 }, { "epoch": 9.763031242352557, "grad_norm": 2.5287249088287354, "learning_rate": 8.553779613908741e-08, "loss": 0.3333, "num_input_tokens_seen": 114285536, "step": 119685 }, { "epoch": 9.763439105962966, "grad_norm": 3.336634397506714, "learning_rate": 8.524386769093452e-08, "loss": 0.3581, "num_input_tokens_seen": 114291360, "step": 119690 }, { "epoch": 9.763846969573375, "grad_norm": 10.8838472366333, "learning_rate": 8.495044425430032e-08, "loss": 0.5194, "num_input_tokens_seen": 114297328, "step": 119695 }, { "epoch": 9.764254833183783, "grad_norm": 24.004058837890625, "learning_rate": 8.465752583513564e-08, "loss": 0.3545, "num_input_tokens_seen": 114302176, "step": 119700 }, { "epoch": 9.764662696794192, "grad_norm": 4.219472408294678, "learning_rate": 8.436511243937462e-08, "loss": 0.2868, "num_input_tokens_seen": 114306816, "step": 119705 }, { "epoch": 9.7650705604046, "grad_norm": 2.1461548805236816, "learning_rate": 8.407320407294306e-08, "loss": 0.294, "num_input_tokens_seen": 114311344, "step": 119710 }, { "epoch": 9.76547842401501, "grad_norm": 5.532228946685791, "learning_rate": 8.3781800741764e-08, "loss": 0.3622, "num_input_tokens_seen": 114316560, "step": 119715 }, { "epoch": 9.765886287625419, "grad_norm": 3.934377908706665, "learning_rate": 8.349090245173829e-08, "loss": 0.3666, "num_input_tokens_seen": 114321856, "step": 119720 }, { "epoch": 9.766294151235826, "grad_norm": 8.820527076721191, "learning_rate": 8.320050920876399e-08, "loss": 0.2381, "num_input_tokens_seen": 114327040, "step": 119725 }, { "epoch": 9.766702014846235, "grad_norm": 22.83069610595703, "learning_rate": 8.291062101872804e-08, "loss": 0.3736, "num_input_tokens_seen": 114331456, "step": 119730 }, { "epoch": 9.767109878456644, "grad_norm": 3.249563217163086, "learning_rate": 8.262123788750631e-08, "loss": 0.2641, "num_input_tokens_seen": 114336816, "step": 119735 }, { "epoch": 9.767517742067053, "grad_norm": 0.3817887008190155, "learning_rate": 8.233235982096632e-08, "loss": 0.2385, "num_input_tokens_seen": 114341808, "step": 119740 }, { "epoch": 9.767925605677462, "grad_norm": 22.808692932128906, "learning_rate": 8.204398682495896e-08, "loss": 0.2893, "num_input_tokens_seen": 114346512, "step": 119745 }, { "epoch": 9.768333469287871, "grad_norm": 2.116384744644165, "learning_rate": 8.17561189053323e-08, "loss": 0.2867, "num_input_tokens_seen": 114351520, "step": 119750 }, { "epoch": 9.768741332898278, "grad_norm": 3.267366409301758, "learning_rate": 8.146875606791782e-08, "loss": 0.2922, "num_input_tokens_seen": 114356480, "step": 119755 }, { "epoch": 9.769149196508687, "grad_norm": 32.153907775878906, "learning_rate": 8.118189831854694e-08, "loss": 0.4156, "num_input_tokens_seen": 114361680, "step": 119760 }, { "epoch": 9.769557060119096, "grad_norm": 25.68404769897461, "learning_rate": 8.089554566302893e-08, "loss": 0.2542, "num_input_tokens_seen": 114366864, "step": 119765 }, { "epoch": 9.769964923729505, "grad_norm": 4.596510887145996, "learning_rate": 8.060969810716745e-08, "loss": 0.1213, "num_input_tokens_seen": 114371728, "step": 119770 }, { "epoch": 9.770372787339914, "grad_norm": 1.877535343170166, "learning_rate": 8.032435565676067e-08, "loss": 0.2981, "num_input_tokens_seen": 114376784, "step": 119775 }, { "epoch": 9.770780650950321, "grad_norm": 19.340452194213867, "learning_rate": 8.003951831759005e-08, "loss": 0.4928, "num_input_tokens_seen": 114381648, "step": 119780 }, { "epoch": 9.77118851456073, "grad_norm": 1.1219242811203003, "learning_rate": 7.975518609542876e-08, "loss": 0.3927, "num_input_tokens_seen": 114385696, "step": 119785 }, { "epoch": 9.77159637817114, "grad_norm": 2.2203235626220703, "learning_rate": 7.947135899604163e-08, "loss": 0.2761, "num_input_tokens_seen": 114390992, "step": 119790 }, { "epoch": 9.772004241781548, "grad_norm": 1.3415554761886597, "learning_rate": 7.918803702517963e-08, "loss": 0.4837, "num_input_tokens_seen": 114395136, "step": 119795 }, { "epoch": 9.772412105391957, "grad_norm": 19.028762817382812, "learning_rate": 7.890522018858537e-08, "loss": 0.3331, "num_input_tokens_seen": 114400704, "step": 119800 }, { "epoch": 9.772819969002366, "grad_norm": 26.977705001831055, "learning_rate": 7.862290849199594e-08, "loss": 0.2374, "num_input_tokens_seen": 114405792, "step": 119805 }, { "epoch": 9.773227832612774, "grad_norm": 3.6744320392608643, "learning_rate": 7.834110194112899e-08, "loss": 0.3925, "num_input_tokens_seen": 114409984, "step": 119810 }, { "epoch": 9.773635696223183, "grad_norm": 32.19020080566406, "learning_rate": 7.805980054169659e-08, "loss": 0.5599, "num_input_tokens_seen": 114414800, "step": 119815 }, { "epoch": 9.774043559833592, "grad_norm": 59.61794662475586, "learning_rate": 7.777900429940255e-08, "loss": 0.3468, "num_input_tokens_seen": 114418496, "step": 119820 }, { "epoch": 9.774451423444, "grad_norm": 10.670833587646484, "learning_rate": 7.74987132199395e-08, "loss": 0.278, "num_input_tokens_seen": 114423536, "step": 119825 }, { "epoch": 9.77485928705441, "grad_norm": 12.807755470275879, "learning_rate": 7.721892730898627e-08, "loss": 0.4556, "num_input_tokens_seen": 114427760, "step": 119830 }, { "epoch": 9.775267150664817, "grad_norm": 13.711455345153809, "learning_rate": 7.693964657221608e-08, "loss": 0.4051, "num_input_tokens_seen": 114432448, "step": 119835 }, { "epoch": 9.775675014275226, "grad_norm": 12.743155479431152, "learning_rate": 7.666087101528829e-08, "loss": 0.2844, "num_input_tokens_seen": 114436944, "step": 119840 }, { "epoch": 9.776082877885635, "grad_norm": 21.473031997680664, "learning_rate": 7.638260064385117e-08, "loss": 0.2839, "num_input_tokens_seen": 114441936, "step": 119845 }, { "epoch": 9.776490741496044, "grad_norm": 6.77434778213501, "learning_rate": 7.610483546355296e-08, "loss": 0.3338, "num_input_tokens_seen": 114446864, "step": 119850 }, { "epoch": 9.776898605106453, "grad_norm": 34.39875411987305, "learning_rate": 7.582757548001695e-08, "loss": 0.3487, "num_input_tokens_seen": 114451792, "step": 119855 }, { "epoch": 9.77730646871686, "grad_norm": 6.0053181648254395, "learning_rate": 7.555082069886365e-08, "loss": 0.3468, "num_input_tokens_seen": 114456224, "step": 119860 }, { "epoch": 9.77771433232727, "grad_norm": 5.674031734466553, "learning_rate": 7.527457112570524e-08, "loss": 0.3916, "num_input_tokens_seen": 114460432, "step": 119865 }, { "epoch": 9.778122195937678, "grad_norm": 20.183151245117188, "learning_rate": 7.499882676614e-08, "loss": 0.3407, "num_input_tokens_seen": 114465488, "step": 119870 }, { "epoch": 9.778530059548087, "grad_norm": 37.37126541137695, "learning_rate": 7.472358762575792e-08, "loss": 0.5903, "num_input_tokens_seen": 114470400, "step": 119875 }, { "epoch": 9.778937923158496, "grad_norm": 21.1668758392334, "learning_rate": 7.444885371013788e-08, "loss": 0.302, "num_input_tokens_seen": 114475888, "step": 119880 }, { "epoch": 9.779345786768904, "grad_norm": 1.2125296592712402, "learning_rate": 7.417462502484484e-08, "loss": 0.3033, "num_input_tokens_seen": 114479744, "step": 119885 }, { "epoch": 9.779753650379313, "grad_norm": 11.709567070007324, "learning_rate": 7.390090157544383e-08, "loss": 0.2328, "num_input_tokens_seen": 114484480, "step": 119890 }, { "epoch": 9.780161513989722, "grad_norm": 6.13678503036499, "learning_rate": 7.362768336748038e-08, "loss": 0.4108, "num_input_tokens_seen": 114489488, "step": 119895 }, { "epoch": 9.78056937760013, "grad_norm": 2.429651975631714, "learning_rate": 7.335497040648898e-08, "loss": 0.2364, "num_input_tokens_seen": 114494352, "step": 119900 }, { "epoch": 9.78097724121054, "grad_norm": 12.580094337463379, "learning_rate": 7.308276269800129e-08, "loss": 0.3525, "num_input_tokens_seen": 114498896, "step": 119905 }, { "epoch": 9.781385104820949, "grad_norm": 1.149305820465088, "learning_rate": 7.281106024753514e-08, "loss": 0.2766, "num_input_tokens_seen": 114504272, "step": 119910 }, { "epoch": 9.781792968431356, "grad_norm": 2.9009640216827393, "learning_rate": 7.253986306059445e-08, "loss": 0.2627, "num_input_tokens_seen": 114509152, "step": 119915 }, { "epoch": 9.782200832041765, "grad_norm": 2.2466797828674316, "learning_rate": 7.226917114268039e-08, "loss": 0.2145, "num_input_tokens_seen": 114513824, "step": 119920 }, { "epoch": 9.782608695652174, "grad_norm": 4.028111934661865, "learning_rate": 7.199898449927745e-08, "loss": 0.4784, "num_input_tokens_seen": 114518992, "step": 119925 }, { "epoch": 9.783016559262583, "grad_norm": 6.197159290313721, "learning_rate": 7.172930313586179e-08, "loss": 0.283, "num_input_tokens_seen": 114523936, "step": 119930 }, { "epoch": 9.783424422872992, "grad_norm": 38.2949104309082, "learning_rate": 7.146012705790128e-08, "loss": 0.4388, "num_input_tokens_seen": 114528800, "step": 119935 }, { "epoch": 9.783832286483399, "grad_norm": 8.810968399047852, "learning_rate": 7.11914562708499e-08, "loss": 0.4105, "num_input_tokens_seen": 114534336, "step": 119940 }, { "epoch": 9.784240150093808, "grad_norm": 3.1144065856933594, "learning_rate": 7.092329078015603e-08, "loss": 0.5458, "num_input_tokens_seen": 114539760, "step": 119945 }, { "epoch": 9.784648013704217, "grad_norm": 7.04528284072876, "learning_rate": 7.065563059125424e-08, "loss": 0.3406, "num_input_tokens_seen": 114544416, "step": 119950 }, { "epoch": 9.785055877314626, "grad_norm": 7.728203773498535, "learning_rate": 7.038847570956797e-08, "loss": 0.2805, "num_input_tokens_seen": 114549056, "step": 119955 }, { "epoch": 9.785463740925035, "grad_norm": 5.398046493530273, "learning_rate": 7.01218261405151e-08, "loss": 0.3187, "num_input_tokens_seen": 114553984, "step": 119960 }, { "epoch": 9.785871604535444, "grad_norm": 5.63886022567749, "learning_rate": 6.985568188949965e-08, "loss": 0.2451, "num_input_tokens_seen": 114558464, "step": 119965 }, { "epoch": 9.786279468145851, "grad_norm": 21.905391693115234, "learning_rate": 6.959004296191729e-08, "loss": 0.2916, "num_input_tokens_seen": 114563616, "step": 119970 }, { "epoch": 9.78668733175626, "grad_norm": 3.251180648803711, "learning_rate": 6.932490936314983e-08, "loss": 0.2852, "num_input_tokens_seen": 114568384, "step": 119975 }, { "epoch": 9.78709519536667, "grad_norm": 46.570289611816406, "learning_rate": 6.906028109857077e-08, "loss": 0.3403, "num_input_tokens_seen": 114573104, "step": 119980 }, { "epoch": 9.787503058977078, "grad_norm": 16.468345642089844, "learning_rate": 6.879615817355078e-08, "loss": 0.4908, "num_input_tokens_seen": 114577840, "step": 119985 }, { "epoch": 9.787910922587487, "grad_norm": 1.859416127204895, "learning_rate": 6.853254059343562e-08, "loss": 0.4362, "num_input_tokens_seen": 114581136, "step": 119990 }, { "epoch": 9.788318786197895, "grad_norm": 1.2965240478515625, "learning_rate": 6.826942836357098e-08, "loss": 0.4692, "num_input_tokens_seen": 114586416, "step": 119995 }, { "epoch": 9.788726649808304, "grad_norm": 10.674607276916504, "learning_rate": 6.800682148929427e-08, "loss": 0.4455, "num_input_tokens_seen": 114590896, "step": 120000 }, { "epoch": 9.789134513418713, "grad_norm": 1.900663137435913, "learning_rate": 6.774471997592346e-08, "loss": 0.3314, "num_input_tokens_seen": 114594992, "step": 120005 }, { "epoch": 9.789542377029122, "grad_norm": 2.231822967529297, "learning_rate": 6.748312382877098e-08, "loss": 0.2529, "num_input_tokens_seen": 114599152, "step": 120010 }, { "epoch": 9.78995024063953, "grad_norm": 18.09117889404297, "learning_rate": 6.722203305314367e-08, "loss": 0.2545, "num_input_tokens_seen": 114603856, "step": 120015 }, { "epoch": 9.79035810424994, "grad_norm": 9.325101852416992, "learning_rate": 6.696144765432899e-08, "loss": 0.45, "num_input_tokens_seen": 114608432, "step": 120020 }, { "epoch": 9.790765967860347, "grad_norm": 15.876701354980469, "learning_rate": 6.670136763761159e-08, "loss": 0.2612, "num_input_tokens_seen": 114612608, "step": 120025 }, { "epoch": 9.791173831470756, "grad_norm": 1.8242261409759521, "learning_rate": 6.644179300826225e-08, "loss": 0.3762, "num_input_tokens_seen": 114617840, "step": 120030 }, { "epoch": 9.791581695081165, "grad_norm": 13.907552719116211, "learning_rate": 6.618272377154622e-08, "loss": 0.2653, "num_input_tokens_seen": 114622224, "step": 120035 }, { "epoch": 9.791989558691574, "grad_norm": 36.32369613647461, "learning_rate": 6.592415993270651e-08, "loss": 0.2207, "num_input_tokens_seen": 114626544, "step": 120040 }, { "epoch": 9.792397422301983, "grad_norm": 2.2797226905822754, "learning_rate": 6.566610149699171e-08, "loss": 0.568, "num_input_tokens_seen": 114631376, "step": 120045 }, { "epoch": 9.79280528591239, "grad_norm": 25.288959503173828, "learning_rate": 6.540854846963096e-08, "loss": 0.3832, "num_input_tokens_seen": 114636576, "step": 120050 }, { "epoch": 9.7932131495228, "grad_norm": 14.486719131469727, "learning_rate": 6.515150085584231e-08, "loss": 0.1697, "num_input_tokens_seen": 114641984, "step": 120055 }, { "epoch": 9.793621013133208, "grad_norm": 5.125096797943115, "learning_rate": 6.48949586608355e-08, "loss": 0.2995, "num_input_tokens_seen": 114647488, "step": 120060 }, { "epoch": 9.794028876743617, "grad_norm": 3.21687650680542, "learning_rate": 6.463892188981468e-08, "loss": 0.3498, "num_input_tokens_seen": 114652640, "step": 120065 }, { "epoch": 9.794436740354026, "grad_norm": 1.833321213722229, "learning_rate": 6.43833905479646e-08, "loss": 0.4727, "num_input_tokens_seen": 114656752, "step": 120070 }, { "epoch": 9.794844603964435, "grad_norm": 17.223848342895508, "learning_rate": 6.412836464047e-08, "loss": 0.2978, "num_input_tokens_seen": 114661072, "step": 120075 }, { "epoch": 9.795252467574842, "grad_norm": 46.52043914794922, "learning_rate": 6.38738441724962e-08, "loss": 0.4952, "num_input_tokens_seen": 114666528, "step": 120080 }, { "epoch": 9.795660331185251, "grad_norm": 24.2415714263916, "learning_rate": 6.361982914920573e-08, "loss": 0.4025, "num_input_tokens_seen": 114672288, "step": 120085 }, { "epoch": 9.79606819479566, "grad_norm": 1.1313064098358154, "learning_rate": 6.336631957574168e-08, "loss": 0.3097, "num_input_tokens_seen": 114676928, "step": 120090 }, { "epoch": 9.79647605840607, "grad_norm": 18.339738845825195, "learning_rate": 6.311331545724997e-08, "loss": 0.398, "num_input_tokens_seen": 114681440, "step": 120095 }, { "epoch": 9.796883922016479, "grad_norm": 27.752214431762695, "learning_rate": 6.286081679885148e-08, "loss": 0.3456, "num_input_tokens_seen": 114686192, "step": 120100 }, { "epoch": 9.797291785626886, "grad_norm": 14.075271606445312, "learning_rate": 6.260882360566988e-08, "loss": 0.4148, "num_input_tokens_seen": 114691280, "step": 120105 }, { "epoch": 9.797699649237295, "grad_norm": 3.487715482711792, "learning_rate": 6.235733588280945e-08, "loss": 0.216, "num_input_tokens_seen": 114696464, "step": 120110 }, { "epoch": 9.798107512847704, "grad_norm": 21.31298065185547, "learning_rate": 6.210635363537165e-08, "loss": 0.3942, "num_input_tokens_seen": 114701152, "step": 120115 }, { "epoch": 9.798515376458113, "grad_norm": 27.716392517089844, "learning_rate": 6.18558768684413e-08, "loss": 0.3627, "num_input_tokens_seen": 114706240, "step": 120120 }, { "epoch": 9.798923240068522, "grad_norm": 15.859220504760742, "learning_rate": 6.160590558709212e-08, "loss": 0.311, "num_input_tokens_seen": 114711152, "step": 120125 }, { "epoch": 9.799331103678929, "grad_norm": 2.4091970920562744, "learning_rate": 6.135643979639782e-08, "loss": 0.4168, "num_input_tokens_seen": 114716688, "step": 120130 }, { "epoch": 9.799738967289338, "grad_norm": 2.5007591247558594, "learning_rate": 6.110747950140994e-08, "loss": 0.1894, "num_input_tokens_seen": 114721920, "step": 120135 }, { "epoch": 9.800146830899747, "grad_norm": 6.932115077972412, "learning_rate": 6.085902470717441e-08, "loss": 0.3453, "num_input_tokens_seen": 114726272, "step": 120140 }, { "epoch": 9.800554694510156, "grad_norm": 28.42730712890625, "learning_rate": 6.061107541873168e-08, "loss": 0.4854, "num_input_tokens_seen": 114730832, "step": 120145 }, { "epoch": 9.800962558120565, "grad_norm": 18.273700714111328, "learning_rate": 6.036363164110548e-08, "loss": 0.3291, "num_input_tokens_seen": 114735296, "step": 120150 }, { "epoch": 9.801370421730972, "grad_norm": 3.2624003887176514, "learning_rate": 6.011669337930848e-08, "loss": 0.3888, "num_input_tokens_seen": 114740368, "step": 120155 }, { "epoch": 9.801778285341381, "grad_norm": 42.701927185058594, "learning_rate": 5.987026063835055e-08, "loss": 0.4176, "num_input_tokens_seen": 114744736, "step": 120160 }, { "epoch": 9.80218614895179, "grad_norm": 6.143246650695801, "learning_rate": 5.962433342322215e-08, "loss": 0.4834, "num_input_tokens_seen": 114750096, "step": 120165 }, { "epoch": 9.8025940125622, "grad_norm": 3.400980234146118, "learning_rate": 5.9378911738913744e-08, "loss": 0.4692, "num_input_tokens_seen": 114754832, "step": 120170 }, { "epoch": 9.803001876172608, "grad_norm": 12.392293930053711, "learning_rate": 5.9133995590393565e-08, "loss": 0.361, "num_input_tokens_seen": 114760480, "step": 120175 }, { "epoch": 9.803409739783017, "grad_norm": 14.595428466796875, "learning_rate": 5.888958498263264e-08, "loss": 0.3382, "num_input_tokens_seen": 114765280, "step": 120180 }, { "epoch": 9.803817603393425, "grad_norm": 10.466523170471191, "learning_rate": 5.864567992057979e-08, "loss": 0.3047, "num_input_tokens_seen": 114769904, "step": 120185 }, { "epoch": 9.804225467003834, "grad_norm": 18.130176544189453, "learning_rate": 5.8402280409181055e-08, "loss": 0.3893, "num_input_tokens_seen": 114773840, "step": 120190 }, { "epoch": 9.804633330614243, "grad_norm": 3.9686083793640137, "learning_rate": 5.8159386453371376e-08, "loss": 0.452, "num_input_tokens_seen": 114778720, "step": 120195 }, { "epoch": 9.805041194224652, "grad_norm": 7.115522861480713, "learning_rate": 5.7916998058071826e-08, "loss": 0.4095, "num_input_tokens_seen": 114783616, "step": 120200 }, { "epoch": 9.80544905783506, "grad_norm": 0.9462494254112244, "learning_rate": 5.767511522819513e-08, "loss": 0.2088, "num_input_tokens_seen": 114788112, "step": 120205 }, { "epoch": 9.805856921445468, "grad_norm": 4.53386926651001, "learning_rate": 5.7433737968645705e-08, "loss": 0.2336, "num_input_tokens_seen": 114792672, "step": 120210 }, { "epoch": 9.806264785055877, "grad_norm": 8.890885353088379, "learning_rate": 5.719286628431686e-08, "loss": 0.3572, "num_input_tokens_seen": 114797136, "step": 120215 }, { "epoch": 9.806672648666286, "grad_norm": 1.5847018957138062, "learning_rate": 5.6952500180085246e-08, "loss": 0.3936, "num_input_tokens_seen": 114802368, "step": 120220 }, { "epoch": 9.807080512276695, "grad_norm": 2.5559468269348145, "learning_rate": 5.671263966083307e-08, "loss": 0.3672, "num_input_tokens_seen": 114806816, "step": 120225 }, { "epoch": 9.807488375887104, "grad_norm": 10.551894187927246, "learning_rate": 5.647328473141478e-08, "loss": 0.3437, "num_input_tokens_seen": 114811376, "step": 120230 }, { "epoch": 9.807896239497513, "grad_norm": 0.6597626805305481, "learning_rate": 5.623443539668205e-08, "loss": 0.3602, "num_input_tokens_seen": 114815744, "step": 120235 }, { "epoch": 9.80830410310792, "grad_norm": 13.786850929260254, "learning_rate": 5.599609166147824e-08, "loss": 0.4264, "num_input_tokens_seen": 114820224, "step": 120240 }, { "epoch": 9.80871196671833, "grad_norm": 40.64881134033203, "learning_rate": 5.5758253530635575e-08, "loss": 0.3546, "num_input_tokens_seen": 114824112, "step": 120245 }, { "epoch": 9.809119830328738, "grad_norm": 2.8020074367523193, "learning_rate": 5.552092100897521e-08, "loss": 0.2847, "num_input_tokens_seen": 114828960, "step": 120250 }, { "epoch": 9.809527693939147, "grad_norm": 31.78194236755371, "learning_rate": 5.5284094101304417e-08, "loss": 0.5314, "num_input_tokens_seen": 114834464, "step": 120255 }, { "epoch": 9.809935557549556, "grad_norm": 1.1237949132919312, "learning_rate": 5.50477728124249e-08, "loss": 0.2735, "num_input_tokens_seen": 114839312, "step": 120260 }, { "epoch": 9.810343421159963, "grad_norm": 18.66818618774414, "learning_rate": 5.481195714713006e-08, "loss": 0.4014, "num_input_tokens_seen": 114843712, "step": 120265 }, { "epoch": 9.810751284770372, "grad_norm": 25.11805534362793, "learning_rate": 5.4576647110193834e-08, "loss": 0.2403, "num_input_tokens_seen": 114848544, "step": 120270 }, { "epoch": 9.811159148380781, "grad_norm": 1.516729474067688, "learning_rate": 5.43418427063902e-08, "loss": 0.3761, "num_input_tokens_seen": 114853360, "step": 120275 }, { "epoch": 9.81156701199119, "grad_norm": 9.078230857849121, "learning_rate": 5.410754394047923e-08, "loss": 0.2869, "num_input_tokens_seen": 114857712, "step": 120280 }, { "epoch": 9.8119748756016, "grad_norm": 9.226594924926758, "learning_rate": 5.387375081720436e-08, "loss": 0.3391, "num_input_tokens_seen": 114862912, "step": 120285 }, { "epoch": 9.812382739212008, "grad_norm": 18.612028121948242, "learning_rate": 5.364046334131179e-08, "loss": 0.3023, "num_input_tokens_seen": 114867616, "step": 120290 }, { "epoch": 9.812790602822416, "grad_norm": 2.6619057655334473, "learning_rate": 5.3407681517522736e-08, "loss": 0.355, "num_input_tokens_seen": 114872480, "step": 120295 }, { "epoch": 9.813198466432825, "grad_norm": 19.229597091674805, "learning_rate": 5.317540535056398e-08, "loss": 0.4013, "num_input_tokens_seen": 114877136, "step": 120300 }, { "epoch": 9.813606330043234, "grad_norm": 3.850503444671631, "learning_rate": 5.294363484513731e-08, "loss": 0.3939, "num_input_tokens_seen": 114882736, "step": 120305 }, { "epoch": 9.814014193653643, "grad_norm": 5.1770405769348145, "learning_rate": 5.271237000594176e-08, "loss": 0.2841, "num_input_tokens_seen": 114888448, "step": 120310 }, { "epoch": 9.814422057264052, "grad_norm": 5.2751264572143555, "learning_rate": 5.248161083766523e-08, "loss": 0.3624, "num_input_tokens_seen": 114893136, "step": 120315 }, { "epoch": 9.814829920874459, "grad_norm": 9.285972595214844, "learning_rate": 5.2251357344987315e-08, "loss": 0.1366, "num_input_tokens_seen": 114897984, "step": 120320 }, { "epoch": 9.815237784484868, "grad_norm": 27.569271087646484, "learning_rate": 5.202160953257373e-08, "loss": 0.4146, "num_input_tokens_seen": 114902848, "step": 120325 }, { "epoch": 9.815645648095277, "grad_norm": 50.259273529052734, "learning_rate": 5.179236740507909e-08, "loss": 0.3862, "num_input_tokens_seen": 114906768, "step": 120330 }, { "epoch": 9.816053511705686, "grad_norm": 6.305515289306641, "learning_rate": 5.156363096715244e-08, "loss": 0.4569, "num_input_tokens_seen": 114910864, "step": 120335 }, { "epoch": 9.816461375316095, "grad_norm": 6.613213062286377, "learning_rate": 5.133540022343175e-08, "loss": 0.4028, "num_input_tokens_seen": 114916528, "step": 120340 }, { "epoch": 9.816869238926502, "grad_norm": 16.7322940826416, "learning_rate": 5.1107675178538315e-08, "loss": 0.4173, "num_input_tokens_seen": 114921680, "step": 120345 }, { "epoch": 9.817277102536911, "grad_norm": 20.2473201751709, "learning_rate": 5.0880455837093445e-08, "loss": 0.446, "num_input_tokens_seen": 114926752, "step": 120350 }, { "epoch": 9.81768496614732, "grad_norm": 50.74196243286133, "learning_rate": 5.0653742203699005e-08, "loss": 0.2818, "num_input_tokens_seen": 114932224, "step": 120355 }, { "epoch": 9.81809282975773, "grad_norm": 3.922312021255493, "learning_rate": 5.0427534282951327e-08, "loss": 0.3498, "num_input_tokens_seen": 114937472, "step": 120360 }, { "epoch": 9.818500693368138, "grad_norm": 18.520923614501953, "learning_rate": 5.020183207943563e-08, "loss": 0.4242, "num_input_tokens_seen": 114942128, "step": 120365 }, { "epoch": 9.818908556978545, "grad_norm": 2.373413562774658, "learning_rate": 4.997663559772881e-08, "loss": 0.3671, "num_input_tokens_seen": 114946816, "step": 120370 }, { "epoch": 9.819316420588954, "grad_norm": 1.7100931406021118, "learning_rate": 4.97519448423911e-08, "loss": 0.3792, "num_input_tokens_seen": 114951712, "step": 120375 }, { "epoch": 9.819724284199363, "grad_norm": 1.6674773693084717, "learning_rate": 4.9527759817979966e-08, "loss": 0.2142, "num_input_tokens_seen": 114957328, "step": 120380 }, { "epoch": 9.820132147809773, "grad_norm": 30.236785888671875, "learning_rate": 4.9304080529041785e-08, "loss": 0.4857, "num_input_tokens_seen": 114962400, "step": 120385 }, { "epoch": 9.820540011420182, "grad_norm": 25.2840576171875, "learning_rate": 4.9080906980106255e-08, "loss": 0.2662, "num_input_tokens_seen": 114966208, "step": 120390 }, { "epoch": 9.82094787503059, "grad_norm": 31.268016815185547, "learning_rate": 4.885823917569754e-08, "loss": 0.3757, "num_input_tokens_seen": 114971840, "step": 120395 }, { "epoch": 9.821355738640998, "grad_norm": 10.184281349182129, "learning_rate": 4.863607712033147e-08, "loss": 0.3907, "num_input_tokens_seen": 114976944, "step": 120400 }, { "epoch": 9.821763602251407, "grad_norm": 2.353592872619629, "learning_rate": 4.841442081850722e-08, "loss": 0.4635, "num_input_tokens_seen": 114982080, "step": 120405 }, { "epoch": 9.822171465861816, "grad_norm": 15.217863082885742, "learning_rate": 4.81932702747212e-08, "loss": 0.4336, "num_input_tokens_seen": 114986416, "step": 120410 }, { "epoch": 9.822579329472225, "grad_norm": 43.30223846435547, "learning_rate": 4.7972625493455936e-08, "loss": 0.2359, "num_input_tokens_seen": 114991040, "step": 120415 }, { "epoch": 9.822987193082634, "grad_norm": 1.6479498147964478, "learning_rate": 4.775248647918284e-08, "loss": 0.3618, "num_input_tokens_seen": 114996192, "step": 120420 }, { "epoch": 9.823395056693041, "grad_norm": 1.6503005027770996, "learning_rate": 4.753285323636225e-08, "loss": 0.3786, "num_input_tokens_seen": 115001136, "step": 120425 }, { "epoch": 9.82380292030345, "grad_norm": 8.331310272216797, "learning_rate": 4.731372576944892e-08, "loss": 0.3305, "num_input_tokens_seen": 115005664, "step": 120430 }, { "epoch": 9.824210783913859, "grad_norm": 4.7991838455200195, "learning_rate": 4.7095104082883755e-08, "loss": 0.4038, "num_input_tokens_seen": 115009904, "step": 120435 }, { "epoch": 9.824618647524268, "grad_norm": 18.017499923706055, "learning_rate": 4.687698818109654e-08, "loss": 0.3712, "num_input_tokens_seen": 115014672, "step": 120440 }, { "epoch": 9.825026511134677, "grad_norm": 13.295994758605957, "learning_rate": 4.665937806850873e-08, "loss": 0.3663, "num_input_tokens_seen": 115019168, "step": 120445 }, { "epoch": 9.825434374745086, "grad_norm": 1.5039458274841309, "learning_rate": 4.644227374953347e-08, "loss": 0.4544, "num_input_tokens_seen": 115022592, "step": 120450 }, { "epoch": 9.825842238355493, "grad_norm": 23.990554809570312, "learning_rate": 4.622567522857002e-08, "loss": 0.3344, "num_input_tokens_seen": 115028128, "step": 120455 }, { "epoch": 9.826250101965902, "grad_norm": 1.0824397802352905, "learning_rate": 4.600958251000931e-08, "loss": 0.205, "num_input_tokens_seen": 115033536, "step": 120460 }, { "epoch": 9.826657965576311, "grad_norm": 1.4308204650878906, "learning_rate": 4.579399559822839e-08, "loss": 0.3616, "num_input_tokens_seen": 115038224, "step": 120465 }, { "epoch": 9.82706582918672, "grad_norm": 8.133378028869629, "learning_rate": 4.557891449760154e-08, "loss": 0.5127, "num_input_tokens_seen": 115042800, "step": 120470 }, { "epoch": 9.82747369279713, "grad_norm": 6.9844584465026855, "learning_rate": 4.5364339212486395e-08, "loss": 0.3769, "num_input_tokens_seen": 115048320, "step": 120475 }, { "epoch": 9.827881556407537, "grad_norm": 2.485372543334961, "learning_rate": 4.5150269747232244e-08, "loss": 0.3457, "num_input_tokens_seen": 115052480, "step": 120480 }, { "epoch": 9.828289420017946, "grad_norm": 25.087749481201172, "learning_rate": 4.493670610617729e-08, "loss": 0.3628, "num_input_tokens_seen": 115057520, "step": 120485 }, { "epoch": 9.828697283628355, "grad_norm": 35.28036117553711, "learning_rate": 4.4723648293651385e-08, "loss": 0.3437, "num_input_tokens_seen": 115061904, "step": 120490 }, { "epoch": 9.829105147238764, "grad_norm": 1.5067722797393799, "learning_rate": 4.451109631397332e-08, "loss": 0.1664, "num_input_tokens_seen": 115067344, "step": 120495 }, { "epoch": 9.829513010849173, "grad_norm": 19.85407066345215, "learning_rate": 4.429905017145075e-08, "loss": 0.4407, "num_input_tokens_seen": 115070864, "step": 120500 }, { "epoch": 9.829920874459582, "grad_norm": 12.952990531921387, "learning_rate": 4.408750987038024e-08, "loss": 0.3699, "num_input_tokens_seen": 115075152, "step": 120505 }, { "epoch": 9.830328738069989, "grad_norm": 7.596090793609619, "learning_rate": 4.38764754150528e-08, "loss": 0.3894, "num_input_tokens_seen": 115079280, "step": 120510 }, { "epoch": 9.830736601680398, "grad_norm": 25.763364791870117, "learning_rate": 4.366594680974556e-08, "loss": 0.4228, "num_input_tokens_seen": 115083472, "step": 120515 }, { "epoch": 9.831144465290807, "grad_norm": 10.364996910095215, "learning_rate": 4.345592405872734e-08, "loss": 0.3063, "num_input_tokens_seen": 115088784, "step": 120520 }, { "epoch": 9.831552328901216, "grad_norm": 2.1139917373657227, "learning_rate": 4.324640716625028e-08, "loss": 0.3219, "num_input_tokens_seen": 115093232, "step": 120525 }, { "epoch": 9.831960192511625, "grad_norm": 1.3357882499694824, "learning_rate": 4.303739613656099e-08, "loss": 0.3543, "num_input_tokens_seen": 115097760, "step": 120530 }, { "epoch": 9.832368056122032, "grad_norm": 5.079653739929199, "learning_rate": 4.28288909739033e-08, "loss": 0.3455, "num_input_tokens_seen": 115101872, "step": 120535 }, { "epoch": 9.832775919732441, "grad_norm": 8.531807899475098, "learning_rate": 4.262089168249605e-08, "loss": 0.4205, "num_input_tokens_seen": 115107248, "step": 120540 }, { "epoch": 9.83318378334285, "grad_norm": 27.873363494873047, "learning_rate": 4.2413398266560877e-08, "loss": 0.2501, "num_input_tokens_seen": 115112176, "step": 120545 }, { "epoch": 9.83359164695326, "grad_norm": 19.385265350341797, "learning_rate": 4.220641073029996e-08, "loss": 0.4212, "num_input_tokens_seen": 115117136, "step": 120550 }, { "epoch": 9.833999510563668, "grad_norm": 1.1122125387191772, "learning_rate": 4.199992907790995e-08, "loss": 0.2301, "num_input_tokens_seen": 115121232, "step": 120555 }, { "epoch": 9.834407374174075, "grad_norm": 4.18986177444458, "learning_rate": 4.1793953313576387e-08, "loss": 0.3697, "num_input_tokens_seen": 115125568, "step": 120560 }, { "epoch": 9.834815237784484, "grad_norm": 56.38526916503906, "learning_rate": 4.1588483441473704e-08, "loss": 0.4941, "num_input_tokens_seen": 115130448, "step": 120565 }, { "epoch": 9.835223101394893, "grad_norm": 1.5596282482147217, "learning_rate": 4.1383519465765244e-08, "loss": 0.1417, "num_input_tokens_seen": 115134896, "step": 120570 }, { "epoch": 9.835630965005302, "grad_norm": 1.9879817962646484, "learning_rate": 4.1179061390608784e-08, "loss": 0.328, "num_input_tokens_seen": 115139840, "step": 120575 }, { "epoch": 9.836038828615711, "grad_norm": 1.0274847745895386, "learning_rate": 4.0975109220148243e-08, "loss": 0.3595, "num_input_tokens_seen": 115144432, "step": 120580 }, { "epoch": 9.836446692226119, "grad_norm": 3.050257682800293, "learning_rate": 4.077166295851365e-08, "loss": 0.2697, "num_input_tokens_seen": 115149104, "step": 120585 }, { "epoch": 9.836854555836528, "grad_norm": 1.8821451663970947, "learning_rate": 4.056872260983502e-08, "loss": 0.1819, "num_input_tokens_seen": 115153568, "step": 120590 }, { "epoch": 9.837262419446937, "grad_norm": 9.571203231811523, "learning_rate": 4.036628817822019e-08, "loss": 0.3598, "num_input_tokens_seen": 115158480, "step": 120595 }, { "epoch": 9.837670283057346, "grad_norm": 17.8131160736084, "learning_rate": 4.016435966777421e-08, "loss": 0.4341, "num_input_tokens_seen": 115164000, "step": 120600 }, { "epoch": 9.838078146667755, "grad_norm": 2.445024013519287, "learning_rate": 3.99629370825938e-08, "loss": 0.3444, "num_input_tokens_seen": 115168432, "step": 120605 }, { "epoch": 9.838486010278164, "grad_norm": 3.4068710803985596, "learning_rate": 3.976202042675625e-08, "loss": 0.24, "num_input_tokens_seen": 115172384, "step": 120610 }, { "epoch": 9.838893873888571, "grad_norm": 17.90643882751465, "learning_rate": 3.956160970433609e-08, "loss": 0.2702, "num_input_tokens_seen": 115176832, "step": 120615 }, { "epoch": 9.83930173749898, "grad_norm": 7.0583086013793945, "learning_rate": 3.936170491939672e-08, "loss": 0.4383, "num_input_tokens_seen": 115181760, "step": 120620 }, { "epoch": 9.839709601109389, "grad_norm": 4.275295734405518, "learning_rate": 3.91623060759877e-08, "loss": 0.3804, "num_input_tokens_seen": 115186272, "step": 120625 }, { "epoch": 9.840117464719798, "grad_norm": 10.961751937866211, "learning_rate": 3.8963413178153e-08, "loss": 0.2317, "num_input_tokens_seen": 115191008, "step": 120630 }, { "epoch": 9.840525328330207, "grad_norm": 6.939073085784912, "learning_rate": 3.876502622992273e-08, "loss": 0.2779, "num_input_tokens_seen": 115196688, "step": 120635 }, { "epoch": 9.840933191940614, "grad_norm": 25.36009979248047, "learning_rate": 3.856714523531868e-08, "loss": 0.5104, "num_input_tokens_seen": 115200480, "step": 120640 }, { "epoch": 9.841341055551023, "grad_norm": 2.042797327041626, "learning_rate": 3.83697701983543e-08, "loss": 0.5739, "num_input_tokens_seen": 115205056, "step": 120645 }, { "epoch": 9.841748919161432, "grad_norm": 5.053013324737549, "learning_rate": 3.8172901123026406e-08, "loss": 0.3853, "num_input_tokens_seen": 115210080, "step": 120650 }, { "epoch": 9.842156782771841, "grad_norm": 31.292577743530273, "learning_rate": 3.797653801332623e-08, "loss": 0.3522, "num_input_tokens_seen": 115214512, "step": 120655 }, { "epoch": 9.84256464638225, "grad_norm": 51.93906784057617, "learning_rate": 3.778068087323394e-08, "loss": 0.3574, "num_input_tokens_seen": 115220176, "step": 120660 }, { "epoch": 9.84297250999266, "grad_norm": 1.7632347345352173, "learning_rate": 3.758532970672135e-08, "loss": 0.2937, "num_input_tokens_seen": 115225552, "step": 120665 }, { "epoch": 9.843380373603067, "grad_norm": 33.4276237487793, "learning_rate": 3.73904845177464e-08, "loss": 0.2315, "num_input_tokens_seen": 115229648, "step": 120670 }, { "epoch": 9.843788237213476, "grad_norm": 12.671587944030762, "learning_rate": 3.71961453102615e-08, "loss": 0.2645, "num_input_tokens_seen": 115234400, "step": 120675 }, { "epoch": 9.844196100823885, "grad_norm": 11.127443313598633, "learning_rate": 3.70023120881996e-08, "loss": 0.2824, "num_input_tokens_seen": 115238816, "step": 120680 }, { "epoch": 9.844603964434294, "grad_norm": 7.649962425231934, "learning_rate": 3.6808984855496445e-08, "loss": 0.4789, "num_input_tokens_seen": 115242320, "step": 120685 }, { "epoch": 9.845011828044703, "grad_norm": 1.2089229822158813, "learning_rate": 3.661616361606557e-08, "loss": 0.2736, "num_input_tokens_seen": 115246432, "step": 120690 }, { "epoch": 9.84541969165511, "grad_norm": 4.011227130889893, "learning_rate": 3.6423848373817736e-08, "loss": 0.5417, "num_input_tokens_seen": 115251808, "step": 120695 }, { "epoch": 9.845827555265519, "grad_norm": 17.733980178833008, "learning_rate": 3.6232039132652605e-08, "loss": 0.3158, "num_input_tokens_seen": 115257152, "step": 120700 }, { "epoch": 9.846235418875928, "grad_norm": 19.884326934814453, "learning_rate": 3.604073589645596e-08, "loss": 0.4037, "num_input_tokens_seen": 115261632, "step": 120705 }, { "epoch": 9.846643282486337, "grad_norm": 21.63292121887207, "learning_rate": 3.584993866910247e-08, "loss": 0.3515, "num_input_tokens_seen": 115266320, "step": 120710 }, { "epoch": 9.847051146096746, "grad_norm": 3.647019147872925, "learning_rate": 3.565964745446682e-08, "loss": 0.3138, "num_input_tokens_seen": 115270224, "step": 120715 }, { "epoch": 9.847459009707155, "grad_norm": 15.73767375946045, "learning_rate": 3.5469862256401474e-08, "loss": 0.435, "num_input_tokens_seen": 115275552, "step": 120720 }, { "epoch": 9.847866873317562, "grad_norm": 7.465544700622559, "learning_rate": 3.528058307875337e-08, "loss": 0.3968, "num_input_tokens_seen": 115280432, "step": 120725 }, { "epoch": 9.848274736927971, "grad_norm": 1.710564136505127, "learning_rate": 3.509180992535832e-08, "loss": 0.3115, "num_input_tokens_seen": 115285376, "step": 120730 }, { "epoch": 9.84868260053838, "grad_norm": 21.138229370117188, "learning_rate": 3.490354280004382e-08, "loss": 0.2217, "num_input_tokens_seen": 115290464, "step": 120735 }, { "epoch": 9.849090464148789, "grad_norm": 58.81050491333008, "learning_rate": 3.4715781706629035e-08, "loss": 0.4298, "num_input_tokens_seen": 115295568, "step": 120740 }, { "epoch": 9.849498327759198, "grad_norm": 6.340307712554932, "learning_rate": 3.4528526648913707e-08, "loss": 0.3209, "num_input_tokens_seen": 115299664, "step": 120745 }, { "epoch": 9.849906191369605, "grad_norm": 1.3820061683654785, "learning_rate": 3.4341777630700345e-08, "loss": 0.164, "num_input_tokens_seen": 115304336, "step": 120750 }, { "epoch": 9.850314054980014, "grad_norm": 6.893636703491211, "learning_rate": 3.4155534655766484e-08, "loss": 0.2894, "num_input_tokens_seen": 115308816, "step": 120755 }, { "epoch": 9.850721918590423, "grad_norm": 0.5004813075065613, "learning_rate": 3.396979772789521e-08, "loss": 0.3268, "num_input_tokens_seen": 115313344, "step": 120760 }, { "epoch": 9.851129782200832, "grad_norm": 39.870933532714844, "learning_rate": 3.378456685084464e-08, "loss": 0.3903, "num_input_tokens_seen": 115317920, "step": 120765 }, { "epoch": 9.851537645811241, "grad_norm": 3.1381947994232178, "learning_rate": 3.359984202837285e-08, "loss": 0.2688, "num_input_tokens_seen": 115322304, "step": 120770 }, { "epoch": 9.851945509421649, "grad_norm": 52.44768142700195, "learning_rate": 3.3415623264224095e-08, "loss": 0.3824, "num_input_tokens_seen": 115326992, "step": 120775 }, { "epoch": 9.852353373032058, "grad_norm": 2.3405489921569824, "learning_rate": 3.3231910562131484e-08, "loss": 0.4797, "num_input_tokens_seen": 115331888, "step": 120780 }, { "epoch": 9.852761236642467, "grad_norm": 31.358333587646484, "learning_rate": 3.3048703925819824e-08, "loss": 0.2401, "num_input_tokens_seen": 115337344, "step": 120785 }, { "epoch": 9.853169100252876, "grad_norm": 17.3231201171875, "learning_rate": 3.286600335900003e-08, "loss": 0.3163, "num_input_tokens_seen": 115341008, "step": 120790 }, { "epoch": 9.853576963863285, "grad_norm": 30.66344451904297, "learning_rate": 3.268380886537747e-08, "loss": 0.3586, "num_input_tokens_seen": 115346448, "step": 120795 }, { "epoch": 9.853984827473692, "grad_norm": 5.664291858673096, "learning_rate": 3.250212044864642e-08, "loss": 0.4754, "num_input_tokens_seen": 115350416, "step": 120800 }, { "epoch": 9.8543926910841, "grad_norm": 2.5822348594665527, "learning_rate": 3.232093811248449e-08, "loss": 0.3324, "num_input_tokens_seen": 115354624, "step": 120805 }, { "epoch": 9.85480055469451, "grad_norm": 27.375810623168945, "learning_rate": 3.214026186057206e-08, "loss": 0.29, "num_input_tokens_seen": 115359424, "step": 120810 }, { "epoch": 9.855208418304919, "grad_norm": 4.237990856170654, "learning_rate": 3.1960091696564554e-08, "loss": 0.1467, "num_input_tokens_seen": 115364128, "step": 120815 }, { "epoch": 9.855616281915328, "grad_norm": 46.59095001220703, "learning_rate": 3.178042762411737e-08, "loss": 0.3977, "num_input_tokens_seen": 115368784, "step": 120820 }, { "epoch": 9.856024145525737, "grad_norm": 42.83207702636719, "learning_rate": 3.160126964687204e-08, "loss": 0.1942, "num_input_tokens_seen": 115373168, "step": 120825 }, { "epoch": 9.856432009136144, "grad_norm": 1.435250163078308, "learning_rate": 3.142261776845623e-08, "loss": 0.6043, "num_input_tokens_seen": 115377392, "step": 120830 }, { "epoch": 9.856839872746553, "grad_norm": 34.74918746948242, "learning_rate": 3.124447199249758e-08, "loss": 0.3233, "num_input_tokens_seen": 115382752, "step": 120835 }, { "epoch": 9.857247736356962, "grad_norm": 2.3520236015319824, "learning_rate": 3.1066832322601546e-08, "loss": 0.1608, "num_input_tokens_seen": 115387088, "step": 120840 }, { "epoch": 9.857655599967371, "grad_norm": 11.840044021606445, "learning_rate": 3.08896987623708e-08, "loss": 0.25, "num_input_tokens_seen": 115391712, "step": 120845 }, { "epoch": 9.85806346357778, "grad_norm": 30.233919143676758, "learning_rate": 3.071307131539691e-08, "loss": 0.3553, "num_input_tokens_seen": 115396784, "step": 120850 }, { "epoch": 9.858471327188187, "grad_norm": 3.2015011310577393, "learning_rate": 3.053694998526036e-08, "loss": 0.3209, "num_input_tokens_seen": 115401632, "step": 120855 }, { "epoch": 9.858879190798596, "grad_norm": 22.022769927978516, "learning_rate": 3.036133477552772e-08, "loss": 0.2698, "num_input_tokens_seen": 115405920, "step": 120860 }, { "epoch": 9.859287054409005, "grad_norm": 41.712249755859375, "learning_rate": 3.018622568976004e-08, "loss": 0.4264, "num_input_tokens_seen": 115410048, "step": 120865 }, { "epoch": 9.859694918019414, "grad_norm": 1.817623496055603, "learning_rate": 3.001162273150726e-08, "loss": 0.2814, "num_input_tokens_seen": 115414640, "step": 120870 }, { "epoch": 9.860102781629823, "grad_norm": 12.511033058166504, "learning_rate": 2.983752590431099e-08, "loss": 0.2485, "num_input_tokens_seen": 115420064, "step": 120875 }, { "epoch": 9.860510645240232, "grad_norm": 4.602372169494629, "learning_rate": 2.9663935211693418e-08, "loss": 0.2359, "num_input_tokens_seen": 115425952, "step": 120880 }, { "epoch": 9.86091850885064, "grad_norm": 0.7540531158447266, "learning_rate": 2.9490850657179493e-08, "loss": 0.1855, "num_input_tokens_seen": 115430976, "step": 120885 }, { "epoch": 9.861326372461049, "grad_norm": 33.64332962036133, "learning_rate": 2.931827224427752e-08, "loss": 0.2084, "num_input_tokens_seen": 115436096, "step": 120890 }, { "epoch": 9.861734236071458, "grad_norm": 5.783391952514648, "learning_rate": 2.9146199976481938e-08, "loss": 0.4528, "num_input_tokens_seen": 115440880, "step": 120895 }, { "epoch": 9.862142099681867, "grad_norm": 3.964272975921631, "learning_rate": 2.8974633857281608e-08, "loss": 0.234, "num_input_tokens_seen": 115446352, "step": 120900 }, { "epoch": 9.862549963292276, "grad_norm": 40.85072708129883, "learning_rate": 2.8803573890154313e-08, "loss": 0.5114, "num_input_tokens_seen": 115451584, "step": 120905 }, { "epoch": 9.862957826902683, "grad_norm": 28.30987548828125, "learning_rate": 2.863302007856672e-08, "loss": 0.3194, "num_input_tokens_seen": 115456336, "step": 120910 }, { "epoch": 9.863365690513092, "grad_norm": 1.3672577142715454, "learning_rate": 2.846297242597995e-08, "loss": 0.2056, "num_input_tokens_seen": 115461456, "step": 120915 }, { "epoch": 9.863773554123501, "grad_norm": 23.730634689331055, "learning_rate": 2.8293430935835695e-08, "loss": 0.4366, "num_input_tokens_seen": 115466512, "step": 120920 }, { "epoch": 9.86418141773391, "grad_norm": 4.723190784454346, "learning_rate": 2.8124395611572873e-08, "loss": 0.2336, "num_input_tokens_seen": 115471488, "step": 120925 }, { "epoch": 9.864589281344319, "grad_norm": 30.33035659790039, "learning_rate": 2.795586645661652e-08, "loss": 0.3982, "num_input_tokens_seen": 115476736, "step": 120930 }, { "epoch": 9.864997144954728, "grad_norm": 3.8344640731811523, "learning_rate": 2.7787843474386123e-08, "loss": 0.4575, "num_input_tokens_seen": 115481280, "step": 120935 }, { "epoch": 9.865405008565135, "grad_norm": 0.3883351981639862, "learning_rate": 2.7620326668281738e-08, "loss": 0.1826, "num_input_tokens_seen": 115486832, "step": 120940 }, { "epoch": 9.865812872175544, "grad_norm": 1.8501344919204712, "learning_rate": 2.7453316041703425e-08, "loss": 0.2011, "num_input_tokens_seen": 115491680, "step": 120945 }, { "epoch": 9.866220735785953, "grad_norm": 1.986887812614441, "learning_rate": 2.728681159803459e-08, "loss": 0.2288, "num_input_tokens_seen": 115496608, "step": 120950 }, { "epoch": 9.866628599396362, "grad_norm": 70.93244934082031, "learning_rate": 2.712081334065031e-08, "loss": 0.4495, "num_input_tokens_seen": 115501744, "step": 120955 }, { "epoch": 9.867036463006771, "grad_norm": 2.32370662689209, "learning_rate": 2.6955321272917333e-08, "loss": 0.3177, "num_input_tokens_seen": 115506016, "step": 120960 }, { "epoch": 9.867444326617179, "grad_norm": 1.381567358970642, "learning_rate": 2.6790335398185763e-08, "loss": 0.2982, "num_input_tokens_seen": 115511040, "step": 120965 }, { "epoch": 9.867852190227588, "grad_norm": 1.669590711593628, "learning_rate": 2.6625855719802917e-08, "loss": 0.4761, "num_input_tokens_seen": 115515856, "step": 120970 }, { "epoch": 9.868260053837997, "grad_norm": 19.74271583557129, "learning_rate": 2.6461882241102242e-08, "loss": 0.3944, "num_input_tokens_seen": 115520640, "step": 120975 }, { "epoch": 9.868667917448406, "grad_norm": 3.3719587326049805, "learning_rate": 2.6298414965408857e-08, "loss": 0.2244, "num_input_tokens_seen": 115525680, "step": 120980 }, { "epoch": 9.869075781058815, "grad_norm": 5.965903282165527, "learning_rate": 2.6135453896034003e-08, "loss": 0.3282, "num_input_tokens_seen": 115529744, "step": 120985 }, { "epoch": 9.869483644669222, "grad_norm": 8.749190330505371, "learning_rate": 2.597299903628059e-08, "loss": 0.4412, "num_input_tokens_seen": 115534544, "step": 120990 }, { "epoch": 9.86989150827963, "grad_norm": 7.353696823120117, "learning_rate": 2.58110503894432e-08, "loss": 0.2526, "num_input_tokens_seen": 115537872, "step": 120995 }, { "epoch": 9.87029937189004, "grad_norm": 0.7387481927871704, "learning_rate": 2.5649607958802556e-08, "loss": 0.3367, "num_input_tokens_seen": 115542768, "step": 121000 }, { "epoch": 9.870707235500449, "grad_norm": 17.76398277282715, "learning_rate": 2.5488671747631032e-08, "loss": 0.3445, "num_input_tokens_seen": 115546528, "step": 121005 }, { "epoch": 9.871115099110858, "grad_norm": 6.589578151702881, "learning_rate": 2.532824175919546e-08, "loss": 0.3274, "num_input_tokens_seen": 115551520, "step": 121010 }, { "epoch": 9.871522962721267, "grad_norm": 9.649255752563477, "learning_rate": 2.5168317996740465e-08, "loss": 0.4661, "num_input_tokens_seen": 115556176, "step": 121015 }, { "epoch": 9.871930826331674, "grad_norm": 1.8604902029037476, "learning_rate": 2.5008900463513452e-08, "loss": 0.2147, "num_input_tokens_seen": 115561200, "step": 121020 }, { "epoch": 9.872338689942083, "grad_norm": 7.2909016609191895, "learning_rate": 2.4849989162742393e-08, "loss": 0.4037, "num_input_tokens_seen": 115565792, "step": 121025 }, { "epoch": 9.872746553552492, "grad_norm": 22.1502742767334, "learning_rate": 2.469158409764971e-08, "loss": 0.4403, "num_input_tokens_seen": 115570608, "step": 121030 }, { "epoch": 9.873154417162901, "grad_norm": 13.335241317749023, "learning_rate": 2.4533685271446728e-08, "loss": 0.3134, "num_input_tokens_seen": 115575792, "step": 121035 }, { "epoch": 9.87356228077331, "grad_norm": 0.7517856955528259, "learning_rate": 2.4376292687333656e-08, "loss": 0.5175, "num_input_tokens_seen": 115580016, "step": 121040 }, { "epoch": 9.873970144383717, "grad_norm": 1.464615821838379, "learning_rate": 2.4219406348499616e-08, "loss": 0.3765, "num_input_tokens_seen": 115583344, "step": 121045 }, { "epoch": 9.874378007994126, "grad_norm": 46.346805572509766, "learning_rate": 2.4063026258122624e-08, "loss": 0.295, "num_input_tokens_seen": 115588336, "step": 121050 }, { "epoch": 9.874785871604535, "grad_norm": 41.154239654541016, "learning_rate": 2.390715241937791e-08, "loss": 0.4303, "num_input_tokens_seen": 115591984, "step": 121055 }, { "epoch": 9.875193735214944, "grad_norm": 3.663578510284424, "learning_rate": 2.3751784835424063e-08, "loss": 0.3212, "num_input_tokens_seen": 115597424, "step": 121060 }, { "epoch": 9.875601598825353, "grad_norm": 17.835309982299805, "learning_rate": 2.3596923509405788e-08, "loss": 0.2747, "num_input_tokens_seen": 115602928, "step": 121065 }, { "epoch": 9.87600946243576, "grad_norm": 70.96162414550781, "learning_rate": 2.344256844446502e-08, "loss": 0.3391, "num_input_tokens_seen": 115608064, "step": 121070 }, { "epoch": 9.87641732604617, "grad_norm": 0.927800178527832, "learning_rate": 2.3288719643729807e-08, "loss": 0.3855, "num_input_tokens_seen": 115612944, "step": 121075 }, { "epoch": 9.876825189656579, "grad_norm": 17.115629196166992, "learning_rate": 2.3135377110319877e-08, "loss": 0.4912, "num_input_tokens_seen": 115617344, "step": 121080 }, { "epoch": 9.877233053266988, "grad_norm": 35.273128509521484, "learning_rate": 2.2982540847343858e-08, "loss": 0.4029, "num_input_tokens_seen": 115622016, "step": 121085 }, { "epoch": 9.877640916877397, "grad_norm": 0.9882810115814209, "learning_rate": 2.2830210857896496e-08, "loss": 0.265, "num_input_tokens_seen": 115627472, "step": 121090 }, { "epoch": 9.878048780487806, "grad_norm": 7.377237796783447, "learning_rate": 2.2678387145066982e-08, "loss": 0.1868, "num_input_tokens_seen": 115632880, "step": 121095 }, { "epoch": 9.878456644098213, "grad_norm": 7.715607643127441, "learning_rate": 2.252706971193341e-08, "loss": 0.3696, "num_input_tokens_seen": 115637536, "step": 121100 }, { "epoch": 9.878864507708622, "grad_norm": 26.428699493408203, "learning_rate": 2.237625856156278e-08, "loss": 0.2664, "num_input_tokens_seen": 115642208, "step": 121105 }, { "epoch": 9.879272371319031, "grad_norm": 0.6551545262336731, "learning_rate": 2.2225953697013746e-08, "loss": 0.3893, "num_input_tokens_seen": 115647136, "step": 121110 }, { "epoch": 9.87968023492944, "grad_norm": 41.733131408691406, "learning_rate": 2.2076155121328323e-08, "loss": 0.3831, "num_input_tokens_seen": 115651216, "step": 121115 }, { "epoch": 9.880088098539849, "grad_norm": 29.625991821289062, "learning_rate": 2.192686283754575e-08, "loss": 0.3718, "num_input_tokens_seen": 115655712, "step": 121120 }, { "epoch": 9.880495962150256, "grad_norm": 14.157331466674805, "learning_rate": 2.177807684869415e-08, "loss": 0.4337, "num_input_tokens_seen": 115660608, "step": 121125 }, { "epoch": 9.880903825760665, "grad_norm": 3.6699137687683105, "learning_rate": 2.1629797157785013e-08, "loss": 0.2759, "num_input_tokens_seen": 115665136, "step": 121130 }, { "epoch": 9.881311689371074, "grad_norm": 37.64492416381836, "learning_rate": 2.1482023767824267e-08, "loss": 0.442, "num_input_tokens_seen": 115669504, "step": 121135 }, { "epoch": 9.881719552981483, "grad_norm": 6.328753471374512, "learning_rate": 2.1334756681812286e-08, "loss": 0.3768, "num_input_tokens_seen": 115673920, "step": 121140 }, { "epoch": 9.882127416591892, "grad_norm": 3.827501058578491, "learning_rate": 2.1187995902730018e-08, "loss": 0.4078, "num_input_tokens_seen": 115678160, "step": 121145 }, { "epoch": 9.882535280202301, "grad_norm": 39.61811828613281, "learning_rate": 2.104174143355009e-08, "loss": 0.3065, "num_input_tokens_seen": 115683280, "step": 121150 }, { "epoch": 9.882943143812708, "grad_norm": 7.699056148529053, "learning_rate": 2.0895993277242342e-08, "loss": 0.4683, "num_input_tokens_seen": 115687072, "step": 121155 }, { "epoch": 9.883351007423117, "grad_norm": 2.347683906555176, "learning_rate": 2.0750751436759974e-08, "loss": 0.4435, "num_input_tokens_seen": 115691360, "step": 121160 }, { "epoch": 9.883758871033526, "grad_norm": 0.7201099991798401, "learning_rate": 2.06060159150423e-08, "loss": 0.4831, "num_input_tokens_seen": 115695008, "step": 121165 }, { "epoch": 9.884166734643935, "grad_norm": 0.8884370923042297, "learning_rate": 2.046178671502863e-08, "loss": 0.4223, "num_input_tokens_seen": 115699456, "step": 121170 }, { "epoch": 9.884574598254344, "grad_norm": 10.053275108337402, "learning_rate": 2.0318063839638856e-08, "loss": 0.194, "num_input_tokens_seen": 115704336, "step": 121175 }, { "epoch": 9.884982461864752, "grad_norm": 19.874788284301758, "learning_rate": 2.0174847291787314e-08, "loss": 0.1505, "num_input_tokens_seen": 115709408, "step": 121180 }, { "epoch": 9.88539032547516, "grad_norm": 3.0080931186676025, "learning_rate": 2.0032137074377233e-08, "loss": 0.3494, "num_input_tokens_seen": 115714048, "step": 121185 }, { "epoch": 9.88579818908557, "grad_norm": 34.39868927001953, "learning_rate": 1.9889933190300748e-08, "loss": 0.3557, "num_input_tokens_seen": 115718544, "step": 121190 }, { "epoch": 9.886206052695979, "grad_norm": 1.5596665143966675, "learning_rate": 1.974823564244166e-08, "loss": 0.2665, "num_input_tokens_seen": 115723072, "step": 121195 }, { "epoch": 9.886613916306388, "grad_norm": 0.8061416149139404, "learning_rate": 1.9607044433669896e-08, "loss": 0.3669, "num_input_tokens_seen": 115727040, "step": 121200 }, { "epoch": 9.887021779916797, "grad_norm": 39.054317474365234, "learning_rate": 1.9466359566849833e-08, "loss": 0.5419, "num_input_tokens_seen": 115732272, "step": 121205 }, { "epoch": 9.887429643527204, "grad_norm": 2.2974750995635986, "learning_rate": 1.932618104483197e-08, "loss": 0.2533, "num_input_tokens_seen": 115737296, "step": 121210 }, { "epoch": 9.887837507137613, "grad_norm": 8.760652542114258, "learning_rate": 1.9186508870455698e-08, "loss": 0.4973, "num_input_tokens_seen": 115741872, "step": 121215 }, { "epoch": 9.888245370748022, "grad_norm": 8.137956619262695, "learning_rate": 1.9047343046554865e-08, "loss": 0.2808, "num_input_tokens_seen": 115746256, "step": 121220 }, { "epoch": 9.888653234358431, "grad_norm": 3.2144877910614014, "learning_rate": 1.8908683575949438e-08, "loss": 0.3766, "num_input_tokens_seen": 115750336, "step": 121225 }, { "epoch": 9.88906109796884, "grad_norm": 2.7442047595977783, "learning_rate": 1.877053046145105e-08, "loss": 0.2935, "num_input_tokens_seen": 115755264, "step": 121230 }, { "epoch": 9.889468961579247, "grad_norm": 21.67302131652832, "learning_rate": 1.8632883705857474e-08, "loss": 0.3261, "num_input_tokens_seen": 115760016, "step": 121235 }, { "epoch": 9.889876825189656, "grad_norm": 1.4420243501663208, "learning_rate": 1.849574331196091e-08, "loss": 0.3661, "num_input_tokens_seen": 115765040, "step": 121240 }, { "epoch": 9.890284688800065, "grad_norm": 24.36284065246582, "learning_rate": 1.83591092825397e-08, "loss": 0.352, "num_input_tokens_seen": 115769552, "step": 121245 }, { "epoch": 9.890692552410474, "grad_norm": 34.54988098144531, "learning_rate": 1.8222981620363845e-08, "loss": 0.2953, "num_input_tokens_seen": 115774576, "step": 121250 }, { "epoch": 9.891100416020883, "grad_norm": 1.6921122074127197, "learning_rate": 1.808736032819225e-08, "loss": 0.3431, "num_input_tokens_seen": 115779520, "step": 121255 }, { "epoch": 9.89150827963129, "grad_norm": 23.152753829956055, "learning_rate": 1.7952245408775493e-08, "loss": 0.3727, "num_input_tokens_seen": 115783744, "step": 121260 }, { "epoch": 9.8919161432417, "grad_norm": 5.562336444854736, "learning_rate": 1.7817636864850273e-08, "loss": 0.2599, "num_input_tokens_seen": 115788688, "step": 121265 }, { "epoch": 9.892324006852109, "grad_norm": 18.016565322875977, "learning_rate": 1.7683534699147743e-08, "loss": 0.36, "num_input_tokens_seen": 115794080, "step": 121270 }, { "epoch": 9.892731870462518, "grad_norm": 6.716217994689941, "learning_rate": 1.754993891438239e-08, "loss": 0.3641, "num_input_tokens_seen": 115798992, "step": 121275 }, { "epoch": 9.893139734072927, "grad_norm": 25.432275772094727, "learning_rate": 1.7416849513265942e-08, "loss": 0.4105, "num_input_tokens_seen": 115803696, "step": 121280 }, { "epoch": 9.893547597683334, "grad_norm": 15.003660202026367, "learning_rate": 1.7284266498490685e-08, "loss": 0.2783, "num_input_tokens_seen": 115808816, "step": 121285 }, { "epoch": 9.893955461293743, "grad_norm": 1.3134589195251465, "learning_rate": 1.715218987275169e-08, "loss": 0.2769, "num_input_tokens_seen": 115813568, "step": 121290 }, { "epoch": 9.894363324904152, "grad_norm": 5.352745056152344, "learning_rate": 1.7020619638719036e-08, "loss": 0.4431, "num_input_tokens_seen": 115818288, "step": 121295 }, { "epoch": 9.89477118851456, "grad_norm": 32.341556549072266, "learning_rate": 1.6889555799062816e-08, "loss": 0.4153, "num_input_tokens_seen": 115823424, "step": 121300 }, { "epoch": 9.89517905212497, "grad_norm": 0.9356725811958313, "learning_rate": 1.6758998356442013e-08, "loss": 0.2362, "num_input_tokens_seen": 115826944, "step": 121305 }, { "epoch": 9.895586915735379, "grad_norm": 36.32198715209961, "learning_rate": 1.6628947313498953e-08, "loss": 0.3514, "num_input_tokens_seen": 115830640, "step": 121310 }, { "epoch": 9.895994779345786, "grad_norm": 1.6587449312210083, "learning_rate": 1.6499402672870423e-08, "loss": 0.3886, "num_input_tokens_seen": 115835408, "step": 121315 }, { "epoch": 9.896402642956195, "grad_norm": 1.8027961254119873, "learning_rate": 1.6370364437182097e-08, "loss": 0.3351, "num_input_tokens_seen": 115840624, "step": 121320 }, { "epoch": 9.896810506566604, "grad_norm": 4.685211181640625, "learning_rate": 1.624183260905132e-08, "loss": 0.3234, "num_input_tokens_seen": 115845520, "step": 121325 }, { "epoch": 9.897218370177013, "grad_norm": 1.650281310081482, "learning_rate": 1.6113807191081576e-08, "loss": 0.2628, "num_input_tokens_seen": 115850640, "step": 121330 }, { "epoch": 9.897626233787422, "grad_norm": 0.9081511497497559, "learning_rate": 1.598628818587078e-08, "loss": 0.3117, "num_input_tokens_seen": 115855424, "step": 121335 }, { "epoch": 9.89803409739783, "grad_norm": 26.415573120117188, "learning_rate": 1.58592755960002e-08, "loss": 0.2589, "num_input_tokens_seen": 115860288, "step": 121340 }, { "epoch": 9.898441961008238, "grad_norm": 24.49315643310547, "learning_rate": 1.5732769424045557e-08, "loss": 0.3373, "num_input_tokens_seen": 115864896, "step": 121345 }, { "epoch": 9.898849824618647, "grad_norm": 7.543863296508789, "learning_rate": 1.5606769672571465e-08, "loss": 0.4594, "num_input_tokens_seen": 115869904, "step": 121350 }, { "epoch": 9.899257688229056, "grad_norm": 2.8785133361816406, "learning_rate": 1.5481276344131434e-08, "loss": 0.3709, "num_input_tokens_seen": 115873616, "step": 121355 }, { "epoch": 9.899665551839465, "grad_norm": 13.851311683654785, "learning_rate": 1.535628944126788e-08, "loss": 0.2498, "num_input_tokens_seen": 115877408, "step": 121360 }, { "epoch": 9.900073415449874, "grad_norm": 10.37722110748291, "learning_rate": 1.5231808966517657e-08, "loss": 0.2655, "num_input_tokens_seen": 115881920, "step": 121365 }, { "epoch": 9.900481279060282, "grad_norm": 20.073854446411133, "learning_rate": 1.5107834922400975e-08, "loss": 0.3545, "num_input_tokens_seen": 115886336, "step": 121370 }, { "epoch": 9.90088914267069, "grad_norm": 2.8106613159179688, "learning_rate": 1.498436731143249e-08, "loss": 0.2493, "num_input_tokens_seen": 115890976, "step": 121375 }, { "epoch": 9.9012970062811, "grad_norm": 1.6154638528823853, "learning_rate": 1.4861406136115751e-08, "loss": 0.5233, "num_input_tokens_seen": 115895648, "step": 121380 }, { "epoch": 9.901704869891509, "grad_norm": 35.31154251098633, "learning_rate": 1.4738951398937662e-08, "loss": 0.3562, "num_input_tokens_seen": 115900272, "step": 121385 }, { "epoch": 9.902112733501918, "grad_norm": 13.832032203674316, "learning_rate": 1.4617003102387894e-08, "loss": 0.2716, "num_input_tokens_seen": 115904848, "step": 121390 }, { "epoch": 9.902520597112325, "grad_norm": 1.418075680732727, "learning_rate": 1.4495561248931145e-08, "loss": 0.2816, "num_input_tokens_seen": 115909360, "step": 121395 }, { "epoch": 9.902928460722734, "grad_norm": 4.588201999664307, "learning_rate": 1.4374625841034883e-08, "loss": 0.2355, "num_input_tokens_seen": 115913600, "step": 121400 }, { "epoch": 9.903336324333143, "grad_norm": 37.81460952758789, "learning_rate": 1.4254196881147152e-08, "loss": 0.5452, "num_input_tokens_seen": 115918272, "step": 121405 }, { "epoch": 9.903744187943552, "grad_norm": 2.48549485206604, "learning_rate": 1.4134274371710444e-08, "loss": 0.2933, "num_input_tokens_seen": 115923104, "step": 121410 }, { "epoch": 9.904152051553961, "grad_norm": 31.553146362304688, "learning_rate": 1.4014858315153368e-08, "loss": 0.3215, "num_input_tokens_seen": 115928784, "step": 121415 }, { "epoch": 9.90455991516437, "grad_norm": 2.1341936588287354, "learning_rate": 1.3895948713898988e-08, "loss": 0.2911, "num_input_tokens_seen": 115934800, "step": 121420 }, { "epoch": 9.904967778774777, "grad_norm": 17.299272537231445, "learning_rate": 1.3777545570356487e-08, "loss": 0.4077, "num_input_tokens_seen": 115939504, "step": 121425 }, { "epoch": 9.905375642385186, "grad_norm": 2.692445755004883, "learning_rate": 1.3659648886923949e-08, "loss": 0.4107, "num_input_tokens_seen": 115944256, "step": 121430 }, { "epoch": 9.905783505995595, "grad_norm": 3.1628804206848145, "learning_rate": 1.3542258665996676e-08, "loss": 0.3414, "num_input_tokens_seen": 115948528, "step": 121435 }, { "epoch": 9.906191369606004, "grad_norm": 1.7336268424987793, "learning_rate": 1.3425374909947775e-08, "loss": 0.2568, "num_input_tokens_seen": 115952992, "step": 121440 }, { "epoch": 9.906599233216413, "grad_norm": 13.31490421295166, "learning_rate": 1.3308997621150343e-08, "loss": 0.3679, "num_input_tokens_seen": 115957792, "step": 121445 }, { "epoch": 9.90700709682682, "grad_norm": 15.99133014678955, "learning_rate": 1.3193126801960832e-08, "loss": 0.4631, "num_input_tokens_seen": 115962400, "step": 121450 }, { "epoch": 9.90741496043723, "grad_norm": 9.710322380065918, "learning_rate": 1.3077762454730136e-08, "loss": 0.2952, "num_input_tokens_seen": 115967056, "step": 121455 }, { "epoch": 9.907822824047638, "grad_norm": 16.662084579467773, "learning_rate": 1.2962904581795276e-08, "loss": 0.3378, "num_input_tokens_seen": 115971744, "step": 121460 }, { "epoch": 9.908230687658047, "grad_norm": 25.808130264282227, "learning_rate": 1.2848553185482171e-08, "loss": 0.394, "num_input_tokens_seen": 115976688, "step": 121465 }, { "epoch": 9.908638551268456, "grad_norm": 25.464799880981445, "learning_rate": 1.273470826811396e-08, "loss": 0.4078, "num_input_tokens_seen": 115980544, "step": 121470 }, { "epoch": 9.909046414878864, "grad_norm": 3.22878098487854, "learning_rate": 1.2621369831997133e-08, "loss": 0.4657, "num_input_tokens_seen": 115984944, "step": 121475 }, { "epoch": 9.909454278489273, "grad_norm": 20.45116424560547, "learning_rate": 1.2508537879424299e-08, "loss": 0.3297, "num_input_tokens_seen": 115990048, "step": 121480 }, { "epoch": 9.909862142099682, "grad_norm": 9.833113670349121, "learning_rate": 1.2396212412685293e-08, "loss": 0.3366, "num_input_tokens_seen": 115994720, "step": 121485 }, { "epoch": 9.91027000571009, "grad_norm": 11.056131362915039, "learning_rate": 1.228439343405885e-08, "loss": 0.4752, "num_input_tokens_seen": 115999776, "step": 121490 }, { "epoch": 9.9106778693205, "grad_norm": 23.62257957458496, "learning_rate": 1.2173080945809823e-08, "loss": 0.3174, "num_input_tokens_seen": 116004704, "step": 121495 }, { "epoch": 9.911085732930907, "grad_norm": 2.0659115314483643, "learning_rate": 1.2062274950194741e-08, "loss": 0.3069, "num_input_tokens_seen": 116009344, "step": 121500 }, { "epoch": 9.911493596541316, "grad_norm": 32.61692428588867, "learning_rate": 1.195197544945903e-08, "loss": 0.3533, "num_input_tokens_seen": 116014176, "step": 121505 }, { "epoch": 9.911901460151725, "grad_norm": 19.061580657958984, "learning_rate": 1.1842182445837014e-08, "loss": 0.4063, "num_input_tokens_seen": 116019024, "step": 121510 }, { "epoch": 9.912309323762134, "grad_norm": 2.5734291076660156, "learning_rate": 1.1732895941557464e-08, "loss": 0.3767, "num_input_tokens_seen": 116024160, "step": 121515 }, { "epoch": 9.912717187372543, "grad_norm": 11.27155876159668, "learning_rate": 1.1624115938835279e-08, "loss": 0.2784, "num_input_tokens_seen": 116029312, "step": 121520 }, { "epoch": 9.913125050982952, "grad_norm": 8.868942260742188, "learning_rate": 1.1515842439871472e-08, "loss": 0.3154, "num_input_tokens_seen": 116033136, "step": 121525 }, { "epoch": 9.91353291459336, "grad_norm": 1.8607748746871948, "learning_rate": 1.1408075446864286e-08, "loss": 0.3268, "num_input_tokens_seen": 116037520, "step": 121530 }, { "epoch": 9.913940778203768, "grad_norm": 36.8183708190918, "learning_rate": 1.1300814961998086e-08, "loss": 0.5123, "num_input_tokens_seen": 116042128, "step": 121535 }, { "epoch": 9.914348641814177, "grad_norm": 29.685415267944336, "learning_rate": 1.1194060987443356e-08, "loss": 0.3658, "num_input_tokens_seen": 116047376, "step": 121540 }, { "epoch": 9.914756505424586, "grad_norm": 1.4471588134765625, "learning_rate": 1.1087813525370582e-08, "loss": 0.366, "num_input_tokens_seen": 116051456, "step": 121545 }, { "epoch": 9.915164369034995, "grad_norm": 10.164528846740723, "learning_rate": 1.0982072577925273e-08, "loss": 0.4724, "num_input_tokens_seen": 116055744, "step": 121550 }, { "epoch": 9.915572232645403, "grad_norm": 9.295066833496094, "learning_rate": 1.0876838147258483e-08, "loss": 0.2393, "num_input_tokens_seen": 116060736, "step": 121555 }, { "epoch": 9.915980096255812, "grad_norm": 3.49092698097229, "learning_rate": 1.0772110235496291e-08, "loss": 0.2139, "num_input_tokens_seen": 116065792, "step": 121560 }, { "epoch": 9.91638795986622, "grad_norm": 10.3955659866333, "learning_rate": 1.0667888844767548e-08, "loss": 0.3731, "num_input_tokens_seen": 116069792, "step": 121565 }, { "epoch": 9.91679582347663, "grad_norm": 40.00802993774414, "learning_rate": 1.0564173977181679e-08, "loss": 0.3238, "num_input_tokens_seen": 116075072, "step": 121570 }, { "epoch": 9.917203687087039, "grad_norm": 2.2528328895568848, "learning_rate": 1.0460965634839781e-08, "loss": 0.2932, "num_input_tokens_seen": 116079200, "step": 121575 }, { "epoch": 9.917611550697448, "grad_norm": 43.937744140625, "learning_rate": 1.03582638198374e-08, "loss": 0.2406, "num_input_tokens_seen": 116084176, "step": 121580 }, { "epoch": 9.918019414307855, "grad_norm": 18.586687088012695, "learning_rate": 1.0256068534253426e-08, "loss": 0.3482, "num_input_tokens_seen": 116089648, "step": 121585 }, { "epoch": 9.918427277918264, "grad_norm": 0.8034942746162415, "learning_rate": 1.015437978015843e-08, "loss": 0.2555, "num_input_tokens_seen": 116094592, "step": 121590 }, { "epoch": 9.918835141528673, "grad_norm": 2.1670901775360107, "learning_rate": 1.0053197559617422e-08, "loss": 0.188, "num_input_tokens_seen": 116099584, "step": 121595 }, { "epoch": 9.919243005139082, "grad_norm": 1.753307580947876, "learning_rate": 9.952521874678767e-09, "loss": 0.2914, "num_input_tokens_seen": 116104608, "step": 121600 }, { "epoch": 9.91965086874949, "grad_norm": 16.939502716064453, "learning_rate": 9.852352727382497e-09, "loss": 0.4335, "num_input_tokens_seen": 116108512, "step": 121605 }, { "epoch": 9.920058732359898, "grad_norm": 2.4391725063323975, "learning_rate": 9.752690119760321e-09, "loss": 0.3156, "num_input_tokens_seen": 116112384, "step": 121610 }, { "epoch": 9.920466595970307, "grad_norm": 2.663451910018921, "learning_rate": 9.653534053832847e-09, "loss": 0.3199, "num_input_tokens_seen": 116117328, "step": 121615 }, { "epoch": 9.920874459580716, "grad_norm": 3.5119009017944336, "learning_rate": 9.554884531609575e-09, "loss": 0.3633, "num_input_tokens_seen": 116122048, "step": 121620 }, { "epoch": 9.921282323191125, "grad_norm": 1.5523746013641357, "learning_rate": 9.45674155508891e-09, "loss": 0.1895, "num_input_tokens_seen": 116127376, "step": 121625 }, { "epoch": 9.921690186801534, "grad_norm": 10.144405364990234, "learning_rate": 9.359105126263701e-09, "loss": 0.3503, "num_input_tokens_seen": 116132432, "step": 121630 }, { "epoch": 9.922098050411943, "grad_norm": 38.701602935791016, "learning_rate": 9.261975247107368e-09, "loss": 0.2814, "num_input_tokens_seen": 116137168, "step": 121635 }, { "epoch": 9.92250591402235, "grad_norm": 5.259092330932617, "learning_rate": 9.16535191959056e-09, "loss": 0.3601, "num_input_tokens_seen": 116142336, "step": 121640 }, { "epoch": 9.92291377763276, "grad_norm": 13.29648208618164, "learning_rate": 9.069235145675592e-09, "loss": 0.3354, "num_input_tokens_seen": 116147328, "step": 121645 }, { "epoch": 9.923321641243168, "grad_norm": 4.18163537979126, "learning_rate": 8.973624927308133e-09, "loss": 0.3864, "num_input_tokens_seen": 116152096, "step": 121650 }, { "epoch": 9.923729504853577, "grad_norm": 2.1920247077941895, "learning_rate": 8.878521266422746e-09, "loss": 0.4116, "num_input_tokens_seen": 116156304, "step": 121655 }, { "epoch": 9.924137368463986, "grad_norm": 8.487857818603516, "learning_rate": 8.783924164953994e-09, "loss": 0.419, "num_input_tokens_seen": 116161840, "step": 121660 }, { "epoch": 9.924545232074394, "grad_norm": 5.61605978012085, "learning_rate": 8.689833624811461e-09, "loss": 0.3394, "num_input_tokens_seen": 116166640, "step": 121665 }, { "epoch": 9.924953095684803, "grad_norm": 1.5895510911941528, "learning_rate": 8.596249647910281e-09, "loss": 0.3464, "num_input_tokens_seen": 116171328, "step": 121670 }, { "epoch": 9.925360959295212, "grad_norm": 30.897640228271484, "learning_rate": 8.503172236143386e-09, "loss": 0.4143, "num_input_tokens_seen": 116176464, "step": 121675 }, { "epoch": 9.92576882290562, "grad_norm": 16.29368019104004, "learning_rate": 8.410601391395378e-09, "loss": 0.3063, "num_input_tokens_seen": 116180704, "step": 121680 }, { "epoch": 9.92617668651603, "grad_norm": 2.069270133972168, "learning_rate": 8.31853711554531e-09, "loss": 0.2499, "num_input_tokens_seen": 116185056, "step": 121685 }, { "epoch": 9.926584550126437, "grad_norm": 4.064880847930908, "learning_rate": 8.226979410461133e-09, "loss": 0.176, "num_input_tokens_seen": 116189408, "step": 121690 }, { "epoch": 9.926992413736846, "grad_norm": 15.873915672302246, "learning_rate": 8.135928277994143e-09, "loss": 0.5671, "num_input_tokens_seen": 116194304, "step": 121695 }, { "epoch": 9.927400277347255, "grad_norm": 1.20461905002594, "learning_rate": 8.045383719992861e-09, "loss": 0.3324, "num_input_tokens_seen": 116198464, "step": 121700 }, { "epoch": 9.927808140957664, "grad_norm": 3.337160587310791, "learning_rate": 7.955345738291931e-09, "loss": 0.3508, "num_input_tokens_seen": 116202576, "step": 121705 }, { "epoch": 9.928216004568073, "grad_norm": 13.421937942504883, "learning_rate": 7.86581433471767e-09, "loss": 0.3097, "num_input_tokens_seen": 116207072, "step": 121710 }, { "epoch": 9.92862386817848, "grad_norm": 11.913987159729004, "learning_rate": 7.776789511082516e-09, "loss": 0.3564, "num_input_tokens_seen": 116211840, "step": 121715 }, { "epoch": 9.92903173178889, "grad_norm": 2.438521146774292, "learning_rate": 7.688271269193358e-09, "loss": 0.4836, "num_input_tokens_seen": 116216624, "step": 121720 }, { "epoch": 9.929439595399298, "grad_norm": 9.98889446258545, "learning_rate": 7.600259610840432e-09, "loss": 0.3946, "num_input_tokens_seen": 116220528, "step": 121725 }, { "epoch": 9.929847459009707, "grad_norm": 4.405464172363281, "learning_rate": 7.512754537811195e-09, "loss": 0.4193, "num_input_tokens_seen": 116225056, "step": 121730 }, { "epoch": 9.930255322620116, "grad_norm": 1.0483230352401733, "learning_rate": 7.425756051879229e-09, "loss": 0.2467, "num_input_tokens_seen": 116230624, "step": 121735 }, { "epoch": 9.930663186230525, "grad_norm": 2.7462236881256104, "learning_rate": 7.339264154807013e-09, "loss": 0.4437, "num_input_tokens_seen": 116235616, "step": 121740 }, { "epoch": 9.931071049840932, "grad_norm": 16.949581146240234, "learning_rate": 7.253278848348699e-09, "loss": 0.2505, "num_input_tokens_seen": 116240240, "step": 121745 }, { "epoch": 9.931478913451341, "grad_norm": 41.986812591552734, "learning_rate": 7.167800134244562e-09, "loss": 0.442, "num_input_tokens_seen": 116245008, "step": 121750 }, { "epoch": 9.93188677706175, "grad_norm": 26.165664672851562, "learning_rate": 7.082828014229326e-09, "loss": 0.3956, "num_input_tokens_seen": 116250128, "step": 121755 }, { "epoch": 9.93229464067216, "grad_norm": 30.940690994262695, "learning_rate": 6.998362490026611e-09, "loss": 0.3501, "num_input_tokens_seen": 116255120, "step": 121760 }, { "epoch": 9.932702504282569, "grad_norm": 2.1982431411743164, "learning_rate": 6.914403563346162e-09, "loss": 0.3463, "num_input_tokens_seen": 116260032, "step": 121765 }, { "epoch": 9.933110367892976, "grad_norm": 44.25352478027344, "learning_rate": 6.830951235889393e-09, "loss": 0.3522, "num_input_tokens_seen": 116265552, "step": 121770 }, { "epoch": 9.933518231503385, "grad_norm": 1.8244308233261108, "learning_rate": 6.748005509349398e-09, "loss": 0.2875, "num_input_tokens_seen": 116269920, "step": 121775 }, { "epoch": 9.933926095113794, "grad_norm": 28.981311798095703, "learning_rate": 6.665566385408162e-09, "loss": 0.3203, "num_input_tokens_seen": 116275280, "step": 121780 }, { "epoch": 9.934333958724203, "grad_norm": 21.654468536376953, "learning_rate": 6.583633865733796e-09, "loss": 0.3535, "num_input_tokens_seen": 116279808, "step": 121785 }, { "epoch": 9.934741822334612, "grad_norm": 28.187740325927734, "learning_rate": 6.502207951991634e-09, "loss": 0.3699, "num_input_tokens_seen": 116283072, "step": 121790 }, { "epoch": 9.93514968594502, "grad_norm": 6.758971214294434, "learning_rate": 6.421288645827584e-09, "loss": 0.4515, "num_input_tokens_seen": 116287616, "step": 121795 }, { "epoch": 9.935557549555428, "grad_norm": 21.8183650970459, "learning_rate": 6.340875948881997e-09, "loss": 0.4096, "num_input_tokens_seen": 116292464, "step": 121800 }, { "epoch": 9.935965413165837, "grad_norm": 8.765693664550781, "learning_rate": 6.260969862789678e-09, "loss": 0.1977, "num_input_tokens_seen": 116297024, "step": 121805 }, { "epoch": 9.936373276776246, "grad_norm": 1.7694423198699951, "learning_rate": 6.181570389166003e-09, "loss": 0.2731, "num_input_tokens_seen": 116301360, "step": 121810 }, { "epoch": 9.936781140386655, "grad_norm": 8.04379940032959, "learning_rate": 6.1026775296207925e-09, "loss": 0.1803, "num_input_tokens_seen": 116305696, "step": 121815 }, { "epoch": 9.937189003997064, "grad_norm": 19.922746658325195, "learning_rate": 6.02429128575277e-09, "loss": 0.3106, "num_input_tokens_seen": 116310224, "step": 121820 }, { "epoch": 9.937596867607471, "grad_norm": 24.26681137084961, "learning_rate": 5.946411659152329e-09, "loss": 0.3211, "num_input_tokens_seen": 116314000, "step": 121825 }, { "epoch": 9.93800473121788, "grad_norm": 9.706419944763184, "learning_rate": 5.869038651398762e-09, "loss": 0.2896, "num_input_tokens_seen": 116318432, "step": 121830 }, { "epoch": 9.93841259482829, "grad_norm": 7.00267219543457, "learning_rate": 5.792172264057483e-09, "loss": 0.3126, "num_input_tokens_seen": 116322432, "step": 121835 }, { "epoch": 9.938820458438698, "grad_norm": 4.893185138702393, "learning_rate": 5.715812498688356e-09, "loss": 0.508, "num_input_tokens_seen": 116327456, "step": 121840 }, { "epoch": 9.939228322049107, "grad_norm": 5.092419624328613, "learning_rate": 5.639959356840141e-09, "loss": 0.2721, "num_input_tokens_seen": 116332128, "step": 121845 }, { "epoch": 9.939636185659516, "grad_norm": 3.2541987895965576, "learning_rate": 5.564612840047723e-09, "loss": 0.3256, "num_input_tokens_seen": 116336608, "step": 121850 }, { "epoch": 9.940044049269924, "grad_norm": 6.10833740234375, "learning_rate": 5.489772949843208e-09, "loss": 0.4042, "num_input_tokens_seen": 116341152, "step": 121855 }, { "epoch": 9.940451912880333, "grad_norm": 5.142118453979492, "learning_rate": 5.415439687736501e-09, "loss": 0.3012, "num_input_tokens_seen": 116345072, "step": 121860 }, { "epoch": 9.940859776490742, "grad_norm": 31.08778953552246, "learning_rate": 5.34161305524028e-09, "loss": 0.3835, "num_input_tokens_seen": 116349808, "step": 121865 }, { "epoch": 9.94126764010115, "grad_norm": 1.1429786682128906, "learning_rate": 5.268293053847795e-09, "loss": 0.3213, "num_input_tokens_seen": 116355376, "step": 121870 }, { "epoch": 9.94167550371156, "grad_norm": 2.024402141571045, "learning_rate": 5.195479685043969e-09, "loss": 0.281, "num_input_tokens_seen": 116359104, "step": 121875 }, { "epoch": 9.942083367321967, "grad_norm": 16.66046905517578, "learning_rate": 5.123172950310951e-09, "loss": 0.254, "num_input_tokens_seen": 116364144, "step": 121880 }, { "epoch": 9.942491230932376, "grad_norm": 17.99907684326172, "learning_rate": 5.051372851105906e-09, "loss": 0.3619, "num_input_tokens_seen": 116369600, "step": 121885 }, { "epoch": 9.942899094542785, "grad_norm": 4.8149919509887695, "learning_rate": 4.980079388891556e-09, "loss": 0.3495, "num_input_tokens_seen": 116374336, "step": 121890 }, { "epoch": 9.943306958153194, "grad_norm": 2.862009048461914, "learning_rate": 4.9092925651111895e-09, "loss": 0.3053, "num_input_tokens_seen": 116379472, "step": 121895 }, { "epoch": 9.943714821763603, "grad_norm": 18.175630569458008, "learning_rate": 4.839012381194219e-09, "loss": 0.3778, "num_input_tokens_seen": 116385008, "step": 121900 }, { "epoch": 9.94412268537401, "grad_norm": 16.981224060058594, "learning_rate": 4.769238838572831e-09, "loss": 0.4239, "num_input_tokens_seen": 116389680, "step": 121905 }, { "epoch": 9.94453054898442, "grad_norm": 3.1768829822540283, "learning_rate": 4.6999719386570105e-09, "loss": 0.2393, "num_input_tokens_seen": 116393936, "step": 121910 }, { "epoch": 9.944938412594828, "grad_norm": 4.095374584197998, "learning_rate": 4.631211682851188e-09, "loss": 0.2327, "num_input_tokens_seen": 116398224, "step": 121915 }, { "epoch": 9.945346276205237, "grad_norm": 24.846960067749023, "learning_rate": 4.562958072548695e-09, "loss": 0.3014, "num_input_tokens_seen": 116403312, "step": 121920 }, { "epoch": 9.945754139815646, "grad_norm": 18.589073181152344, "learning_rate": 4.495211109134534e-09, "loss": 0.4551, "num_input_tokens_seen": 116407968, "step": 121925 }, { "epoch": 9.946162003426053, "grad_norm": 42.7470588684082, "learning_rate": 4.427970793982606e-09, "loss": 0.3553, "num_input_tokens_seen": 116412240, "step": 121930 }, { "epoch": 9.946569867036462, "grad_norm": 16.53272247314453, "learning_rate": 4.361237128452933e-09, "loss": 0.2816, "num_input_tokens_seen": 116417040, "step": 121935 }, { "epoch": 9.946977730646871, "grad_norm": 2.9720704555511475, "learning_rate": 4.295010113899989e-09, "loss": 0.3749, "num_input_tokens_seen": 116422064, "step": 121940 }, { "epoch": 9.94738559425728, "grad_norm": 1.87306809425354, "learning_rate": 4.229289751664367e-09, "loss": 0.3426, "num_input_tokens_seen": 116427344, "step": 121945 }, { "epoch": 9.94779345786769, "grad_norm": 7.080347537994385, "learning_rate": 4.164076043081111e-09, "loss": 0.4091, "num_input_tokens_seen": 116431920, "step": 121950 }, { "epoch": 9.948201321478098, "grad_norm": 13.712135314941406, "learning_rate": 4.0993689894713864e-09, "loss": 0.3598, "num_input_tokens_seen": 116437104, "step": 121955 }, { "epoch": 9.948609185088506, "grad_norm": 1.6094359159469604, "learning_rate": 4.035168592145255e-09, "loss": 0.2737, "num_input_tokens_seen": 116441856, "step": 121960 }, { "epoch": 9.949017048698915, "grad_norm": 34.171573638916016, "learning_rate": 3.971474852404456e-09, "loss": 0.2796, "num_input_tokens_seen": 116447168, "step": 121965 }, { "epoch": 9.949424912309324, "grad_norm": 61.091373443603516, "learning_rate": 3.908287771542396e-09, "loss": 0.461, "num_input_tokens_seen": 116451632, "step": 121970 }, { "epoch": 9.949832775919733, "grad_norm": 3.7375030517578125, "learning_rate": 3.845607350835834e-09, "loss": 0.4458, "num_input_tokens_seen": 116456160, "step": 121975 }, { "epoch": 9.950240639530142, "grad_norm": 10.451553344726562, "learning_rate": 3.783433591558749e-09, "loss": 0.125, "num_input_tokens_seen": 116461648, "step": 121980 }, { "epoch": 9.950648503140549, "grad_norm": 4.007776737213135, "learning_rate": 3.7217664949684706e-09, "loss": 0.594, "num_input_tokens_seen": 116467008, "step": 121985 }, { "epoch": 9.951056366750958, "grad_norm": 1.1578927040100098, "learning_rate": 3.6606060623167737e-09, "loss": 0.2545, "num_input_tokens_seen": 116471808, "step": 121990 }, { "epoch": 9.951464230361367, "grad_norm": 4.011131286621094, "learning_rate": 3.599952294844333e-09, "loss": 0.3522, "num_input_tokens_seen": 116475552, "step": 121995 }, { "epoch": 9.951872093971776, "grad_norm": 38.52475357055664, "learning_rate": 3.5398051937779455e-09, "loss": 0.3287, "num_input_tokens_seen": 116480368, "step": 122000 }, { "epoch": 9.952279957582185, "grad_norm": 5.135792255401611, "learning_rate": 3.4801647603388556e-09, "loss": 0.309, "num_input_tokens_seen": 116485184, "step": 122005 }, { "epoch": 9.952687821192594, "grad_norm": 11.64942741394043, "learning_rate": 3.421030995734431e-09, "loss": 0.4742, "num_input_tokens_seen": 116490400, "step": 122010 }, { "epoch": 9.953095684803001, "grad_norm": 28.74186134338379, "learning_rate": 3.362403901163713e-09, "loss": 0.4058, "num_input_tokens_seen": 116495136, "step": 122015 }, { "epoch": 9.95350354841341, "grad_norm": 2.152757167816162, "learning_rate": 3.3042834778146404e-09, "loss": 0.2985, "num_input_tokens_seen": 116499680, "step": 122020 }, { "epoch": 9.95391141202382, "grad_norm": 1.7977622747421265, "learning_rate": 3.2466697268668245e-09, "loss": 0.4855, "num_input_tokens_seen": 116504848, "step": 122025 }, { "epoch": 9.954319275634228, "grad_norm": 0.7367557883262634, "learning_rate": 3.1895626494887754e-09, "loss": 0.2982, "num_input_tokens_seen": 116509120, "step": 122030 }, { "epoch": 9.954727139244637, "grad_norm": 28.752914428710938, "learning_rate": 3.132962246835125e-09, "loss": 0.3326, "num_input_tokens_seen": 116514096, "step": 122035 }, { "epoch": 9.955135002855044, "grad_norm": 2.5609142780303955, "learning_rate": 3.076868520054954e-09, "loss": 0.4383, "num_input_tokens_seen": 116517520, "step": 122040 }, { "epoch": 9.955542866465453, "grad_norm": 13.642081260681152, "learning_rate": 3.0212814702834656e-09, "loss": 0.2482, "num_input_tokens_seen": 116522256, "step": 122045 }, { "epoch": 9.955950730075863, "grad_norm": 11.163881301879883, "learning_rate": 2.966201098650312e-09, "loss": 0.5427, "num_input_tokens_seen": 116527328, "step": 122050 }, { "epoch": 9.956358593686272, "grad_norm": 26.33934211730957, "learning_rate": 2.9116274062712667e-09, "loss": 0.3608, "num_input_tokens_seen": 116532608, "step": 122055 }, { "epoch": 9.95676645729668, "grad_norm": 9.386056900024414, "learning_rate": 2.8575603942482264e-09, "loss": 0.4274, "num_input_tokens_seen": 116536960, "step": 122060 }, { "epoch": 9.95717432090709, "grad_norm": 34.14866638183594, "learning_rate": 2.8040000636830875e-09, "loss": 0.4456, "num_input_tokens_seen": 116542768, "step": 122065 }, { "epoch": 9.957582184517497, "grad_norm": 5.439007759094238, "learning_rate": 2.7509464156583176e-09, "loss": 0.2622, "num_input_tokens_seen": 116546592, "step": 122070 }, { "epoch": 9.957990048127906, "grad_norm": 2.7091317176818848, "learning_rate": 2.6983994512508327e-09, "loss": 0.3576, "num_input_tokens_seen": 116552080, "step": 122075 }, { "epoch": 9.958397911738315, "grad_norm": 24.99420738220215, "learning_rate": 2.6463591715236714e-09, "loss": 0.2448, "num_input_tokens_seen": 116557184, "step": 122080 }, { "epoch": 9.958805775348724, "grad_norm": 3.9939827919006348, "learning_rate": 2.594825577534321e-09, "loss": 0.2473, "num_input_tokens_seen": 116561808, "step": 122085 }, { "epoch": 9.959213638959133, "grad_norm": 17.94546127319336, "learning_rate": 2.543798670323616e-09, "loss": 0.4699, "num_input_tokens_seen": 116566592, "step": 122090 }, { "epoch": 9.95962150256954, "grad_norm": 3.8106231689453125, "learning_rate": 2.4932784509296148e-09, "loss": 0.4219, "num_input_tokens_seen": 116571856, "step": 122095 }, { "epoch": 9.960029366179949, "grad_norm": 1.20745050907135, "learning_rate": 2.4432649203737224e-09, "loss": 0.2527, "num_input_tokens_seen": 116576752, "step": 122100 }, { "epoch": 9.960437229790358, "grad_norm": 8.371623039245605, "learning_rate": 2.393758079671793e-09, "loss": 0.5363, "num_input_tokens_seen": 116581152, "step": 122105 }, { "epoch": 9.960845093400767, "grad_norm": 2.8423562049865723, "learning_rate": 2.344757929823027e-09, "loss": 0.2893, "num_input_tokens_seen": 116585616, "step": 122110 }, { "epoch": 9.961252957011176, "grad_norm": 2.593545436859131, "learning_rate": 2.2962644718266257e-09, "loss": 0.31, "num_input_tokens_seen": 116590816, "step": 122115 }, { "epoch": 9.961660820621583, "grad_norm": 23.033817291259766, "learning_rate": 2.248277706665136e-09, "loss": 0.4289, "num_input_tokens_seen": 116596176, "step": 122120 }, { "epoch": 9.962068684231992, "grad_norm": 24.372493743896484, "learning_rate": 2.2007976353044525e-09, "loss": 0.3751, "num_input_tokens_seen": 116600592, "step": 122125 }, { "epoch": 9.962476547842401, "grad_norm": 6.104903697967529, "learning_rate": 2.153824258713244e-09, "loss": 0.316, "num_input_tokens_seen": 116604784, "step": 122130 }, { "epoch": 9.96288441145281, "grad_norm": 29.259328842163086, "learning_rate": 2.1073575778435273e-09, "loss": 0.2533, "num_input_tokens_seen": 116608640, "step": 122135 }, { "epoch": 9.96329227506322, "grad_norm": 7.365413665771484, "learning_rate": 2.0613975936334406e-09, "loss": 0.3346, "num_input_tokens_seen": 116612640, "step": 122140 }, { "epoch": 9.963700138673628, "grad_norm": 16.275680541992188, "learning_rate": 2.0159443070155716e-09, "loss": 0.2909, "num_input_tokens_seen": 116618096, "step": 122145 }, { "epoch": 9.964108002284036, "grad_norm": 15.54636287689209, "learning_rate": 1.9709977189141804e-09, "loss": 0.2791, "num_input_tokens_seen": 116623808, "step": 122150 }, { "epoch": 9.964515865894445, "grad_norm": 1.7986483573913574, "learning_rate": 1.926557830236875e-09, "loss": 0.255, "num_input_tokens_seen": 116627312, "step": 122155 }, { "epoch": 9.964923729504854, "grad_norm": 1.0174779891967773, "learning_rate": 1.8826246418884863e-09, "loss": 0.3109, "num_input_tokens_seen": 116631616, "step": 122160 }, { "epoch": 9.965331593115263, "grad_norm": 7.238996982574463, "learning_rate": 1.839198154754418e-09, "loss": 0.1603, "num_input_tokens_seen": 116636032, "step": 122165 }, { "epoch": 9.965739456725672, "grad_norm": 19.93195152282715, "learning_rate": 1.7962783697172969e-09, "loss": 0.441, "num_input_tokens_seen": 116640608, "step": 122170 }, { "epoch": 9.966147320336079, "grad_norm": 10.84640884399414, "learning_rate": 1.7538652876486483e-09, "loss": 0.3809, "num_input_tokens_seen": 116645408, "step": 122175 }, { "epoch": 9.966555183946488, "grad_norm": 1.7286686897277832, "learning_rate": 1.7119589094061195e-09, "loss": 0.4357, "num_input_tokens_seen": 116650288, "step": 122180 }, { "epoch": 9.966963047556897, "grad_norm": 2.521144390106201, "learning_rate": 1.6705592358390309e-09, "loss": 0.4238, "num_input_tokens_seen": 116655136, "step": 122185 }, { "epoch": 9.967370911167306, "grad_norm": 1.4696437120437622, "learning_rate": 1.6296662677883767e-09, "loss": 0.4096, "num_input_tokens_seen": 116659632, "step": 122190 }, { "epoch": 9.967778774777715, "grad_norm": 19.952327728271484, "learning_rate": 1.5892800060840486e-09, "loss": 0.318, "num_input_tokens_seen": 116663904, "step": 122195 }, { "epoch": 9.968186638388122, "grad_norm": 32.85464859008789, "learning_rate": 1.5494004515392847e-09, "loss": 0.3135, "num_input_tokens_seen": 116669136, "step": 122200 }, { "epoch": 9.968594501998531, "grad_norm": 2.1276004314422607, "learning_rate": 1.5100276049673235e-09, "loss": 0.1856, "num_input_tokens_seen": 116673536, "step": 122205 }, { "epoch": 9.96900236560894, "grad_norm": 19.592710494995117, "learning_rate": 1.4711614671619744e-09, "loss": 0.3252, "num_input_tokens_seen": 116678832, "step": 122210 }, { "epoch": 9.96941022921935, "grad_norm": 5.965043544769287, "learning_rate": 1.4328020389170471e-09, "loss": 0.2957, "num_input_tokens_seen": 116683920, "step": 122215 }, { "epoch": 9.969818092829758, "grad_norm": 14.627278327941895, "learning_rate": 1.3949493210041465e-09, "loss": 0.2829, "num_input_tokens_seen": 116688048, "step": 122220 }, { "epoch": 9.970225956440167, "grad_norm": 1.0740330219268799, "learning_rate": 1.3576033141948773e-09, "loss": 0.3308, "num_input_tokens_seen": 116693152, "step": 122225 }, { "epoch": 9.970633820050574, "grad_norm": 19.857547760009766, "learning_rate": 1.3207640192441916e-09, "loss": 0.3508, "num_input_tokens_seen": 116697376, "step": 122230 }, { "epoch": 9.971041683660983, "grad_norm": 1.0596444606781006, "learning_rate": 1.2844314368987143e-09, "loss": 0.2532, "num_input_tokens_seen": 116702768, "step": 122235 }, { "epoch": 9.971449547271392, "grad_norm": 38.394737243652344, "learning_rate": 1.248605567893968e-09, "loss": 0.4642, "num_input_tokens_seen": 116707200, "step": 122240 }, { "epoch": 9.971857410881801, "grad_norm": 11.116769790649414, "learning_rate": 1.2132864129571486e-09, "loss": 0.3559, "num_input_tokens_seen": 116712112, "step": 122245 }, { "epoch": 9.97226527449221, "grad_norm": 2.2097461223602295, "learning_rate": 1.178473972807126e-09, "loss": 0.2583, "num_input_tokens_seen": 116716816, "step": 122250 }, { "epoch": 9.972673138102618, "grad_norm": 12.132261276245117, "learning_rate": 1.1441682481461158e-09, "loss": 0.2775, "num_input_tokens_seen": 116721952, "step": 122255 }, { "epoch": 9.973081001713027, "grad_norm": 1.764165997505188, "learning_rate": 1.1103692396680076e-09, "loss": 0.2717, "num_input_tokens_seen": 116727216, "step": 122260 }, { "epoch": 9.973488865323436, "grad_norm": 29.311452865600586, "learning_rate": 1.0770769480611397e-09, "loss": 0.3983, "num_input_tokens_seen": 116731808, "step": 122265 }, { "epoch": 9.973896728933845, "grad_norm": 1.6043567657470703, "learning_rate": 1.0442913739999727e-09, "loss": 0.3162, "num_input_tokens_seen": 116736752, "step": 122270 }, { "epoch": 9.974304592544254, "grad_norm": 61.316802978515625, "learning_rate": 1.0120125181478646e-09, "loss": 0.2241, "num_input_tokens_seen": 116741872, "step": 122275 }, { "epoch": 9.974712456154663, "grad_norm": 49.398502349853516, "learning_rate": 9.80240381159847e-10, "loss": 0.2557, "num_input_tokens_seen": 116747072, "step": 122280 }, { "epoch": 9.97512031976507, "grad_norm": 22.79129409790039, "learning_rate": 9.489749636798495e-10, "loss": 0.2892, "num_input_tokens_seen": 116751456, "step": 122285 }, { "epoch": 9.975528183375479, "grad_norm": 24.715896606445312, "learning_rate": 9.182162663406991e-10, "loss": 0.3549, "num_input_tokens_seen": 116756096, "step": 122290 }, { "epoch": 9.975936046985888, "grad_norm": 2.0574421882629395, "learning_rate": 8.87964289766896e-10, "loss": 0.3198, "num_input_tokens_seen": 116760992, "step": 122295 }, { "epoch": 9.976343910596297, "grad_norm": 11.33498477935791, "learning_rate": 8.582190345718388e-10, "loss": 0.3417, "num_input_tokens_seen": 116766016, "step": 122300 }, { "epoch": 9.976751774206706, "grad_norm": 0.9580799341201782, "learning_rate": 8.289805013550478e-10, "loss": 0.3964, "num_input_tokens_seen": 116771072, "step": 122305 }, { "epoch": 9.977159637817113, "grad_norm": 4.945048809051514, "learning_rate": 8.002486907160434e-10, "loss": 0.4015, "num_input_tokens_seen": 116776256, "step": 122310 }, { "epoch": 9.977567501427522, "grad_norm": 1.8527319431304932, "learning_rate": 7.720236032293659e-10, "loss": 0.1893, "num_input_tokens_seen": 116781264, "step": 122315 }, { "epoch": 9.977975365037931, "grad_norm": 10.853480339050293, "learning_rate": 7.443052394751071e-10, "loss": 0.2366, "num_input_tokens_seen": 116786432, "step": 122320 }, { "epoch": 9.97838322864834, "grad_norm": 3.4726505279541016, "learning_rate": 7.170936000083784e-10, "loss": 0.4156, "num_input_tokens_seen": 116790816, "step": 122325 }, { "epoch": 9.97879109225875, "grad_norm": 1.9733024835586548, "learning_rate": 6.903886853842911e-10, "loss": 0.1959, "num_input_tokens_seen": 116795872, "step": 122330 }, { "epoch": 9.979198955869158, "grad_norm": 3.2502083778381348, "learning_rate": 6.641904961413037e-10, "loss": 0.2881, "num_input_tokens_seen": 116801424, "step": 122335 }, { "epoch": 9.979606819479566, "grad_norm": 3.7349870204925537, "learning_rate": 6.38499032812323e-10, "loss": 0.4021, "num_input_tokens_seen": 116807216, "step": 122340 }, { "epoch": 9.980014683089975, "grad_norm": 31.12194061279297, "learning_rate": 6.133142959219295e-10, "loss": 0.3282, "num_input_tokens_seen": 116811920, "step": 122345 }, { "epoch": 9.980422546700384, "grad_norm": 14.012203216552734, "learning_rate": 5.886362859724992e-10, "loss": 0.2785, "num_input_tokens_seen": 116815344, "step": 122350 }, { "epoch": 9.980830410310793, "grad_norm": 1.2648305892944336, "learning_rate": 5.64465003471959e-10, "loss": 0.3779, "num_input_tokens_seen": 116820384, "step": 122355 }, { "epoch": 9.981238273921202, "grad_norm": 2.5268025398254395, "learning_rate": 5.408004489032559e-10, "loss": 0.3096, "num_input_tokens_seen": 116824848, "step": 122360 }, { "epoch": 9.981646137531609, "grad_norm": 28.445327758789062, "learning_rate": 5.176426227521125e-10, "loss": 0.4368, "num_input_tokens_seen": 116830064, "step": 122365 }, { "epoch": 9.982054001142018, "grad_norm": 9.667953491210938, "learning_rate": 4.949915254848225e-10, "loss": 0.2022, "num_input_tokens_seen": 116834880, "step": 122370 }, { "epoch": 9.982461864752427, "grad_norm": 4.067269325256348, "learning_rate": 4.728471575621285e-10, "loss": 0.4803, "num_input_tokens_seen": 116839760, "step": 122375 }, { "epoch": 9.982869728362836, "grad_norm": 25.702051162719727, "learning_rate": 4.512095194281196e-10, "loss": 0.259, "num_input_tokens_seen": 116844064, "step": 122380 }, { "epoch": 9.983277591973245, "grad_norm": 10.903882026672363, "learning_rate": 4.300786115296607e-10, "loss": 0.3205, "num_input_tokens_seen": 116848048, "step": 122385 }, { "epoch": 9.983685455583652, "grad_norm": 10.071301460266113, "learning_rate": 4.094544342886364e-10, "loss": 0.4327, "num_input_tokens_seen": 116852464, "step": 122390 }, { "epoch": 9.984093319194061, "grad_norm": 52.45609664916992, "learning_rate": 3.89336988124156e-10, "loss": 0.3522, "num_input_tokens_seen": 116856896, "step": 122395 }, { "epoch": 9.98450118280447, "grad_norm": 7.224765300750732, "learning_rate": 3.6972627344700193e-10, "loss": 0.2922, "num_input_tokens_seen": 116861376, "step": 122400 }, { "epoch": 9.984909046414879, "grad_norm": 54.260536193847656, "learning_rate": 3.506222906513035e-10, "loss": 0.3665, "num_input_tokens_seen": 116865248, "step": 122405 }, { "epoch": 9.985316910025288, "grad_norm": 2.3297958374023438, "learning_rate": 3.320250401256386e-10, "loss": 0.3055, "num_input_tokens_seen": 116870000, "step": 122410 }, { "epoch": 9.985724773635695, "grad_norm": 1.49947190284729, "learning_rate": 3.1393452224470767e-10, "loss": 0.2562, "num_input_tokens_seen": 116875152, "step": 122415 }, { "epoch": 9.986132637246104, "grad_norm": 20.587085723876953, "learning_rate": 2.9635073738043527e-10, "loss": 0.5028, "num_input_tokens_seen": 116880160, "step": 122420 }, { "epoch": 9.986540500856513, "grad_norm": 2.8642072677612305, "learning_rate": 2.7927368588531733e-10, "loss": 0.3487, "num_input_tokens_seen": 116884928, "step": 122425 }, { "epoch": 9.986948364466922, "grad_norm": 7.119813919067383, "learning_rate": 2.6270336810629846e-10, "loss": 0.349, "num_input_tokens_seen": 116890032, "step": 122430 }, { "epoch": 9.987356228077331, "grad_norm": 5.082896709442139, "learning_rate": 2.466397843792212e-10, "loss": 0.2623, "num_input_tokens_seen": 116894752, "step": 122435 }, { "epoch": 9.98776409168774, "grad_norm": 40.29871368408203, "learning_rate": 2.3108293502882573e-10, "loss": 0.2845, "num_input_tokens_seen": 116900176, "step": 122440 }, { "epoch": 9.988171955298148, "grad_norm": 18.23906135559082, "learning_rate": 2.1603282037152562e-10, "loss": 0.3752, "num_input_tokens_seen": 116905776, "step": 122445 }, { "epoch": 9.988579818908557, "grad_norm": 18.590450286865234, "learning_rate": 2.014894407126322e-10, "loss": 0.251, "num_input_tokens_seen": 116910816, "step": 122450 }, { "epoch": 9.988987682518966, "grad_norm": 17.21749496459961, "learning_rate": 1.874527963463546e-10, "loss": 0.464, "num_input_tokens_seen": 116915360, "step": 122455 }, { "epoch": 9.989395546129375, "grad_norm": 0.7979169487953186, "learning_rate": 1.7392288755579966e-10, "loss": 0.3347, "num_input_tokens_seen": 116920336, "step": 122460 }, { "epoch": 9.989803409739784, "grad_norm": 21.82465934753418, "learning_rate": 1.6089971461852315e-10, "loss": 0.3515, "num_input_tokens_seen": 116925744, "step": 122465 }, { "epoch": 9.99021127335019, "grad_norm": 21.56449317932129, "learning_rate": 1.4838327779820304e-10, "loss": 0.4527, "num_input_tokens_seen": 116930256, "step": 122470 }, { "epoch": 9.9906191369606, "grad_norm": 24.042287826538086, "learning_rate": 1.363735773446395e-10, "loss": 0.2785, "num_input_tokens_seen": 116934896, "step": 122475 }, { "epoch": 9.991027000571009, "grad_norm": 2.4867019653320312, "learning_rate": 1.248706135048572e-10, "loss": 0.1709, "num_input_tokens_seen": 116940272, "step": 122480 }, { "epoch": 9.991434864181418, "grad_norm": 29.470069885253906, "learning_rate": 1.1387438650922733e-10, "loss": 0.3784, "num_input_tokens_seen": 116945312, "step": 122485 }, { "epoch": 9.991842727791827, "grad_norm": 25.578462600708008, "learning_rate": 1.0338489658534567e-10, "loss": 0.3207, "num_input_tokens_seen": 116949616, "step": 122490 }, { "epoch": 9.992250591402236, "grad_norm": 11.76368522644043, "learning_rate": 9.340214394137903e-11, "loss": 0.3982, "num_input_tokens_seen": 116954304, "step": 122495 }, { "epoch": 9.992658455012643, "grad_norm": 0.8017356395721436, "learning_rate": 8.392612877994311e-11, "loss": 0.2876, "num_input_tokens_seen": 116959344, "step": 122500 }, { "epoch": 9.993066318623052, "grad_norm": 18.91987419128418, "learning_rate": 7.495685129532693e-11, "loss": 0.3403, "num_input_tokens_seen": 116963840, "step": 122505 }, { "epoch": 9.993474182233461, "grad_norm": 5.883454322814941, "learning_rate": 6.649431166794174e-11, "loss": 0.4418, "num_input_tokens_seen": 116968928, "step": 122510 }, { "epoch": 9.99388204584387, "grad_norm": 4.413821697235107, "learning_rate": 5.853851006709654e-11, "loss": 0.4139, "num_input_tokens_seen": 116974080, "step": 122515 }, { "epoch": 9.99428990945428, "grad_norm": 1.0249688625335693, "learning_rate": 5.1089446659324804e-11, "loss": 0.2991, "num_input_tokens_seen": 116979392, "step": 122520 }, { "epoch": 9.994697773064686, "grad_norm": 20.561946868896484, "learning_rate": 4.4147121594506626e-11, "loss": 0.4664, "num_input_tokens_seen": 116983920, "step": 122525 }, { "epoch": 9.995105636675095, "grad_norm": 31.0506591796875, "learning_rate": 3.771153500864433e-11, "loss": 0.5124, "num_input_tokens_seen": 116988256, "step": 122530 }, { "epoch": 9.995513500285504, "grad_norm": 7.561352729797363, "learning_rate": 3.178268703496468e-11, "loss": 0.1653, "num_input_tokens_seen": 116992912, "step": 122535 }, { "epoch": 9.995921363895913, "grad_norm": 52.91217803955078, "learning_rate": 2.63605777955922e-11, "loss": 0.3418, "num_input_tokens_seen": 116998144, "step": 122540 }, { "epoch": 9.996329227506322, "grad_norm": 18.7901554107666, "learning_rate": 2.1445207398773647e-11, "loss": 0.4247, "num_input_tokens_seen": 117002544, "step": 122545 }, { "epoch": 9.996737091116731, "grad_norm": 1.3452024459838867, "learning_rate": 1.703657594442909e-11, "loss": 0.2755, "num_input_tokens_seen": 117006976, "step": 122550 }, { "epoch": 9.997144954727139, "grad_norm": 12.041435241699219, "learning_rate": 1.3134683521376368e-11, "loss": 0.353, "num_input_tokens_seen": 117011312, "step": 122555 }, { "epoch": 9.997552818337548, "grad_norm": 2.3559350967407227, "learning_rate": 9.739530207331094e-12, "loss": 0.3689, "num_input_tokens_seen": 117015904, "step": 122560 }, { "epoch": 9.997960681947957, "grad_norm": 6.472840785980225, "learning_rate": 6.851116074457764e-12, "loss": 0.3796, "num_input_tokens_seen": 117020640, "step": 122565 }, { "epoch": 9.998368545558366, "grad_norm": 14.26824951171875, "learning_rate": 4.4694411782675305e-12, "loss": 0.394, "num_input_tokens_seen": 117025184, "step": 122570 }, { "epoch": 9.998776409168775, "grad_norm": 2.3773796558380127, "learning_rate": 2.5945055687204288e-12, "loss": 0.2156, "num_input_tokens_seen": 117030400, "step": 122575 }, { "epoch": 9.999184272779182, "grad_norm": 17.160520553588867, "learning_rate": 1.2263092846742652e-12, "loss": 0.5679, "num_input_tokens_seen": 117034992, "step": 122580 }, { "epoch": 9.999592136389591, "grad_norm": 5.792725563049316, "learning_rate": 3.6485235110905735e-13, "loss": 0.4983, "num_input_tokens_seen": 117040480, "step": 122585 }, { "epoch": 10.0, "grad_norm": 6.807618141174316, "learning_rate": 1.0134787453708327e-14, "loss": 0.4528, "num_input_tokens_seen": 117044976, "step": 122590 }, { "epoch": 10.0, "num_input_tokens_seen": 117044976, "step": 122590, "total_flos": 5.270482368696287e+18, "train_loss": 0.34805614852386135, "train_runtime": 121813.0138, "train_samples_per_second": 2.013, "train_steps_per_second": 1.006 } ], "logging_steps": 5, "max_steps": 122590, "num_input_tokens_seen": 117044976, "num_train_epochs": 10, "save_steps": 6130, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.270482368696287e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }