{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 20.0, "eval_steps": 500, "global_step": 383360, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.026085141903171953, "grad_norm": 3.252858877182007, "learning_rate": 4.978305856983862e-05, "loss": 1.9218, "num_input_tokens_seen": 283536, "step": 500, "train_runtime": 18.7004, "train_tokens_per_second": 15162.068 }, { "epoch": 0.052170283806343906, "grad_norm": 3.500765085220337, "learning_rate": 4.956568238731219e-05, "loss": 1.7605, "num_input_tokens_seen": 574552, "step": 1000, "train_runtime": 38.0018, "train_tokens_per_second": 15119.085 }, { "epoch": 0.07825542570951587, "grad_norm": 3.4590671062469482, "learning_rate": 4.934830620478575e-05, "loss": 1.6898, "num_input_tokens_seen": 859976, "step": 1500, "train_runtime": 57.2593, "train_tokens_per_second": 15018.978 }, { "epoch": 0.10434056761268781, "grad_norm": 3.67798113822937, "learning_rate": 4.9130930022259324e-05, "loss": 1.6968, "num_input_tokens_seen": 1151232, "step": 2000, "train_runtime": 76.5735, "train_tokens_per_second": 15034.338 }, { "epoch": 0.13042570951585977, "grad_norm": 3.009059190750122, "learning_rate": 4.891355383973289e-05, "loss": 1.6838, "num_input_tokens_seen": 1439432, "step": 2500, "train_runtime": 95.8962, "train_tokens_per_second": 15010.309 }, { "epoch": 0.15651085141903173, "grad_norm": 3.1467044353485107, "learning_rate": 4.869617765720646e-05, "loss": 1.6861, "num_input_tokens_seen": 1727728, "step": 3000, "train_runtime": 114.9793, "train_tokens_per_second": 15026.424 }, { "epoch": 0.18259599332220366, "grad_norm": 2.8238844871520996, "learning_rate": 4.8478801474680025e-05, "loss": 1.6343, "num_input_tokens_seen": 2016488, "step": 3500, "train_runtime": 134.048, "train_tokens_per_second": 15043.024 }, { "epoch": 0.20868113522537562, "grad_norm": 2.7848801612854004, "learning_rate": 4.826142529215359e-05, "loss": 1.6482, "num_input_tokens_seen": 2310136, "step": 4000, "train_runtime": 153.775, "train_tokens_per_second": 15022.828 }, { "epoch": 0.23476627712854758, "grad_norm": 3.402919054031372, "learning_rate": 4.804404910962716e-05, "loss": 1.6326, "num_input_tokens_seen": 2601800, "step": 4500, "train_runtime": 173.1573, "train_tokens_per_second": 15025.64 }, { "epoch": 0.26085141903171954, "grad_norm": 4.777134418487549, "learning_rate": 4.7826672927100726e-05, "loss": 1.6236, "num_input_tokens_seen": 2889448, "step": 5000, "train_runtime": 192.4563, "train_tokens_per_second": 15013.531 }, { "epoch": 0.2869365609348915, "grad_norm": 2.45479416847229, "learning_rate": 4.760929674457429e-05, "loss": 1.5949, "num_input_tokens_seen": 3180128, "step": 5500, "train_runtime": 211.2052, "train_tokens_per_second": 15057.053 }, { "epoch": 0.31302170283806346, "grad_norm": 2.6998794078826904, "learning_rate": 4.7391920562047856e-05, "loss": 1.6117, "num_input_tokens_seen": 3470912, "step": 6000, "train_runtime": 230.9915, "train_tokens_per_second": 15026.144 }, { "epoch": 0.33910684474123537, "grad_norm": 2.838428258895874, "learning_rate": 4.717454437952143e-05, "loss": 1.6056, "num_input_tokens_seen": 3764848, "step": 6500, "train_runtime": 251.0138, "train_tokens_per_second": 14998.572 }, { "epoch": 0.36519198664440733, "grad_norm": 2.8896422386169434, "learning_rate": 4.695716819699499e-05, "loss": 1.6002, "num_input_tokens_seen": 4049200, "step": 7000, "train_runtime": 270.653, "train_tokens_per_second": 14960.855 }, { "epoch": 0.3912771285475793, "grad_norm": 2.878220558166504, "learning_rate": 4.673979201446856e-05, "loss": 1.5839, "num_input_tokens_seen": 4340488, "step": 7500, "train_runtime": 290.1843, "train_tokens_per_second": 14957.693 }, { "epoch": 0.41736227045075125, "grad_norm": 2.7241406440734863, "learning_rate": 4.652241583194213e-05, "loss": 1.5844, "num_input_tokens_seen": 4631904, "step": 8000, "train_runtime": 309.2754, "train_tokens_per_second": 14976.633 }, { "epoch": 0.4434474123539232, "grad_norm": 2.727529287338257, "learning_rate": 4.630503964941569e-05, "loss": 1.5936, "num_input_tokens_seen": 4919576, "step": 8500, "train_runtime": 328.4961, "train_tokens_per_second": 14976.057 }, { "epoch": 0.46953255425709517, "grad_norm": 3.117870330810547, "learning_rate": 4.6087663466889265e-05, "loss": 1.5695, "num_input_tokens_seen": 5211016, "step": 9000, "train_runtime": 348.3435, "train_tokens_per_second": 14959.417 }, { "epoch": 0.49561769616026713, "grad_norm": 2.490983724594116, "learning_rate": 4.587028728436283e-05, "loss": 1.5802, "num_input_tokens_seen": 5507568, "step": 9500, "train_runtime": 368.0383, "train_tokens_per_second": 14964.661 }, { "epoch": 0.5217028380634391, "grad_norm": 2.392632246017456, "learning_rate": 4.56529111018364e-05, "loss": 1.5806, "num_input_tokens_seen": 5798840, "step": 10000, "train_runtime": 387.6945, "train_tokens_per_second": 14957.241 }, { "epoch": 0.547787979966611, "grad_norm": 2.6862573623657227, "learning_rate": 4.5435534919309966e-05, "loss": 1.5801, "num_input_tokens_seen": 6085768, "step": 10500, "train_runtime": 407.4294, "train_tokens_per_second": 14936.988 }, { "epoch": 0.573873121869783, "grad_norm": 3.164522647857666, "learning_rate": 4.521815873678353e-05, "loss": 1.5636, "num_input_tokens_seen": 6371672, "step": 11000, "train_runtime": 426.5237, "train_tokens_per_second": 14938.61 }, { "epoch": 0.5999582637729549, "grad_norm": 2.5483455657958984, "learning_rate": 4.5000782554257095e-05, "loss": 1.5541, "num_input_tokens_seen": 6659744, "step": 11500, "train_runtime": 445.61, "train_tokens_per_second": 14945.23 }, { "epoch": 0.6260434056761269, "grad_norm": 2.6326119899749756, "learning_rate": 4.478340637173066e-05, "loss": 1.5801, "num_input_tokens_seen": 6947616, "step": 12000, "train_runtime": 465.2155, "train_tokens_per_second": 14934.188 }, { "epoch": 0.6521285475792988, "grad_norm": 2.5993449687957764, "learning_rate": 4.456603018920423e-05, "loss": 1.5497, "num_input_tokens_seen": 7236800, "step": 12500, "train_runtime": 484.6648, "train_tokens_per_second": 14931.556 }, { "epoch": 0.6782136894824707, "grad_norm": 2.419832468032837, "learning_rate": 4.4348654006677796e-05, "loss": 1.5692, "num_input_tokens_seen": 7525160, "step": 13000, "train_runtime": 504.4097, "train_tokens_per_second": 14918.745 }, { "epoch": 0.7042988313856428, "grad_norm": 2.346853017807007, "learning_rate": 4.413127782415137e-05, "loss": 1.568, "num_input_tokens_seen": 7815704, "step": 13500, "train_runtime": 523.0681, "train_tokens_per_second": 14942.039 }, { "epoch": 0.7303839732888147, "grad_norm": 2.47847580909729, "learning_rate": 4.391390164162493e-05, "loss": 1.5597, "num_input_tokens_seen": 8107760, "step": 14000, "train_runtime": 542.052, "train_tokens_per_second": 14957.533 }, { "epoch": 0.7564691151919867, "grad_norm": 2.5489418506622314, "learning_rate": 4.36965254590985e-05, "loss": 1.5588, "num_input_tokens_seen": 8400096, "step": 14500, "train_runtime": 562.4429, "train_tokens_per_second": 14935.019 }, { "epoch": 0.7825542570951586, "grad_norm": 3.1929831504821777, "learning_rate": 4.347914927657207e-05, "loss": 1.5409, "num_input_tokens_seen": 8679112, "step": 15000, "train_runtime": 581.6704, "train_tokens_per_second": 14921.014 }, { "epoch": 0.8086393989983306, "grad_norm": 2.6714396476745605, "learning_rate": 4.3261773094045634e-05, "loss": 1.5494, "num_input_tokens_seen": 8969456, "step": 15500, "train_runtime": 600.81, "train_tokens_per_second": 14928.94 }, { "epoch": 0.8347245409015025, "grad_norm": 2.379903554916382, "learning_rate": 4.3044396911519205e-05, "loss": 1.5589, "num_input_tokens_seen": 9261064, "step": 16000, "train_runtime": 619.9911, "train_tokens_per_second": 14937.414 }, { "epoch": 0.8608096828046744, "grad_norm": 2.5801916122436523, "learning_rate": 4.282702072899277e-05, "loss": 1.5594, "num_input_tokens_seen": 9550752, "step": 16500, "train_runtime": 639.7359, "train_tokens_per_second": 14929.21 }, { "epoch": 0.8868948247078464, "grad_norm": 2.8763697147369385, "learning_rate": 4.2609644546466335e-05, "loss": 1.5768, "num_input_tokens_seen": 9839304, "step": 17000, "train_runtime": 659.5206, "train_tokens_per_second": 14918.873 }, { "epoch": 0.9129799666110183, "grad_norm": 3.0146758556365967, "learning_rate": 4.23922683639399e-05, "loss": 1.5499, "num_input_tokens_seen": 10132792, "step": 17500, "train_runtime": 679.4357, "train_tokens_per_second": 14913.541 }, { "epoch": 0.9390651085141903, "grad_norm": 2.629668951034546, "learning_rate": 4.2174892181413464e-05, "loss": 1.5469, "num_input_tokens_seen": 10417368, "step": 18000, "train_runtime": 699.3592, "train_tokens_per_second": 14895.589 }, { "epoch": 0.9651502504173622, "grad_norm": 2.527364492416382, "learning_rate": 4.1957515998887036e-05, "loss": 1.5396, "num_input_tokens_seen": 10711800, "step": 18500, "train_runtime": 719.3221, "train_tokens_per_second": 14891.522 }, { "epoch": 0.9912353923205343, "grad_norm": 2.3071608543395996, "learning_rate": 4.17401398163606e-05, "loss": 1.5194, "num_input_tokens_seen": 10994712, "step": 19000, "train_runtime": 738.6768, "train_tokens_per_second": 14884.334 }, { "epoch": 1.0, "eval_loss": 1.3842333555221558, "eval_runtime": 47.3762, "eval_samples_per_second": 809.161, "eval_steps_per_second": 101.148, "num_input_tokens_seen": 11091734, "step": 19168 }, { "epoch": 1.0173205342237062, "grad_norm": 2.8192083835601807, "learning_rate": 4.152276363383417e-05, "loss": 1.4963, "num_input_tokens_seen": 11281086, "step": 19500, "train_runtime": 806.5637, "train_tokens_per_second": 13986.603 }, { "epoch": 1.0434056761268782, "grad_norm": 3.121436595916748, "learning_rate": 4.130538745130774e-05, "loss": 1.5117, "num_input_tokens_seen": 11574638, "step": 20000, "train_runtime": 825.9512, "train_tokens_per_second": 14013.707 }, { "epoch": 1.06949081803005, "grad_norm": 2.0136849880218506, "learning_rate": 4.108801126878131e-05, "loss": 1.5143, "num_input_tokens_seen": 11864494, "step": 20500, "train_runtime": 845.2133, "train_tokens_per_second": 14037.278 }, { "epoch": 1.095575959933222, "grad_norm": 2.6219029426574707, "learning_rate": 4.087063508625487e-05, "loss": 1.5055, "num_input_tokens_seen": 12158550, "step": 21000, "train_runtime": 864.7079, "train_tokens_per_second": 14060.876 }, { "epoch": 1.121661101836394, "grad_norm": 3.265441656112671, "learning_rate": 4.065325890372844e-05, "loss": 1.4973, "num_input_tokens_seen": 12445726, "step": 21500, "train_runtime": 885.5348, "train_tokens_per_second": 14054.474 }, { "epoch": 1.147746243739566, "grad_norm": 2.6268465518951416, "learning_rate": 4.043588272120201e-05, "loss": 1.5264, "num_input_tokens_seen": 12733878, "step": 22000, "train_runtime": 905.2864, "train_tokens_per_second": 14066.131 }, { "epoch": 1.1738313856427378, "grad_norm": 4.112071990966797, "learning_rate": 4.0218506538675574e-05, "loss": 1.4786, "num_input_tokens_seen": 13017478, "step": 22500, "train_runtime": 924.8642, "train_tokens_per_second": 14075.016 }, { "epoch": 1.1999165275459098, "grad_norm": 3.13775897026062, "learning_rate": 4.000113035614914e-05, "loss": 1.4809, "num_input_tokens_seen": 13308726, "step": 23000, "train_runtime": 944.4145, "train_tokens_per_second": 14092.038 }, { "epoch": 1.2260016694490818, "grad_norm": 2.7305409908294678, "learning_rate": 3.9783754173622704e-05, "loss": 1.5037, "num_input_tokens_seen": 13600462, "step": 23500, "train_runtime": 964.343, "train_tokens_per_second": 14103.346 }, { "epoch": 1.2520868113522536, "grad_norm": 3.8625481128692627, "learning_rate": 3.9566377991096275e-05, "loss": 1.4744, "num_input_tokens_seen": 13886382, "step": 24000, "train_runtime": 983.6134, "train_tokens_per_second": 14117.723 }, { "epoch": 1.2781719532554257, "grad_norm": 3.4027693271636963, "learning_rate": 3.934900180856984e-05, "loss": 1.4796, "num_input_tokens_seen": 14171390, "step": 24500, "train_runtime": 1003.0211, "train_tokens_per_second": 14128.706 }, { "epoch": 1.3042570951585977, "grad_norm": 2.1200718879699707, "learning_rate": 3.9131625626043405e-05, "loss": 1.5107, "num_input_tokens_seen": 14461470, "step": 25000, "train_runtime": 1022.6959, "train_tokens_per_second": 14140.538 }, { "epoch": 1.3303422370617697, "grad_norm": 2.7789530754089355, "learning_rate": 3.8914249443516976e-05, "loss": 1.4596, "num_input_tokens_seen": 14747598, "step": 25500, "train_runtime": 1042.1868, "train_tokens_per_second": 14150.628 }, { "epoch": 1.3564273789649417, "grad_norm": 2.1225244998931885, "learning_rate": 3.869687326099054e-05, "loss": 1.4669, "num_input_tokens_seen": 15036278, "step": 26000, "train_runtime": 1061.7955, "train_tokens_per_second": 14161.181 }, { "epoch": 1.3825125208681135, "grad_norm": 2.9342072010040283, "learning_rate": 3.847949707846411e-05, "loss": 1.4947, "num_input_tokens_seen": 15322110, "step": 26500, "train_runtime": 1081.7408, "train_tokens_per_second": 14164.308 }, { "epoch": 1.4085976627712855, "grad_norm": 2.25174880027771, "learning_rate": 3.826212089593768e-05, "loss": 1.472, "num_input_tokens_seen": 15619830, "step": 27000, "train_runtime": 1101.4139, "train_tokens_per_second": 14181.616 }, { "epoch": 1.4346828046744573, "grad_norm": 2.1327219009399414, "learning_rate": 3.804474471341124e-05, "loss": 1.4745, "num_input_tokens_seen": 15910494, "step": 27500, "train_runtime": 1120.8296, "train_tokens_per_second": 14195.283 }, { "epoch": 1.4607679465776293, "grad_norm": 2.2169244289398193, "learning_rate": 3.782736853088481e-05, "loss": 1.4961, "num_input_tokens_seen": 16202854, "step": 28000, "train_runtime": 1140.1942, "train_tokens_per_second": 14210.609 }, { "epoch": 1.4868530884808013, "grad_norm": 2.7171308994293213, "learning_rate": 3.760999234835837e-05, "loss": 1.4707, "num_input_tokens_seen": 16491582, "step": 28500, "train_runtime": 1160.3313, "train_tokens_per_second": 14212.822 }, { "epoch": 1.5129382303839733, "grad_norm": 2.9756038188934326, "learning_rate": 3.739261616583194e-05, "loss": 1.4584, "num_input_tokens_seen": 16778886, "step": 29000, "train_runtime": 1180.225, "train_tokens_per_second": 14216.684 }, { "epoch": 1.5390233722871454, "grad_norm": 2.1410768032073975, "learning_rate": 3.717523998330551e-05, "loss": 1.4856, "num_input_tokens_seen": 17072582, "step": 29500, "train_runtime": 1199.0906, "train_tokens_per_second": 14237.942 }, { "epoch": 1.5651085141903172, "grad_norm": 2.650392532348633, "learning_rate": 3.695786380077908e-05, "loss": 1.4821, "num_input_tokens_seen": 17362110, "step": 30000, "train_runtime": 1218.8129, "train_tokens_per_second": 14245.098 }, { "epoch": 1.5911936560934892, "grad_norm": 2.675250291824341, "learning_rate": 3.6740487618252644e-05, "loss": 1.4694, "num_input_tokens_seen": 17647902, "step": 30500, "train_runtime": 1238.9908, "train_tokens_per_second": 14243.772 }, { "epoch": 1.617278797996661, "grad_norm": 2.670755386352539, "learning_rate": 3.652311143572621e-05, "loss": 1.5342, "num_input_tokens_seen": 17943398, "step": 31000, "train_runtime": 1259.7818, "train_tokens_per_second": 14243.259 }, { "epoch": 1.643363939899833, "grad_norm": 2.637608051300049, "learning_rate": 3.630573525319978e-05, "loss": 1.4575, "num_input_tokens_seen": 18231966, "step": 31500, "train_runtime": 1279.0356, "train_tokens_per_second": 14254.464 }, { "epoch": 1.669449081803005, "grad_norm": 2.5078988075256348, "learning_rate": 3.6088359070673345e-05, "loss": 1.4518, "num_input_tokens_seen": 18525670, "step": 32000, "train_runtime": 1297.7662, "train_tokens_per_second": 14275.044 }, { "epoch": 1.695534223706177, "grad_norm": 2.266803503036499, "learning_rate": 3.587098288814692e-05, "loss": 1.5014, "num_input_tokens_seen": 18815526, "step": 32500, "train_runtime": 1316.4234, "train_tokens_per_second": 14292.914 }, { "epoch": 1.721619365609349, "grad_norm": 3.0197086334228516, "learning_rate": 3.565360670562048e-05, "loss": 1.4843, "num_input_tokens_seen": 19112486, "step": 33000, "train_runtime": 1335.2332, "train_tokens_per_second": 14313.968 }, { "epoch": 1.7477045075125208, "grad_norm": 2.791066884994507, "learning_rate": 3.5436230523094046e-05, "loss": 1.4878, "num_input_tokens_seen": 19396846, "step": 33500, "train_runtime": 1353.9271, "train_tokens_per_second": 14326.359 }, { "epoch": 1.7737896494156928, "grad_norm": 2.995617628097534, "learning_rate": 3.521885434056761e-05, "loss": 1.4606, "num_input_tokens_seen": 19683174, "step": 34000, "train_runtime": 1372.6447, "train_tokens_per_second": 14339.599 }, { "epoch": 1.7998747913188646, "grad_norm": 2.561185836791992, "learning_rate": 3.5001478158041176e-05, "loss": 1.4802, "num_input_tokens_seen": 19973646, "step": 34500, "train_runtime": 1391.2808, "train_tokens_per_second": 14356.301 }, { "epoch": 1.8259599332220366, "grad_norm": 3.1782171726226807, "learning_rate": 3.478410197551475e-05, "loss": 1.4588, "num_input_tokens_seen": 20264526, "step": 35000, "train_runtime": 1409.9676, "train_tokens_per_second": 14372.334 }, { "epoch": 1.8520450751252087, "grad_norm": 5.561634063720703, "learning_rate": 3.456672579298831e-05, "loss": 1.4609, "num_input_tokens_seen": 20553006, "step": 35500, "train_runtime": 1428.6129, "train_tokens_per_second": 14386.686 }, { "epoch": 1.8781302170283807, "grad_norm": 2.784186363220215, "learning_rate": 3.4349349610461884e-05, "loss": 1.4682, "num_input_tokens_seen": 20844014, "step": 36000, "train_runtime": 1447.2777, "train_tokens_per_second": 14402.221 }, { "epoch": 1.9042153589315527, "grad_norm": 2.59779691696167, "learning_rate": 3.413197342793545e-05, "loss": 1.5035, "num_input_tokens_seen": 21130910, "step": 36500, "train_runtime": 1465.9615, "train_tokens_per_second": 14414.369 }, { "epoch": 1.9303005008347245, "grad_norm": 2.6355996131896973, "learning_rate": 3.391459724540902e-05, "loss": 1.4815, "num_input_tokens_seen": 21419886, "step": 37000, "train_runtime": 1484.6953, "train_tokens_per_second": 14427.126 }, { "epoch": 1.9563856427378965, "grad_norm": 2.1540422439575195, "learning_rate": 3.3697221062882585e-05, "loss": 1.4686, "num_input_tokens_seen": 21706222, "step": 37500, "train_runtime": 1503.3619, "train_tokens_per_second": 14438.454 }, { "epoch": 1.9824707846410683, "grad_norm": 2.1270930767059326, "learning_rate": 3.347984488035615e-05, "loss": 1.4853, "num_input_tokens_seen": 21997414, "step": 38000, "train_runtime": 1522.056, "train_tokens_per_second": 14452.434 }, { "epoch": 2.0, "eval_loss": 1.347296118736267, "eval_runtime": 45.0902, "eval_samples_per_second": 850.185, "eval_steps_per_second": 106.276, "num_input_tokens_seen": 22196446, "step": 38336 }, { "epoch": 2.0085559265442403, "grad_norm": 2.812293767929077, "learning_rate": 3.326246869782972e-05, "loss": 1.4672, "num_input_tokens_seen": 22289118, "step": 38500, "train_runtime": 1586.651, "train_tokens_per_second": 14047.902 }, { "epoch": 2.0346410684474123, "grad_norm": 3.67232346534729, "learning_rate": 3.3045092515303286e-05, "loss": 1.4381, "num_input_tokens_seen": 22577710, "step": 39000, "train_runtime": 1605.3175, "train_tokens_per_second": 14064.327 }, { "epoch": 2.0607262103505843, "grad_norm": 2.2775866985321045, "learning_rate": 3.282771633277685e-05, "loss": 1.4397, "num_input_tokens_seen": 22866142, "step": 39500, "train_runtime": 1623.9658, "train_tokens_per_second": 14080.434 }, { "epoch": 2.0868113522537564, "grad_norm": 3.0156877040863037, "learning_rate": 3.2610340150250415e-05, "loss": 1.4657, "num_input_tokens_seen": 23163734, "step": 40000, "train_runtime": 1642.6646, "train_tokens_per_second": 14101.317 }, { "epoch": 2.1128964941569284, "grad_norm": 3.8104028701782227, "learning_rate": 3.239296396772399e-05, "loss": 1.4687, "num_input_tokens_seen": 23451982, "step": 40500, "train_runtime": 1661.3261, "train_tokens_per_second": 14116.423 }, { "epoch": 2.1389816360601, "grad_norm": 1.780987024307251, "learning_rate": 3.217558778519755e-05, "loss": 1.4432, "num_input_tokens_seen": 23743406, "step": 41000, "train_runtime": 1679.966, "train_tokens_per_second": 14133.266 }, { "epoch": 2.165066777963272, "grad_norm": 2.234935998916626, "learning_rate": 3.1958211602671117e-05, "loss": 1.447, "num_input_tokens_seen": 24037990, "step": 41500, "train_runtime": 1698.6679, "train_tokens_per_second": 14151.082 }, { "epoch": 2.191151919866444, "grad_norm": 2.599027395248413, "learning_rate": 3.174083542014469e-05, "loss": 1.4307, "num_input_tokens_seen": 24333206, "step": 42000, "train_runtime": 1717.3337, "train_tokens_per_second": 14169.177 }, { "epoch": 2.217237061769616, "grad_norm": 3.104538917541504, "learning_rate": 3.152345923761825e-05, "loss": 1.4165, "num_input_tokens_seen": 24623262, "step": 42500, "train_runtime": 1735.9704, "train_tokens_per_second": 14184.149 }, { "epoch": 2.243322203672788, "grad_norm": 2.5183098316192627, "learning_rate": 3.1306083055091824e-05, "loss": 1.4251, "num_input_tokens_seen": 24910790, "step": 43000, "train_runtime": 1754.6301, "train_tokens_per_second": 14197.175 }, { "epoch": 2.26940734557596, "grad_norm": 3.010117530822754, "learning_rate": 3.108870687256539e-05, "loss": 1.4719, "num_input_tokens_seen": 25200606, "step": 43500, "train_runtime": 1773.3028, "train_tokens_per_second": 14211.112 }, { "epoch": 2.295492487479132, "grad_norm": 3.781156063079834, "learning_rate": 3.087133069003896e-05, "loss": 1.44, "num_input_tokens_seen": 25494558, "step": 44000, "train_runtime": 1791.9661, "train_tokens_per_second": 14227.143 }, { "epoch": 2.321577629382304, "grad_norm": 2.3171684741973877, "learning_rate": 3.0653954507512525e-05, "loss": 1.4048, "num_input_tokens_seen": 25783878, "step": 44500, "train_runtime": 1810.6406, "train_tokens_per_second": 14240.196 }, { "epoch": 2.3476627712854756, "grad_norm": 2.785936117172241, "learning_rate": 3.0436578324986087e-05, "loss": 1.4333, "num_input_tokens_seen": 26074006, "step": 45000, "train_runtime": 1829.2827, "train_tokens_per_second": 14253.677 }, { "epoch": 2.3737479131886476, "grad_norm": 3.067204475402832, "learning_rate": 3.021920214245966e-05, "loss": 1.412, "num_input_tokens_seen": 26362862, "step": 45500, "train_runtime": 1847.9255, "train_tokens_per_second": 14266.193 }, { "epoch": 2.3998330550918197, "grad_norm": 3.440131902694702, "learning_rate": 3.0001825959933223e-05, "loss": 1.4343, "num_input_tokens_seen": 26659222, "step": 46000, "train_runtime": 1866.6572, "train_tokens_per_second": 14281.799 }, { "epoch": 2.4259181969949917, "grad_norm": 4.180527210235596, "learning_rate": 2.978444977740679e-05, "loss": 1.4231, "num_input_tokens_seen": 26945814, "step": 46500, "train_runtime": 1885.3282, "train_tokens_per_second": 14292.373 }, { "epoch": 2.4520033388981637, "grad_norm": 4.318091869354248, "learning_rate": 2.9567073594880356e-05, "loss": 1.4234, "num_input_tokens_seen": 27240518, "step": 47000, "train_runtime": 1904.0251, "train_tokens_per_second": 14306.806 }, { "epoch": 2.4780884808013357, "grad_norm": 2.4914376735687256, "learning_rate": 2.9349697412353928e-05, "loss": 1.4466, "num_input_tokens_seen": 27523134, "step": 47500, "train_runtime": 1922.7393, "train_tokens_per_second": 14314.543 }, { "epoch": 2.5041736227045073, "grad_norm": 2.4933414459228516, "learning_rate": 2.9132321229827492e-05, "loss": 1.4219, "num_input_tokens_seen": 27811630, "step": 48000, "train_runtime": 1941.4401, "train_tokens_per_second": 14325.258 }, { "epoch": 2.5302587646076793, "grad_norm": 3.3003621101379395, "learning_rate": 2.8914945047301057e-05, "loss": 1.4167, "num_input_tokens_seen": 28103582, "step": 48500, "train_runtime": 1960.1495, "train_tokens_per_second": 14337.469 }, { "epoch": 2.5563439065108513, "grad_norm": 2.9343557357788086, "learning_rate": 2.8697568864774625e-05, "loss": 1.4343, "num_input_tokens_seen": 28395062, "step": 49000, "train_runtime": 1978.7726, "train_tokens_per_second": 14349.836 }, { "epoch": 2.5824290484140233, "grad_norm": 2.247775077819824, "learning_rate": 2.848019268224819e-05, "loss": 1.44, "num_input_tokens_seen": 28682022, "step": 49500, "train_runtime": 1997.425, "train_tokens_per_second": 14359.499 }, { "epoch": 2.6085141903171953, "grad_norm": 3.329780101776123, "learning_rate": 2.826281649972176e-05, "loss": 1.4366, "num_input_tokens_seen": 28966702, "step": 50000, "train_runtime": 2016.0551, "train_tokens_per_second": 14368.011 }, { "epoch": 2.6345993322203674, "grad_norm": 2.639854907989502, "learning_rate": 2.8045440317195326e-05, "loss": 1.4175, "num_input_tokens_seen": 29256878, "step": 50500, "train_runtime": 2034.718, "train_tokens_per_second": 14378.837 }, { "epoch": 2.6606844741235394, "grad_norm": 4.10645055770874, "learning_rate": 2.7828064134668898e-05, "loss": 1.4229, "num_input_tokens_seen": 29545014, "step": 51000, "train_runtime": 2053.4349, "train_tokens_per_second": 14388.094 }, { "epoch": 2.6867696160267114, "grad_norm": 3.233084201812744, "learning_rate": 2.7610687952142463e-05, "loss": 1.4396, "num_input_tokens_seen": 29832302, "step": 51500, "train_runtime": 2072.1004, "train_tokens_per_second": 14397.132 }, { "epoch": 2.7128547579298834, "grad_norm": 3.0811736583709717, "learning_rate": 2.7393311769616027e-05, "loss": 1.4417, "num_input_tokens_seen": 30124678, "step": 52000, "train_runtime": 2090.765, "train_tokens_per_second": 14408.448 }, { "epoch": 2.738939899833055, "grad_norm": 3.9066579341888428, "learning_rate": 2.7175935587089595e-05, "loss": 1.42, "num_input_tokens_seen": 30411006, "step": 52500, "train_runtime": 2109.4596, "train_tokens_per_second": 14416.492 }, { "epoch": 2.765025041736227, "grad_norm": 3.752941131591797, "learning_rate": 2.695855940456316e-05, "loss": 1.4416, "num_input_tokens_seen": 30697118, "step": 53000, "train_runtime": 2128.1961, "train_tokens_per_second": 14424.008 }, { "epoch": 2.791110183639399, "grad_norm": 2.2906174659729004, "learning_rate": 2.6741183222036732e-05, "loss": 1.434, "num_input_tokens_seen": 30985038, "step": 53500, "train_runtime": 2146.9172, "train_tokens_per_second": 14432.339 }, { "epoch": 2.817195325542571, "grad_norm": 4.612029075622559, "learning_rate": 2.6523807039510297e-05, "loss": 1.4167, "num_input_tokens_seen": 31273350, "step": 54000, "train_runtime": 2165.6016, "train_tokens_per_second": 14440.952 }, { "epoch": 2.843280467445743, "grad_norm": 2.9580113887786865, "learning_rate": 2.6306430856983865e-05, "loss": 1.4059, "num_input_tokens_seen": 31560206, "step": 54500, "train_runtime": 2184.355, "train_tokens_per_second": 14448.295 }, { "epoch": 2.8693656093489146, "grad_norm": 3.1787197589874268, "learning_rate": 2.608905467445743e-05, "loss": 1.4472, "num_input_tokens_seen": 31852006, "step": 55000, "train_runtime": 2203.0469, "train_tokens_per_second": 14458.161 }, { "epoch": 2.8954507512520866, "grad_norm": 2.0112416744232178, "learning_rate": 2.5871678491930994e-05, "loss": 1.4311, "num_input_tokens_seen": 32138366, "step": 55500, "train_runtime": 2221.6719, "train_tokens_per_second": 14465.847 }, { "epoch": 2.9215358931552586, "grad_norm": 1.9806029796600342, "learning_rate": 2.5654302309404566e-05, "loss": 1.4348, "num_input_tokens_seen": 32427294, "step": 56000, "train_runtime": 2240.3821, "train_tokens_per_second": 14474.002 }, { "epoch": 2.9476210350584306, "grad_norm": 1.9818835258483887, "learning_rate": 2.543692612687813e-05, "loss": 1.4442, "num_input_tokens_seen": 32714750, "step": 56500, "train_runtime": 2259.0685, "train_tokens_per_second": 14481.522 }, { "epoch": 2.9737061769616027, "grad_norm": 2.794255256652832, "learning_rate": 2.52195499443517e-05, "loss": 1.4452, "num_input_tokens_seen": 33004950, "step": 57000, "train_runtime": 2277.7337, "train_tokens_per_second": 14490.258 }, { "epoch": 2.9997913188647747, "grad_norm": 3.825054407119751, "learning_rate": 2.5002173761825263e-05, "loss": 1.4031, "num_input_tokens_seen": 33292886, "step": 57500, "train_runtime": 2296.3777, "train_tokens_per_second": 14498.001 }, { "epoch": 3.0, "eval_loss": 1.3332206010818481, "eval_runtime": 45.0681, "eval_samples_per_second": 850.602, "eval_steps_per_second": 106.328, "num_input_tokens_seen": 33294704, "step": 57504 }, { "epoch": 3.0258764607679467, "grad_norm": 3.42480731010437, "learning_rate": 2.478479757929883e-05, "loss": 1.3848, "num_input_tokens_seen": 33584784, "step": 58000, "train_runtime": 2361.2516, "train_tokens_per_second": 14223.298 }, { "epoch": 3.0519616026711187, "grad_norm": 2.5299935340881348, "learning_rate": 2.45674213967724e-05, "loss": 1.3964, "num_input_tokens_seen": 33871192, "step": 58500, "train_runtime": 2379.8401, "train_tokens_per_second": 14232.55 }, { "epoch": 3.0780467445742903, "grad_norm": 2.3154349327087402, "learning_rate": 2.4350045214245968e-05, "loss": 1.4092, "num_input_tokens_seen": 34162736, "step": 59000, "train_runtime": 2398.5047, "train_tokens_per_second": 14243.348 }, { "epoch": 3.1041318864774623, "grad_norm": 3.183199167251587, "learning_rate": 2.4132669031719536e-05, "loss": 1.4007, "num_input_tokens_seen": 34452880, "step": 59500, "train_runtime": 2417.223, "train_tokens_per_second": 14253.083 }, { "epoch": 3.1302170283806343, "grad_norm": 2.856942892074585, "learning_rate": 2.39152928491931e-05, "loss": 1.407, "num_input_tokens_seen": 34740064, "step": 60000, "train_runtime": 2435.9312, "train_tokens_per_second": 14261.513 }, { "epoch": 3.1563021702838063, "grad_norm": 3.0104143619537354, "learning_rate": 2.3697916666666666e-05, "loss": 1.3869, "num_input_tokens_seen": 35033296, "step": 60500, "train_runtime": 2454.6106, "train_tokens_per_second": 14272.446 }, { "epoch": 3.1823873121869783, "grad_norm": 2.1120755672454834, "learning_rate": 2.3480540484140234e-05, "loss": 1.4128, "num_input_tokens_seen": 35326400, "step": 61000, "train_runtime": 2473.3018, "train_tokens_per_second": 14283.093 }, { "epoch": 3.2084724540901504, "grad_norm": 2.3867533206939697, "learning_rate": 2.3263164301613802e-05, "loss": 1.421, "num_input_tokens_seen": 35610096, "step": 61500, "train_runtime": 2491.98, "train_tokens_per_second": 14289.88 }, { "epoch": 3.2345575959933224, "grad_norm": 2.934441566467285, "learning_rate": 2.304578811908737e-05, "loss": 1.4507, "num_input_tokens_seen": 35899736, "step": 62000, "train_runtime": 2510.6844, "train_tokens_per_second": 14298.785 }, { "epoch": 3.260642737896494, "grad_norm": 1.9727118015289307, "learning_rate": 2.2828411936560938e-05, "loss": 1.4167, "num_input_tokens_seen": 36185200, "step": 62500, "train_runtime": 2529.3663, "train_tokens_per_second": 14306.034 }, { "epoch": 3.286727879799666, "grad_norm": 2.6939632892608643, "learning_rate": 2.2611035754034503e-05, "loss": 1.4152, "num_input_tokens_seen": 36476040, "step": 63000, "train_runtime": 2548.104, "train_tokens_per_second": 14314.973 }, { "epoch": 3.312813021702838, "grad_norm": 2.878223180770874, "learning_rate": 2.2393659571508068e-05, "loss": 1.4027, "num_input_tokens_seen": 36776288, "step": 63500, "train_runtime": 2566.9571, "train_tokens_per_second": 14326.803 }, { "epoch": 3.33889816360601, "grad_norm": 2.485452175140381, "learning_rate": 2.2176283388981636e-05, "loss": 1.3992, "num_input_tokens_seen": 37063960, "step": 64000, "train_runtime": 2585.6586, "train_tokens_per_second": 14334.437 }, { "epoch": 3.364983305509182, "grad_norm": 3.862046241760254, "learning_rate": 2.1958907206455204e-05, "loss": 1.3968, "num_input_tokens_seen": 37353184, "step": 64500, "train_runtime": 2604.3949, "train_tokens_per_second": 14342.366 }, { "epoch": 3.391068447412354, "grad_norm": 2.4618258476257324, "learning_rate": 2.1741531023928772e-05, "loss": 1.4059, "num_input_tokens_seen": 37648648, "step": 65000, "train_runtime": 2623.1097, "train_tokens_per_second": 14352.678 }, { "epoch": 3.417153589315526, "grad_norm": 2.7443792819976807, "learning_rate": 2.152415484140234e-05, "loss": 1.3809, "num_input_tokens_seen": 37936072, "step": 65500, "train_runtime": 2641.8438, "train_tokens_per_second": 14359.695 }, { "epoch": 3.443238731218698, "grad_norm": 2.808088541030884, "learning_rate": 2.1306778658875905e-05, "loss": 1.4118, "num_input_tokens_seen": 38225568, "step": 66000, "train_runtime": 2660.549, "train_tokens_per_second": 14367.549 }, { "epoch": 3.4693238731218696, "grad_norm": 2.7997331619262695, "learning_rate": 2.1089402476349473e-05, "loss": 1.404, "num_input_tokens_seen": 38512144, "step": 66500, "train_runtime": 2679.274, "train_tokens_per_second": 14374.097 }, { "epoch": 3.4954090150250416, "grad_norm": 2.4735493659973145, "learning_rate": 2.0872026293823038e-05, "loss": 1.4271, "num_input_tokens_seen": 38797344, "step": 67000, "train_runtime": 2697.9506, "train_tokens_per_second": 14380.302 }, { "epoch": 3.5214941569282137, "grad_norm": 4.414172172546387, "learning_rate": 2.0654650111296606e-05, "loss": 1.3969, "num_input_tokens_seen": 39085088, "step": 67500, "train_runtime": 2716.6451, "train_tokens_per_second": 14387.263 }, { "epoch": 3.5475792988313857, "grad_norm": 2.165419340133667, "learning_rate": 2.0437273928770174e-05, "loss": 1.4137, "num_input_tokens_seen": 39369904, "step": 68000, "train_runtime": 2735.364, "train_tokens_per_second": 14392.93 }, { "epoch": 3.5736644407345577, "grad_norm": 2.251249074935913, "learning_rate": 2.021989774624374e-05, "loss": 1.4066, "num_input_tokens_seen": 39661008, "step": 68500, "train_runtime": 2754.1198, "train_tokens_per_second": 14400.611 }, { "epoch": 3.5997495826377297, "grad_norm": 2.874959945678711, "learning_rate": 2.0002521563717307e-05, "loss": 1.3949, "num_input_tokens_seen": 39953968, "step": 69000, "train_runtime": 2772.8706, "train_tokens_per_second": 14408.883 }, { "epoch": 3.6258347245409013, "grad_norm": 2.662647008895874, "learning_rate": 1.9785145381190875e-05, "loss": 1.4054, "num_input_tokens_seen": 40240768, "step": 69500, "train_runtime": 2791.6372, "train_tokens_per_second": 14414.756 }, { "epoch": 3.6519198664440733, "grad_norm": 2.5272815227508545, "learning_rate": 1.9567769198664444e-05, "loss": 1.4323, "num_input_tokens_seen": 40533416, "step": 70000, "train_runtime": 2810.3654, "train_tokens_per_second": 14422.827 }, { "epoch": 3.6780050083472453, "grad_norm": 2.721334457397461, "learning_rate": 1.9350393016138008e-05, "loss": 1.3872, "num_input_tokens_seen": 40825024, "step": 70500, "train_runtime": 2829.08, "train_tokens_per_second": 14430.495 }, { "epoch": 3.7040901502504173, "grad_norm": 2.5722897052764893, "learning_rate": 1.9133016833611576e-05, "loss": 1.372, "num_input_tokens_seen": 41113376, "step": 71000, "train_runtime": 2847.8223, "train_tokens_per_second": 14436.777 }, { "epoch": 3.7301752921535893, "grad_norm": 2.262794256210327, "learning_rate": 1.891564065108514e-05, "loss": 1.3728, "num_input_tokens_seen": 41401936, "step": 71500, "train_runtime": 2866.4955, "train_tokens_per_second": 14443.398 }, { "epoch": 3.7562604340567614, "grad_norm": 2.6011643409729004, "learning_rate": 1.869826446855871e-05, "loss": 1.3901, "num_input_tokens_seen": 41689120, "step": 72000, "train_runtime": 2885.163, "train_tokens_per_second": 14449.485 }, { "epoch": 3.7823455759599334, "grad_norm": 2.6435554027557373, "learning_rate": 1.8480888286032277e-05, "loss": 1.4071, "num_input_tokens_seen": 41974720, "step": 72500, "train_runtime": 2903.8827, "train_tokens_per_second": 14454.689 }, { "epoch": 3.8084307178631054, "grad_norm": 2.489372730255127, "learning_rate": 1.8263512103505846e-05, "loss": 1.4023, "num_input_tokens_seen": 42264016, "step": 73000, "train_runtime": 2922.5501, "train_tokens_per_second": 14461.349 }, { "epoch": 3.8345158597662774, "grad_norm": 2.4132964611053467, "learning_rate": 1.8046135920979414e-05, "loss": 1.4153, "num_input_tokens_seen": 42558416, "step": 73500, "train_runtime": 2941.2299, "train_tokens_per_second": 14469.599 }, { "epoch": 3.860601001669449, "grad_norm": 3.1832597255706787, "learning_rate": 1.782875973845298e-05, "loss": 1.4076, "num_input_tokens_seen": 42847504, "step": 74000, "train_runtime": 2959.9571, "train_tokens_per_second": 14475.718 }, { "epoch": 3.886686143572621, "grad_norm": 2.246975898742676, "learning_rate": 1.7611383555926543e-05, "loss": 1.3755, "num_input_tokens_seen": 43137392, "step": 74500, "train_runtime": 2978.6745, "train_tokens_per_second": 14482.077 }, { "epoch": 3.912771285475793, "grad_norm": 3.47536039352417, "learning_rate": 1.739400737340011e-05, "loss": 1.3837, "num_input_tokens_seen": 43421200, "step": 75000, "train_runtime": 2997.3314, "train_tokens_per_second": 14486.62 }, { "epoch": 3.938856427378965, "grad_norm": 2.817647695541382, "learning_rate": 1.717663119087368e-05, "loss": 1.3869, "num_input_tokens_seen": 43714432, "step": 75500, "train_runtime": 3015.9535, "train_tokens_per_second": 14494.399 }, { "epoch": 3.964941569282137, "grad_norm": 2.670565366744995, "learning_rate": 1.6959255008347248e-05, "loss": 1.3875, "num_input_tokens_seen": 44005040, "step": 76000, "train_runtime": 3034.653, "train_tokens_per_second": 14500.847 }, { "epoch": 3.9910267111853086, "grad_norm": 3.01701021194458, "learning_rate": 1.6741878825820816e-05, "loss": 1.3875, "num_input_tokens_seen": 44295304, "step": 76500, "train_runtime": 3053.3496, "train_tokens_per_second": 14507.118 }, { "epoch": 4.0, "eval_loss": 1.3256505727767944, "eval_runtime": 45.046, "eval_samples_per_second": 851.018, "eval_steps_per_second": 106.38, "num_input_tokens_seen": 44395724, "step": 76672 }, { "epoch": 4.017111853088481, "grad_norm": 2.520019292831421, "learning_rate": 1.652450264329438e-05, "loss": 1.3838, "num_input_tokens_seen": 44585564, "step": 77000, "train_runtime": 3118.0069, "train_tokens_per_second": 14299.379 }, { "epoch": 4.043196994991653, "grad_norm": 4.146509170532227, "learning_rate": 1.6307126460767945e-05, "loss": 1.3596, "num_input_tokens_seen": 44870940, "step": 77500, "train_runtime": 3136.5879, "train_tokens_per_second": 14305.654 }, { "epoch": 4.069282136894825, "grad_norm": 2.3407187461853027, "learning_rate": 1.6089750278241514e-05, "loss": 1.3979, "num_input_tokens_seen": 45165140, "step": 78000, "train_runtime": 3155.153, "train_tokens_per_second": 14314.723 }, { "epoch": 4.095367278797997, "grad_norm": 2.992572069168091, "learning_rate": 1.5872374095715082e-05, "loss": 1.4121, "num_input_tokens_seen": 45458076, "step": 78500, "train_runtime": 3173.7885, "train_tokens_per_second": 14322.97 }, { "epoch": 4.121452420701169, "grad_norm": 3.490511655807495, "learning_rate": 1.565499791318865e-05, "loss": 1.37, "num_input_tokens_seen": 45746588, "step": 79000, "train_runtime": 3192.4179, "train_tokens_per_second": 14329.762 }, { "epoch": 4.147537562604341, "grad_norm": 3.6620404720306396, "learning_rate": 1.5437621730662215e-05, "loss": 1.398, "num_input_tokens_seen": 46037020, "step": 79500, "train_runtime": 3212.1684, "train_tokens_per_second": 14332.069 }, { "epoch": 4.173622704507513, "grad_norm": 2.709702253341675, "learning_rate": 1.5220245548135783e-05, "loss": 1.3714, "num_input_tokens_seen": 46327764, "step": 80000, "train_runtime": 3232.3645, "train_tokens_per_second": 14332.469 }, { "epoch": 4.199707846410685, "grad_norm": 3.0171260833740234, "learning_rate": 1.5002869365609348e-05, "loss": 1.3777, "num_input_tokens_seen": 46608924, "step": 80500, "train_runtime": 3252.0642, "train_tokens_per_second": 14332.104 }, { "epoch": 4.225792988313857, "grad_norm": 2.588928461074829, "learning_rate": 1.4785493183082916e-05, "loss": 1.3768, "num_input_tokens_seen": 46898436, "step": 81000, "train_runtime": 3271.9745, "train_tokens_per_second": 14333.375 }, { "epoch": 4.251878130217029, "grad_norm": 2.5653598308563232, "learning_rate": 1.4568117000556484e-05, "loss": 1.3753, "num_input_tokens_seen": 47187548, "step": 81500, "train_runtime": 3291.5411, "train_tokens_per_second": 14336.005 }, { "epoch": 4.2779632721202, "grad_norm": 3.236936330795288, "learning_rate": 1.435074081803005e-05, "loss": 1.3987, "num_input_tokens_seen": 47475276, "step": 82000, "train_runtime": 3311.1953, "train_tokens_per_second": 14337.806 }, { "epoch": 4.304048414023372, "grad_norm": 2.4497241973876953, "learning_rate": 1.4133364635503618e-05, "loss": 1.36, "num_input_tokens_seen": 47768556, "step": 82500, "train_runtime": 3330.633, "train_tokens_per_second": 14342.185 }, { "epoch": 4.330133555926544, "grad_norm": 3.381693124771118, "learning_rate": 1.3915988452977185e-05, "loss": 1.4122, "num_input_tokens_seen": 48056012, "step": 83000, "train_runtime": 3350.0565, "train_tokens_per_second": 14344.836 }, { "epoch": 4.356218697829716, "grad_norm": 2.8100342750549316, "learning_rate": 1.3698612270450753e-05, "loss": 1.3836, "num_input_tokens_seen": 48341348, "step": 83500, "train_runtime": 3369.3072, "train_tokens_per_second": 14347.563 }, { "epoch": 4.382303839732888, "grad_norm": 3.380335569381714, "learning_rate": 1.3481236087924318e-05, "loss": 1.3726, "num_input_tokens_seen": 48631420, "step": 84000, "train_runtime": 3389.3206, "train_tokens_per_second": 14348.427 }, { "epoch": 4.40838898163606, "grad_norm": 2.434285879135132, "learning_rate": 1.3263859905397884e-05, "loss": 1.3937, "num_input_tokens_seen": 48915972, "step": 84500, "train_runtime": 3409.284, "train_tokens_per_second": 14347.873 }, { "epoch": 4.434474123539232, "grad_norm": 2.8802988529205322, "learning_rate": 1.3046483722871452e-05, "loss": 1.3761, "num_input_tokens_seen": 49203916, "step": 85000, "train_runtime": 3428.2362, "train_tokens_per_second": 14352.545 }, { "epoch": 4.460559265442404, "grad_norm": 3.350780963897705, "learning_rate": 1.282910754034502e-05, "loss": 1.3766, "num_input_tokens_seen": 49493860, "step": 85500, "train_runtime": 3447.3803, "train_tokens_per_second": 14356.948 }, { "epoch": 4.486644407345576, "grad_norm": 2.4271440505981445, "learning_rate": 1.2611731357818587e-05, "loss": 1.3672, "num_input_tokens_seen": 49778012, "step": 86000, "train_runtime": 3466.8453, "train_tokens_per_second": 14358.302 }, { "epoch": 4.512729549248748, "grad_norm": 2.5384743213653564, "learning_rate": 1.2394355175292154e-05, "loss": 1.3701, "num_input_tokens_seen": 50065764, "step": 86500, "train_runtime": 3486.5719, "train_tokens_per_second": 14359.596 }, { "epoch": 4.53881469115192, "grad_norm": 3.011307716369629, "learning_rate": 1.2176978992765722e-05, "loss": 1.3884, "num_input_tokens_seen": 50349860, "step": 87000, "train_runtime": 3505.9535, "train_tokens_per_second": 14361.246 }, { "epoch": 4.564899833055092, "grad_norm": 2.5870578289031982, "learning_rate": 1.1959602810239288e-05, "loss": 1.3991, "num_input_tokens_seen": 50643260, "step": 87500, "train_runtime": 3525.1982, "train_tokens_per_second": 14366.074 }, { "epoch": 4.590984974958264, "grad_norm": 3.0917413234710693, "learning_rate": 1.1742226627712856e-05, "loss": 1.3876, "num_input_tokens_seen": 50934732, "step": 88000, "train_runtime": 3544.4536, "train_tokens_per_second": 14370.264 }, { "epoch": 4.617070116861436, "grad_norm": 2.181250810623169, "learning_rate": 1.1524850445186423e-05, "loss": 1.3801, "num_input_tokens_seen": 51225644, "step": 88500, "train_runtime": 3563.7836, "train_tokens_per_second": 14373.949 }, { "epoch": 4.643155258764608, "grad_norm": 3.146324872970581, "learning_rate": 1.130747426265999e-05, "loss": 1.3451, "num_input_tokens_seen": 51515932, "step": 89000, "train_runtime": 3583.4863, "train_tokens_per_second": 14375.925 }, { "epoch": 4.66924040066778, "grad_norm": 2.4125654697418213, "learning_rate": 1.1090098080133557e-05, "loss": 1.3759, "num_input_tokens_seen": 51803372, "step": 89500, "train_runtime": 3602.6645, "train_tokens_per_second": 14379.183 }, { "epoch": 4.695325542570951, "grad_norm": 3.1065971851348877, "learning_rate": 1.0872721897607122e-05, "loss": 1.3846, "num_input_tokens_seen": 52096660, "step": 90000, "train_runtime": 3621.3864, "train_tokens_per_second": 14385.833 }, { "epoch": 4.721410684474123, "grad_norm": 2.9472384452819824, "learning_rate": 1.065534571508069e-05, "loss": 1.3826, "num_input_tokens_seen": 52385124, "step": 90500, "train_runtime": 3640.3069, "train_tokens_per_second": 14390.304 }, { "epoch": 4.747495826377295, "grad_norm": 3.2821028232574463, "learning_rate": 1.0437969532554258e-05, "loss": 1.3913, "num_input_tokens_seen": 52675284, "step": 91000, "train_runtime": 3659.1435, "train_tokens_per_second": 14395.523 }, { "epoch": 4.773580968280467, "grad_norm": 2.897390604019165, "learning_rate": 1.0220593350027825e-05, "loss": 1.3745, "num_input_tokens_seen": 52966012, "step": 91500, "train_runtime": 3677.8728, "train_tokens_per_second": 14401.263 }, { "epoch": 4.799666110183639, "grad_norm": 2.4328722953796387, "learning_rate": 1.0003217167501391e-05, "loss": 1.3675, "num_input_tokens_seen": 53260060, "step": 92000, "train_runtime": 3696.7483, "train_tokens_per_second": 14407.272 }, { "epoch": 4.825751252086811, "grad_norm": 2.3648526668548584, "learning_rate": 9.78584098497496e-06, "loss": 1.348, "num_input_tokens_seen": 53549900, "step": 92500, "train_runtime": 3715.4001, "train_tokens_per_second": 14412.956 }, { "epoch": 4.851836393989983, "grad_norm": 2.3531742095947266, "learning_rate": 9.568464802448526e-06, "loss": 1.3779, "num_input_tokens_seen": 53844180, "step": 93000, "train_runtime": 3734.2446, "train_tokens_per_second": 14419.029 }, { "epoch": 4.877921535893155, "grad_norm": 2.4701406955718994, "learning_rate": 9.351088619922092e-06, "loss": 1.3688, "num_input_tokens_seen": 54132452, "step": 93500, "train_runtime": 3752.9114, "train_tokens_per_second": 14424.122 }, { "epoch": 4.904006677796327, "grad_norm": 3.4860074520111084, "learning_rate": 9.13371243739566e-06, "loss": 1.3786, "num_input_tokens_seen": 54424212, "step": 94000, "train_runtime": 3771.7803, "train_tokens_per_second": 14429.316 }, { "epoch": 4.930091819699499, "grad_norm": 2.331005811691284, "learning_rate": 8.916336254869227e-06, "loss": 1.3582, "num_input_tokens_seen": 54719684, "step": 94500, "train_runtime": 3790.6832, "train_tokens_per_second": 14435.309 }, { "epoch": 4.956176961602671, "grad_norm": 2.379862070083618, "learning_rate": 8.698960072342793e-06, "loss": 1.3838, "num_input_tokens_seen": 55006740, "step": 95000, "train_runtime": 3809.755, "train_tokens_per_second": 14438.393 }, { "epoch": 4.982262103505843, "grad_norm": 3.527317523956299, "learning_rate": 8.481583889816362e-06, "loss": 1.3944, "num_input_tokens_seen": 55294876, "step": 95500, "train_runtime": 3829.1057, "train_tokens_per_second": 14440.676 }, { "epoch": 5.0, "eval_loss": 1.3229724168777466, "eval_runtime": 46.7304, "eval_samples_per_second": 820.343, "eval_steps_per_second": 102.546, "num_input_tokens_seen": 55492754, "step": 95840 }, { "epoch": 5.008347245409015, "grad_norm": 2.8223490715026855, "learning_rate": 8.264207707289928e-06, "loss": 1.3501, "num_input_tokens_seen": 55585722, "step": 96000, "train_runtime": 3896.8789, "train_tokens_per_second": 14264.165 }, { "epoch": 5.034432387312187, "grad_norm": 3.312976360321045, "learning_rate": 8.046831524763496e-06, "loss": 1.364, "num_input_tokens_seen": 55873162, "step": 96500, "train_runtime": 3916.5275, "train_tokens_per_second": 14265.995 }, { "epoch": 5.060517529215359, "grad_norm": 4.365355491638184, "learning_rate": 7.829455342237061e-06, "loss": 1.3657, "num_input_tokens_seen": 56159210, "step": 97000, "train_runtime": 3935.5771, "train_tokens_per_second": 14269.625 }, { "epoch": 5.086602671118531, "grad_norm": 2.77451753616333, "learning_rate": 7.612079159710629e-06, "loss": 1.3722, "num_input_tokens_seen": 56450234, "step": 97500, "train_runtime": 3954.8081, "train_tokens_per_second": 14273.824 }, { "epoch": 5.112687813021703, "grad_norm": 2.028353214263916, "learning_rate": 7.3947029771841964e-06, "loss": 1.3778, "num_input_tokens_seen": 56740002, "step": 98000, "train_runtime": 3973.4854, "train_tokens_per_second": 14279.655 }, { "epoch": 5.138772954924875, "grad_norm": 2.0676374435424805, "learning_rate": 7.177326794657763e-06, "loss": 1.3462, "num_input_tokens_seen": 57027226, "step": 98500, "train_runtime": 3992.3304, "train_tokens_per_second": 14284.195 }, { "epoch": 5.164858096828047, "grad_norm": 2.0867531299591064, "learning_rate": 6.95995061213133e-06, "loss": 1.3739, "num_input_tokens_seen": 57316978, "step": 99000, "train_runtime": 4012.1011, "train_tokens_per_second": 14286.025 }, { "epoch": 5.190943238731219, "grad_norm": 2.3995723724365234, "learning_rate": 6.7425744296048975e-06, "loss": 1.3821, "num_input_tokens_seen": 57607834, "step": 99500, "train_runtime": 4031.6912, "train_tokens_per_second": 14288.752 }, { "epoch": 5.217028380634391, "grad_norm": 3.466399669647217, "learning_rate": 6.525198247078465e-06, "loss": 1.3499, "num_input_tokens_seen": 57896786, "step": 100000, "train_runtime": 4051.2038, "train_tokens_per_second": 14291.255 }, { "epoch": 5.243113522537563, "grad_norm": 2.673947811126709, "learning_rate": 6.307822064552031e-06, "loss": 1.3703, "num_input_tokens_seen": 58184506, "step": 100500, "train_runtime": 4070.3919, "train_tokens_per_second": 14294.571 }, { "epoch": 5.269198664440735, "grad_norm": 2.0675642490386963, "learning_rate": 6.0904458820255986e-06, "loss": 1.3759, "num_input_tokens_seen": 58473186, "step": 101000, "train_runtime": 4090.425, "train_tokens_per_second": 14295.137 }, { "epoch": 5.295283806343907, "grad_norm": 2.8680272102355957, "learning_rate": 5.873069699499165e-06, "loss": 1.3811, "num_input_tokens_seen": 58764498, "step": 101500, "train_runtime": 4109.7435, "train_tokens_per_second": 14298.824 }, { "epoch": 5.321368948247079, "grad_norm": 3.1335153579711914, "learning_rate": 5.655693516972733e-06, "loss": 1.3914, "num_input_tokens_seen": 59053762, "step": 102000, "train_runtime": 4129.2443, "train_tokens_per_second": 14301.348 }, { "epoch": 5.347454090150251, "grad_norm": 4.179940223693848, "learning_rate": 5.4383173344463e-06, "loss": 1.3353, "num_input_tokens_seen": 59346138, "step": 102500, "train_runtime": 4148.9629, "train_tokens_per_second": 14303.849 }, { "epoch": 5.373539232053423, "grad_norm": 2.837871551513672, "learning_rate": 5.220941151919867e-06, "loss": 1.3592, "num_input_tokens_seen": 59634050, "step": 103000, "train_runtime": 4172.193, "train_tokens_per_second": 14293.215 }, { "epoch": 5.399624373956595, "grad_norm": 2.620933771133423, "learning_rate": 5.003564969393433e-06, "loss": 1.3438, "num_input_tokens_seen": 59920002, "step": 103500, "train_runtime": 4191.8017, "train_tokens_per_second": 14294.57 }, { "epoch": 5.425709515859766, "grad_norm": 2.974597454071045, "learning_rate": 4.786188786867001e-06, "loss": 1.3848, "num_input_tokens_seen": 60208490, "step": 104000, "train_runtime": 4210.5451, "train_tokens_per_second": 14299.453 }, { "epoch": 5.451794657762938, "grad_norm": 2.7892649173736572, "learning_rate": 4.568812604340568e-06, "loss": 1.3947, "num_input_tokens_seen": 60497570, "step": 104500, "train_runtime": 4229.6543, "train_tokens_per_second": 14303.195 }, { "epoch": 5.47787979966611, "grad_norm": 2.9217751026153564, "learning_rate": 4.3514364218141344e-06, "loss": 1.3637, "num_input_tokens_seen": 60791682, "step": 105000, "train_runtime": 4249.1377, "train_tokens_per_second": 14306.828 }, { "epoch": 5.503964941569282, "grad_norm": 2.3021788597106934, "learning_rate": 4.134060239287702e-06, "loss": 1.3772, "num_input_tokens_seen": 61081546, "step": 105500, "train_runtime": 4268.0879, "train_tokens_per_second": 14311.22 }, { "epoch": 5.530050083472454, "grad_norm": 2.520854949951172, "learning_rate": 3.916684056761269e-06, "loss": 1.3595, "num_input_tokens_seen": 61376714, "step": 106000, "train_runtime": 4287.1193, "train_tokens_per_second": 14316.54 }, { "epoch": 5.556135225375626, "grad_norm": 2.5124387741088867, "learning_rate": 3.6993078742348355e-06, "loss": 1.3755, "num_input_tokens_seen": 61670282, "step": 106500, "train_runtime": 4306.7613, "train_tokens_per_second": 14319.41 }, { "epoch": 5.582220367278798, "grad_norm": 3.6542813777923584, "learning_rate": 3.4819316917084032e-06, "loss": 1.3299, "num_input_tokens_seen": 61959530, "step": 107000, "train_runtime": 4325.9495, "train_tokens_per_second": 14322.758 }, { "epoch": 5.60830550918197, "grad_norm": 2.480987787246704, "learning_rate": 3.2645555091819697e-06, "loss": 1.3488, "num_input_tokens_seen": 62248610, "step": 107500, "train_runtime": 4344.6789, "train_tokens_per_second": 14327.551 }, { "epoch": 5.634390651085142, "grad_norm": 3.620051383972168, "learning_rate": 3.047179326655537e-06, "loss": 1.3663, "num_input_tokens_seen": 62535434, "step": 108000, "train_runtime": 4363.4204, "train_tokens_per_second": 14331.746 }, { "epoch": 5.660475792988314, "grad_norm": 2.9154930114746094, "learning_rate": 2.8298031441291043e-06, "loss": 1.3719, "num_input_tokens_seen": 62824930, "step": 108500, "train_runtime": 4382.1169, "train_tokens_per_second": 14336.662 }, { "epoch": 5.686560934891486, "grad_norm": 2.5228476524353027, "learning_rate": 2.612426961602671e-06, "loss": 1.3476, "num_input_tokens_seen": 63114954, "step": 109000, "train_runtime": 4400.737, "train_tokens_per_second": 14341.905 }, { "epoch": 5.712646076794658, "grad_norm": 2.6546239852905273, "learning_rate": 2.3950507790762385e-06, "loss": 1.3474, "num_input_tokens_seen": 63403826, "step": 109500, "train_runtime": 4419.5486, "train_tokens_per_second": 14346.222 }, { "epoch": 5.73873121869783, "grad_norm": 3.8582890033721924, "learning_rate": 2.1776745965498054e-06, "loss": 1.3451, "num_input_tokens_seen": 63689762, "step": 110000, "train_runtime": 4438.8474, "train_tokens_per_second": 14348.266 }, { "epoch": 5.764816360601001, "grad_norm": 3.4054343700408936, "learning_rate": 1.9602984140233727e-06, "loss": 1.3811, "num_input_tokens_seen": 63978794, "step": 110500, "train_runtime": 4458.4476, "train_tokens_per_second": 14350.016 }, { "epoch": 5.790901502504173, "grad_norm": 2.907578468322754, "learning_rate": 1.7429222314969393e-06, "loss": 1.3843, "num_input_tokens_seen": 64270234, "step": 111000, "train_runtime": 4478.4361, "train_tokens_per_second": 14351.044 }, { "epoch": 5.816986644407345, "grad_norm": 2.72294020652771, "learning_rate": 1.5255460489705064e-06, "loss": 1.3511, "num_input_tokens_seen": 64557130, "step": 111500, "train_runtime": 4498.1115, "train_tokens_per_second": 14352.052 }, { "epoch": 5.843071786310517, "grad_norm": 2.910423755645752, "learning_rate": 1.3081698664440735e-06, "loss": 1.355, "num_input_tokens_seen": 64847634, "step": 112000, "train_runtime": 4517.4916, "train_tokens_per_second": 14354.788 }, { "epoch": 5.869156928213689, "grad_norm": 2.3920516967773438, "learning_rate": 1.0907936839176406e-06, "loss": 1.3696, "num_input_tokens_seen": 65135722, "step": 112500, "train_runtime": 4536.2006, "train_tokens_per_second": 14359.092 }, { "epoch": 5.895242070116861, "grad_norm": 2.619903087615967, "learning_rate": 8.734175013912075e-07, "loss": 1.3515, "num_input_tokens_seen": 65423234, "step": 113000, "train_runtime": 4554.8848, "train_tokens_per_second": 14363.312 }, { "epoch": 5.921327212020033, "grad_norm": 2.61676025390625, "learning_rate": 6.560413188647746e-07, "loss": 1.3784, "num_input_tokens_seen": 65718338, "step": 113500, "train_runtime": 4573.7838, "train_tokens_per_second": 14368.484 }, { "epoch": 5.947412353923205, "grad_norm": 2.6655712127685547, "learning_rate": 4.3866513633834173e-07, "loss": 1.3672, "num_input_tokens_seen": 66007642, "step": 114000, "train_runtime": 4592.9685, "train_tokens_per_second": 14371.456 }, { "epoch": 5.973497495826377, "grad_norm": 2.606362819671631, "learning_rate": 2.2128895381190875e-07, "loss": 1.3579, "num_input_tokens_seen": 66290722, "step": 114500, "train_runtime": 4612.1986, "train_tokens_per_second": 14372.911 }, { "epoch": 5.999582637729549, "grad_norm": 2.8683297634124756, "learning_rate": 3.912771285475793e-09, "loss": 1.3687, "num_input_tokens_seen": 66581138, "step": 115000, "train_runtime": 4632.4758, "train_tokens_per_second": 14372.69 }, { "epoch": 6.0, "eval_loss": 1.319564938545227, "eval_runtime": 45.0275, "eval_samples_per_second": 851.369, "eval_steps_per_second": 106.424, "num_input_tokens_seen": 66585670, "step": 115008 }, { "epoch": 6.025667779632721, "grad_norm": 3.327254295349121, "learning_rate": 1.9871921953255425e-05, "loss": 1.3775, "num_input_tokens_seen": 66874998, "step": 115500, "train_runtime": 18.7889, "train_tokens_per_second": 3559284.107 }, { "epoch": 6.051752921535893, "grad_norm": 2.0363502502441406, "learning_rate": 1.9741496243739565e-05, "loss": 1.3598, "num_input_tokens_seen": 67165902, "step": 116000, "train_runtime": 38.0722, "train_tokens_per_second": 1764173.697 }, { "epoch": 6.077838063439065, "grad_norm": 3.2186789512634277, "learning_rate": 1.9611070534223708e-05, "loss": 1.3582, "num_input_tokens_seen": 67454310, "step": 116500, "train_runtime": 56.9892, "train_tokens_per_second": 1183632.851 }, { "epoch": 6.103923205342237, "grad_norm": 3.1102960109710693, "learning_rate": 1.9480644824707847e-05, "loss": 1.342, "num_input_tokens_seen": 67741886, "step": 117000, "train_runtime": 76.1489, "train_tokens_per_second": 889597.261 }, { "epoch": 6.130008347245409, "grad_norm": 2.1836190223693848, "learning_rate": 1.9350219115191987e-05, "loss": 1.3578, "num_input_tokens_seen": 68030070, "step": 117500, "train_runtime": 95.3289, "train_tokens_per_second": 713635.053 }, { "epoch": 6.156093489148581, "grad_norm": 2.637117624282837, "learning_rate": 1.921979340567613e-05, "loss": 1.3561, "num_input_tokens_seen": 68313278, "step": 118000, "train_runtime": 114.4954, "train_tokens_per_second": 596646.246 }, { "epoch": 6.182178631051753, "grad_norm": 2.454594612121582, "learning_rate": 1.908936769616027e-05, "loss": 1.3897, "num_input_tokens_seen": 68603790, "step": 118500, "train_runtime": 133.1929, "train_tokens_per_second": 515071.035 }, { "epoch": 6.208263772954925, "grad_norm": 2.6059861183166504, "learning_rate": 1.895894198664441e-05, "loss": 1.3662, "num_input_tokens_seen": 68897534, "step": 119000, "train_runtime": 152.3637, "train_tokens_per_second": 452191.312 }, { "epoch": 6.234348914858097, "grad_norm": 2.963710308074951, "learning_rate": 1.8828516277128548e-05, "loss": 1.3688, "num_input_tokens_seen": 69185822, "step": 119500, "train_runtime": 171.3295, "train_tokens_per_second": 403817.306 }, { "epoch": 6.260434056761269, "grad_norm": 2.3006739616394043, "learning_rate": 1.8698090567612688e-05, "loss": 1.3867, "num_input_tokens_seen": 69477766, "step": 120000, "train_runtime": 189.6964, "train_tokens_per_second": 366257.718 }, { "epoch": 6.286519198664441, "grad_norm": 2.4806406497955322, "learning_rate": 1.8567664858096827e-05, "loss": 1.349, "num_input_tokens_seen": 69770974, "step": 120500, "train_runtime": 208.9904, "train_tokens_per_second": 333847.728 }, { "epoch": 6.312604340567613, "grad_norm": 2.4395639896392822, "learning_rate": 1.843723914858097e-05, "loss": 1.3733, "num_input_tokens_seen": 70062350, "step": 121000, "train_runtime": 228.9771, "train_tokens_per_second": 305979.777 }, { "epoch": 6.338689482470785, "grad_norm": 2.7110908031463623, "learning_rate": 1.830681343906511e-05, "loss": 1.3708, "num_input_tokens_seen": 70351870, "step": 121500, "train_runtime": 248.7026, "train_tokens_per_second": 282875.484 }, { "epoch": 6.364774624373957, "grad_norm": 2.789796829223633, "learning_rate": 1.817638772954925e-05, "loss": 1.3688, "num_input_tokens_seen": 70642750, "step": 122000, "train_runtime": 268.6462, "train_tokens_per_second": 262958.28 }, { "epoch": 6.390859766277129, "grad_norm": 2.9111709594726562, "learning_rate": 1.8045962020033392e-05, "loss": 1.3518, "num_input_tokens_seen": 70931190, "step": 122500, "train_runtime": 288.3677, "train_tokens_per_second": 245974.799 }, { "epoch": 6.416944908180301, "grad_norm": 2.4599456787109375, "learning_rate": 1.791553631051753e-05, "loss": 1.3431, "num_input_tokens_seen": 71224646, "step": 123000, "train_runtime": 307.3647, "train_tokens_per_second": 231726.811 }, { "epoch": 6.443030050083473, "grad_norm": 2.365891456604004, "learning_rate": 1.778511060100167e-05, "loss": 1.3865, "num_input_tokens_seen": 71511326, "step": 123500, "train_runtime": 326.1759, "train_tokens_per_second": 219241.597 }, { "epoch": 6.469115191986645, "grad_norm": 2.6345105171203613, "learning_rate": 1.765468489148581e-05, "loss": 1.3734, "num_input_tokens_seen": 71797622, "step": 124000, "train_runtime": 344.951, "train_tokens_per_second": 208138.626 }, { "epoch": 6.495200333889817, "grad_norm": 3.2426106929779053, "learning_rate": 1.752425918196995e-05, "loss": 1.3628, "num_input_tokens_seen": 72088862, "step": 124500, "train_runtime": 363.8685, "train_tokens_per_second": 198117.913 }, { "epoch": 6.521285475792988, "grad_norm": 2.608137845993042, "learning_rate": 1.739383347245409e-05, "loss": 1.3723, "num_input_tokens_seen": 72378534, "step": 125000, "train_runtime": 383.5577, "train_tokens_per_second": 188703.107 }, { "epoch": 6.54737061769616, "grad_norm": 4.101028919219971, "learning_rate": 1.726340776293823e-05, "loss": 1.3776, "num_input_tokens_seen": 72669942, "step": 125500, "train_runtime": 402.8471, "train_tokens_per_second": 180390.889 }, { "epoch": 6.573455759599332, "grad_norm": 2.356037139892578, "learning_rate": 1.7132982053422372e-05, "loss": 1.376, "num_input_tokens_seen": 72956998, "step": 126000, "train_runtime": 422.9625, "train_tokens_per_second": 172490.455 }, { "epoch": 6.599540901502504, "grad_norm": 2.768091917037964, "learning_rate": 1.7002556343906512e-05, "loss": 1.3849, "num_input_tokens_seen": 73246278, "step": 126500, "train_runtime": 442.5677, "train_tokens_per_second": 165503.005 }, { "epoch": 6.625626043405676, "grad_norm": 2.1557633876800537, "learning_rate": 1.687213063439065e-05, "loss": 1.3692, "num_input_tokens_seen": 73532518, "step": 127000, "train_runtime": 461.2902, "train_tokens_per_second": 159406.192 }, { "epoch": 6.651711185308848, "grad_norm": 2.739330768585205, "learning_rate": 1.6741704924874794e-05, "loss": 1.3853, "num_input_tokens_seen": 73816374, "step": 127500, "train_runtime": 480.2569, "train_tokens_per_second": 153701.835 }, { "epoch": 6.67779632721202, "grad_norm": 2.28963303565979, "learning_rate": 1.6611279215358934e-05, "loss": 1.3539, "num_input_tokens_seen": 74103334, "step": 128000, "train_runtime": 499.0026, "train_tokens_per_second": 148502.901 }, { "epoch": 6.703881469115192, "grad_norm": 3.2728097438812256, "learning_rate": 1.6480853505843073e-05, "loss": 1.3519, "num_input_tokens_seen": 74392214, "step": 128500, "train_runtime": 517.9355, "train_tokens_per_second": 143632.196 }, { "epoch": 6.729966611018364, "grad_norm": 3.280041217803955, "learning_rate": 1.6350427796327213e-05, "loss": 1.3064, "num_input_tokens_seen": 74677654, "step": 129000, "train_runtime": 536.8375, "train_tokens_per_second": 139106.624 }, { "epoch": 6.756051752921536, "grad_norm": 3.9127538204193115, "learning_rate": 1.6220002086811352e-05, "loss": 1.3779, "num_input_tokens_seen": 74968646, "step": 129500, "train_runtime": 555.72, "train_tokens_per_second": 134903.621 }, { "epoch": 6.782136894824708, "grad_norm": 2.7960000038146973, "learning_rate": 1.6089576377295492e-05, "loss": 1.3327, "num_input_tokens_seen": 75257286, "step": 130000, "train_runtime": 574.6797, "train_tokens_per_second": 130955.186 }, { "epoch": 6.80822203672788, "grad_norm": 2.997286796569824, "learning_rate": 1.5959150667779635e-05, "loss": 1.3684, "num_input_tokens_seen": 75546398, "step": 130500, "train_runtime": 593.4532, "train_tokens_per_second": 127299.662 }, { "epoch": 6.834307178631052, "grad_norm": 2.6267356872558594, "learning_rate": 1.5828724958263774e-05, "loss": 1.3416, "num_input_tokens_seen": 75840662, "step": 131000, "train_runtime": 612.3615, "train_tokens_per_second": 123849.503 }, { "epoch": 6.860392320534224, "grad_norm": 2.1126062870025635, "learning_rate": 1.5698299248747914e-05, "loss": 1.3606, "num_input_tokens_seen": 76125694, "step": 131500, "train_runtime": 631.2618, "train_tokens_per_second": 120592.897 }, { "epoch": 6.886477462437396, "grad_norm": 2.9131317138671875, "learning_rate": 1.5567873539232053e-05, "loss": 1.3813, "num_input_tokens_seen": 76417118, "step": 132000, "train_runtime": 650.1892, "train_tokens_per_second": 117530.578 }, { "epoch": 6.912562604340567, "grad_norm": 3.5298712253570557, "learning_rate": 1.5437447829716196e-05, "loss": 1.3617, "num_input_tokens_seen": 76703430, "step": 132500, "train_runtime": 669.1223, "train_tokens_per_second": 114632.907 }, { "epoch": 6.938647746243739, "grad_norm": 2.850775718688965, "learning_rate": 1.5307022120200336e-05, "loss": 1.3672, "num_input_tokens_seen": 76992342, "step": 133000, "train_runtime": 687.9389, "train_tokens_per_second": 111917.419 }, { "epoch": 6.964732888146911, "grad_norm": 3.314821481704712, "learning_rate": 1.5176596410684474e-05, "loss": 1.3715, "num_input_tokens_seen": 77284374, "step": 133500, "train_runtime": 706.8708, "train_tokens_per_second": 109333.091 }, { "epoch": 6.990818030050083, "grad_norm": 3.3693618774414062, "learning_rate": 1.5046170701168617e-05, "loss": 1.3858, "num_input_tokens_seen": 77571966, "step": 134000, "train_runtime": 725.7267, "train_tokens_per_second": 106888.674 }, { "epoch": 7.0, "eval_loss": 1.3148815631866455, "eval_runtime": 45.8848, "eval_samples_per_second": 835.462, "eval_steps_per_second": 104.435, "num_input_tokens_seen": 77673096, "step": 134176 }, { "epoch": 7.016903171953255, "grad_norm": 2.7694716453552246, "learning_rate": 1.4915744991652755e-05, "loss": 1.3419, "num_input_tokens_seen": 77861608, "step": 134500, "train_runtime": 791.8621, "train_tokens_per_second": 98327.231 }, { "epoch": 7.042988313856427, "grad_norm": 2.7334187030792236, "learning_rate": 1.4785319282136894e-05, "loss": 1.3308, "num_input_tokens_seen": 78149784, "step": 135000, "train_runtime": 810.7343, "train_tokens_per_second": 96393.825 }, { "epoch": 7.069073455759599, "grad_norm": 2.9365265369415283, "learning_rate": 1.4654893572621037e-05, "loss": 1.3525, "num_input_tokens_seen": 78438792, "step": 135500, "train_runtime": 829.7324, "train_tokens_per_second": 94535.049 }, { "epoch": 7.095158597662771, "grad_norm": 4.147580146789551, "learning_rate": 1.4524467863105177e-05, "loss": 1.3465, "num_input_tokens_seen": 78732384, "step": 136000, "train_runtime": 848.5615, "train_tokens_per_second": 92783.357 }, { "epoch": 7.121243739565943, "grad_norm": 2.915922164916992, "learning_rate": 1.4394042153589316e-05, "loss": 1.3614, "num_input_tokens_seen": 79016208, "step": 136500, "train_runtime": 867.5653, "train_tokens_per_second": 91078.111 }, { "epoch": 7.147328881469115, "grad_norm": 2.549786329269409, "learning_rate": 1.4263616444073457e-05, "loss": 1.318, "num_input_tokens_seen": 79301784, "step": 137000, "train_runtime": 886.659, "train_tokens_per_second": 89438.871 }, { "epoch": 7.173414023372287, "grad_norm": 2.5047004222869873, "learning_rate": 1.4133190734557597e-05, "loss": 1.368, "num_input_tokens_seen": 79590400, "step": 137500, "train_runtime": 905.5133, "train_tokens_per_second": 87895.338 }, { "epoch": 7.199499165275459, "grad_norm": 3.0781052112579346, "learning_rate": 1.4002765025041736e-05, "loss": 1.3653, "num_input_tokens_seen": 79879504, "step": 138000, "train_runtime": 924.4454, "train_tokens_per_second": 86408.029 }, { "epoch": 7.225584307178631, "grad_norm": 3.6476972103118896, "learning_rate": 1.387233931552588e-05, "loss": 1.3514, "num_input_tokens_seen": 80167640, "step": 138500, "train_runtime": 943.229, "train_tokens_per_second": 84992.766 }, { "epoch": 7.2516694490818026, "grad_norm": 5.114116191864014, "learning_rate": 1.3741913606010017e-05, "loss": 1.3413, "num_input_tokens_seen": 80456216, "step": 139000, "train_runtime": 962.1141, "train_tokens_per_second": 83624.399 }, { "epoch": 7.277754590984975, "grad_norm": 2.5727877616882324, "learning_rate": 1.3611487896494157e-05, "loss": 1.3414, "num_input_tokens_seen": 80747832, "step": 139500, "train_runtime": 980.9921, "train_tokens_per_second": 82312.418 }, { "epoch": 7.303839732888147, "grad_norm": 2.9491872787475586, "learning_rate": 1.3481062186978296e-05, "loss": 1.3412, "num_input_tokens_seen": 81043216, "step": 140000, "train_runtime": 999.9989, "train_tokens_per_second": 81043.309 }, { "epoch": 7.329924874791319, "grad_norm": 2.045164108276367, "learning_rate": 1.3350636477462439e-05, "loss": 1.3729, "num_input_tokens_seen": 81333232, "step": 140500, "train_runtime": 1018.8484, "train_tokens_per_second": 79828.588 }, { "epoch": 7.356010016694491, "grad_norm": 3.922563314437866, "learning_rate": 1.3220210767946579e-05, "loss": 1.3443, "num_input_tokens_seen": 81622416, "step": 141000, "train_runtime": 1037.8422, "train_tokens_per_second": 78646.268 }, { "epoch": 7.382095158597663, "grad_norm": 2.426223039627075, "learning_rate": 1.3089785058430718e-05, "loss": 1.3544, "num_input_tokens_seen": 81911608, "step": 141500, "train_runtime": 1056.8045, "train_tokens_per_second": 77508.763 }, { "epoch": 7.408180300500835, "grad_norm": 2.67075514793396, "learning_rate": 1.295935934891486e-05, "loss": 1.3246, "num_input_tokens_seen": 82202544, "step": 142000, "train_runtime": 1075.733, "train_tokens_per_second": 76415.38 }, { "epoch": 7.434265442404007, "grad_norm": 2.5923829078674316, "learning_rate": 1.2828933639398999e-05, "loss": 1.3388, "num_input_tokens_seen": 82493944, "step": 142500, "train_runtime": 1094.8433, "train_tokens_per_second": 75347.716 }, { "epoch": 7.460350584307179, "grad_norm": 2.602835178375244, "learning_rate": 1.2698507929883138e-05, "loss": 1.3423, "num_input_tokens_seen": 82784656, "step": 143000, "train_runtime": 1113.7237, "train_tokens_per_second": 74331.413 }, { "epoch": 7.486435726210351, "grad_norm": 3.1531965732574463, "learning_rate": 1.256808222036728e-05, "loss": 1.3452, "num_input_tokens_seen": 83068624, "step": 143500, "train_runtime": 1132.6794, "train_tokens_per_second": 73338.162 }, { "epoch": 7.512520868113523, "grad_norm": 2.2403712272644043, "learning_rate": 1.243765651085142e-05, "loss": 1.3618, "num_input_tokens_seen": 83351920, "step": 144000, "train_runtime": 1151.592, "train_tokens_per_second": 72379.733 }, { "epoch": 7.538606010016695, "grad_norm": 3.465223550796509, "learning_rate": 1.2307230801335559e-05, "loss": 1.3632, "num_input_tokens_seen": 83638888, "step": 144500, "train_runtime": 1170.6851, "train_tokens_per_second": 71444.392 }, { "epoch": 7.564691151919867, "grad_norm": 2.3392977714538574, "learning_rate": 1.21768050918197e-05, "loss": 1.3318, "num_input_tokens_seen": 83931992, "step": 145000, "train_runtime": 1189.5906, "train_tokens_per_second": 70555.362 }, { "epoch": 7.590776293823039, "grad_norm": 3.0218007564544678, "learning_rate": 1.2046379382303841e-05, "loss": 1.3636, "num_input_tokens_seen": 84220168, "step": 145500, "train_runtime": 1208.5932, "train_tokens_per_second": 69684.461 }, { "epoch": 7.616861435726211, "grad_norm": 3.329549789428711, "learning_rate": 1.191595367278798e-05, "loss": 1.355, "num_input_tokens_seen": 84509512, "step": 146000, "train_runtime": 1227.4823, "train_tokens_per_second": 68847.845 }, { "epoch": 7.642946577629383, "grad_norm": 6.515806198120117, "learning_rate": 1.178552796327212e-05, "loss": 1.3414, "num_input_tokens_seen": 84808104, "step": 146500, "train_runtime": 1246.4918, "train_tokens_per_second": 68037.434 }, { "epoch": 7.669031719532554, "grad_norm": 3.5463063716888428, "learning_rate": 1.1655102253756262e-05, "loss": 1.3617, "num_input_tokens_seen": 85099704, "step": 147000, "train_runtime": 1265.5528, "train_tokens_per_second": 67243.109 }, { "epoch": 7.695116861435726, "grad_norm": 2.877112627029419, "learning_rate": 1.1524676544240401e-05, "loss": 1.3524, "num_input_tokens_seen": 85387272, "step": 147500, "train_runtime": 1284.3298, "train_tokens_per_second": 66483.913 }, { "epoch": 7.721202003338898, "grad_norm": 2.8873534202575684, "learning_rate": 1.1394250834724542e-05, "loss": 1.3442, "num_input_tokens_seen": 85671272, "step": 148000, "train_runtime": 1303.2108, "train_tokens_per_second": 65738.615 }, { "epoch": 7.74728714524207, "grad_norm": 3.5610382556915283, "learning_rate": 1.126382512520868e-05, "loss": 1.3505, "num_input_tokens_seen": 85959168, "step": 148500, "train_runtime": 1322.1598, "train_tokens_per_second": 65014.207 }, { "epoch": 7.773372287145242, "grad_norm": 2.6103343963623047, "learning_rate": 1.1133399415692821e-05, "loss": 1.3616, "num_input_tokens_seen": 86255128, "step": 149000, "train_runtime": 1341.3928, "train_tokens_per_second": 64302.661 }, { "epoch": 7.799457429048414, "grad_norm": 2.5157065391540527, "learning_rate": 1.1002973706176963e-05, "loss": 1.3422, "num_input_tokens_seen": 86546848, "step": 149500, "train_runtime": 1360.2395, "train_tokens_per_second": 63626.184 }, { "epoch": 7.825542570951586, "grad_norm": 2.315091371536255, "learning_rate": 1.0872547996661102e-05, "loss": 1.3511, "num_input_tokens_seen": 86837440, "step": 150000, "train_runtime": 1379.3034, "train_tokens_per_second": 62957.46 }, { "epoch": 7.851627712854758, "grad_norm": 2.2483925819396973, "learning_rate": 1.0742122287145243e-05, "loss": 1.3355, "num_input_tokens_seen": 87120032, "step": 150500, "train_runtime": 1398.2422, "train_tokens_per_second": 62306.824 }, { "epoch": 7.87771285475793, "grad_norm": 2.340362071990967, "learning_rate": 1.0611696577629383e-05, "loss": 1.3537, "num_input_tokens_seen": 87415824, "step": 151000, "train_runtime": 1417.1731, "train_tokens_per_second": 61683.236 }, { "epoch": 7.903797996661102, "grad_norm": 2.813960552215576, "learning_rate": 1.0481270868113522e-05, "loss": 1.3479, "num_input_tokens_seen": 87701680, "step": 151500, "train_runtime": 1436.1799, "train_tokens_per_second": 61065.945 }, { "epoch": 7.929883138564274, "grad_norm": 2.2960751056671143, "learning_rate": 1.0350845158597664e-05, "loss": 1.3475, "num_input_tokens_seen": 87992448, "step": 152000, "train_runtime": 1455.0801, "train_tokens_per_second": 60472.578 }, { "epoch": 7.955968280467446, "grad_norm": 3.048780918121338, "learning_rate": 1.0220419449081803e-05, "loss": 1.3619, "num_input_tokens_seen": 88281416, "step": 152500, "train_runtime": 1474.0153, "train_tokens_per_second": 59891.791 }, { "epoch": 7.982053422370617, "grad_norm": 2.816805362701416, "learning_rate": 1.0089993739565943e-05, "loss": 1.357, "num_input_tokens_seen": 88572368, "step": 153000, "train_runtime": 1492.8602, "train_tokens_per_second": 59330.65 }, { "epoch": 8.0, "eval_loss": 1.310753345489502, "eval_runtime": 45.8622, "eval_samples_per_second": 835.874, "eval_steps_per_second": 104.487, "num_input_tokens_seen": 88772850, "step": 153344 }, { "epoch": 8.00813856427379, "grad_norm": 2.93835186958313, "learning_rate": 9.959568030050084e-06, "loss": 1.3378, "num_input_tokens_seen": 88861818, "step": 153500, "train_runtime": 1558.6187, "train_tokens_per_second": 57013.187 }, { "epoch": 8.034223706176961, "grad_norm": 3.2679965496063232, "learning_rate": 9.829142320534224e-06, "loss": 1.3403, "num_input_tokens_seen": 89148626, "step": 154000, "train_runtime": 1577.6034, "train_tokens_per_second": 56508.897 }, { "epoch": 8.060308848080133, "grad_norm": 1.7137473821640015, "learning_rate": 9.698716611018365e-06, "loss": 1.3357, "num_input_tokens_seen": 89432242, "step": 154500, "train_runtime": 1596.5679, "train_tokens_per_second": 56015.306 }, { "epoch": 8.086393989983305, "grad_norm": 2.5696284770965576, "learning_rate": 9.568290901502506e-06, "loss": 1.3465, "num_input_tokens_seen": 89721890, "step": 155000, "train_runtime": 1615.4358, "train_tokens_per_second": 55540.364 }, { "epoch": 8.112479131886477, "grad_norm": 3.715364694595337, "learning_rate": 9.437865191986644e-06, "loss": 1.3407, "num_input_tokens_seen": 90009618, "step": 155500, "train_runtime": 1634.2407, "train_tokens_per_second": 55077.332 }, { "epoch": 8.13856427378965, "grad_norm": 2.7199196815490723, "learning_rate": 9.307439482470785e-06, "loss": 1.3444, "num_input_tokens_seen": 90299538, "step": 156000, "train_runtime": 1653.1786, "train_tokens_per_second": 54621.767 }, { "epoch": 8.164649415692821, "grad_norm": 2.546076774597168, "learning_rate": 9.177013772954925e-06, "loss": 1.3201, "num_input_tokens_seen": 90585634, "step": 156500, "train_runtime": 1671.9571, "train_tokens_per_second": 54179.401 }, { "epoch": 8.190734557595993, "grad_norm": 2.7355287075042725, "learning_rate": 9.046588063439066e-06, "loss": 1.3286, "num_input_tokens_seen": 90875986, "step": 157000, "train_runtime": 1690.8444, "train_tokens_per_second": 53745.919 }, { "epoch": 8.216819699499165, "grad_norm": 2.610476016998291, "learning_rate": 8.916162353923205e-06, "loss": 1.3624, "num_input_tokens_seen": 91165682, "step": 157500, "train_runtime": 1709.7962, "train_tokens_per_second": 53319.619 }, { "epoch": 8.242904841402337, "grad_norm": 3.424274444580078, "learning_rate": 8.785736644407345e-06, "loss": 1.3615, "num_input_tokens_seen": 91458162, "step": 158000, "train_runtime": 1728.5683, "train_tokens_per_second": 52909.776 }, { "epoch": 8.26898998330551, "grad_norm": 2.9222910404205322, "learning_rate": 8.655310934891486e-06, "loss": 1.3359, "num_input_tokens_seen": 91748050, "step": 158500, "train_runtime": 1747.5127, "train_tokens_per_second": 52502.078 }, { "epoch": 8.295075125208681, "grad_norm": 3.5217490196228027, "learning_rate": 8.524885225375627e-06, "loss": 1.3414, "num_input_tokens_seen": 92035050, "step": 159000, "train_runtime": 1766.4856, "train_tokens_per_second": 52100.651 }, { "epoch": 8.321160267111853, "grad_norm": 2.656613826751709, "learning_rate": 8.394459515859767e-06, "loss": 1.3436, "num_input_tokens_seen": 92326378, "step": 159500, "train_runtime": 1785.3077, "train_tokens_per_second": 51714.547 }, { "epoch": 8.347245409015025, "grad_norm": 2.8764595985412598, "learning_rate": 8.264033806343906e-06, "loss": 1.316, "num_input_tokens_seen": 92617586, "step": 160000, "train_runtime": 1804.1264, "train_tokens_per_second": 51336.529 }, { "epoch": 8.373330550918197, "grad_norm": 2.635450839996338, "learning_rate": 8.133608096828046e-06, "loss": 1.3745, "num_input_tokens_seen": 92904010, "step": 160500, "train_runtime": 1823.1613, "train_tokens_per_second": 50957.647 }, { "epoch": 8.39941569282137, "grad_norm": 3.4129796028137207, "learning_rate": 8.003182387312187e-06, "loss": 1.3278, "num_input_tokens_seen": 93189170, "step": 161000, "train_runtime": 1842.0126, "train_tokens_per_second": 50590.953 }, { "epoch": 8.425500834724541, "grad_norm": 3.2952401638031006, "learning_rate": 7.872756677796328e-06, "loss": 1.337, "num_input_tokens_seen": 93475210, "step": 161500, "train_runtime": 1861.0469, "train_tokens_per_second": 50227.218 }, { "epoch": 8.451585976627713, "grad_norm": 2.8078572750091553, "learning_rate": 7.742330968280468e-06, "loss": 1.3511, "num_input_tokens_seen": 93764458, "step": 162000, "train_runtime": 1880.0164, "train_tokens_per_second": 49874.278 }, { "epoch": 8.477671118530886, "grad_norm": 3.6334028244018555, "learning_rate": 7.611905258764608e-06, "loss": 1.3214, "num_input_tokens_seen": 94054690, "step": 162500, "train_runtime": 1898.9183, "train_tokens_per_second": 49530.666 }, { "epoch": 8.503756260434058, "grad_norm": 2.255051851272583, "learning_rate": 7.481479549248749e-06, "loss": 1.3181, "num_input_tokens_seen": 94342986, "step": 163000, "train_runtime": 1917.8935, "train_tokens_per_second": 49190.941 }, { "epoch": 8.52984140233723, "grad_norm": 2.2999086380004883, "learning_rate": 7.351053839732888e-06, "loss": 1.3468, "num_input_tokens_seen": 94628458, "step": 163500, "train_runtime": 1936.8254, "train_tokens_per_second": 48857.505 }, { "epoch": 8.5559265442404, "grad_norm": 2.8126626014709473, "learning_rate": 7.220628130217029e-06, "loss": 1.3442, "num_input_tokens_seen": 94916450, "step": 164000, "train_runtime": 1955.7716, "train_tokens_per_second": 48531.459 }, { "epoch": 8.582011686143572, "grad_norm": 3.6833460330963135, "learning_rate": 7.090202420701168e-06, "loss": 1.3097, "num_input_tokens_seen": 95209610, "step": 164500, "train_runtime": 1974.6708, "train_tokens_per_second": 48215.434 }, { "epoch": 8.608096828046744, "grad_norm": 2.2948975563049316, "learning_rate": 6.959776711185309e-06, "loss": 1.3158, "num_input_tokens_seen": 95500162, "step": 165000, "train_runtime": 1993.7313, "train_tokens_per_second": 47900.216 }, { "epoch": 8.634181969949916, "grad_norm": 2.677102565765381, "learning_rate": 6.82935100166945e-06, "loss": 1.3492, "num_input_tokens_seen": 95791218, "step": 165500, "train_runtime": 2012.6562, "train_tokens_per_second": 47594.428 }, { "epoch": 8.660267111853088, "grad_norm": 2.8302109241485596, "learning_rate": 6.698925292153589e-06, "loss": 1.3176, "num_input_tokens_seen": 96078250, "step": 166000, "train_runtime": 2031.6856, "train_tokens_per_second": 47289.919 }, { "epoch": 8.68635225375626, "grad_norm": 2.7552695274353027, "learning_rate": 6.56849958263773e-06, "loss": 1.3259, "num_input_tokens_seen": 96363322, "step": 166500, "train_runtime": 2050.6873, "train_tokens_per_second": 46990.744 }, { "epoch": 8.712437395659432, "grad_norm": 2.76167368888855, "learning_rate": 6.438073873121871e-06, "loss": 1.341, "num_input_tokens_seen": 96655826, "step": 167000, "train_runtime": 2069.5519, "train_tokens_per_second": 46703.746 }, { "epoch": 8.738522537562604, "grad_norm": 2.799135208129883, "learning_rate": 6.3076481636060104e-06, "loss": 1.3516, "num_input_tokens_seen": 96941474, "step": 167500, "train_runtime": 2088.6051, "train_tokens_per_second": 46414.458 }, { "epoch": 8.764607679465776, "grad_norm": 2.185119390487671, "learning_rate": 6.177222454090151e-06, "loss": 1.3495, "num_input_tokens_seen": 97236010, "step": 168000, "train_runtime": 2107.5825, "train_tokens_per_second": 46136.277 }, { "epoch": 8.790692821368948, "grad_norm": 2.787100315093994, "learning_rate": 6.046796744574291e-06, "loss": 1.3059, "num_input_tokens_seen": 97526826, "step": 168500, "train_runtime": 2126.4823, "train_tokens_per_second": 45862.984 }, { "epoch": 8.81677796327212, "grad_norm": 2.6303234100341797, "learning_rate": 5.916371035058431e-06, "loss": 1.3463, "num_input_tokens_seen": 97816378, "step": 169000, "train_runtime": 2145.3741, "train_tokens_per_second": 45594.088 }, { "epoch": 8.842863105175292, "grad_norm": 2.5196168422698975, "learning_rate": 5.785945325542571e-06, "loss": 1.3462, "num_input_tokens_seen": 98111226, "step": 169500, "train_runtime": 2164.4052, "train_tokens_per_second": 45329.417 }, { "epoch": 8.868948247078464, "grad_norm": 3.008777141571045, "learning_rate": 5.6555196160267115e-06, "loss": 1.3463, "num_input_tokens_seen": 98404994, "step": 170000, "train_runtime": 2183.2406, "train_tokens_per_second": 45072.904 }, { "epoch": 8.895033388981636, "grad_norm": 2.664883613586426, "learning_rate": 5.525093906510852e-06, "loss": 1.3505, "num_input_tokens_seen": 98691458, "step": 170500, "train_runtime": 2202.2373, "train_tokens_per_second": 44814.179 }, { "epoch": 8.921118530884808, "grad_norm": 3.8976974487304688, "learning_rate": 5.3946681969949914e-06, "loss": 1.3325, "num_input_tokens_seen": 98980730, "step": 171000, "train_runtime": 2221.2328, "train_tokens_per_second": 44561.169 }, { "epoch": 8.94720367278798, "grad_norm": 2.5917086601257324, "learning_rate": 5.264242487479132e-06, "loss": 1.333, "num_input_tokens_seen": 99265698, "step": 171500, "train_runtime": 2240.1093, "train_tokens_per_second": 44312.882 }, { "epoch": 8.973288814691152, "grad_norm": 3.012345314025879, "learning_rate": 5.133816777963272e-06, "loss": 1.3493, "num_input_tokens_seen": 99562818, "step": 172000, "train_runtime": 2259.2484, "train_tokens_per_second": 44069.001 }, { "epoch": 8.999373956594324, "grad_norm": 1.994488000869751, "learning_rate": 5.0033910684474126e-06, "loss": 1.3704, "num_input_tokens_seen": 99855026, "step": 172500, "train_runtime": 2278.2393, "train_tokens_per_second": 43829.912 }, { "epoch": 9.0, "eval_loss": 1.3092994689941406, "eval_runtime": 45.6876, "eval_samples_per_second": 839.069, "eval_steps_per_second": 104.886, "num_input_tokens_seen": 99861888, "step": 172512 }, { "epoch": 9.025459098497496, "grad_norm": 3.0312609672546387, "learning_rate": 4.872965358931553e-06, "loss": 1.3012, "num_input_tokens_seen": 100137840, "step": 173000, "train_runtime": 2343.9004, "train_tokens_per_second": 42722.738 }, { "epoch": 9.051544240400668, "grad_norm": 2.9846737384796143, "learning_rate": 4.7425396494156925e-06, "loss": 1.3416, "num_input_tokens_seen": 100428752, "step": 173500, "train_runtime": 2362.9108, "train_tokens_per_second": 42502.134 }, { "epoch": 9.07762938230384, "grad_norm": 2.700178623199463, "learning_rate": 4.612113939899834e-06, "loss": 1.3509, "num_input_tokens_seen": 100714360, "step": 174000, "train_runtime": 2381.7919, "train_tokens_per_second": 42285.122 }, { "epoch": 9.103714524207012, "grad_norm": 2.5982463359832764, "learning_rate": 4.481688230383973e-06, "loss": 1.33, "num_input_tokens_seen": 101010096, "step": 174500, "train_runtime": 2400.7332, "train_tokens_per_second": 42074.685 }, { "epoch": 9.129799666110184, "grad_norm": 3.2345430850982666, "learning_rate": 4.351262520868114e-06, "loss": 1.3127, "num_input_tokens_seen": 101301448, "step": 175000, "train_runtime": 2419.7, "train_tokens_per_second": 41865.292 }, { "epoch": 9.155884808013356, "grad_norm": 2.8651511669158936, "learning_rate": 4.220836811352254e-06, "loss": 1.3198, "num_input_tokens_seen": 101583952, "step": 175500, "train_runtime": 2438.5641, "train_tokens_per_second": 41657.282 }, { "epoch": 9.181969949916528, "grad_norm": 2.723923921585083, "learning_rate": 4.090411101836394e-06, "loss": 1.3486, "num_input_tokens_seen": 101879904, "step": 176000, "train_runtime": 2457.6414, "train_tokens_per_second": 41454.341 }, { "epoch": 9.2080550918197, "grad_norm": 2.9765188694000244, "learning_rate": 3.959985392320535e-06, "loss": 1.3247, "num_input_tokens_seen": 102169192, "step": 176500, "train_runtime": 2476.4667, "train_tokens_per_second": 41256.033 }, { "epoch": 9.234140233722872, "grad_norm": 2.14411997795105, "learning_rate": 3.829559682804674e-06, "loss": 1.3542, "num_input_tokens_seen": 102454992, "step": 177000, "train_runtime": 2495.3856, "train_tokens_per_second": 41057.779 }, { "epoch": 9.260225375626044, "grad_norm": 2.7752788066864014, "learning_rate": 3.6991339732888147e-06, "loss": 1.3469, "num_input_tokens_seen": 102739160, "step": 177500, "train_runtime": 2514.175, "train_tokens_per_second": 40863.966 }, { "epoch": 9.286310517529216, "grad_norm": 2.3828213214874268, "learning_rate": 3.5687082637729555e-06, "loss": 1.3267, "num_input_tokens_seen": 103027896, "step": 178000, "train_runtime": 2533.1537, "train_tokens_per_second": 40671.791 }, { "epoch": 9.312395659432386, "grad_norm": 2.554948329925537, "learning_rate": 3.4382825542570955e-06, "loss": 1.3218, "num_input_tokens_seen": 103314672, "step": 178500, "train_runtime": 2552.0027, "train_tokens_per_second": 40483.762 }, { "epoch": 9.338480801335558, "grad_norm": 2.6806468963623047, "learning_rate": 3.3078568447412354e-06, "loss": 1.3384, "num_input_tokens_seen": 103602648, "step": 179000, "train_runtime": 2571.0574, "train_tokens_per_second": 40295.735 }, { "epoch": 9.36456594323873, "grad_norm": 3.95470929145813, "learning_rate": 3.1774311352253754e-06, "loss": 1.3187, "num_input_tokens_seen": 103892480, "step": 179500, "train_runtime": 2590.0953, "train_tokens_per_second": 40111.45 }, { "epoch": 9.390651085141902, "grad_norm": 2.708707332611084, "learning_rate": 3.0470054257095158e-06, "loss": 1.335, "num_input_tokens_seen": 104178104, "step": 180000, "train_runtime": 2608.9847, "train_tokens_per_second": 39930.515 }, { "epoch": 9.416736227045075, "grad_norm": 3.4441354274749756, "learning_rate": 2.916579716193656e-06, "loss": 1.3204, "num_input_tokens_seen": 104470488, "step": 180500, "train_runtime": 2627.985, "train_tokens_per_second": 39753.076 }, { "epoch": 9.442821368948247, "grad_norm": 3.5723414421081543, "learning_rate": 2.7861540066777965e-06, "loss": 1.3457, "num_input_tokens_seen": 104759104, "step": 181000, "train_runtime": 2646.9218, "train_tokens_per_second": 39577.71 }, { "epoch": 9.468906510851419, "grad_norm": 3.956160068511963, "learning_rate": 2.655728297161937e-06, "loss": 1.3717, "num_input_tokens_seen": 105044408, "step": 181500, "train_runtime": 2665.9278, "train_tokens_per_second": 39402.571 }, { "epoch": 9.49499165275459, "grad_norm": 2.565819025039673, "learning_rate": 2.525302587646077e-06, "loss": 1.3413, "num_input_tokens_seen": 105327088, "step": 182000, "train_runtime": 2684.8097, "train_tokens_per_second": 39230.746 }, { "epoch": 9.521076794657763, "grad_norm": 3.5526235103607178, "learning_rate": 2.3948768781302173e-06, "loss": 1.2786, "num_input_tokens_seen": 105615560, "step": 182500, "train_runtime": 2703.8493, "train_tokens_per_second": 39061.185 }, { "epoch": 9.547161936560935, "grad_norm": 2.816168785095215, "learning_rate": 2.264451168614357e-06, "loss": 1.3268, "num_input_tokens_seen": 105904984, "step": 183000, "train_runtime": 2722.6828, "train_tokens_per_second": 38897.29 }, { "epoch": 9.573247078464107, "grad_norm": 3.1430675983428955, "learning_rate": 2.1340254590984976e-06, "loss": 1.3181, "num_input_tokens_seen": 106197728, "step": 183500, "train_runtime": 2741.6799, "train_tokens_per_second": 38734.547 }, { "epoch": 9.599332220367279, "grad_norm": 3.099498748779297, "learning_rate": 2.0035997495826376e-06, "loss": 1.321, "num_input_tokens_seen": 106489536, "step": 184000, "train_runtime": 2760.6349, "train_tokens_per_second": 38574.292 }, { "epoch": 9.62541736227045, "grad_norm": 3.0963542461395264, "learning_rate": 1.8731740400667781e-06, "loss": 1.3177, "num_input_tokens_seen": 106779640, "step": 184500, "train_runtime": 2779.5118, "train_tokens_per_second": 38416.688 }, { "epoch": 9.651502504173623, "grad_norm": 2.6030497550964355, "learning_rate": 1.742748330550918e-06, "loss": 1.3052, "num_input_tokens_seen": 107073888, "step": 185000, "train_runtime": 2798.5054, "train_tokens_per_second": 38261.097 }, { "epoch": 9.677587646076795, "grad_norm": 3.022160768508911, "learning_rate": 1.6123226210350585e-06, "loss": 1.3436, "num_input_tokens_seen": 107364848, "step": 185500, "train_runtime": 2817.3417, "train_tokens_per_second": 38108.565 }, { "epoch": 9.703672787979967, "grad_norm": 2.626763105392456, "learning_rate": 1.4818969115191989e-06, "loss": 1.3352, "num_input_tokens_seen": 107659488, "step": 186000, "train_runtime": 2836.337, "train_tokens_per_second": 37957.227 }, { "epoch": 9.729757929883139, "grad_norm": 3.0171899795532227, "learning_rate": 1.351471202003339e-06, "loss": 1.317, "num_input_tokens_seen": 107949608, "step": 186500, "train_runtime": 2855.247, "train_tokens_per_second": 37807.45 }, { "epoch": 9.75584307178631, "grad_norm": 2.22269868850708, "learning_rate": 1.2210454924874792e-06, "loss": 1.3193, "num_input_tokens_seen": 108245936, "step": 187000, "train_runtime": 2874.2964, "train_tokens_per_second": 37659.977 }, { "epoch": 9.781928213689483, "grad_norm": 2.8673713207244873, "learning_rate": 1.0906197829716196e-06, "loss": 1.3392, "num_input_tokens_seen": 108539552, "step": 187500, "train_runtime": 2893.1693, "train_tokens_per_second": 37515.798 }, { "epoch": 9.808013355592655, "grad_norm": 2.645888566970825, "learning_rate": 9.601940734557598e-07, "loss": 1.3395, "num_input_tokens_seen": 108827736, "step": 188000, "train_runtime": 2912.2618, "train_tokens_per_second": 37368.802 }, { "epoch": 9.834098497495827, "grad_norm": 3.0480117797851562, "learning_rate": 8.297683639398999e-07, "loss": 1.3325, "num_input_tokens_seen": 109119720, "step": 188500, "train_runtime": 2931.2115, "train_tokens_per_second": 37226.832 }, { "epoch": 9.860183639398999, "grad_norm": 3.1074326038360596, "learning_rate": 6.993426544240401e-07, "loss": 1.3365, "num_input_tokens_seen": 109406600, "step": 189000, "train_runtime": 2950.1291, "train_tokens_per_second": 37085.361 }, { "epoch": 9.88626878130217, "grad_norm": 2.7331807613372803, "learning_rate": 5.689169449081803e-07, "loss": 1.346, "num_input_tokens_seen": 109694976, "step": 189500, "train_runtime": 2969.1526, "train_tokens_per_second": 36944.877 }, { "epoch": 9.912353923205343, "grad_norm": 2.5716543197631836, "learning_rate": 4.3849123539232055e-07, "loss": 1.3331, "num_input_tokens_seen": 109985584, "step": 190000, "train_runtime": 2988.0525, "train_tokens_per_second": 36808.451 }, { "epoch": 9.938439065108515, "grad_norm": 2.6166512966156006, "learning_rate": 3.080655258764608e-07, "loss": 1.3292, "num_input_tokens_seen": 110272368, "step": 190500, "train_runtime": 3007.0207, "train_tokens_per_second": 36671.636 }, { "epoch": 9.964524207011687, "grad_norm": 2.8893744945526123, "learning_rate": 1.77639816360601e-07, "loss": 1.3166, "num_input_tokens_seen": 110557664, "step": 191000, "train_runtime": 3025.9068, "train_tokens_per_second": 36537.035 }, { "epoch": 9.990609348914859, "grad_norm": 2.441220998764038, "learning_rate": 4.721410684474124e-08, "loss": 1.3429, "num_input_tokens_seen": 110851304, "step": 191500, "train_runtime": 3044.8849, "train_tokens_per_second": 36405.745 }, { "epoch": 10.0, "eval_loss": 1.3094313144683838, "eval_runtime": 45.924, "eval_samples_per_second": 834.748, "eval_steps_per_second": 104.346, "num_input_tokens_seen": 110955972, "step": 191680 }, { "epoch": 10.01669449081803, "grad_norm": 3.2865073680877686, "learning_rate": 2.495839419866444e-05, "loss": 1.3149, "num_input_tokens_seen": 111137740, "step": 192000, "train_runtime": 12.5405, "train_tokens_per_second": 8862327.118 }, { "epoch": 10.042779632721203, "grad_norm": 1.945192813873291, "learning_rate": 2.4893181343906512e-05, "loss": 1.3105, "num_input_tokens_seen": 111433348, "step": 192500, "train_runtime": 31.7519, "train_tokens_per_second": 3509506.821 }, { "epoch": 10.068864774624373, "grad_norm": 2.5163190364837646, "learning_rate": 2.482796848914858e-05, "loss": 1.3183, "num_input_tokens_seen": 111720916, "step": 193000, "train_runtime": 52.2905, "train_tokens_per_second": 2136543.334 }, { "epoch": 10.094949916527545, "grad_norm": 2.6350646018981934, "learning_rate": 2.4762755634390652e-05, "loss": 1.3066, "num_input_tokens_seen": 112012948, "step": 193500, "train_runtime": 72.2858, "train_tokens_per_second": 1549584.477 }, { "epoch": 10.121035058430717, "grad_norm": 2.0416669845581055, "learning_rate": 2.4697542779632723e-05, "loss": 1.3383, "num_input_tokens_seen": 112299028, "step": 194000, "train_runtime": 92.8402, "train_tokens_per_second": 1209594.915 }, { "epoch": 10.14712020033389, "grad_norm": 2.219244956970215, "learning_rate": 2.463232992487479e-05, "loss": 1.3436, "num_input_tokens_seen": 112590044, "step": 194500, "train_runtime": 113.0832, "train_tokens_per_second": 995639.026 }, { "epoch": 10.173205342237061, "grad_norm": 3.015204429626465, "learning_rate": 2.4567117070116863e-05, "loss": 1.3393, "num_input_tokens_seen": 112885940, "step": 195000, "train_runtime": 133.1713, "train_tokens_per_second": 847674.91 }, { "epoch": 10.199290484140233, "grad_norm": 2.1486213207244873, "learning_rate": 2.4501904215358934e-05, "loss": 1.3463, "num_input_tokens_seen": 113169852, "step": 195500, "train_runtime": 153.2143, "train_tokens_per_second": 738637.549 }, { "epoch": 10.225375626043405, "grad_norm": 2.8701765537261963, "learning_rate": 2.4436691360601002e-05, "loss": 1.3125, "num_input_tokens_seen": 113459500, "step": 196000, "train_runtime": 173.4235, "train_tokens_per_second": 654233.54 }, { "epoch": 10.251460767946577, "grad_norm": 2.4410154819488525, "learning_rate": 2.4371478505843074e-05, "loss": 1.3423, "num_input_tokens_seen": 113754868, "step": 196500, "train_runtime": 193.1046, "train_tokens_per_second": 589084.328 }, { "epoch": 10.27754590984975, "grad_norm": 2.3649730682373047, "learning_rate": 2.4306265651085145e-05, "loss": 1.3583, "num_input_tokens_seen": 114041052, "step": 197000, "train_runtime": 211.6778, "train_tokens_per_second": 538748.365 }, { "epoch": 10.303631051752921, "grad_norm": 2.661882162094116, "learning_rate": 2.4241052796327213e-05, "loss": 1.3226, "num_input_tokens_seen": 114327300, "step": 197500, "train_runtime": 230.1549, "train_tokens_per_second": 496740.752 }, { "epoch": 10.329716193656093, "grad_norm": 3.2307496070861816, "learning_rate": 2.4175839941569285e-05, "loss": 1.3317, "num_input_tokens_seen": 114614836, "step": 198000, "train_runtime": 248.6123, "train_tokens_per_second": 461018.428 }, { "epoch": 10.355801335559265, "grad_norm": 2.0446155071258545, "learning_rate": 2.4110627086811353e-05, "loss": 1.3289, "num_input_tokens_seen": 114898460, "step": 198500, "train_runtime": 267.0794, "train_tokens_per_second": 430203.456 }, { "epoch": 10.381886477462437, "grad_norm": 2.149264335632324, "learning_rate": 2.4045414232053424e-05, "loss": 1.3479, "num_input_tokens_seen": 115190612, "step": 199000, "train_runtime": 285.5801, "train_tokens_per_second": 403356.593 }, { "epoch": 10.40797161936561, "grad_norm": 2.5007822513580322, "learning_rate": 2.3980201377295496e-05, "loss": 1.3398, "num_input_tokens_seen": 115480604, "step": 199500, "train_runtime": 304.0559, "train_tokens_per_second": 379800.589 }, { "epoch": 10.434056761268781, "grad_norm": 2.485358238220215, "learning_rate": 2.3914988522537564e-05, "loss": 1.3471, "num_input_tokens_seen": 115772396, "step": 200000, "train_runtime": 322.5534, "train_tokens_per_second": 358924.703 }, { "epoch": 10.460141903171953, "grad_norm": 3.0661306381225586, "learning_rate": 2.3849775667779635e-05, "loss": 1.3459, "num_input_tokens_seen": 116055028, "step": 200500, "train_runtime": 341.0251, "train_tokens_per_second": 340312.297 }, { "epoch": 10.486227045075125, "grad_norm": 3.0374038219451904, "learning_rate": 2.3784562813021703e-05, "loss": 1.3294, "num_input_tokens_seen": 116342956, "step": 201000, "train_runtime": 359.5564, "train_tokens_per_second": 323573.635 }, { "epoch": 10.512312186978297, "grad_norm": 2.4844298362731934, "learning_rate": 2.371934995826377e-05, "loss": 1.347, "num_input_tokens_seen": 116629444, "step": 201500, "train_runtime": 378.1517, "train_tokens_per_second": 308419.724 }, { "epoch": 10.53839732888147, "grad_norm": 3.5257129669189453, "learning_rate": 2.3654137103505843e-05, "loss": 1.3621, "num_input_tokens_seen": 116918476, "step": 202000, "train_runtime": 396.7415, "train_tokens_per_second": 294696.879 }, { "epoch": 10.564482470784641, "grad_norm": 2.989980936050415, "learning_rate": 2.3588924248747914e-05, "loss": 1.3474, "num_input_tokens_seen": 117203300, "step": 202500, "train_runtime": 415.3093, "train_tokens_per_second": 282207.249 }, { "epoch": 10.590567612687813, "grad_norm": 2.9134278297424316, "learning_rate": 2.3523711393989982e-05, "loss": 1.3293, "num_input_tokens_seen": 117490356, "step": 203000, "train_runtime": 433.8487, "train_tokens_per_second": 270809.506 }, { "epoch": 10.616652754590985, "grad_norm": 3.4408249855041504, "learning_rate": 2.3458498539232054e-05, "loss": 1.3323, "num_input_tokens_seen": 117778116, "step": 203500, "train_runtime": 452.4151, "train_tokens_per_second": 260331.947 }, { "epoch": 10.642737896494157, "grad_norm": 2.5976977348327637, "learning_rate": 2.3393285684474125e-05, "loss": 1.326, "num_input_tokens_seen": 118066028, "step": 204000, "train_runtime": 471.0506, "train_tokens_per_second": 250644.038 }, { "epoch": 10.66882303839733, "grad_norm": 2.8414862155914307, "learning_rate": 2.3328072829716193e-05, "loss": 1.3268, "num_input_tokens_seen": 118349812, "step": 204500, "train_runtime": 489.6127, "train_tokens_per_second": 241721.304 }, { "epoch": 10.694908180300501, "grad_norm": 2.9611923694610596, "learning_rate": 2.3262859974958265e-05, "loss": 1.3183, "num_input_tokens_seen": 118641012, "step": 205000, "train_runtime": 508.2873, "train_tokens_per_second": 233413.297 }, { "epoch": 10.720993322203674, "grad_norm": 3.3537490367889404, "learning_rate": 2.3197647120200336e-05, "loss": 1.3344, "num_input_tokens_seen": 118928020, "step": 205500, "train_runtime": 526.9474, "train_tokens_per_second": 225692.395 }, { "epoch": 10.747078464106846, "grad_norm": 2.557131290435791, "learning_rate": 2.3132434265442404e-05, "loss": 1.341, "num_input_tokens_seen": 119221628, "step": 206000, "train_runtime": 545.6837, "train_tokens_per_second": 218481.209 }, { "epoch": 10.773163606010016, "grad_norm": 3.0086355209350586, "learning_rate": 2.3067221410684476e-05, "loss": 1.3298, "num_input_tokens_seen": 119513436, "step": 206500, "train_runtime": 564.4783, "train_tokens_per_second": 211723.717 }, { "epoch": 10.79924874791319, "grad_norm": 3.600940227508545, "learning_rate": 2.3002008555926547e-05, "loss": 1.3572, "num_input_tokens_seen": 119801196, "step": 207000, "train_runtime": 583.2037, "train_tokens_per_second": 205419.144 }, { "epoch": 10.82533388981636, "grad_norm": 2.5225415229797363, "learning_rate": 2.2936795701168615e-05, "loss": 1.3173, "num_input_tokens_seen": 120090740, "step": 207500, "train_runtime": 601.9046, "train_tokens_per_second": 199517.884 }, { "epoch": 10.851419031719532, "grad_norm": 2.092555046081543, "learning_rate": 2.2871582846410687e-05, "loss": 1.3557, "num_input_tokens_seen": 120377796, "step": 208000, "train_runtime": 620.6064, "train_tokens_per_second": 193968.023 }, { "epoch": 10.877504173622704, "grad_norm": 2.5600435733795166, "learning_rate": 2.2806369991652758e-05, "loss": 1.3432, "num_input_tokens_seen": 120669548, "step": 208500, "train_runtime": 639.2536, "train_tokens_per_second": 188766.325 }, { "epoch": 10.903589315525876, "grad_norm": 2.583836793899536, "learning_rate": 2.2741157136894826e-05, "loss": 1.3376, "num_input_tokens_seen": 120961348, "step": 209000, "train_runtime": 657.9319, "train_tokens_per_second": 183850.86 }, { "epoch": 10.929674457429048, "grad_norm": 3.099386692047119, "learning_rate": 2.2675944282136898e-05, "loss": 1.3296, "num_input_tokens_seen": 121257580, "step": 209500, "train_runtime": 676.7145, "train_tokens_per_second": 179185.735 }, { "epoch": 10.95575959933222, "grad_norm": 3.329822063446045, "learning_rate": 2.2610731427378966e-05, "loss": 1.3424, "num_input_tokens_seen": 121550684, "step": 210000, "train_runtime": 695.457, "train_tokens_per_second": 174778.155 }, { "epoch": 10.981844741235392, "grad_norm": 2.160890817642212, "learning_rate": 2.2545518572621034e-05, "loss": 1.3391, "num_input_tokens_seen": 121840244, "step": 210500, "train_runtime": 714.1173, "train_tokens_per_second": 170616.572 }, { "epoch": 11.0, "eval_loss": 1.3036798238754272, "eval_runtime": 45.5874, "eval_samples_per_second": 840.912, "eval_steps_per_second": 105.117, "num_input_tokens_seen": 122042976, "step": 210848 }, { "epoch": 11.007929883138564, "grad_norm": 2.8093433380126953, "learning_rate": 2.2480305717863105e-05, "loss": 1.3252, "num_input_tokens_seen": 122133808, "step": 211000, "train_runtime": 779.5622, "train_tokens_per_second": 156669.744 }, { "epoch": 11.034015025041736, "grad_norm": 2.5687525272369385, "learning_rate": 2.2415092863105177e-05, "loss": 1.3285, "num_input_tokens_seen": 122424408, "step": 211500, "train_runtime": 798.366, "train_tokens_per_second": 153343.713 }, { "epoch": 11.060100166944908, "grad_norm": 2.920220136642456, "learning_rate": 2.2349880008347245e-05, "loss": 1.2892, "num_input_tokens_seen": 122706872, "step": 212000, "train_runtime": 817.1043, "train_tokens_per_second": 150172.829 }, { "epoch": 11.08618530884808, "grad_norm": 2.7014081478118896, "learning_rate": 2.2284667153589316e-05, "loss": 1.3207, "num_input_tokens_seen": 122993992, "step": 212500, "train_runtime": 835.8914, "train_tokens_per_second": 147141.106 }, { "epoch": 11.112270450751252, "grad_norm": 2.6697499752044678, "learning_rate": 2.2219454298831388e-05, "loss": 1.3299, "num_input_tokens_seen": 123284616, "step": 213000, "train_runtime": 854.6172, "train_tokens_per_second": 144257.114 }, { "epoch": 11.138355592654424, "grad_norm": 3.0389206409454346, "learning_rate": 2.2154241444073456e-05, "loss": 1.3267, "num_input_tokens_seen": 123574760, "step": 213500, "train_runtime": 873.3482, "train_tokens_per_second": 141495.405 }, { "epoch": 11.164440734557596, "grad_norm": 2.5090649127960205, "learning_rate": 2.2089028589315527e-05, "loss": 1.3173, "num_input_tokens_seen": 123863512, "step": 214000, "train_runtime": 892.119, "train_tokens_per_second": 138841.92 }, { "epoch": 11.190525876460768, "grad_norm": 2.458717107772827, "learning_rate": 2.2023815734557595e-05, "loss": 1.3488, "num_input_tokens_seen": 124153280, "step": 214500, "train_runtime": 910.8704, "train_tokens_per_second": 136301.807 }, { "epoch": 11.21661101836394, "grad_norm": 2.2780613899230957, "learning_rate": 2.1958602879799667e-05, "loss": 1.3227, "num_input_tokens_seen": 124441304, "step": 215000, "train_runtime": 929.5347, "train_tokens_per_second": 133874.841 }, { "epoch": 11.242696160267112, "grad_norm": 2.2592554092407227, "learning_rate": 2.189339002504174e-05, "loss": 1.3417, "num_input_tokens_seen": 124732192, "step": 215500, "train_runtime": 948.4081, "train_tokens_per_second": 131517.428 }, { "epoch": 11.268781302170284, "grad_norm": 1.9092062711715698, "learning_rate": 2.1828177170283806e-05, "loss": 1.3168, "num_input_tokens_seen": 125026840, "step": 216000, "train_runtime": 967.1853, "train_tokens_per_second": 129268.756 }, { "epoch": 11.294866444073456, "grad_norm": 2.6668968200683594, "learning_rate": 2.1762964315525878e-05, "loss": 1.3158, "num_input_tokens_seen": 125322792, "step": 216500, "train_runtime": 985.9404, "train_tokens_per_second": 127109.902 }, { "epoch": 11.320951585976628, "grad_norm": 2.6406455039978027, "learning_rate": 2.169775146076795e-05, "loss": 1.3155, "num_input_tokens_seen": 125610912, "step": 217000, "train_runtime": 1004.7846, "train_tokens_per_second": 125012.78 }, { "epoch": 11.3470367278798, "grad_norm": 3.033663272857666, "learning_rate": 2.1632538606010017e-05, "loss": 1.3048, "num_input_tokens_seen": 125899904, "step": 217500, "train_runtime": 1023.5588, "train_tokens_per_second": 123002.125 }, { "epoch": 11.373121869782972, "grad_norm": 2.4079842567443848, "learning_rate": 2.156732575125209e-05, "loss": 1.3217, "num_input_tokens_seen": 126190608, "step": 218000, "train_runtime": 1042.2822, "train_tokens_per_second": 121071.437 }, { "epoch": 11.399207011686144, "grad_norm": 2.4821534156799316, "learning_rate": 2.150211289649416e-05, "loss": 1.3127, "num_input_tokens_seen": 126477736, "step": 218500, "train_runtime": 1060.9849, "train_tokens_per_second": 119207.852 }, { "epoch": 11.425292153589316, "grad_norm": 3.1184568405151367, "learning_rate": 2.143690004173623e-05, "loss": 1.3191, "num_input_tokens_seen": 126768304, "step": 219000, "train_runtime": 1079.744, "train_tokens_per_second": 117405.884 }, { "epoch": 11.451377295492488, "grad_norm": 2.4726860523223877, "learning_rate": 2.1371687186978297e-05, "loss": 1.3, "num_input_tokens_seen": 127057344, "step": 219500, "train_runtime": 1098.4724, "train_tokens_per_second": 115667.311 }, { "epoch": 11.47746243739566, "grad_norm": 2.8745577335357666, "learning_rate": 2.1306474332220368e-05, "loss": 1.3066, "num_input_tokens_seen": 127342264, "step": 220000, "train_runtime": 1117.2372, "train_tokens_per_second": 113979.609 }, { "epoch": 11.503547579298832, "grad_norm": 2.5106630325317383, "learning_rate": 2.1241261477462436e-05, "loss": 1.3081, "num_input_tokens_seen": 127636384, "step": 220500, "train_runtime": 1136.0017, "train_tokens_per_second": 112355.806 }, { "epoch": 11.529632721202002, "grad_norm": 2.9184515476226807, "learning_rate": 2.1176048622704508e-05, "loss": 1.3162, "num_input_tokens_seen": 127929168, "step": 221000, "train_runtime": 1154.8123, "train_tokens_per_second": 110779.183 }, { "epoch": 11.555717863105176, "grad_norm": 2.631758689880371, "learning_rate": 2.111083576794658e-05, "loss": 1.3154, "num_input_tokens_seen": 128214768, "step": 221500, "train_runtime": 1173.5738, "train_tokens_per_second": 109251.562 }, { "epoch": 11.581803005008346, "grad_norm": 3.0632224082946777, "learning_rate": 2.1045622913188647e-05, "loss": 1.3265, "num_input_tokens_seen": 128502040, "step": 222000, "train_runtime": 1192.3765, "train_tokens_per_second": 107769.681 }, { "epoch": 11.607888146911518, "grad_norm": 3.1149165630340576, "learning_rate": 2.098041005843072e-05, "loss": 1.321, "num_input_tokens_seen": 128788576, "step": 222500, "train_runtime": 1211.1873, "train_tokens_per_second": 106332.503 }, { "epoch": 11.63397328881469, "grad_norm": 3.4126601219177246, "learning_rate": 2.091519720367279e-05, "loss": 1.3089, "num_input_tokens_seen": 129075456, "step": 223000, "train_runtime": 1229.9696, "train_tokens_per_second": 104941.986 }, { "epoch": 11.660058430717863, "grad_norm": 2.5633208751678467, "learning_rate": 2.0849984348914858e-05, "loss": 1.3354, "num_input_tokens_seen": 129363864, "step": 223500, "train_runtime": 1248.7371, "train_tokens_per_second": 103595.756 }, { "epoch": 11.686143572621035, "grad_norm": 2.816091775894165, "learning_rate": 2.078477149415693e-05, "loss": 1.3338, "num_input_tokens_seen": 129649336, "step": 224000, "train_runtime": 1267.5029, "train_tokens_per_second": 102287.208 }, { "epoch": 11.712228714524207, "grad_norm": 3.5613439083099365, "learning_rate": 2.0719558639399e-05, "loss": 1.3199, "num_input_tokens_seen": 129942320, "step": 224500, "train_runtime": 1286.259, "train_tokens_per_second": 101023.451 }, { "epoch": 11.738313856427379, "grad_norm": 2.822772741317749, "learning_rate": 2.065434578464107e-05, "loss": 1.3044, "num_input_tokens_seen": 130232704, "step": 225000, "train_runtime": 1305.0245, "train_tokens_per_second": 99793.304 }, { "epoch": 11.76439899833055, "grad_norm": 2.610865592956543, "learning_rate": 2.058913292988314e-05, "loss": 1.3334, "num_input_tokens_seen": 130524424, "step": 225500, "train_runtime": 1323.7569, "train_tokens_per_second": 98601.505 }, { "epoch": 11.790484140233723, "grad_norm": 2.68410325050354, "learning_rate": 2.0523920075125212e-05, "loss": 1.3042, "num_input_tokens_seen": 130811008, "step": 226000, "train_runtime": 1342.504, "train_tokens_per_second": 97438.079 }, { "epoch": 11.816569282136895, "grad_norm": 2.4882125854492188, "learning_rate": 2.045870722036728e-05, "loss": 1.365, "num_input_tokens_seen": 131095640, "step": 226500, "train_runtime": 1361.2815, "train_tokens_per_second": 96303.109 }, { "epoch": 11.842654424040067, "grad_norm": 2.4496724605560303, "learning_rate": 2.039349436560935e-05, "loss": 1.3053, "num_input_tokens_seen": 131380824, "step": 227000, "train_runtime": 1380.0428, "train_tokens_per_second": 95200.546 }, { "epoch": 11.868739565943239, "grad_norm": 2.1208622455596924, "learning_rate": 2.032828151085142e-05, "loss": 1.3387, "num_input_tokens_seen": 131669800, "step": 227500, "train_runtime": 1398.7962, "train_tokens_per_second": 94130.797 }, { "epoch": 11.89482470784641, "grad_norm": 2.5656790733337402, "learning_rate": 2.026306865609349e-05, "loss": 1.3109, "num_input_tokens_seen": 131956504, "step": 228000, "train_runtime": 1417.5824, "train_tokens_per_second": 93085.598 }, { "epoch": 11.920909849749583, "grad_norm": 2.894057035446167, "learning_rate": 2.019785580133556e-05, "loss": 1.3385, "num_input_tokens_seen": 132249552, "step": 228500, "train_runtime": 1436.3884, "train_tokens_per_second": 92070.886 }, { "epoch": 11.946994991652755, "grad_norm": 4.0213446617126465, "learning_rate": 2.013264294657763e-05, "loss": 1.3252, "num_input_tokens_seen": 132541072, "step": 229000, "train_runtime": 1455.1937, "train_tokens_per_second": 91081.394 }, { "epoch": 11.973080133555927, "grad_norm": 2.279191255569458, "learning_rate": 2.00674300918197e-05, "loss": 1.3362, "num_input_tokens_seen": 132831104, "step": 229500, "train_runtime": 1473.9285, "train_tokens_per_second": 90120.453 }, { "epoch": 11.999165275459099, "grad_norm": 2.1568970680236816, "learning_rate": 2.000221723706177e-05, "loss": 1.293, "num_input_tokens_seen": 133123320, "step": 230000, "train_runtime": 1492.6888, "train_tokens_per_second": 89183.575 }, { "epoch": 12.0, "eval_loss": 1.303634762763977, "eval_runtime": 45.533, "eval_samples_per_second": 841.917, "eval_steps_per_second": 105.242, "num_input_tokens_seen": 133131832, "step": 230016 }, { "epoch": 12.02525041736227, "grad_norm": 2.564668655395508, "learning_rate": 1.9937004382303838e-05, "loss": 1.2803, "num_input_tokens_seen": 133411856, "step": 230500, "train_runtime": 1558.1778, "train_tokens_per_second": 85620.431 }, { "epoch": 12.051335559265443, "grad_norm": 1.8836562633514404, "learning_rate": 1.987179152754591e-05, "loss": 1.3323, "num_input_tokens_seen": 133703544, "step": 231000, "train_runtime": 1576.9538, "train_tokens_per_second": 84785.959 }, { "epoch": 12.077420701168615, "grad_norm": 3.665679693222046, "learning_rate": 1.980657867278798e-05, "loss": 1.3101, "num_input_tokens_seen": 133990048, "step": 231500, "train_runtime": 1595.7021, "train_tokens_per_second": 83969.336 }, { "epoch": 12.103505843071787, "grad_norm": 2.481233596801758, "learning_rate": 1.974136581803005e-05, "loss": 1.3122, "num_input_tokens_seen": 134279720, "step": 232000, "train_runtime": 1614.4866, "train_tokens_per_second": 83171.778 }, { "epoch": 12.129590984974959, "grad_norm": 2.0712811946868896, "learning_rate": 1.967615296327212e-05, "loss": 1.3191, "num_input_tokens_seen": 134570152, "step": 232500, "train_runtime": 1633.2616, "train_tokens_per_second": 82393.51 }, { "epoch": 12.15567612687813, "grad_norm": 2.377253293991089, "learning_rate": 1.9610940108514192e-05, "loss": 1.303, "num_input_tokens_seen": 134859336, "step": 233000, "train_runtime": 1652.0277, "train_tokens_per_second": 81632.612 }, { "epoch": 12.181761268781303, "grad_norm": 2.749286651611328, "learning_rate": 1.954572725375626e-05, "loss": 1.3219, "num_input_tokens_seen": 135151088, "step": 233500, "train_runtime": 1670.9562, "train_tokens_per_second": 80882.482 }, { "epoch": 12.207846410684475, "grad_norm": 1.9715009927749634, "learning_rate": 1.948051439899833e-05, "loss": 1.3164, "num_input_tokens_seen": 135441304, "step": 234000, "train_runtime": 1689.8315, "train_tokens_per_second": 80150.776 }, { "epoch": 12.233931552587647, "grad_norm": 2.8835082054138184, "learning_rate": 1.9415301544240403e-05, "loss": 1.3164, "num_input_tokens_seen": 135728888, "step": 234500, "train_runtime": 1708.6414, "train_tokens_per_second": 79436.731 }, { "epoch": 12.260016694490819, "grad_norm": 2.7887117862701416, "learning_rate": 1.935008868948247e-05, "loss": 1.3003, "num_input_tokens_seen": 136016392, "step": 235000, "train_runtime": 1727.3834, "train_tokens_per_second": 78741.287 }, { "epoch": 12.28610183639399, "grad_norm": 2.219428777694702, "learning_rate": 1.9284875834724543e-05, "loss": 1.2853, "num_input_tokens_seen": 136304528, "step": 235500, "train_runtime": 1746.1346, "train_tokens_per_second": 78060.723 }, { "epoch": 12.312186978297161, "grad_norm": 2.7682409286499023, "learning_rate": 1.9219662979966614e-05, "loss": 1.3175, "num_input_tokens_seen": 136593504, "step": 236000, "train_runtime": 1764.8652, "train_tokens_per_second": 77395.999 }, { "epoch": 12.338272120200333, "grad_norm": 4.289463520050049, "learning_rate": 1.9154450125208682e-05, "loss": 1.2741, "num_input_tokens_seen": 136885144, "step": 236500, "train_runtime": 1783.6052, "train_tokens_per_second": 76746.323 }, { "epoch": 12.364357262103505, "grad_norm": 3.1798133850097656, "learning_rate": 1.9089237270450754e-05, "loss": 1.2896, "num_input_tokens_seen": 137168736, "step": 237000, "train_runtime": 1802.3604, "train_tokens_per_second": 76105.055 }, { "epoch": 12.390442404006677, "grad_norm": 3.9631903171539307, "learning_rate": 1.9024024415692822e-05, "loss": 1.3425, "num_input_tokens_seen": 137463960, "step": 237500, "train_runtime": 1821.2214, "train_tokens_per_second": 75478.997 }, { "epoch": 12.41652754590985, "grad_norm": 3.6029210090637207, "learning_rate": 1.8958811560934893e-05, "loss": 1.3134, "num_input_tokens_seen": 137751968, "step": 238000, "train_runtime": 1839.9397, "train_tokens_per_second": 74867.655 }, { "epoch": 12.442612687813021, "grad_norm": 2.178394317626953, "learning_rate": 1.889359870617696e-05, "loss": 1.2797, "num_input_tokens_seen": 138044520, "step": 238500, "train_runtime": 1858.64, "train_tokens_per_second": 74271.788 }, { "epoch": 12.468697829716193, "grad_norm": 2.5995266437530518, "learning_rate": 1.8828385851419033e-05, "loss": 1.3029, "num_input_tokens_seen": 138334136, "step": 239000, "train_runtime": 1877.3231, "train_tokens_per_second": 73686.909 }, { "epoch": 12.494782971619365, "grad_norm": 2.1378602981567383, "learning_rate": 1.87631729966611e-05, "loss": 1.3092, "num_input_tokens_seen": 138621760, "step": 239500, "train_runtime": 1895.9609, "train_tokens_per_second": 73114.252 }, { "epoch": 12.520868113522537, "grad_norm": 2.3101305961608887, "learning_rate": 1.8697960141903172e-05, "loss": 1.3457, "num_input_tokens_seen": 138914632, "step": 240000, "train_runtime": 1914.6876, "train_tokens_per_second": 72552.113 }, { "epoch": 12.54695325542571, "grad_norm": 2.8269946575164795, "learning_rate": 1.8632747287145244e-05, "loss": 1.3055, "num_input_tokens_seen": 139199064, "step": 240500, "train_runtime": 1933.403, "train_tokens_per_second": 71996.923 }, { "epoch": 12.573038397328881, "grad_norm": 4.536306858062744, "learning_rate": 1.8567534432387312e-05, "loss": 1.3104, "num_input_tokens_seen": 139488888, "step": 241000, "train_runtime": 1952.205, "train_tokens_per_second": 71451.969 }, { "epoch": 12.599123539232053, "grad_norm": 2.898843765258789, "learning_rate": 1.8502321577629383e-05, "loss": 1.2751, "num_input_tokens_seen": 139777560, "step": 241500, "train_runtime": 1970.9694, "train_tokens_per_second": 70918.18 }, { "epoch": 12.625208681135225, "grad_norm": 2.233572006225586, "learning_rate": 1.8437108722871455e-05, "loss": 1.2931, "num_input_tokens_seen": 140065240, "step": 242000, "train_runtime": 1989.7056, "train_tokens_per_second": 70394.956 }, { "epoch": 12.651293823038397, "grad_norm": 4.327518939971924, "learning_rate": 1.8371895868113523e-05, "loss": 1.2964, "num_input_tokens_seen": 140353912, "step": 242500, "train_runtime": 2008.5433, "train_tokens_per_second": 69878.458 }, { "epoch": 12.67737896494157, "grad_norm": 2.5169992446899414, "learning_rate": 1.8306683013355594e-05, "loss": 1.3056, "num_input_tokens_seen": 140643424, "step": 243000, "train_runtime": 2027.3392, "train_tokens_per_second": 69373.405 }, { "epoch": 12.703464106844741, "grad_norm": 2.1607372760772705, "learning_rate": 1.8241470158597666e-05, "loss": 1.2978, "num_input_tokens_seen": 140936888, "step": 243500, "train_runtime": 2046.1461, "train_tokens_per_second": 68879.19 }, { "epoch": 12.729549248747913, "grad_norm": 3.104569673538208, "learning_rate": 1.8176257303839734e-05, "loss": 1.3203, "num_input_tokens_seen": 141227736, "step": 244000, "train_runtime": 2064.9328, "train_tokens_per_second": 68393.38 }, { "epoch": 12.755634390651085, "grad_norm": 2.6793630123138428, "learning_rate": 1.8111044449081805e-05, "loss": 1.2928, "num_input_tokens_seen": 141513976, "step": 244500, "train_runtime": 2083.7183, "train_tokens_per_second": 67914.159 }, { "epoch": 12.781719532554257, "grad_norm": 2.779440402984619, "learning_rate": 1.8045831594323873e-05, "loss": 1.2963, "num_input_tokens_seen": 141801952, "step": 245000, "train_runtime": 2102.5169, "train_tokens_per_second": 67443.905 }, { "epoch": 12.80780467445743, "grad_norm": 2.685547351837158, "learning_rate": 1.7980618739565945e-05, "loss": 1.3113, "num_input_tokens_seen": 142087288, "step": 245500, "train_runtime": 2121.2928, "train_tokens_per_second": 66981.458 }, { "epoch": 12.833889816360601, "grad_norm": 3.5041792392730713, "learning_rate": 1.7915405884808016e-05, "loss": 1.3062, "num_input_tokens_seen": 142379312, "step": 246000, "train_runtime": 2140.0752, "train_tokens_per_second": 66530.051 }, { "epoch": 12.859974958263773, "grad_norm": 3.0701446533203125, "learning_rate": 1.7850193030050084e-05, "loss": 1.3036, "num_input_tokens_seen": 142666568, "step": 246500, "train_runtime": 2158.8062, "train_tokens_per_second": 66085.862 }, { "epoch": 12.886060100166945, "grad_norm": 1.8722320795059204, "learning_rate": 1.7784980175292152e-05, "loss": 1.3004, "num_input_tokens_seen": 142954624, "step": 247000, "train_runtime": 2177.6361, "train_tokens_per_second": 65646.701 }, { "epoch": 12.912145242070117, "grad_norm": 3.499333381652832, "learning_rate": 1.7719767320534224e-05, "loss": 1.3213, "num_input_tokens_seen": 143246680, "step": 247500, "train_runtime": 2196.4459, "train_tokens_per_second": 65217.486 }, { "epoch": 12.93823038397329, "grad_norm": 4.5629353523254395, "learning_rate": 1.7654554465776292e-05, "loss": 1.3231, "num_input_tokens_seen": 143537736, "step": 248000, "train_runtime": 2215.2759, "train_tokens_per_second": 64794.52 }, { "epoch": 12.964315525876462, "grad_norm": 3.0510342121124268, "learning_rate": 1.7589341611018363e-05, "loss": 1.2966, "num_input_tokens_seen": 143823008, "step": 248500, "train_runtime": 2233.9986, "train_tokens_per_second": 64379.186 }, { "epoch": 12.990400667779634, "grad_norm": 3.152311325073242, "learning_rate": 1.7524128756260435e-05, "loss": 1.2741, "num_input_tokens_seen": 144116976, "step": 249000, "train_runtime": 2252.7592, "train_tokens_per_second": 63973.537 }, { "epoch": 13.0, "eval_loss": 1.3037497997283936, "eval_runtime": 45.363, "eval_samples_per_second": 845.072, "eval_steps_per_second": 105.637, "num_input_tokens_seen": 144224222, "step": 249184 }, { "epoch": 13.016485809682806, "grad_norm": 2.950641632080078, "learning_rate": 1.7458915901502503e-05, "loss": 1.2892, "num_input_tokens_seen": 144404846, "step": 249500, "train_runtime": 2317.872, "train_tokens_per_second": 62300.612 }, { "epoch": 13.042570951585976, "grad_norm": 3.1258602142333984, "learning_rate": 1.7393703046744574e-05, "loss": 1.279, "num_input_tokens_seen": 144697406, "step": 250000, "train_runtime": 2336.6661, "train_tokens_per_second": 61924.725 }, { "epoch": 13.068656093489148, "grad_norm": 2.8600733280181885, "learning_rate": 1.7328490191986646e-05, "loss": 1.2856, "num_input_tokens_seen": 144992526, "step": 250500, "train_runtime": 2355.4549, "train_tokens_per_second": 61556.062 }, { "epoch": 13.09474123539232, "grad_norm": 2.740837335586548, "learning_rate": 1.7263277337228714e-05, "loss": 1.2793, "num_input_tokens_seen": 145286206, "step": 251000, "train_runtime": 2374.3019, "train_tokens_per_second": 61191.125 }, { "epoch": 13.120826377295492, "grad_norm": 2.514106035232544, "learning_rate": 1.7198064482470785e-05, "loss": 1.2966, "num_input_tokens_seen": 145576638, "step": 251500, "train_runtime": 2393.1024, "train_tokens_per_second": 60831.763 }, { "epoch": 13.146911519198664, "grad_norm": 2.3407087326049805, "learning_rate": 1.7132851627712857e-05, "loss": 1.288, "num_input_tokens_seen": 145861950, "step": 252000, "train_runtime": 2411.8629, "train_tokens_per_second": 60476.884 }, { "epoch": 13.172996661101836, "grad_norm": 2.940520763397217, "learning_rate": 1.7067638772954925e-05, "loss": 1.2828, "num_input_tokens_seen": 146153318, "step": 252500, "train_runtime": 2430.6861, "train_tokens_per_second": 60128.423 }, { "epoch": 13.199081803005008, "grad_norm": 2.352440595626831, "learning_rate": 1.7002425918196996e-05, "loss": 1.3483, "num_input_tokens_seen": 146442846, "step": 253000, "train_runtime": 2449.4406, "train_tokens_per_second": 59786.24 }, { "epoch": 13.22516694490818, "grad_norm": 3.5476200580596924, "learning_rate": 1.6937213063439068e-05, "loss": 1.286, "num_input_tokens_seen": 146729830, "step": 253500, "train_runtime": 2468.227, "train_tokens_per_second": 59447.462 }, { "epoch": 13.251252086811352, "grad_norm": 3.1068811416625977, "learning_rate": 1.6872000208681136e-05, "loss": 1.2873, "num_input_tokens_seen": 147026030, "step": 254000, "train_runtime": 2486.9722, "train_tokens_per_second": 59118.484 }, { "epoch": 13.277337228714524, "grad_norm": 3.000011920928955, "learning_rate": 1.6806787353923207e-05, "loss": 1.2832, "num_input_tokens_seen": 147309830, "step": 254500, "train_runtime": 2505.7198, "train_tokens_per_second": 58789.428 }, { "epoch": 13.303422370617696, "grad_norm": 3.2478373050689697, "learning_rate": 1.674157449916528e-05, "loss": 1.3025, "num_input_tokens_seen": 147604054, "step": 255000, "train_runtime": 2524.5534, "train_tokens_per_second": 58467.393 }, { "epoch": 13.329507512520868, "grad_norm": 2.5078775882720947, "learning_rate": 1.6676361644407347e-05, "loss": 1.2669, "num_input_tokens_seen": 147894782, "step": 255500, "train_runtime": 2543.3336, "train_tokens_per_second": 58149.974 }, { "epoch": 13.35559265442404, "grad_norm": 2.6515934467315674, "learning_rate": 1.6611148789649415e-05, "loss": 1.2827, "num_input_tokens_seen": 148189078, "step": 256000, "train_runtime": 2562.0637, "train_tokens_per_second": 57839.731 }, { "epoch": 13.381677796327212, "grad_norm": 3.669487237930298, "learning_rate": 1.6545935934891486e-05, "loss": 1.3063, "num_input_tokens_seen": 148477710, "step": 256500, "train_runtime": 2580.8969, "train_tokens_per_second": 57529.5 }, { "epoch": 13.407762938230384, "grad_norm": 2.5362067222595215, "learning_rate": 1.6480723080133555e-05, "loss": 1.311, "num_input_tokens_seen": 148771438, "step": 257000, "train_runtime": 2599.7745, "train_tokens_per_second": 57224.747 }, { "epoch": 13.433848080133556, "grad_norm": 1.743450403213501, "learning_rate": 1.6415510225375626e-05, "loss": 1.2843, "num_input_tokens_seen": 149060526, "step": 257500, "train_runtime": 2618.5247, "train_tokens_per_second": 56925.386 }, { "epoch": 13.459933222036728, "grad_norm": 2.875257968902588, "learning_rate": 1.6350297370617697e-05, "loss": 1.2692, "num_input_tokens_seen": 149346974, "step": 258000, "train_runtime": 2637.3123, "train_tokens_per_second": 56628.474 }, { "epoch": 13.4860183639399, "grad_norm": 3.3050479888916016, "learning_rate": 1.6285084515859766e-05, "loss": 1.2869, "num_input_tokens_seen": 149633070, "step": 258500, "train_runtime": 2656.0943, "train_tokens_per_second": 56335.751 }, { "epoch": 13.512103505843072, "grad_norm": 2.2370221614837646, "learning_rate": 1.6219871661101837e-05, "loss": 1.3004, "num_input_tokens_seen": 149926758, "step": 259000, "train_runtime": 2674.8246, "train_tokens_per_second": 56051.06 }, { "epoch": 13.538188647746244, "grad_norm": 4.20009183883667, "learning_rate": 1.615465880634391e-05, "loss": 1.2629, "num_input_tokens_seen": 150212054, "step": 259500, "train_runtime": 2693.5708, "train_tokens_per_second": 55766.885 }, { "epoch": 13.564273789649416, "grad_norm": 2.247492551803589, "learning_rate": 1.6089445951585977e-05, "loss": 1.3251, "num_input_tokens_seen": 150502366, "step": 260000, "train_runtime": 2712.3292, "train_tokens_per_second": 55488.237 }, { "epoch": 13.590358931552588, "grad_norm": 2.1950037479400635, "learning_rate": 1.6024233096828048e-05, "loss": 1.2798, "num_input_tokens_seen": 150787110, "step": 260500, "train_runtime": 2731.1083, "train_tokens_per_second": 55210.959 }, { "epoch": 13.61644407345576, "grad_norm": 2.5948126316070557, "learning_rate": 1.5959020242070116e-05, "loss": 1.2685, "num_input_tokens_seen": 151072982, "step": 261000, "train_runtime": 2749.822, "train_tokens_per_second": 54939.185 }, { "epoch": 13.642529215358932, "grad_norm": 3.1042332649230957, "learning_rate": 1.5893807387312188e-05, "loss": 1.2917, "num_input_tokens_seen": 151366958, "step": 261500, "train_runtime": 2768.6641, "train_tokens_per_second": 54671.478 }, { "epoch": 13.668614357262104, "grad_norm": 2.2142746448516846, "learning_rate": 1.582859453255426e-05, "loss": 1.2928, "num_input_tokens_seen": 151651278, "step": 262000, "train_runtime": 2787.4176, "train_tokens_per_second": 54405.653 }, { "epoch": 13.694699499165276, "grad_norm": 2.406888008117676, "learning_rate": 1.5763381677796327e-05, "loss": 1.2888, "num_input_tokens_seen": 151940174, "step": 262500, "train_runtime": 2806.1465, "train_tokens_per_second": 54145.488 }, { "epoch": 13.720784641068448, "grad_norm": 2.989021062850952, "learning_rate": 1.56981688230384e-05, "loss": 1.3058, "num_input_tokens_seen": 152226926, "step": 263000, "train_runtime": 2824.9214, "train_tokens_per_second": 53887.137 }, { "epoch": 13.746869782971618, "grad_norm": 2.4519472122192383, "learning_rate": 1.563295596828047e-05, "loss": 1.3242, "num_input_tokens_seen": 152519390, "step": 263500, "train_runtime": 2843.6976, "train_tokens_per_second": 53634.181 }, { "epoch": 13.772954924874792, "grad_norm": 3.375582456588745, "learning_rate": 1.5567743113522538e-05, "loss": 1.2878, "num_input_tokens_seen": 152810446, "step": 264000, "train_runtime": 2862.4801, "train_tokens_per_second": 53383.932 }, { "epoch": 13.799040066777962, "grad_norm": 2.5288329124450684, "learning_rate": 1.550253025876461e-05, "loss": 1.279, "num_input_tokens_seen": 153100030, "step": 264500, "train_runtime": 2881.2536, "train_tokens_per_second": 53136.604 }, { "epoch": 13.825125208681134, "grad_norm": 2.273123025894165, "learning_rate": 1.5437317404006678e-05, "loss": 1.2912, "num_input_tokens_seen": 153385646, "step": 265000, "train_runtime": 2900.0148, "train_tokens_per_second": 52891.332 }, { "epoch": 13.851210350584306, "grad_norm": 5.488306522369385, "learning_rate": 1.537210454924875e-05, "loss": 1.3079, "num_input_tokens_seen": 153672086, "step": 265500, "train_runtime": 2918.7985, "train_tokens_per_second": 52649.091 }, { "epoch": 13.877295492487479, "grad_norm": 2.2071919441223145, "learning_rate": 1.5306891694490817e-05, "loss": 1.3046, "num_input_tokens_seen": 153960638, "step": 266000, "train_runtime": 2937.5127, "train_tokens_per_second": 52411.906 }, { "epoch": 13.90338063439065, "grad_norm": 3.046309471130371, "learning_rate": 1.524167883973289e-05, "loss": 1.2832, "num_input_tokens_seen": 154246150, "step": 266500, "train_runtime": 2956.2532, "train_tokens_per_second": 52176.231 }, { "epoch": 13.929465776293823, "grad_norm": 2.4747865200042725, "learning_rate": 1.5176465984974958e-05, "loss": 1.2976, "num_input_tokens_seen": 154534870, "step": 267000, "train_runtime": 2975.0146, "train_tokens_per_second": 51944.238 }, { "epoch": 13.955550918196995, "grad_norm": 2.148017168045044, "learning_rate": 1.511125313021703e-05, "loss": 1.3016, "num_input_tokens_seen": 154821518, "step": 267500, "train_runtime": 2993.7975, "train_tokens_per_second": 51714.091 }, { "epoch": 13.981636060100167, "grad_norm": 2.248180389404297, "learning_rate": 1.50460402754591e-05, "loss": 1.2983, "num_input_tokens_seen": 155115046, "step": 268000, "train_runtime": 3012.5625, "train_tokens_per_second": 51489.403 }, { "epoch": 14.0, "eval_loss": 1.2995389699935913, "eval_runtime": 45.4147, "eval_samples_per_second": 844.109, "eval_steps_per_second": 105.516, "num_input_tokens_seen": 155319448, "step": 268352 }, { "epoch": 14.007721202003339, "grad_norm": 3.0312399864196777, "learning_rate": 1.4980827420701168e-05, "loss": 1.3027, "num_input_tokens_seen": 155408024, "step": 268500, "train_runtime": 3077.7818, "train_tokens_per_second": 50493.516 }, { "epoch": 14.03380634390651, "grad_norm": 4.309081077575684, "learning_rate": 1.4915614565943239e-05, "loss": 1.2652, "num_input_tokens_seen": 155690152, "step": 269000, "train_runtime": 3096.5771, "train_tokens_per_second": 50278.144 }, { "epoch": 14.059891485809683, "grad_norm": 2.96939754486084, "learning_rate": 1.485040171118531e-05, "loss": 1.271, "num_input_tokens_seen": 155981000, "step": 269500, "train_runtime": 3115.3826, "train_tokens_per_second": 50068.008 }, { "epoch": 14.085976627712855, "grad_norm": 2.4417145252227783, "learning_rate": 1.4785188856427379e-05, "loss": 1.2753, "num_input_tokens_seen": 156272536, "step": 270000, "train_runtime": 3134.181, "train_tokens_per_second": 49860.724 }, { "epoch": 14.112061769616027, "grad_norm": 3.6525328159332275, "learning_rate": 1.471997600166945e-05, "loss": 1.2708, "num_input_tokens_seen": 156564232, "step": 270500, "train_runtime": 3152.9933, "train_tokens_per_second": 49655.746 }, { "epoch": 14.138146911519199, "grad_norm": 2.702702045440674, "learning_rate": 1.4654763146911522e-05, "loss": 1.2644, "num_input_tokens_seen": 156847192, "step": 271000, "train_runtime": 3171.767, "train_tokens_per_second": 49451.045 }, { "epoch": 14.16423205342237, "grad_norm": 2.738504648208618, "learning_rate": 1.458955029215359e-05, "loss": 1.2735, "num_input_tokens_seen": 157138056, "step": 271500, "train_runtime": 3190.5858, "train_tokens_per_second": 49250.534 }, { "epoch": 14.190317195325543, "grad_norm": 2.680459976196289, "learning_rate": 1.4524337437395661e-05, "loss": 1.2923, "num_input_tokens_seen": 157427656, "step": 272000, "train_runtime": 3209.3923, "train_tokens_per_second": 49052.17 }, { "epoch": 14.216402337228715, "grad_norm": 2.5472817420959473, "learning_rate": 1.4459124582637731e-05, "loss": 1.2812, "num_input_tokens_seen": 157714904, "step": 272500, "train_runtime": 3228.1634, "train_tokens_per_second": 48855.924 }, { "epoch": 14.242487479131887, "grad_norm": 2.909809112548828, "learning_rate": 1.4393911727879799e-05, "loss": 1.3002, "num_input_tokens_seen": 158004216, "step": 273000, "train_runtime": 3246.9319, "train_tokens_per_second": 48662.621 }, { "epoch": 14.268572621035059, "grad_norm": 3.222720146179199, "learning_rate": 1.432869887312187e-05, "loss": 1.2887, "num_input_tokens_seen": 158292352, "step": 273500, "train_runtime": 3265.647, "train_tokens_per_second": 48471.973 }, { "epoch": 14.29465776293823, "grad_norm": 1.991113543510437, "learning_rate": 1.4263486018363942e-05, "loss": 1.2627, "num_input_tokens_seen": 158587024, "step": 274000, "train_runtime": 3284.4013, "train_tokens_per_second": 48284.91 }, { "epoch": 14.320742904841403, "grad_norm": 2.8505282402038574, "learning_rate": 1.419827316360601e-05, "loss": 1.2836, "num_input_tokens_seen": 158886520, "step": 274500, "train_runtime": 3303.3083, "train_tokens_per_second": 48099.209 }, { "epoch": 14.346828046744575, "grad_norm": 2.9469573497772217, "learning_rate": 1.4133060308848081e-05, "loss": 1.2749, "num_input_tokens_seen": 159177696, "step": 275000, "train_runtime": 3322.098, "train_tokens_per_second": 47914.811 }, { "epoch": 14.372913188647747, "grad_norm": 4.244631767272949, "learning_rate": 1.4067847454090153e-05, "loss": 1.2695, "num_input_tokens_seen": 159460280, "step": 275500, "train_runtime": 3340.8943, "train_tokens_per_second": 47729.819 }, { "epoch": 14.398998330550919, "grad_norm": 3.174166440963745, "learning_rate": 1.4002634599332221e-05, "loss": 1.2888, "num_input_tokens_seen": 159745000, "step": 276000, "train_runtime": 3359.6609, "train_tokens_per_second": 47547.953 }, { "epoch": 14.42508347245409, "grad_norm": 2.760267496109009, "learning_rate": 1.3937421744574292e-05, "loss": 1.2714, "num_input_tokens_seen": 160037624, "step": 276500, "train_runtime": 3378.4646, "train_tokens_per_second": 47369.928 }, { "epoch": 14.451168614357263, "grad_norm": 3.1717495918273926, "learning_rate": 1.387220888981636e-05, "loss": 1.2967, "num_input_tokens_seen": 160328736, "step": 277000, "train_runtime": 3397.3414, "train_tokens_per_second": 47192.412 }, { "epoch": 14.477253756260435, "grad_norm": 2.68973708152771, "learning_rate": 1.380699603505843e-05, "loss": 1.2688, "num_input_tokens_seen": 160619656, "step": 277500, "train_runtime": 3416.1542, "train_tokens_per_second": 47017.683 }, { "epoch": 14.503338898163605, "grad_norm": 2.4333648681640625, "learning_rate": 1.3741783180300502e-05, "loss": 1.2797, "num_input_tokens_seen": 160908592, "step": 278000, "train_runtime": 3434.8918, "train_tokens_per_second": 46845.316 }, { "epoch": 14.529424040066779, "grad_norm": 2.4637181758880615, "learning_rate": 1.367657032554257e-05, "loss": 1.2733, "num_input_tokens_seen": 161202600, "step": 278500, "train_runtime": 3453.6295, "train_tokens_per_second": 46676.287 }, { "epoch": 14.55550918196995, "grad_norm": 2.199878215789795, "learning_rate": 1.3611357470784641e-05, "loss": 1.2812, "num_input_tokens_seen": 161493960, "step": 279000, "train_runtime": 3472.3475, "train_tokens_per_second": 46508.583 }, { "epoch": 14.581594323873121, "grad_norm": 2.7561452388763428, "learning_rate": 1.3546144616026713e-05, "loss": 1.2981, "num_input_tokens_seen": 161780984, "step": 279500, "train_runtime": 3491.0873, "train_tokens_per_second": 46341.146 }, { "epoch": 14.607679465776293, "grad_norm": 2.5802223682403564, "learning_rate": 1.348093176126878e-05, "loss": 1.2772, "num_input_tokens_seen": 162067272, "step": 280000, "train_runtime": 3509.8281, "train_tokens_per_second": 46175.274 }, { "epoch": 14.633764607679465, "grad_norm": 2.8847203254699707, "learning_rate": 1.3415718906510852e-05, "loss": 1.2868, "num_input_tokens_seen": 162356640, "step": 280500, "train_runtime": 3528.574, "train_tokens_per_second": 46011.97 }, { "epoch": 14.659849749582637, "grad_norm": 2.8300564289093018, "learning_rate": 1.3350506051752924e-05, "loss": 1.3286, "num_input_tokens_seen": 162645952, "step": 281000, "train_runtime": 3547.3388, "train_tokens_per_second": 45850.132 }, { "epoch": 14.68593489148581, "grad_norm": 2.2055959701538086, "learning_rate": 1.3285293196994992e-05, "loss": 1.2874, "num_input_tokens_seen": 162937608, "step": 281500, "train_runtime": 3566.1498, "train_tokens_per_second": 45690.063 }, { "epoch": 14.712020033388981, "grad_norm": 2.794443368911743, "learning_rate": 1.3220080342237062e-05, "loss": 1.2976, "num_input_tokens_seen": 163226160, "step": 282000, "train_runtime": 3584.9392, "train_tokens_per_second": 45531.081 }, { "epoch": 14.738105175292153, "grad_norm": 2.3322718143463135, "learning_rate": 1.3154867487479133e-05, "loss": 1.3031, "num_input_tokens_seen": 163520392, "step": 282500, "train_runtime": 3603.7244, "train_tokens_per_second": 45375.388 }, { "epoch": 14.764190317195325, "grad_norm": 2.4972341060638428, "learning_rate": 1.3089654632721201e-05, "loss": 1.2688, "num_input_tokens_seen": 163814080, "step": 283000, "train_runtime": 3622.5289, "train_tokens_per_second": 45220.917 }, { "epoch": 14.790275459098497, "grad_norm": 2.5767734050750732, "learning_rate": 1.3024441777963273e-05, "loss": 1.2623, "num_input_tokens_seen": 164098944, "step": 283500, "train_runtime": 3641.3406, "train_tokens_per_second": 45065.531 }, { "epoch": 14.81636060100167, "grad_norm": 2.557332992553711, "learning_rate": 1.2959228923205344e-05, "loss": 1.2782, "num_input_tokens_seen": 164388472, "step": 284000, "train_runtime": 3660.0837, "train_tokens_per_second": 44913.856 }, { "epoch": 14.842445742904841, "grad_norm": 2.9156086444854736, "learning_rate": 1.2894016068447412e-05, "loss": 1.2929, "num_input_tokens_seen": 164678824, "step": 284500, "train_runtime": 3678.8815, "train_tokens_per_second": 44763.286 }, { "epoch": 14.868530884808013, "grad_norm": 2.550926685333252, "learning_rate": 1.2828803213689484e-05, "loss": 1.2843, "num_input_tokens_seen": 164964520, "step": 285000, "train_runtime": 3697.6895, "train_tokens_per_second": 44612.864 }, { "epoch": 14.894616026711185, "grad_norm": 3.0715761184692383, "learning_rate": 1.2763590358931555e-05, "loss": 1.2791, "num_input_tokens_seen": 165252424, "step": 285500, "train_runtime": 3716.4903, "train_tokens_per_second": 44464.646 }, { "epoch": 14.920701168614357, "grad_norm": 3.2298481464385986, "learning_rate": 1.2698377504173623e-05, "loss": 1.286, "num_input_tokens_seen": 165546752, "step": 286000, "train_runtime": 3735.2292, "train_tokens_per_second": 44320.373 }, { "epoch": 14.94678631051753, "grad_norm": 2.6789731979370117, "learning_rate": 1.2633164649415693e-05, "loss": 1.2922, "num_input_tokens_seen": 165831800, "step": 286500, "train_runtime": 3754.0295, "train_tokens_per_second": 44174.346 }, { "epoch": 14.972871452420701, "grad_norm": 2.6322739124298096, "learning_rate": 1.2567951794657764e-05, "loss": 1.2873, "num_input_tokens_seen": 166125192, "step": 287000, "train_runtime": 3772.8414, "train_tokens_per_second": 44031.852 }, { "epoch": 14.998956594323873, "grad_norm": 2.762434244155884, "learning_rate": 1.2502738939899832e-05, "loss": 1.2715, "num_input_tokens_seen": 166410264, "step": 287500, "train_runtime": 3791.6838, "train_tokens_per_second": 43888.222 }, { "epoch": 15.0, "eval_loss": 1.2970120906829834, "eval_runtime": 45.5176, "eval_samples_per_second": 842.201, "eval_steps_per_second": 105.278, "num_input_tokens_seen": 166422516, "step": 287520 }, { "epoch": 15.025041736227045, "grad_norm": 2.177825927734375, "learning_rate": 1.2437526085141904e-05, "loss": 1.2801, "num_input_tokens_seen": 166697628, "step": 288000, "train_runtime": 3857.3769, "train_tokens_per_second": 43215.282 }, { "epoch": 15.051126878130217, "grad_norm": 3.206347703933716, "learning_rate": 1.2372313230383974e-05, "loss": 1.2709, "num_input_tokens_seen": 166992924, "step": 288500, "train_runtime": 3876.1711, "train_tokens_per_second": 43081.927 }, { "epoch": 15.07721202003339, "grad_norm": 2.4079601764678955, "learning_rate": 1.2307100375626043e-05, "loss": 1.2744, "num_input_tokens_seen": 167286132, "step": 289000, "train_runtime": 3895.0066, "train_tokens_per_second": 42948.869 }, { "epoch": 15.103297161936561, "grad_norm": 1.9692761898040771, "learning_rate": 1.2241887520868115e-05, "loss": 1.2559, "num_input_tokens_seen": 167572372, "step": 289500, "train_runtime": 3913.7506, "train_tokens_per_second": 42816.313 }, { "epoch": 15.129382303839733, "grad_norm": 2.694408416748047, "learning_rate": 1.2176674666110185e-05, "loss": 1.2661, "num_input_tokens_seen": 167863284, "step": 290000, "train_runtime": 3932.5501, "train_tokens_per_second": 42685.606 }, { "epoch": 15.155467445742905, "grad_norm": 2.9768283367156982, "learning_rate": 1.2111461811352254e-05, "loss": 1.2868, "num_input_tokens_seen": 168153292, "step": 290500, "train_runtime": 3951.2884, "train_tokens_per_second": 42556.573 }, { "epoch": 15.181552587646078, "grad_norm": 3.165743112564087, "learning_rate": 1.2046248956594324e-05, "loss": 1.2598, "num_input_tokens_seen": 168442780, "step": 291000, "train_runtime": 3970.108, "train_tokens_per_second": 42427.758 }, { "epoch": 15.20763772954925, "grad_norm": 2.1122047901153564, "learning_rate": 1.1981036101836394e-05, "loss": 1.2777, "num_input_tokens_seen": 168730764, "step": 291500, "train_runtime": 3989.0323, "train_tokens_per_second": 42298.671 }, { "epoch": 15.233722871452422, "grad_norm": 2.8908307552337646, "learning_rate": 1.1915823247078464e-05, "loss": 1.2524, "num_input_tokens_seen": 169023804, "step": 292000, "train_runtime": 4008.0188, "train_tokens_per_second": 42171.41 }, { "epoch": 15.259808013355592, "grad_norm": 5.693580627441406, "learning_rate": 1.1850610392320535e-05, "loss": 1.2636, "num_input_tokens_seen": 169313124, "step": 292500, "train_runtime": 4028.3264, "train_tokens_per_second": 42030.637 }, { "epoch": 15.285893155258764, "grad_norm": 2.3008134365081787, "learning_rate": 1.1785397537562605e-05, "loss": 1.2828, "num_input_tokens_seen": 169601124, "step": 293000, "train_runtime": 4048.6666, "train_tokens_per_second": 41890.613 }, { "epoch": 15.311978297161936, "grad_norm": 2.8285107612609863, "learning_rate": 1.1720184682804675e-05, "loss": 1.2528, "num_input_tokens_seen": 169887028, "step": 293500, "train_runtime": 4068.1864, "train_tokens_per_second": 41759.893 }, { "epoch": 15.338063439065108, "grad_norm": 2.4193263053894043, "learning_rate": 1.1654971828046746e-05, "loss": 1.272, "num_input_tokens_seen": 170171812, "step": 294000, "train_runtime": 4087.6299, "train_tokens_per_second": 41630.925 }, { "epoch": 15.36414858096828, "grad_norm": 2.8411006927490234, "learning_rate": 1.1589758973288816e-05, "loss": 1.2846, "num_input_tokens_seen": 170459652, "step": 294500, "train_runtime": 4106.8845, "train_tokens_per_second": 41505.83 }, { "epoch": 15.390233722871452, "grad_norm": 3.2765908241271973, "learning_rate": 1.1524546118530886e-05, "loss": 1.283, "num_input_tokens_seen": 170746052, "step": 295000, "train_runtime": 4125.6554, "train_tokens_per_second": 41386.407 }, { "epoch": 15.416318864774624, "grad_norm": 4.315444469451904, "learning_rate": 1.1459333263772955e-05, "loss": 1.2499, "num_input_tokens_seen": 171039820, "step": 295500, "train_runtime": 4144.6159, "train_tokens_per_second": 41267.954 }, { "epoch": 15.442404006677796, "grad_norm": 2.635226249694824, "learning_rate": 1.1394120409015025e-05, "loss": 1.271, "num_input_tokens_seen": 171325612, "step": 296000, "train_runtime": 4164.0018, "train_tokens_per_second": 41144.461 }, { "epoch": 15.468489148580968, "grad_norm": 2.699335813522339, "learning_rate": 1.1328907554257095e-05, "loss": 1.276, "num_input_tokens_seen": 171612740, "step": 296500, "train_runtime": 4184.2714, "train_tokens_per_second": 41013.768 }, { "epoch": 15.49457429048414, "grad_norm": 2.0063083171844482, "learning_rate": 1.1263694699499165e-05, "loss": 1.2596, "num_input_tokens_seen": 171906348, "step": 297000, "train_runtime": 4203.2579, "train_tokens_per_second": 40898.358 }, { "epoch": 15.520659432387312, "grad_norm": 2.836402654647827, "learning_rate": 1.1198481844741236e-05, "loss": 1.2578, "num_input_tokens_seen": 172189356, "step": 297500, "train_runtime": 4222.1833, "train_tokens_per_second": 40782.066 }, { "epoch": 15.546744574290484, "grad_norm": 3.0927999019622803, "learning_rate": 1.1133268989983306e-05, "loss": 1.2973, "num_input_tokens_seen": 172482468, "step": 298000, "train_runtime": 4241.2002, "train_tokens_per_second": 40668.316 }, { "epoch": 15.572829716193656, "grad_norm": 3.955559492111206, "learning_rate": 1.1068056135225376e-05, "loss": 1.272, "num_input_tokens_seen": 172775212, "step": 298500, "train_runtime": 4260.8077, "train_tokens_per_second": 40549.873 }, { "epoch": 15.598914858096828, "grad_norm": 2.954066753387451, "learning_rate": 1.1002843280467447e-05, "loss": 1.2696, "num_input_tokens_seen": 173066468, "step": 299000, "train_runtime": 4279.6208, "train_tokens_per_second": 40439.674 }, { "epoch": 15.625, "grad_norm": 2.927549362182617, "learning_rate": 1.0937630425709517e-05, "loss": 1.2947, "num_input_tokens_seen": 173362372, "step": 299500, "train_runtime": 4298.4621, "train_tokens_per_second": 40331.255 }, { "epoch": 15.651085141903172, "grad_norm": 3.2571945190429688, "learning_rate": 1.0872417570951587e-05, "loss": 1.2612, "num_input_tokens_seen": 173657292, "step": 300000, "train_runtime": 4317.6857, "train_tokens_per_second": 40219.994 }, { "epoch": 15.677170283806344, "grad_norm": 4.016629695892334, "learning_rate": 1.0807204716193657e-05, "loss": 1.2903, "num_input_tokens_seen": 173953028, "step": 300500, "train_runtime": 4337.5188, "train_tokens_per_second": 40104.27 }, { "epoch": 15.703255425709516, "grad_norm": 3.677175998687744, "learning_rate": 1.0741991861435726e-05, "loss": 1.2654, "num_input_tokens_seen": 174243612, "step": 301000, "train_runtime": 4357.6686, "train_tokens_per_second": 39985.512 }, { "epoch": 15.729340567612688, "grad_norm": 2.5401861667633057, "learning_rate": 1.0676779006677796e-05, "loss": 1.2785, "num_input_tokens_seen": 174528492, "step": 301500, "train_runtime": 4377.8182, "train_tokens_per_second": 39866.546 }, { "epoch": 15.75542570951586, "grad_norm": 3.0386669635772705, "learning_rate": 1.0611566151919868e-05, "loss": 1.2672, "num_input_tokens_seen": 174824740, "step": 302000, "train_runtime": 4397.7063, "train_tokens_per_second": 39753.619 }, { "epoch": 15.781510851419032, "grad_norm": 2.869920253753662, "learning_rate": 1.0546353297161937e-05, "loss": 1.2971, "num_input_tokens_seen": 175115884, "step": 302500, "train_runtime": 4417.5927, "train_tokens_per_second": 39640.568 }, { "epoch": 15.807595993322204, "grad_norm": 2.551456928253174, "learning_rate": 1.0481140442404007e-05, "loss": 1.2603, "num_input_tokens_seen": 175404964, "step": 303000, "train_runtime": 4437.075, "train_tokens_per_second": 39531.665 }, { "epoch": 15.833681135225376, "grad_norm": 2.8451788425445557, "learning_rate": 1.0415927587646079e-05, "loss": 1.3059, "num_input_tokens_seen": 175694332, "step": 303500, "train_runtime": 4456.4315, "train_tokens_per_second": 39424.893 }, { "epoch": 15.859766277128548, "grad_norm": 3.364713668823242, "learning_rate": 1.0350714732888148e-05, "loss": 1.2669, "num_input_tokens_seen": 175983324, "step": 304000, "train_runtime": 4475.8992, "train_tokens_per_second": 39317.982 }, { "epoch": 15.88585141903172, "grad_norm": 3.5180881023406982, "learning_rate": 1.0285501878130218e-05, "loss": 1.2704, "num_input_tokens_seen": 176271988, "step": 304500, "train_runtime": 4494.9616, "train_tokens_per_second": 39215.46 }, { "epoch": 15.911936560934892, "grad_norm": 3.1893362998962402, "learning_rate": 1.0220289023372288e-05, "loss": 1.2689, "num_input_tokens_seen": 176565276, "step": 305000, "train_runtime": 4513.98, "train_tokens_per_second": 39115.21 }, { "epoch": 15.938021702838064, "grad_norm": 3.272306442260742, "learning_rate": 1.0155076168614358e-05, "loss": 1.27, "num_input_tokens_seen": 176847788, "step": 305500, "train_runtime": 4533.1414, "train_tokens_per_second": 39012.193 }, { "epoch": 15.964106844741236, "grad_norm": 2.6090383529663086, "learning_rate": 1.0089863313856427e-05, "loss": 1.2684, "num_input_tokens_seen": 177132460, "step": 306000, "train_runtime": 4551.9653, "train_tokens_per_second": 38913.403 }, { "epoch": 15.990191986644408, "grad_norm": 2.874281644821167, "learning_rate": 1.0024650459098497e-05, "loss": 1.2839, "num_input_tokens_seen": 177417428, "step": 306500, "train_runtime": 4571.3016, "train_tokens_per_second": 38811.141 }, { "epoch": 16.0, "eval_loss": 1.2972913980484009, "eval_runtime": 46.7515, "eval_samples_per_second": 819.974, "eval_steps_per_second": 102.499, "num_input_tokens_seen": 177522072, "step": 306688 }, { "epoch": 16.01627712854758, "grad_norm": 2.4503226280212402, "learning_rate": 9.959437604340569e-06, "loss": 1.2666, "num_input_tokens_seen": 177704312, "step": 307000, "train_runtime": 4639.2676, "train_tokens_per_second": 38304.389 }, { "epoch": 16.042362270450752, "grad_norm": 2.57148814201355, "learning_rate": 9.894224749582638e-06, "loss": 1.2827, "num_input_tokens_seen": 177987984, "step": 307500, "train_runtime": 4658.9243, "train_tokens_per_second": 38203.665 }, { "epoch": 16.068447412353922, "grad_norm": 2.241555690765381, "learning_rate": 9.829011894824708e-06, "loss": 1.2417, "num_input_tokens_seen": 178276096, "step": 308000, "train_runtime": 4678.6926, "train_tokens_per_second": 38103.828 }, { "epoch": 16.094532554257096, "grad_norm": 3.140139579772949, "learning_rate": 9.76379904006678e-06, "loss": 1.2696, "num_input_tokens_seen": 178568312, "step": 308500, "train_runtime": 4698.0151, "train_tokens_per_second": 38009.31 }, { "epoch": 16.120617696160267, "grad_norm": 2.9327456951141357, "learning_rate": 9.69858618530885e-06, "loss": 1.2835, "num_input_tokens_seen": 178856160, "step": 309000, "train_runtime": 4717.2135, "train_tokens_per_second": 37915.638 }, { "epoch": 16.14670283806344, "grad_norm": 3.2067556381225586, "learning_rate": 9.633373330550919e-06, "loss": 1.2688, "num_input_tokens_seen": 179143944, "step": 309500, "train_runtime": 4736.2008, "train_tokens_per_second": 37824.398 }, { "epoch": 16.17278797996661, "grad_norm": 2.4767651557922363, "learning_rate": 9.568160475792989e-06, "loss": 1.2721, "num_input_tokens_seen": 179434664, "step": 310000, "train_runtime": 4755.3615, "train_tokens_per_second": 37733.128 }, { "epoch": 16.198873121869784, "grad_norm": 2.9996862411499023, "learning_rate": 9.502947621035059e-06, "loss": 1.2569, "num_input_tokens_seen": 179724792, "step": 310500, "train_runtime": 4774.7367, "train_tokens_per_second": 37640.776 }, { "epoch": 16.224958263772955, "grad_norm": 2.587339162826538, "learning_rate": 9.437734766277128e-06, "loss": 1.2562, "num_input_tokens_seen": 180020736, "step": 311000, "train_runtime": 4794.2064, "train_tokens_per_second": 37549.642 }, { "epoch": 16.25104340567613, "grad_norm": 2.425332546234131, "learning_rate": 9.3725219115192e-06, "loss": 1.2859, "num_input_tokens_seen": 180308088, "step": 311500, "train_runtime": 4813.4723, "train_tokens_per_second": 37459.048 }, { "epoch": 16.2771285475793, "grad_norm": 3.213170289993286, "learning_rate": 9.30730905676127e-06, "loss": 1.2648, "num_input_tokens_seen": 180593256, "step": 312000, "train_runtime": 4832.7472, "train_tokens_per_second": 37368.653 }, { "epoch": 16.303213689482472, "grad_norm": 2.971393346786499, "learning_rate": 9.24209620200334e-06, "loss": 1.2565, "num_input_tokens_seen": 180883912, "step": 312500, "train_runtime": 4853.0289, "train_tokens_per_second": 37272.375 }, { "epoch": 16.329298831385643, "grad_norm": 3.2865586280822754, "learning_rate": 9.17688334724541e-06, "loss": 1.2695, "num_input_tokens_seen": 181172920, "step": 313000, "train_runtime": 4872.3486, "train_tokens_per_second": 37183.899 }, { "epoch": 16.355383973288816, "grad_norm": 2.691861867904663, "learning_rate": 9.11167049248748e-06, "loss": 1.2742, "num_input_tokens_seen": 181457952, "step": 313500, "train_runtime": 4891.6907, "train_tokens_per_second": 37095.14 }, { "epoch": 16.381469115191987, "grad_norm": 3.302048444747925, "learning_rate": 9.04645763772955e-06, "loss": 1.261, "num_input_tokens_seen": 181746184, "step": 314000, "train_runtime": 4911.0159, "train_tokens_per_second": 37007.859 }, { "epoch": 16.407554257095157, "grad_norm": 3.427002191543579, "learning_rate": 8.981244782971618e-06, "loss": 1.2763, "num_input_tokens_seen": 182036728, "step": 314500, "train_runtime": 4930.339, "train_tokens_per_second": 36921.747 }, { "epoch": 16.43363939899833, "grad_norm": 2.194302558898926, "learning_rate": 8.91603192821369e-06, "loss": 1.2347, "num_input_tokens_seen": 182327360, "step": 315000, "train_runtime": 4949.6263, "train_tokens_per_second": 36836.591 }, { "epoch": 16.4597245409015, "grad_norm": 2.6108365058898926, "learning_rate": 8.85081907345576e-06, "loss": 1.3033, "num_input_tokens_seen": 182614776, "step": 315500, "train_runtime": 4968.861, "train_tokens_per_second": 36751.839 }, { "epoch": 16.485809682804675, "grad_norm": 3.398846387863159, "learning_rate": 8.78560621869783e-06, "loss": 1.231, "num_input_tokens_seen": 182898920, "step": 316000, "train_runtime": 4988.2986, "train_tokens_per_second": 36665.592 }, { "epoch": 16.511894824707845, "grad_norm": 3.175825357437134, "learning_rate": 8.720393363939901e-06, "loss": 1.2653, "num_input_tokens_seen": 183194016, "step": 316500, "train_runtime": 5007.4717, "train_tokens_per_second": 36584.134 }, { "epoch": 16.53797996661102, "grad_norm": 3.3755290508270264, "learning_rate": 8.65518050918197e-06, "loss": 1.2382, "num_input_tokens_seen": 183486192, "step": 317000, "train_runtime": 5026.7596, "train_tokens_per_second": 36501.883 }, { "epoch": 16.56406510851419, "grad_norm": 3.120741128921509, "learning_rate": 8.58996765442404e-06, "loss": 1.2661, "num_input_tokens_seen": 183774000, "step": 317500, "train_runtime": 5045.8839, "train_tokens_per_second": 36420.577 }, { "epoch": 16.590150250417363, "grad_norm": 4.2182440757751465, "learning_rate": 8.524754799666112e-06, "loss": 1.254, "num_input_tokens_seen": 184064816, "step": 318000, "train_runtime": 5065.2521, "train_tokens_per_second": 36338.727 }, { "epoch": 16.616235392320533, "grad_norm": 3.3010435104370117, "learning_rate": 8.459541944908182e-06, "loss": 1.2621, "num_input_tokens_seen": 184350480, "step": 318500, "train_runtime": 5084.6874, "train_tokens_per_second": 36256.011 }, { "epoch": 16.642320534223707, "grad_norm": 3.2120778560638428, "learning_rate": 8.39432909015025e-06, "loss": 1.2563, "num_input_tokens_seen": 184642440, "step": 319000, "train_runtime": 5103.9372, "train_tokens_per_second": 36176.472 }, { "epoch": 16.668405676126877, "grad_norm": 2.9939897060394287, "learning_rate": 8.329116235392321e-06, "loss": 1.2594, "num_input_tokens_seen": 184928112, "step": 319500, "train_runtime": 5123.0191, "train_tokens_per_second": 36097.486 }, { "epoch": 16.69449081803005, "grad_norm": 3.710550308227539, "learning_rate": 8.263903380634391e-06, "loss": 1.2634, "num_input_tokens_seen": 185211440, "step": 320000, "train_runtime": 5142.5889, "train_tokens_per_second": 36015.214 }, { "epoch": 16.72057595993322, "grad_norm": 2.5137531757354736, "learning_rate": 8.19869052587646e-06, "loss": 1.2601, "num_input_tokens_seen": 185506864, "step": 320500, "train_runtime": 5162.7072, "train_tokens_per_second": 35932.091 }, { "epoch": 16.746661101836395, "grad_norm": 4.654266834259033, "learning_rate": 8.13347767111853e-06, "loss": 1.282, "num_input_tokens_seen": 185792944, "step": 321000, "train_runtime": 5181.946, "train_tokens_per_second": 35853.894 }, { "epoch": 16.772746243739565, "grad_norm": 2.9473636150360107, "learning_rate": 8.068264816360602e-06, "loss": 1.2839, "num_input_tokens_seen": 186086024, "step": 321500, "train_runtime": 5201.2933, "train_tokens_per_second": 35776.876 }, { "epoch": 16.79883138564274, "grad_norm": 2.2345118522644043, "learning_rate": 8.003051961602672e-06, "loss": 1.249, "num_input_tokens_seen": 186378104, "step": 322000, "train_runtime": 5221.502, "train_tokens_per_second": 35694.347 }, { "epoch": 16.82491652754591, "grad_norm": 2.1228227615356445, "learning_rate": 7.937839106844742e-06, "loss": 1.2856, "num_input_tokens_seen": 186672776, "step": 322500, "train_runtime": 5242.0802, "train_tokens_per_second": 35610.438 }, { "epoch": 16.851001669449083, "grad_norm": 3.548326253890991, "learning_rate": 7.872626252086811e-06, "loss": 1.2777, "num_input_tokens_seen": 186964952, "step": 323000, "train_runtime": 5262.7553, "train_tokens_per_second": 35526.058 }, { "epoch": 16.877086811352253, "grad_norm": 3.222048044204712, "learning_rate": 7.807413397328881e-06, "loss": 1.288, "num_input_tokens_seen": 187250864, "step": 323500, "train_runtime": 5283.1207, "train_tokens_per_second": 35443.23 }, { "epoch": 16.903171953255427, "grad_norm": 3.267969846725464, "learning_rate": 7.74220054257095e-06, "loss": 1.2746, "num_input_tokens_seen": 187543856, "step": 324000, "train_runtime": 5303.6214, "train_tokens_per_second": 35361.471 }, { "epoch": 16.929257095158597, "grad_norm": 2.1591436862945557, "learning_rate": 7.676987687813022e-06, "loss": 1.2524, "num_input_tokens_seen": 187833368, "step": 324500, "train_runtime": 5324.0933, "train_tokens_per_second": 35279.879 }, { "epoch": 16.95534223706177, "grad_norm": 5.07979154586792, "learning_rate": 7.611774833055092e-06, "loss": 1.2888, "num_input_tokens_seen": 188120672, "step": 325000, "train_runtime": 5344.7171, "train_tokens_per_second": 35197.499 }, { "epoch": 16.98142737896494, "grad_norm": 3.134291410446167, "learning_rate": 7.546561978297162e-06, "loss": 1.2575, "num_input_tokens_seen": 188407336, "step": 325500, "train_runtime": 5365.0524, "train_tokens_per_second": 35117.52 }, { "epoch": 17.0, "eval_loss": 1.2976926565170288, "eval_runtime": 49.7121, "eval_samples_per_second": 771.141, "eval_steps_per_second": 96.395, "num_input_tokens_seen": 188612114, "step": 325856 }, { "epoch": 17.007512520868115, "grad_norm": 2.3629326820373535, "learning_rate": 7.481349123539233e-06, "loss": 1.2388, "num_input_tokens_seen": 188700266, "step": 326000, "train_runtime": 5436.6369, "train_tokens_per_second": 34709.007 }, { "epoch": 17.033597662771285, "grad_norm": 2.8408102989196777, "learning_rate": 7.416136268781303e-06, "loss": 1.2502, "num_input_tokens_seen": 188990786, "step": 326500, "train_runtime": 5458.7072, "train_tokens_per_second": 34621.894 }, { "epoch": 17.05968280467446, "grad_norm": 3.5564496517181396, "learning_rate": 7.350923414023372e-06, "loss": 1.2586, "num_input_tokens_seen": 189289114, "step": 327000, "train_runtime": 5481.0865, "train_tokens_per_second": 34534.962 }, { "epoch": 17.08576794657763, "grad_norm": 2.573309898376465, "learning_rate": 7.2857105592654434e-06, "loss": 1.255, "num_input_tokens_seen": 189582338, "step": 327500, "train_runtime": 5502.6097, "train_tokens_per_second": 34453.168 }, { "epoch": 17.1118530884808, "grad_norm": 2.900810718536377, "learning_rate": 7.220497704507513e-06, "loss": 1.2625, "num_input_tokens_seen": 189873506, "step": 328000, "train_runtime": 5523.9124, "train_tokens_per_second": 34373.012 }, { "epoch": 17.137938230383973, "grad_norm": 2.80328106880188, "learning_rate": 7.155284849749583e-06, "loss": 1.2621, "num_input_tokens_seen": 190163986, "step": 328500, "train_runtime": 5545.2522, "train_tokens_per_second": 34293.117 }, { "epoch": 17.164023372287144, "grad_norm": 2.8359973430633545, "learning_rate": 7.090071994991653e-06, "loss": 1.2276, "num_input_tokens_seen": 190454602, "step": 329000, "train_runtime": 5566.6958, "train_tokens_per_second": 34213.223 }, { "epoch": 17.190108514190317, "grad_norm": 2.6880123615264893, "learning_rate": 7.024859140233723e-06, "loss": 1.2414, "num_input_tokens_seen": 190749178, "step": 329500, "train_runtime": 5587.661, "train_tokens_per_second": 34137.572 }, { "epoch": 17.216193656093488, "grad_norm": 2.2190914154052734, "learning_rate": 6.959646285475793e-06, "loss": 1.2697, "num_input_tokens_seen": 191041514, "step": 330000, "train_runtime": 5608.8953, "train_tokens_per_second": 34060.453 }, { "epoch": 17.24227879799666, "grad_norm": 2.855161428451538, "learning_rate": 6.894433430717863e-06, "loss": 1.2656, "num_input_tokens_seen": 191333666, "step": 330500, "train_runtime": 5629.9424, "train_tokens_per_second": 33985.013 }, { "epoch": 17.26836393989983, "grad_norm": 2.8625779151916504, "learning_rate": 6.829220575959934e-06, "loss": 1.2595, "num_input_tokens_seen": 191622570, "step": 331000, "train_runtime": 5650.9098, "train_tokens_per_second": 33910.039 }, { "epoch": 17.294449081803005, "grad_norm": 2.630918502807617, "learning_rate": 6.764007721202003e-06, "loss": 1.2521, "num_input_tokens_seen": 191911738, "step": 331500, "train_runtime": 5671.7949, "train_tokens_per_second": 33836.156 }, { "epoch": 17.320534223706176, "grad_norm": 2.7609314918518066, "learning_rate": 6.698794866444073e-06, "loss": 1.2586, "num_input_tokens_seen": 192200466, "step": 332000, "train_runtime": 5692.6673, "train_tokens_per_second": 33762.814 }, { "epoch": 17.34661936560935, "grad_norm": 2.250659465789795, "learning_rate": 6.6335820116861445e-06, "loss": 1.2388, "num_input_tokens_seen": 192489178, "step": 332500, "train_runtime": 5713.6569, "train_tokens_per_second": 33689.313 }, { "epoch": 17.37270450751252, "grad_norm": 3.1896932125091553, "learning_rate": 6.568369156928214e-06, "loss": 1.2559, "num_input_tokens_seen": 192778922, "step": 333000, "train_runtime": 5734.7257, "train_tokens_per_second": 33616.067 }, { "epoch": 17.398789649415694, "grad_norm": 3.3856568336486816, "learning_rate": 6.503156302170284e-06, "loss": 1.267, "num_input_tokens_seen": 193066674, "step": 333500, "train_runtime": 5755.6678, "train_tokens_per_second": 33543.748 }, { "epoch": 17.424874791318864, "grad_norm": 2.031611919403076, "learning_rate": 6.437943447412355e-06, "loss": 1.2624, "num_input_tokens_seen": 193346250, "step": 334000, "train_runtime": 5776.7111, "train_tokens_per_second": 33469.953 }, { "epoch": 17.450959933222038, "grad_norm": 6.999661922454834, "learning_rate": 6.3727305926544244e-06, "loss": 1.25, "num_input_tokens_seen": 193636658, "step": 334500, "train_runtime": 5797.5543, "train_tokens_per_second": 33399.715 }, { "epoch": 17.477045075125208, "grad_norm": 3.335151433944702, "learning_rate": 6.307517737896494e-06, "loss": 1.2646, "num_input_tokens_seen": 193927418, "step": 335000, "train_runtime": 5818.5663, "train_tokens_per_second": 33329.072 }, { "epoch": 17.50313021702838, "grad_norm": 3.0118470191955566, "learning_rate": 6.242304883138565e-06, "loss": 1.2595, "num_input_tokens_seen": 194220626, "step": 335500, "train_runtime": 5839.6078, "train_tokens_per_second": 33259.19 }, { "epoch": 17.529215358931552, "grad_norm": 2.819512128829956, "learning_rate": 6.177092028380635e-06, "loss": 1.2604, "num_input_tokens_seen": 194507882, "step": 336000, "train_runtime": 5860.9533, "train_tokens_per_second": 33187.072 }, { "epoch": 17.555300500834726, "grad_norm": 2.87508225440979, "learning_rate": 6.111879173622704e-06, "loss": 1.2855, "num_input_tokens_seen": 194796762, "step": 336500, "train_runtime": 5882.4096, "train_tokens_per_second": 33115.13 }, { "epoch": 17.581385642737896, "grad_norm": 2.2459728717803955, "learning_rate": 6.046666318864775e-06, "loss": 1.2522, "num_input_tokens_seen": 195084282, "step": 337000, "train_runtime": 5904.0114, "train_tokens_per_second": 33042.667 }, { "epoch": 17.60747078464107, "grad_norm": 2.935845375061035, "learning_rate": 5.981453464106846e-06, "loss": 1.2545, "num_input_tokens_seen": 195375162, "step": 337500, "train_runtime": 5925.5481, "train_tokens_per_second": 32971.661 }, { "epoch": 17.63355592654424, "grad_norm": 3.0520784854888916, "learning_rate": 5.916240609348915e-06, "loss": 1.2587, "num_input_tokens_seen": 195666498, "step": 338000, "train_runtime": 5946.9563, "train_tokens_per_second": 32901.957 }, { "epoch": 17.659641068447414, "grad_norm": 1.9762933254241943, "learning_rate": 5.851027754590985e-06, "loss": 1.2714, "num_input_tokens_seen": 195952418, "step": 338500, "train_runtime": 5968.684, "train_tokens_per_second": 32830.087 }, { "epoch": 17.685726210350584, "grad_norm": 3.0459036827087402, "learning_rate": 5.785814899833055e-06, "loss": 1.2819, "num_input_tokens_seen": 196243738, "step": 339000, "train_runtime": 5990.4534, "train_tokens_per_second": 32759.413 }, { "epoch": 17.711811352253758, "grad_norm": 2.7781834602355957, "learning_rate": 5.7206020450751255e-06, "loss": 1.253, "num_input_tokens_seen": 196532034, "step": 339500, "train_runtime": 6011.9799, "train_tokens_per_second": 32690.068 }, { "epoch": 17.737896494156928, "grad_norm": 3.383931875228882, "learning_rate": 5.655389190317196e-06, "loss": 1.2521, "num_input_tokens_seen": 196822202, "step": 340000, "train_runtime": 6033.5216, "train_tokens_per_second": 32621.446 }, { "epoch": 17.7639816360601, "grad_norm": 2.72835373878479, "learning_rate": 5.590176335559266e-06, "loss": 1.2494, "num_input_tokens_seen": 197110802, "step": 340500, "train_runtime": 6054.9604, "train_tokens_per_second": 32553.607 }, { "epoch": 17.790066777963272, "grad_norm": 2.868680000305176, "learning_rate": 5.524963480801336e-06, "loss": 1.2436, "num_input_tokens_seen": 197396914, "step": 341000, "train_runtime": 6076.2211, "train_tokens_per_second": 32486.789 }, { "epoch": 17.816151919866446, "grad_norm": 2.985006809234619, "learning_rate": 5.459750626043405e-06, "loss": 1.269, "num_input_tokens_seen": 197687178, "step": 341500, "train_runtime": 6097.2778, "train_tokens_per_second": 32422.203 }, { "epoch": 17.842237061769616, "grad_norm": 2.457155704498291, "learning_rate": 5.394537771285476e-06, "loss": 1.2725, "num_input_tokens_seen": 197978106, "step": 342000, "train_runtime": 6118.3065, "train_tokens_per_second": 32358.318 }, { "epoch": 17.86832220367279, "grad_norm": 2.6323978900909424, "learning_rate": 5.329324916527547e-06, "loss": 1.2691, "num_input_tokens_seen": 198267826, "step": 342500, "train_runtime": 6138.9907, "train_tokens_per_second": 32296.486 }, { "epoch": 17.89440734557596, "grad_norm": 2.9683570861816406, "learning_rate": 5.264112061769616e-06, "loss": 1.2606, "num_input_tokens_seen": 198555794, "step": 343000, "train_runtime": 6159.8347, "train_tokens_per_second": 32233.948 }, { "epoch": 17.92049248747913, "grad_norm": 2.6426734924316406, "learning_rate": 5.198899207011686e-06, "loss": 1.2572, "num_input_tokens_seen": 198837802, "step": 343500, "train_runtime": 6180.7096, "train_tokens_per_second": 32170.708 }, { "epoch": 17.946577629382304, "grad_norm": 2.743959426879883, "learning_rate": 5.133686352253757e-06, "loss": 1.2584, "num_input_tokens_seen": 199125674, "step": 344000, "train_runtime": 6201.5223, "train_tokens_per_second": 32109.16 }, { "epoch": 17.972662771285474, "grad_norm": 2.5115082263946533, "learning_rate": 5.0684734974958266e-06, "loss": 1.2496, "num_input_tokens_seen": 199418034, "step": 344500, "train_runtime": 6222.8136, "train_tokens_per_second": 32046.281 }, { "epoch": 17.998747913188648, "grad_norm": 2.3742177486419678, "learning_rate": 5.003260642737897e-06, "loss": 1.2601, "num_input_tokens_seen": 199702842, "step": 345000, "train_runtime": 6244.2476, "train_tokens_per_second": 31981.89 }, { "epoch": 18.0, "eval_loss": 1.296248197555542, "eval_runtime": 50.1469, "eval_samples_per_second": 764.454, "eval_steps_per_second": 95.559, "num_input_tokens_seen": 199715854, "step": 345024 }, { "epoch": 18.02483305509182, "grad_norm": 2.2281575202941895, "learning_rate": 4.938047787979966e-06, "loss": 1.2228, "num_input_tokens_seen": 199990102, "step": 345500, "train_runtime": 6316.4242, "train_tokens_per_second": 31661.917 }, { "epoch": 18.050918196994992, "grad_norm": 2.840803384780884, "learning_rate": 4.872834933222037e-06, "loss": 1.2581, "num_input_tokens_seen": 200279302, "step": 346000, "train_runtime": 6337.2304, "train_tokens_per_second": 31603.601 }, { "epoch": 18.077003338898162, "grad_norm": 2.4082562923431396, "learning_rate": 4.807622078464107e-06, "loss": 1.2566, "num_input_tokens_seen": 200566038, "step": 346500, "train_runtime": 6358.142, "train_tokens_per_second": 31544.756 }, { "epoch": 18.103088480801336, "grad_norm": 3.136262893676758, "learning_rate": 4.742409223706177e-06, "loss": 1.2631, "num_input_tokens_seen": 200854406, "step": 347000, "train_runtime": 6379.2543, "train_tokens_per_second": 31485.562 }, { "epoch": 18.129173622704506, "grad_norm": 2.251553535461426, "learning_rate": 4.677196368948248e-06, "loss": 1.2434, "num_input_tokens_seen": 201141734, "step": 347500, "train_runtime": 6400.5038, "train_tokens_per_second": 31425.922 }, { "epoch": 18.15525876460768, "grad_norm": 2.587162971496582, "learning_rate": 4.6119835141903175e-06, "loss": 1.2481, "num_input_tokens_seen": 201429926, "step": 348000, "train_runtime": 6421.5455, "train_tokens_per_second": 31367.827 }, { "epoch": 18.18134390651085, "grad_norm": 2.8229830265045166, "learning_rate": 4.546770659432387e-06, "loss": 1.2536, "num_input_tokens_seen": 201720902, "step": 348500, "train_runtime": 6442.673, "train_tokens_per_second": 31310.126 }, { "epoch": 18.207429048414024, "grad_norm": 2.943593740463257, "learning_rate": 4.481557804674458e-06, "loss": 1.2687, "num_input_tokens_seen": 202015494, "step": 349000, "train_runtime": 6463.8115, "train_tokens_per_second": 31253.308 }, { "epoch": 18.233514190317194, "grad_norm": 2.8468620777130127, "learning_rate": 4.416344949916528e-06, "loss": 1.2475, "num_input_tokens_seen": 202301734, "step": 349500, "train_runtime": 6484.7729, "train_tokens_per_second": 31196.426 }, { "epoch": 18.25959933222037, "grad_norm": 2.5584495067596436, "learning_rate": 4.351132095158597e-06, "loss": 1.2464, "num_input_tokens_seen": 202582798, "step": 350000, "train_runtime": 6505.8233, "train_tokens_per_second": 31138.688 }, { "epoch": 18.28568447412354, "grad_norm": 3.42409348487854, "learning_rate": 4.285919240400668e-06, "loss": 1.2696, "num_input_tokens_seen": 202872662, "step": 350500, "train_runtime": 6526.8475, "train_tokens_per_second": 31082.795 }, { "epoch": 18.311769616026712, "grad_norm": 2.7311031818389893, "learning_rate": 4.220706385642738e-06, "loss": 1.249, "num_input_tokens_seen": 203159246, "step": 351000, "train_runtime": 6547.7257, "train_tokens_per_second": 31027.452 }, { "epoch": 18.337854757929883, "grad_norm": 3.2200024127960205, "learning_rate": 4.155493530884808e-06, "loss": 1.2766, "num_input_tokens_seen": 203449598, "step": 351500, "train_runtime": 6568.6802, "train_tokens_per_second": 30972.675 }, { "epoch": 18.363939899833056, "grad_norm": 3.4853382110595703, "learning_rate": 4.090280676126879e-06, "loss": 1.2478, "num_input_tokens_seen": 203737350, "step": 352000, "train_runtime": 6589.7847, "train_tokens_per_second": 30917.148 }, { "epoch": 18.390025041736227, "grad_norm": 2.6248600482940674, "learning_rate": 4.025067821368948e-06, "loss": 1.2461, "num_input_tokens_seen": 204033470, "step": 352500, "train_runtime": 6610.8585, "train_tokens_per_second": 30863.385 }, { "epoch": 18.4161101836394, "grad_norm": 3.1528148651123047, "learning_rate": 3.9598549666110185e-06, "loss": 1.2487, "num_input_tokens_seen": 204320822, "step": 353000, "train_runtime": 6632.008, "train_tokens_per_second": 30808.289 }, { "epoch": 18.44219532554257, "grad_norm": 2.4708855152130127, "learning_rate": 3.894642111853088e-06, "loss": 1.2493, "num_input_tokens_seen": 204615126, "step": 353500, "train_runtime": 6653.2853, "train_tokens_per_second": 30753.998 }, { "epoch": 18.468280467445744, "grad_norm": 2.8539340496063232, "learning_rate": 3.829429257095159e-06, "loss": 1.2469, "num_input_tokens_seen": 204908238, "step": 354000, "train_runtime": 6674.3465, "train_tokens_per_second": 30700.869 }, { "epoch": 18.494365609348915, "grad_norm": 3.047869920730591, "learning_rate": 3.764216402337229e-06, "loss": 1.2571, "num_input_tokens_seen": 205200078, "step": 354500, "train_runtime": 6695.2164, "train_tokens_per_second": 30648.759 }, { "epoch": 18.52045075125209, "grad_norm": 3.70831298828125, "learning_rate": 3.699003547579299e-06, "loss": 1.2544, "num_input_tokens_seen": 205493198, "step": 355000, "train_runtime": 6716.2607, "train_tokens_per_second": 30596.37 }, { "epoch": 18.54653589315526, "grad_norm": 2.9419515132904053, "learning_rate": 3.633790692821369e-06, "loss": 1.2406, "num_input_tokens_seen": 205782654, "step": 355500, "train_runtime": 6737.4279, "train_tokens_per_second": 30543.207 }, { "epoch": 18.572621035058432, "grad_norm": 3.3979151248931885, "learning_rate": 3.5685778380634397e-06, "loss": 1.2387, "num_input_tokens_seen": 206078310, "step": 356000, "train_runtime": 6758.4267, "train_tokens_per_second": 30492.054 }, { "epoch": 18.598706176961603, "grad_norm": 2.5537753105163574, "learning_rate": 3.503364983305509e-06, "loss": 1.2454, "num_input_tokens_seen": 206364678, "step": 356500, "train_runtime": 6779.3225, "train_tokens_per_second": 30440.31 }, { "epoch": 18.624791318864773, "grad_norm": 3.0519020557403564, "learning_rate": 3.4381521285475796e-06, "loss": 1.2617, "num_input_tokens_seen": 206651694, "step": 357000, "train_runtime": 6800.2161, "train_tokens_per_second": 30388.989 }, { "epoch": 18.650876460767947, "grad_norm": 2.832632541656494, "learning_rate": 3.3729392737896494e-06, "loss": 1.2594, "num_input_tokens_seen": 206935862, "step": 357500, "train_runtime": 6821.3364, "train_tokens_per_second": 30336.557 }, { "epoch": 18.676961602671117, "grad_norm": 3.5510575771331787, "learning_rate": 3.3077264190317196e-06, "loss": 1.2576, "num_input_tokens_seen": 207225006, "step": 358000, "train_runtime": 6842.5612, "train_tokens_per_second": 30284.713 }, { "epoch": 18.70304674457429, "grad_norm": 2.7018370628356934, "learning_rate": 3.24251356427379e-06, "loss": 1.2524, "num_input_tokens_seen": 207518494, "step": 358500, "train_runtime": 6863.6965, "train_tokens_per_second": 30234.218 }, { "epoch": 18.72913188647746, "grad_norm": 2.3896238803863525, "learning_rate": 3.1773007095158596e-06, "loss": 1.2787, "num_input_tokens_seen": 207806854, "step": 359000, "train_runtime": 6884.7293, "train_tokens_per_second": 30183.736 }, { "epoch": 18.755217028380635, "grad_norm": 2.3457329273223877, "learning_rate": 3.11208785475793e-06, "loss": 1.2612, "num_input_tokens_seen": 208104358, "step": 359500, "train_runtime": 6906.0499, "train_tokens_per_second": 30133.631 }, { "epoch": 18.781302170283805, "grad_norm": 3.7799017429351807, "learning_rate": 3.046875e-06, "loss": 1.2278, "num_input_tokens_seen": 208395230, "step": 360000, "train_runtime": 6927.2417, "train_tokens_per_second": 30083.436 }, { "epoch": 18.80738731218698, "grad_norm": 2.9162731170654297, "learning_rate": 2.98166214524207e-06, "loss": 1.2495, "num_input_tokens_seen": 208684190, "step": 360500, "train_runtime": 6948.3051, "train_tokens_per_second": 30033.826 }, { "epoch": 18.83347245409015, "grad_norm": 3.2956576347351074, "learning_rate": 2.9164492904841403e-06, "loss": 1.2556, "num_input_tokens_seen": 208972206, "step": 361000, "train_runtime": 6969.2518, "train_tokens_per_second": 29984.884 }, { "epoch": 18.859557595993323, "grad_norm": 2.974874496459961, "learning_rate": 2.8512364357262105e-06, "loss": 1.2433, "num_input_tokens_seen": 209260382, "step": 361500, "train_runtime": 6990.1944, "train_tokens_per_second": 29936.275 }, { "epoch": 18.885642737896493, "grad_norm": 2.385434150695801, "learning_rate": 2.7860235809682807e-06, "loss": 1.2529, "num_input_tokens_seen": 209544430, "step": 362000, "train_runtime": 7011.1006, "train_tokens_per_second": 29887.523 }, { "epoch": 18.911727879799667, "grad_norm": 2.289966344833374, "learning_rate": 2.7208107262103505e-06, "loss": 1.262, "num_input_tokens_seen": 209834774, "step": 362500, "train_runtime": 7032.3451, "train_tokens_per_second": 29838.521 }, { "epoch": 18.937813021702837, "grad_norm": 2.8906939029693604, "learning_rate": 2.655597871452421e-06, "loss": 1.2716, "num_input_tokens_seen": 210123054, "step": 363000, "train_runtime": 7053.702, "train_tokens_per_second": 29789.046 }, { "epoch": 18.96389816360601, "grad_norm": 3.4153401851654053, "learning_rate": 2.590385016694491e-06, "loss": 1.2774, "num_input_tokens_seen": 210412382, "step": 363500, "train_runtime": 7075.1001, "train_tokens_per_second": 29739.845 }, { "epoch": 18.98998330550918, "grad_norm": 3.0862789154052734, "learning_rate": 2.525172161936561e-06, "loss": 1.2665, "num_input_tokens_seen": 210705166, "step": 364000, "train_runtime": 7096.4466, "train_tokens_per_second": 29691.644 }, { "epoch": 19.0, "eval_loss": 1.296281337738037, "eval_runtime": 51.4225, "eval_samples_per_second": 745.49, "eval_steps_per_second": 93.189, "num_input_tokens_seen": 210813428, "step": 364192 }, { "epoch": 19.016068447412355, "grad_norm": 2.282921314239502, "learning_rate": 2.459959307178631e-06, "loss": 1.2246, "num_input_tokens_seen": 210992604, "step": 364500, "train_runtime": 7170.5972, "train_tokens_per_second": 29424.69 }, { "epoch": 19.042153589315525, "grad_norm": 2.1377789974212646, "learning_rate": 2.3947464524207014e-06, "loss": 1.2377, "num_input_tokens_seen": 211280204, "step": 365000, "train_runtime": 7192.2041, "train_tokens_per_second": 29376.28 }, { "epoch": 19.0682387312187, "grad_norm": 3.454662799835205, "learning_rate": 2.3295335976627716e-06, "loss": 1.2658, "num_input_tokens_seen": 211569500, "step": 365500, "train_runtime": 7213.663, "train_tokens_per_second": 29328.997 }, { "epoch": 19.09432387312187, "grad_norm": 2.45365309715271, "learning_rate": 2.2643207429048414e-06, "loss": 1.2296, "num_input_tokens_seen": 211851156, "step": 366000, "train_runtime": 7235.0182, "train_tokens_per_second": 29281.358 }, { "epoch": 19.120409015025043, "grad_norm": 2.841344118118286, "learning_rate": 2.1991078881469116e-06, "loss": 1.2817, "num_input_tokens_seen": 212137660, "step": 366500, "train_runtime": 7256.1778, "train_tokens_per_second": 29235.455 }, { "epoch": 19.146494156928213, "grad_norm": 2.386323928833008, "learning_rate": 2.1338950333889818e-06, "loss": 1.2336, "num_input_tokens_seen": 212425948, "step": 367000, "train_runtime": 7277.2221, "train_tokens_per_second": 29190.527 }, { "epoch": 19.172579298831387, "grad_norm": 3.1663670539855957, "learning_rate": 2.068682178631052e-06, "loss": 1.2755, "num_input_tokens_seen": 212713028, "step": 367500, "train_runtime": 7298.2567, "train_tokens_per_second": 29145.731 }, { "epoch": 19.198664440734557, "grad_norm": 2.1720612049102783, "learning_rate": 2.0034693238731217e-06, "loss": 1.2636, "num_input_tokens_seen": 213002716, "step": 368000, "train_runtime": 7318.2656, "train_tokens_per_second": 29105.628 }, { "epoch": 19.22474958263773, "grad_norm": 2.9212682247161865, "learning_rate": 1.938256469115192e-06, "loss": 1.2423, "num_input_tokens_seen": 213288196, "step": 368500, "train_runtime": 7337.9518, "train_tokens_per_second": 29066.448 }, { "epoch": 19.2508347245409, "grad_norm": 2.7475364208221436, "learning_rate": 1.8730436143572623e-06, "loss": 1.2443, "num_input_tokens_seen": 213574692, "step": 369000, "train_runtime": 7356.9693, "train_tokens_per_second": 29030.255 }, { "epoch": 19.276919866444075, "grad_norm": 2.422600030899048, "learning_rate": 1.8078307595993323e-06, "loss": 1.2201, "num_input_tokens_seen": 213864116, "step": 369500, "train_runtime": 7375.605, "train_tokens_per_second": 28996.146 }, { "epoch": 19.303005008347245, "grad_norm": 2.7195160388946533, "learning_rate": 1.7426179048414023e-06, "loss": 1.2481, "num_input_tokens_seen": 214150676, "step": 370000, "train_runtime": 7396.4144, "train_tokens_per_second": 28953.31 }, { "epoch": 19.32909015025042, "grad_norm": 2.50443172454834, "learning_rate": 1.6774050500834725e-06, "loss": 1.2302, "num_input_tokens_seen": 214440244, "step": 370500, "train_runtime": 7416.7831, "train_tokens_per_second": 28912.837 }, { "epoch": 19.35517529215359, "grad_norm": 2.887474775314331, "learning_rate": 1.6121921953255427e-06, "loss": 1.2449, "num_input_tokens_seen": 214730132, "step": 371000, "train_runtime": 7437.2553, "train_tokens_per_second": 28872.228 }, { "epoch": 19.38126043405676, "grad_norm": 2.5884950160980225, "learning_rate": 1.5469793405676129e-06, "loss": 1.2521, "num_input_tokens_seen": 215019220, "step": 371500, "train_runtime": 7457.6502, "train_tokens_per_second": 28832.033 }, { "epoch": 19.407345575959933, "grad_norm": 2.357685089111328, "learning_rate": 1.4817664858096828e-06, "loss": 1.2443, "num_input_tokens_seen": 215310132, "step": 372000, "train_runtime": 7478.2575, "train_tokens_per_second": 28791.484 }, { "epoch": 19.433430717863104, "grad_norm": 2.3335018157958984, "learning_rate": 1.416553631051753e-06, "loss": 1.2469, "num_input_tokens_seen": 215600084, "step": 372500, "train_runtime": 7498.6623, "train_tokens_per_second": 28751.806 }, { "epoch": 19.459515859766277, "grad_norm": 2.7641124725341797, "learning_rate": 1.351340776293823e-06, "loss": 1.228, "num_input_tokens_seen": 215888340, "step": 373000, "train_runtime": 7519.0798, "train_tokens_per_second": 28712.069 }, { "epoch": 19.485601001669448, "grad_norm": 2.7597529888153076, "learning_rate": 1.2861279215358932e-06, "loss": 1.2499, "num_input_tokens_seen": 216178932, "step": 373500, "train_runtime": 7539.4463, "train_tokens_per_second": 28673.051 }, { "epoch": 19.51168614357262, "grad_norm": 2.3733975887298584, "learning_rate": 1.2209150667779632e-06, "loss": 1.2484, "num_input_tokens_seen": 216470580, "step": 374000, "train_runtime": 7559.9641, "train_tokens_per_second": 28633.811 }, { "epoch": 19.53777128547579, "grad_norm": 2.3238165378570557, "learning_rate": 1.1557022120200334e-06, "loss": 1.2364, "num_input_tokens_seen": 216763740, "step": 374500, "train_runtime": 7579.7748, "train_tokens_per_second": 28597.649 }, { "epoch": 19.563856427378965, "grad_norm": 2.8229446411132812, "learning_rate": 1.0904893572621036e-06, "loss": 1.2358, "num_input_tokens_seen": 217053292, "step": 375000, "train_runtime": 7598.7817, "train_tokens_per_second": 28564.223 }, { "epoch": 19.589941569282136, "grad_norm": 2.4836158752441406, "learning_rate": 1.0252765025041738e-06, "loss": 1.2606, "num_input_tokens_seen": 217344428, "step": 375500, "train_runtime": 7618.304, "train_tokens_per_second": 28529.241 }, { "epoch": 19.61602671118531, "grad_norm": 2.7675931453704834, "learning_rate": 9.600636477462437e-07, "loss": 1.2629, "num_input_tokens_seen": 217634524, "step": 376000, "train_runtime": 7637.288, "train_tokens_per_second": 28496.31 }, { "epoch": 19.64211185308848, "grad_norm": 2.331380844116211, "learning_rate": 8.948507929883139e-07, "loss": 1.2521, "num_input_tokens_seen": 217924508, "step": 376500, "train_runtime": 7656.4955, "train_tokens_per_second": 28462.697 }, { "epoch": 19.668196994991654, "grad_norm": 3.3577489852905273, "learning_rate": 8.29637938230384e-07, "loss": 1.2571, "num_input_tokens_seen": 218217084, "step": 377000, "train_runtime": 7675.4197, "train_tokens_per_second": 28430.639 }, { "epoch": 19.694282136894824, "grad_norm": 2.872344970703125, "learning_rate": 7.644250834724542e-07, "loss": 1.271, "num_input_tokens_seen": 218508180, "step": 377500, "train_runtime": 7694.4779, "train_tokens_per_second": 28398.052 }, { "epoch": 19.720367278797998, "grad_norm": 2.9395909309387207, "learning_rate": 6.992122287145243e-07, "loss": 1.25, "num_input_tokens_seen": 218798076, "step": 378000, "train_runtime": 7712.7627, "train_tokens_per_second": 28368.314 }, { "epoch": 19.746452420701168, "grad_norm": 2.5424513816833496, "learning_rate": 6.339993739565944e-07, "loss": 1.2817, "num_input_tokens_seen": 219089308, "step": 378500, "train_runtime": 7731.9549, "train_tokens_per_second": 28335.565 }, { "epoch": 19.77253756260434, "grad_norm": 2.9725682735443115, "learning_rate": 5.687865191986645e-07, "loss": 1.2418, "num_input_tokens_seen": 219383604, "step": 379000, "train_runtime": 7751.6984, "train_tokens_per_second": 28301.36 }, { "epoch": 19.798622704507512, "grad_norm": 3.3688950538635254, "learning_rate": 5.035736644407346e-07, "loss": 1.2449, "num_input_tokens_seen": 219679124, "step": 379500, "train_runtime": 7771.8118, "train_tokens_per_second": 28266.14 }, { "epoch": 19.824707846410686, "grad_norm": 2.398789882659912, "learning_rate": 4.3836080968280473e-07, "loss": 1.2362, "num_input_tokens_seen": 219963660, "step": 380000, "train_runtime": 7790.6642, "train_tokens_per_second": 28234.263 }, { "epoch": 19.850792988313856, "grad_norm": 2.845128059387207, "learning_rate": 3.731479549248748e-07, "loss": 1.2803, "num_input_tokens_seen": 220255900, "step": 380500, "train_runtime": 7809.3731, "train_tokens_per_second": 28204.044 }, { "epoch": 19.87687813021703, "grad_norm": 2.6180248260498047, "learning_rate": 3.079351001669449e-07, "loss": 1.2634, "num_input_tokens_seen": 220547100, "step": 381000, "train_runtime": 7827.8518, "train_tokens_per_second": 28174.665 }, { "epoch": 19.9029632721202, "grad_norm": 2.5833303928375244, "learning_rate": 2.4272224540901504e-07, "loss": 1.2482, "num_input_tokens_seen": 220835100, "step": 381500, "train_runtime": 7847.7813, "train_tokens_per_second": 28139.813 }, { "epoch": 19.929048414023374, "grad_norm": 2.800402879714966, "learning_rate": 1.7750939065108515e-07, "loss": 1.2335, "num_input_tokens_seen": 221122004, "step": 382000, "train_runtime": 7866.365, "train_tokens_per_second": 28109.807 }, { "epoch": 19.955133555926544, "grad_norm": 2.8612380027770996, "learning_rate": 1.1229653589315525e-07, "loss": 1.2251, "num_input_tokens_seen": 221409964, "step": 382500, "train_runtime": 7884.8107, "train_tokens_per_second": 28080.568 }, { "epoch": 19.981218697829718, "grad_norm": 3.3842055797576904, "learning_rate": 4.7083681135225376e-08, "loss": 1.2888, "num_input_tokens_seen": 221700476, "step": 383000, "train_runtime": 7903.5539, "train_tokens_per_second": 28050.732 }, { "epoch": 20.0, "eval_loss": 1.2961275577545166, "eval_runtime": 46.2863, "eval_samples_per_second": 828.215, "eval_steps_per_second": 103.53, "num_input_tokens_seen": 221910640, "step": 383360 }, { "epoch": 20.0, "num_input_tokens_seen": 221910640, "step": 383360, "total_flos": 8.056851732185088e+16, "train_loss": 0.641815161904031, "train_runtime": 7964.4512, "train_samples_per_second": 385.056, "train_steps_per_second": 48.134, "train_tokens_per_second": 27853.103 } ], "logging_steps": 500, "max_steps": 383360, "num_input_tokens_seen": 221910640, "num_train_epochs": 20, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.056851732185088e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }