{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 8.162230671736376, "eval_steps": 100, "global_step": 800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.050697084917617236, "grad_norm": 3.785501003265381, "learning_rate": 4.999948617395915e-05, "loss": 0.8619, "num_input_tokens_seen": 162496, "step": 5 }, { "epoch": 0.10139416983523447, "grad_norm": 1.068943738937378, "learning_rate": 4.9997944716957985e-05, "loss": 0.2854, "num_input_tokens_seen": 326848, "step": 10 }, { "epoch": 0.1520912547528517, "grad_norm": 0.6261367201805115, "learning_rate": 4.9995375692359755e-05, "loss": 0.2279, "num_input_tokens_seen": 491648, "step": 15 }, { "epoch": 0.20278833967046894, "grad_norm": 0.27561241388320923, "learning_rate": 4.9991779205767e-05, "loss": 0.1964, "num_input_tokens_seen": 656640, "step": 20 }, { "epoch": 0.2534854245880862, "grad_norm": 0.2867205739021301, "learning_rate": 4.99871554050172e-05, "loss": 0.2034, "num_input_tokens_seen": 820416, "step": 25 }, { "epoch": 0.3041825095057034, "grad_norm": 0.2154628187417984, "learning_rate": 4.9981504480176696e-05, "loss": 0.1831, "num_input_tokens_seen": 984000, "step": 30 }, { "epoch": 0.3548795944233207, "grad_norm": 0.39752644300460815, "learning_rate": 4.997482666353287e-05, "loss": 0.1736, "num_input_tokens_seen": 1150080, "step": 35 }, { "epoch": 0.4055766793409379, "grad_norm": 0.40215054154396057, "learning_rate": 4.996712222958461e-05, "loss": 0.167, "num_input_tokens_seen": 1313152, "step": 40 }, { "epoch": 0.45627376425855515, "grad_norm": 0.24769514799118042, "learning_rate": 4.9958391495031026e-05, "loss": 0.176, "num_input_tokens_seen": 1479104, "step": 45 }, { "epoch": 0.5069708491761724, "grad_norm": 0.5237451195716858, "learning_rate": 4.994863481875841e-05, "loss": 0.1769, "num_input_tokens_seen": 1645120, "step": 50 }, { "epoch": 0.5576679340937896, "grad_norm": 0.4815216362476349, "learning_rate": 4.993785260182552e-05, "loss": 0.1544, "num_input_tokens_seen": 1809024, "step": 55 }, { "epoch": 0.6083650190114068, "grad_norm": 0.24125471711158752, "learning_rate": 4.992604528744705e-05, "loss": 0.1466, "num_input_tokens_seen": 1973696, "step": 60 }, { "epoch": 0.6590621039290241, "grad_norm": 0.43303415179252625, "learning_rate": 4.991321336097546e-05, "loss": 0.1475, "num_input_tokens_seen": 2138560, "step": 65 }, { "epoch": 0.7097591888466414, "grad_norm": 0.41517049074172974, "learning_rate": 4.989935734988098e-05, "loss": 0.1248, "num_input_tokens_seen": 2302016, "step": 70 }, { "epoch": 0.7604562737642585, "grad_norm": 0.3203924000263214, "learning_rate": 4.9884477823729956e-05, "loss": 0.116, "num_input_tokens_seen": 2465664, "step": 75 }, { "epoch": 0.8111533586818758, "grad_norm": 0.3540217876434326, "learning_rate": 4.986857539416144e-05, "loss": 0.1304, "num_input_tokens_seen": 2628544, "step": 80 }, { "epoch": 0.861850443599493, "grad_norm": 0.6859760880470276, "learning_rate": 4.9851650714862006e-05, "loss": 0.1265, "num_input_tokens_seen": 2792064, "step": 85 }, { "epoch": 0.9125475285171103, "grad_norm": 0.43165433406829834, "learning_rate": 4.983370448153896e-05, "loss": 0.1324, "num_input_tokens_seen": 2955968, "step": 90 }, { "epoch": 0.9632446134347274, "grad_norm": 0.37339577078819275, "learning_rate": 4.981473743189163e-05, "loss": 0.1093, "num_input_tokens_seen": 3118976, "step": 95 }, { "epoch": 1.020278833967047, "grad_norm": 0.3954649269580841, "learning_rate": 4.979475034558115e-05, "loss": 0.1196, "num_input_tokens_seen": 3291584, "step": 100 }, { "epoch": 1.020278833967047, "eval_loss": 0.09924378991127014, "eval_runtime": 104.8348, "eval_samples_per_second": 6.687, "eval_steps_per_second": 0.839, "num_input_tokens_seen": 3291584, "step": 100 }, { "epoch": 1.0709759188846641, "grad_norm": 0.405510812997818, "learning_rate": 4.977374404419837e-05, "loss": 0.0985, "num_input_tokens_seen": 3453568, "step": 105 }, { "epoch": 1.1216730038022813, "grad_norm": 0.3010367453098297, "learning_rate": 4.975171939123005e-05, "loss": 0.0914, "num_input_tokens_seen": 3618304, "step": 110 }, { "epoch": 1.1723700887198987, "grad_norm": 0.32824525237083435, "learning_rate": 4.9728677292023405e-05, "loss": 0.0872, "num_input_tokens_seen": 3783424, "step": 115 }, { "epoch": 1.2230671736375158, "grad_norm": 0.6524387001991272, "learning_rate": 4.970461869374889e-05, "loss": 0.0986, "num_input_tokens_seen": 3947136, "step": 120 }, { "epoch": 1.2737642585551332, "grad_norm": 0.5022702813148499, "learning_rate": 4.967954458536126e-05, "loss": 0.0929, "num_input_tokens_seen": 4113344, "step": 125 }, { "epoch": 1.3244613434727504, "grad_norm": 0.6579980850219727, "learning_rate": 4.965345599755887e-05, "loss": 0.1045, "num_input_tokens_seen": 4277312, "step": 130 }, { "epoch": 1.3751584283903675, "grad_norm": 0.3814283013343811, "learning_rate": 4.962635400274142e-05, "loss": 0.0759, "num_input_tokens_seen": 4441344, "step": 135 }, { "epoch": 1.4258555133079849, "grad_norm": 0.4254150986671448, "learning_rate": 4.959823971496574e-05, "loss": 0.1045, "num_input_tokens_seen": 4605056, "step": 140 }, { "epoch": 1.476552598225602, "grad_norm": 0.7336615324020386, "learning_rate": 4.95691142899001e-05, "loss": 0.1082, "num_input_tokens_seen": 4767872, "step": 145 }, { "epoch": 1.5272496831432192, "grad_norm": 0.42282769083976746, "learning_rate": 4.9538978924776634e-05, "loss": 0.0754, "num_input_tokens_seen": 4931392, "step": 150 }, { "epoch": 1.5779467680608366, "grad_norm": 0.3356935381889343, "learning_rate": 4.9507834858342186e-05, "loss": 0.0889, "num_input_tokens_seen": 5095552, "step": 155 }, { "epoch": 1.6286438529784537, "grad_norm": 0.2882402539253235, "learning_rate": 4.9475683370807326e-05, "loss": 0.0809, "num_input_tokens_seen": 5258624, "step": 160 }, { "epoch": 1.6793409378960709, "grad_norm": 0.6393563151359558, "learning_rate": 4.9442525783793794e-05, "loss": 0.0765, "num_input_tokens_seen": 5422464, "step": 165 }, { "epoch": 1.7300380228136882, "grad_norm": 0.46365517377853394, "learning_rate": 4.940836346028011e-05, "loss": 0.0777, "num_input_tokens_seen": 5586944, "step": 170 }, { "epoch": 1.7807351077313056, "grad_norm": 0.7445991039276123, "learning_rate": 4.937319780454559e-05, "loss": 0.0776, "num_input_tokens_seen": 5751104, "step": 175 }, { "epoch": 1.8314321926489225, "grad_norm": 0.4604029357433319, "learning_rate": 4.933703026211262e-05, "loss": 0.0753, "num_input_tokens_seen": 5916608, "step": 180 }, { "epoch": 1.88212927756654, "grad_norm": 0.5083752274513245, "learning_rate": 4.9299862319687204e-05, "loss": 0.0787, "num_input_tokens_seen": 6080768, "step": 185 }, { "epoch": 1.9328263624841573, "grad_norm": 0.3158109188079834, "learning_rate": 4.926169550509787e-05, "loss": 0.0712, "num_input_tokens_seen": 6244992, "step": 190 }, { "epoch": 1.9835234474017744, "grad_norm": 0.2848895788192749, "learning_rate": 4.9222531387232885e-05, "loss": 0.0657, "num_input_tokens_seen": 6409536, "step": 195 }, { "epoch": 2.040557667934094, "grad_norm": 0.6200438737869263, "learning_rate": 4.9182371575975736e-05, "loss": 0.0632, "num_input_tokens_seen": 6580928, "step": 200 }, { "epoch": 2.040557667934094, "eval_loss": 0.07071041315793991, "eval_runtime": 104.6768, "eval_samples_per_second": 6.697, "eval_steps_per_second": 0.841, "num_input_tokens_seen": 6580928, "step": 200 }, { "epoch": 2.091254752851711, "grad_norm": 0.3254186511039734, "learning_rate": 4.914121772213898e-05, "loss": 0.0709, "num_input_tokens_seen": 6746432, "step": 205 }, { "epoch": 2.1419518377693283, "grad_norm": 0.42892560362815857, "learning_rate": 4.909907151739633e-05, "loss": 0.0523, "num_input_tokens_seen": 6909760, "step": 210 }, { "epoch": 2.1926489226869457, "grad_norm": 0.349162757396698, "learning_rate": 4.905593469421323e-05, "loss": 0.0665, "num_input_tokens_seen": 7073728, "step": 215 }, { "epoch": 2.2433460076045626, "grad_norm": 0.4294612407684326, "learning_rate": 4.9011809025775486e-05, "loss": 0.0591, "num_input_tokens_seen": 7238912, "step": 220 }, { "epoch": 2.29404309252218, "grad_norm": 0.6912519335746765, "learning_rate": 4.8966696325916515e-05, "loss": 0.0599, "num_input_tokens_seen": 7404480, "step": 225 }, { "epoch": 2.3447401774397973, "grad_norm": 0.8554105758666992, "learning_rate": 4.892059844904272e-05, "loss": 0.0732, "num_input_tokens_seen": 7568384, "step": 230 }, { "epoch": 2.3954372623574143, "grad_norm": 0.4321288764476776, "learning_rate": 4.887351729005726e-05, "loss": 0.0561, "num_input_tokens_seen": 7732544, "step": 235 }, { "epoch": 2.4461343472750317, "grad_norm": 0.43583258986473083, "learning_rate": 4.882545478428218e-05, "loss": 0.064, "num_input_tokens_seen": 7894656, "step": 240 }, { "epoch": 2.496831432192649, "grad_norm": 0.4895538091659546, "learning_rate": 4.877641290737884e-05, "loss": 0.054, "num_input_tokens_seen": 8057216, "step": 245 }, { "epoch": 2.5475285171102664, "grad_norm": 0.4905809462070465, "learning_rate": 4.8726393675266716e-05, "loss": 0.0635, "num_input_tokens_seen": 8221248, "step": 250 }, { "epoch": 2.5982256020278833, "grad_norm": 0.8135025501251221, "learning_rate": 4.8675399144040537e-05, "loss": 0.0519, "num_input_tokens_seen": 8384512, "step": 255 }, { "epoch": 2.6489226869455007, "grad_norm": 0.4855857193470001, "learning_rate": 4.862343140988573e-05, "loss": 0.0554, "num_input_tokens_seen": 8549440, "step": 260 }, { "epoch": 2.6996197718631176, "grad_norm": 0.39417335391044617, "learning_rate": 4.8570492608992325e-05, "loss": 0.0547, "num_input_tokens_seen": 8713856, "step": 265 }, { "epoch": 2.750316856780735, "grad_norm": 0.48521384596824646, "learning_rate": 4.851658491746707e-05, "loss": 0.0814, "num_input_tokens_seen": 8877888, "step": 270 }, { "epoch": 2.8010139416983524, "grad_norm": 0.31612926721572876, "learning_rate": 4.846171055124401e-05, "loss": 0.0619, "num_input_tokens_seen": 9041920, "step": 275 }, { "epoch": 2.8517110266159698, "grad_norm": 0.5412792563438416, "learning_rate": 4.8405871765993433e-05, "loss": 0.061, "num_input_tokens_seen": 9205824, "step": 280 }, { "epoch": 2.9024081115335867, "grad_norm": 0.3346579074859619, "learning_rate": 4.834907085702908e-05, "loss": 0.052, "num_input_tokens_seen": 9369280, "step": 285 }, { "epoch": 2.953105196451204, "grad_norm": 0.5361500382423401, "learning_rate": 4.829131015921385e-05, "loss": 0.0547, "num_input_tokens_seen": 9534464, "step": 290 }, { "epoch": 3.0101394169835234, "grad_norm": 0.6115812063217163, "learning_rate": 4.82325920468638e-05, "loss": 0.0796, "num_input_tokens_seen": 9708032, "step": 295 }, { "epoch": 3.0608365019011408, "grad_norm": 0.3585353195667267, "learning_rate": 4.817291893365055e-05, "loss": 0.0481, "num_input_tokens_seen": 9871936, "step": 300 }, { "epoch": 3.0608365019011408, "eval_loss": 0.061122000217437744, "eval_runtime": 104.817, "eval_samples_per_second": 6.688, "eval_steps_per_second": 0.84, "num_input_tokens_seen": 9871936, "step": 300 }, { "epoch": 3.111533586818758, "grad_norm": 0.5046743154525757, "learning_rate": 4.8112293272502043e-05, "loss": 0.0568, "num_input_tokens_seen": 10036352, "step": 305 }, { "epoch": 3.162230671736375, "grad_norm": 0.4078225791454315, "learning_rate": 4.805071755550177e-05, "loss": 0.0614, "num_input_tokens_seen": 10201088, "step": 310 }, { "epoch": 3.2129277566539924, "grad_norm": 0.337298184633255, "learning_rate": 4.7988194313786275e-05, "loss": 0.0287, "num_input_tokens_seen": 10366016, "step": 315 }, { "epoch": 3.26362484157161, "grad_norm": 0.6969597339630127, "learning_rate": 4.7924726117441135e-05, "loss": 0.0452, "num_input_tokens_seen": 10528704, "step": 320 }, { "epoch": 3.3143219264892267, "grad_norm": 0.6235790252685547, "learning_rate": 4.7860315575395316e-05, "loss": 0.0497, "num_input_tokens_seen": 10693760, "step": 325 }, { "epoch": 3.365019011406844, "grad_norm": 0.3542743921279907, "learning_rate": 4.7794965335313926e-05, "loss": 0.0523, "num_input_tokens_seen": 10859200, "step": 330 }, { "epoch": 3.4157160963244615, "grad_norm": 0.45905178785324097, "learning_rate": 4.772867808348938e-05, "loss": 0.0618, "num_input_tokens_seen": 11022400, "step": 335 }, { "epoch": 3.4664131812420784, "grad_norm": 0.7552258372306824, "learning_rate": 4.766145654473095e-05, "loss": 0.0516, "num_input_tokens_seen": 11187328, "step": 340 }, { "epoch": 3.517110266159696, "grad_norm": 0.4128399193286896, "learning_rate": 4.759330348225284e-05, "loss": 0.0428, "num_input_tokens_seen": 11352832, "step": 345 }, { "epoch": 3.567807351077313, "grad_norm": 0.67793869972229, "learning_rate": 4.752422169756048e-05, "loss": 0.0582, "num_input_tokens_seen": 11518016, "step": 350 }, { "epoch": 3.6185044359949305, "grad_norm": 0.4128822684288025, "learning_rate": 4.745421403033548e-05, "loss": 0.0393, "num_input_tokens_seen": 11680640, "step": 355 }, { "epoch": 3.6692015209125475, "grad_norm": 0.475558340549469, "learning_rate": 4.738328335831883e-05, "loss": 0.0533, "num_input_tokens_seen": 11842880, "step": 360 }, { "epoch": 3.719898605830165, "grad_norm": 0.3762592077255249, "learning_rate": 4.731143259719265e-05, "loss": 0.0362, "num_input_tokens_seen": 12009344, "step": 365 }, { "epoch": 3.770595690747782, "grad_norm": 0.5744361281394958, "learning_rate": 4.72386647004603e-05, "loss": 0.0553, "num_input_tokens_seen": 12173056, "step": 370 }, { "epoch": 3.821292775665399, "grad_norm": 0.5609379410743713, "learning_rate": 4.716498265932501e-05, "loss": 0.0501, "num_input_tokens_seen": 12337408, "step": 375 }, { "epoch": 3.8719898605830165, "grad_norm": 0.42430299520492554, "learning_rate": 4.709038950256688e-05, "loss": 0.0384, "num_input_tokens_seen": 12501312, "step": 380 }, { "epoch": 3.922686945500634, "grad_norm": 0.5434412956237793, "learning_rate": 4.701488829641845e-05, "loss": 0.0354, "num_input_tokens_seen": 12665152, "step": 385 }, { "epoch": 3.973384030418251, "grad_norm": 0.5311803221702576, "learning_rate": 4.693848214443858e-05, "loss": 0.0348, "num_input_tokens_seen": 12828608, "step": 390 }, { "epoch": 4.030418250950571, "grad_norm": 0.5105487704277039, "learning_rate": 4.686117418738489e-05, "loss": 0.036, "num_input_tokens_seen": 13001024, "step": 395 }, { "epoch": 4.081115335868188, "grad_norm": 0.7211757898330688, "learning_rate": 4.678296760308474e-05, "loss": 0.0352, "num_input_tokens_seen": 13164032, "step": 400 }, { "epoch": 4.081115335868188, "eval_loss": 0.06444713473320007, "eval_runtime": 104.6657, "eval_samples_per_second": 6.698, "eval_steps_per_second": 0.841, "num_input_tokens_seen": 13164032, "step": 400 }, { "epoch": 4.1318124207858045, "grad_norm": 0.6069203615188599, "learning_rate": 4.6703865606304465e-05, "loss": 0.0295, "num_input_tokens_seen": 13329344, "step": 405 }, { "epoch": 4.182509505703422, "grad_norm": 0.6209607720375061, "learning_rate": 4.662387144861734e-05, "loss": 0.0323, "num_input_tokens_seen": 13492800, "step": 410 }, { "epoch": 4.233206590621039, "grad_norm": 0.772402822971344, "learning_rate": 4.6542988418269876e-05, "loss": 0.0381, "num_input_tokens_seen": 13656704, "step": 415 }, { "epoch": 4.283903675538657, "grad_norm": 1.5642151832580566, "learning_rate": 4.6461219840046654e-05, "loss": 0.0397, "num_input_tokens_seen": 13821568, "step": 420 }, { "epoch": 4.334600760456274, "grad_norm": 0.7930517792701721, "learning_rate": 4.637856907513366e-05, "loss": 0.0291, "num_input_tokens_seen": 13985792, "step": 425 }, { "epoch": 4.385297845373891, "grad_norm": 1.0651878118515015, "learning_rate": 4.629503952098011e-05, "loss": 0.037, "num_input_tokens_seen": 14150080, "step": 430 }, { "epoch": 4.435994930291509, "grad_norm": 0.7544033527374268, "learning_rate": 4.6210634611158816e-05, "loss": 0.0309, "num_input_tokens_seen": 14314432, "step": 435 }, { "epoch": 4.486692015209125, "grad_norm": 0.6338365077972412, "learning_rate": 4.612535781522504e-05, "loss": 0.0256, "num_input_tokens_seen": 14480128, "step": 440 }, { "epoch": 4.537389100126743, "grad_norm": 0.4086742103099823, "learning_rate": 4.6039212638573833e-05, "loss": 0.0213, "num_input_tokens_seen": 14643712, "step": 445 }, { "epoch": 4.58808618504436, "grad_norm": 0.750950038433075, "learning_rate": 4.595220262229601e-05, "loss": 0.027, "num_input_tokens_seen": 14809600, "step": 450 }, { "epoch": 4.638783269961977, "grad_norm": 0.5362918376922607, "learning_rate": 4.586433134303257e-05, "loss": 0.0206, "num_input_tokens_seen": 14973248, "step": 455 }, { "epoch": 4.689480354879595, "grad_norm": 1.1060535907745361, "learning_rate": 4.5775602412827604e-05, "loss": 0.0326, "num_input_tokens_seen": 15136128, "step": 460 }, { "epoch": 4.740177439797211, "grad_norm": 0.9209577441215515, "learning_rate": 4.5686019478979915e-05, "loss": 0.0368, "num_input_tokens_seen": 15299008, "step": 465 }, { "epoch": 4.7908745247148286, "grad_norm": 0.5697187781333923, "learning_rate": 4.559558622389304e-05, "loss": 0.0318, "num_input_tokens_seen": 15462528, "step": 470 }, { "epoch": 4.841571609632446, "grad_norm": 0.382828950881958, "learning_rate": 4.55043063649239e-05, "loss": 0.0285, "num_input_tokens_seen": 15626240, "step": 475 }, { "epoch": 4.892268694550063, "grad_norm": 0.3667276203632355, "learning_rate": 4.5412183654229965e-05, "loss": 0.0254, "num_input_tokens_seen": 15791168, "step": 480 }, { "epoch": 4.942965779467681, "grad_norm": 0.4793355166912079, "learning_rate": 4.531922187861507e-05, "loss": 0.0407, "num_input_tokens_seen": 15955712, "step": 485 }, { "epoch": 4.993662864385298, "grad_norm": 0.6076450347900391, "learning_rate": 4.522542485937369e-05, "loss": 0.0307, "num_input_tokens_seen": 16121216, "step": 490 }, { "epoch": 5.050697084917617, "grad_norm": 0.33613860607147217, "learning_rate": 4.51307964521339e-05, "loss": 0.0189, "num_input_tokens_seen": 16292672, "step": 495 }, { "epoch": 5.101394169835235, "grad_norm": 0.6635271310806274, "learning_rate": 4.503534054669892e-05, "loss": 0.0188, "num_input_tokens_seen": 16456000, "step": 500 }, { "epoch": 5.101394169835235, "eval_loss": 0.05884933844208717, "eval_runtime": 104.9474, "eval_samples_per_second": 6.68, "eval_steps_per_second": 0.839, "num_input_tokens_seen": 16456000, "step": 500 }, { "epoch": 5.152091254752852, "grad_norm": 0.3716290593147278, "learning_rate": 4.493906106688712e-05, "loss": 0.0092, "num_input_tokens_seen": 16620608, "step": 505 }, { "epoch": 5.202788339670469, "grad_norm": 0.7663172483444214, "learning_rate": 4.484196197037082e-05, "loss": 0.0174, "num_input_tokens_seen": 16783936, "step": 510 }, { "epoch": 5.253485424588086, "grad_norm": 0.9232292771339417, "learning_rate": 4.474404724851356e-05, "loss": 0.0093, "num_input_tokens_seen": 16948544, "step": 515 }, { "epoch": 5.304182509505703, "grad_norm": 0.5671281218528748, "learning_rate": 4.4645320926206064e-05, "loss": 0.0141, "num_input_tokens_seen": 17112640, "step": 520 }, { "epoch": 5.354879594423321, "grad_norm": 0.7603790163993835, "learning_rate": 4.454578706170075e-05, "loss": 0.0197, "num_input_tokens_seen": 17275968, "step": 525 }, { "epoch": 5.405576679340938, "grad_norm": 0.7700641751289368, "learning_rate": 4.444544974644493e-05, "loss": 0.0125, "num_input_tokens_seen": 17439680, "step": 530 }, { "epoch": 5.4562737642585555, "grad_norm": 0.876714825630188, "learning_rate": 4.434431310491267e-05, "loss": 0.0218, "num_input_tokens_seen": 17603136, "step": 535 }, { "epoch": 5.506970849176172, "grad_norm": 0.6164574027061462, "learning_rate": 4.4242381294435154e-05, "loss": 0.0235, "num_input_tokens_seen": 17768192, "step": 540 }, { "epoch": 5.557667934093789, "grad_norm": 0.500507652759552, "learning_rate": 4.413965850502987e-05, "loss": 0.0181, "num_input_tokens_seen": 17932928, "step": 545 }, { "epoch": 5.608365019011407, "grad_norm": 0.6840673089027405, "learning_rate": 4.4036148959228365e-05, "loss": 0.0214, "num_input_tokens_seen": 18096768, "step": 550 }, { "epoch": 5.659062103929024, "grad_norm": 0.41719505190849304, "learning_rate": 4.393185691190264e-05, "loss": 0.0196, "num_input_tokens_seen": 18261184, "step": 555 }, { "epoch": 5.7097591888466415, "grad_norm": 0.30571097135543823, "learning_rate": 4.382678665009028e-05, "loss": 0.0177, "num_input_tokens_seen": 18426880, "step": 560 }, { "epoch": 5.760456273764259, "grad_norm": 0.7241319417953491, "learning_rate": 4.372094249281821e-05, "loss": 0.0181, "num_input_tokens_seen": 18591488, "step": 565 }, { "epoch": 5.811153358681876, "grad_norm": 1.0090749263763428, "learning_rate": 4.3614328790925177e-05, "loss": 0.017, "num_input_tokens_seen": 18755584, "step": 570 }, { "epoch": 5.861850443599493, "grad_norm": 1.0040756464004517, "learning_rate": 4.350694992688289e-05, "loss": 0.02, "num_input_tokens_seen": 18919360, "step": 575 }, { "epoch": 5.91254752851711, "grad_norm": 1.0349421501159668, "learning_rate": 4.3398810314615876e-05, "loss": 0.0238, "num_input_tokens_seen": 19082816, "step": 580 }, { "epoch": 5.9632446134347274, "grad_norm": 1.2083125114440918, "learning_rate": 4.3289914399320034e-05, "loss": 0.0148, "num_input_tokens_seen": 19247040, "step": 585 }, { "epoch": 6.020278833967047, "grad_norm": 0.5085653066635132, "learning_rate": 4.318026665727993e-05, "loss": 0.0116, "num_input_tokens_seen": 19420480, "step": 590 }, { "epoch": 6.070975918884664, "grad_norm": 0.5758472681045532, "learning_rate": 4.306987159568479e-05, "loss": 0.0126, "num_input_tokens_seen": 19583680, "step": 595 }, { "epoch": 6.1216730038022815, "grad_norm": 0.11992669850587845, "learning_rate": 4.2958733752443195e-05, "loss": 0.0041, "num_input_tokens_seen": 19746816, "step": 600 }, { "epoch": 6.1216730038022815, "eval_loss": 0.07074201852083206, "eval_runtime": 106.0122, "eval_samples_per_second": 6.612, "eval_steps_per_second": 0.83, "num_input_tokens_seen": 19746816, "step": 600 }, { "epoch": 6.172370088719899, "grad_norm": 0.5216241478919983, "learning_rate": 4.284685769599658e-05, "loss": 0.008, "num_input_tokens_seen": 19911552, "step": 605 }, { "epoch": 6.223067173637516, "grad_norm": 1.355669617652893, "learning_rate": 4.273424802513145e-05, "loss": 0.0069, "num_input_tokens_seen": 20075200, "step": 610 }, { "epoch": 6.273764258555133, "grad_norm": 0.1907280832529068, "learning_rate": 4.262090936879029e-05, "loss": 0.0016, "num_input_tokens_seen": 20241472, "step": 615 }, { "epoch": 6.32446134347275, "grad_norm": 0.0559777207672596, "learning_rate": 4.250684638588138e-05, "loss": 0.0091, "num_input_tokens_seen": 20403904, "step": 620 }, { "epoch": 6.3751584283903675, "grad_norm": 1.0960038900375366, "learning_rate": 4.239206376508717e-05, "loss": 0.0139, "num_input_tokens_seen": 20569536, "step": 625 }, { "epoch": 6.425855513307985, "grad_norm": 0.3808915317058563, "learning_rate": 4.227656622467162e-05, "loss": 0.0122, "num_input_tokens_seen": 20732032, "step": 630 }, { "epoch": 6.476552598225602, "grad_norm": 1.5081939697265625, "learning_rate": 4.216035851228626e-05, "loss": 0.0105, "num_input_tokens_seen": 20895552, "step": 635 }, { "epoch": 6.52724968314322, "grad_norm": 0.4840177893638611, "learning_rate": 4.204344540477499e-05, "loss": 0.0049, "num_input_tokens_seen": 21060736, "step": 640 }, { "epoch": 6.577946768060836, "grad_norm": 1.1747405529022217, "learning_rate": 4.192583170797774e-05, "loss": 0.0093, "num_input_tokens_seen": 21224320, "step": 645 }, { "epoch": 6.6286438529784535, "grad_norm": 0.9056284427642822, "learning_rate": 4.180752225653292e-05, "loss": 0.0137, "num_input_tokens_seen": 21387328, "step": 650 }, { "epoch": 6.679340937896071, "grad_norm": 1.1171244382858276, "learning_rate": 4.16885219136787e-05, "loss": 0.0054, "num_input_tokens_seen": 21551616, "step": 655 }, { "epoch": 6.730038022813688, "grad_norm": 0.4659402370452881, "learning_rate": 4.1568835571053075e-05, "loss": 0.0068, "num_input_tokens_seen": 21716864, "step": 660 }, { "epoch": 6.780735107731306, "grad_norm": 0.12265567481517792, "learning_rate": 4.144846814849282e-05, "loss": 0.0106, "num_input_tokens_seen": 21882304, "step": 665 }, { "epoch": 6.831432192648923, "grad_norm": 0.3785063326358795, "learning_rate": 4.132742459383122e-05, "loss": 0.0095, "num_input_tokens_seen": 22045504, "step": 670 }, { "epoch": 6.8821292775665395, "grad_norm": 0.9941157102584839, "learning_rate": 4.120570988269472e-05, "loss": 0.0079, "num_input_tokens_seen": 22210368, "step": 675 }, { "epoch": 6.932826362484157, "grad_norm": 2.0760669708251953, "learning_rate": 4.108332901829836e-05, "loss": 0.0113, "num_input_tokens_seen": 22374976, "step": 680 }, { "epoch": 6.983523447401774, "grad_norm": 1.1653289794921875, "learning_rate": 4.096028703124014e-05, "loss": 0.0101, "num_input_tokens_seen": 22540160, "step": 685 }, { "epoch": 7.0405576679340935, "grad_norm": 0.6426116824150085, "learning_rate": 4.083658897929426e-05, "loss": 0.0094, "num_input_tokens_seen": 22713344, "step": 690 }, { "epoch": 7.091254752851711, "grad_norm": 0.1562691628932953, "learning_rate": 4.071223994720309e-05, "loss": 0.0083, "num_input_tokens_seen": 22876672, "step": 695 }, { "epoch": 7.141951837769328, "grad_norm": 0.24841922521591187, "learning_rate": 4.058724504646834e-05, "loss": 0.0018, "num_input_tokens_seen": 23041536, "step": 700 }, { "epoch": 7.141951837769328, "eval_loss": 0.08085346221923828, "eval_runtime": 104.8865, "eval_samples_per_second": 6.683, "eval_steps_per_second": 0.839, "num_input_tokens_seen": 23041536, "step": 700 }, { "epoch": 7.192648922686946, "grad_norm": 0.3186471462249756, "learning_rate": 4.046160941514079e-05, "loss": 0.0131, "num_input_tokens_seen": 23206528, "step": 705 }, { "epoch": 7.243346007604563, "grad_norm": 0.1616538017988205, "learning_rate": 4.033533821760917e-05, "loss": 0.0045, "num_input_tokens_seen": 23370880, "step": 710 }, { "epoch": 7.29404309252218, "grad_norm": 0.5200349688529968, "learning_rate": 4.0208436644387834e-05, "loss": 0.006, "num_input_tokens_seen": 23533184, "step": 715 }, { "epoch": 7.344740177439797, "grad_norm": 0.3917842507362366, "learning_rate": 4.008090991190341e-05, "loss": 0.0049, "num_input_tokens_seen": 23696000, "step": 720 }, { "epoch": 7.395437262357414, "grad_norm": 0.20676745474338531, "learning_rate": 3.9952763262280405e-05, "loss": 0.0014, "num_input_tokens_seen": 23861120, "step": 725 }, { "epoch": 7.446134347275032, "grad_norm": 0.4431841969490051, "learning_rate": 3.982400196312564e-05, "loss": 0.0037, "num_input_tokens_seen": 24025280, "step": 730 }, { "epoch": 7.496831432192649, "grad_norm": 0.7824487686157227, "learning_rate": 3.969463130731183e-05, "loss": 0.0083, "num_input_tokens_seen": 24190144, "step": 735 }, { "epoch": 7.547528517110266, "grad_norm": 1.250172734260559, "learning_rate": 3.95646566127599e-05, "loss": 0.002, "num_input_tokens_seen": 24356352, "step": 740 }, { "epoch": 7.598225602027884, "grad_norm": 0.06873621046543121, "learning_rate": 3.943408322222049e-05, "loss": 0.0014, "num_input_tokens_seen": 24521024, "step": 745 }, { "epoch": 7.6489226869455, "grad_norm": 0.9064635038375854, "learning_rate": 3.9302916503054246e-05, "loss": 0.0062, "num_input_tokens_seen": 24683072, "step": 750 }, { "epoch": 7.699619771863118, "grad_norm": 0.5123156905174255, "learning_rate": 3.917116184701125e-05, "loss": 0.0031, "num_input_tokens_seen": 24848384, "step": 755 }, { "epoch": 7.750316856780735, "grad_norm": 1.4605395793914795, "learning_rate": 3.903882467000937e-05, "loss": 0.0048, "num_input_tokens_seen": 25011648, "step": 760 }, { "epoch": 7.801013941698352, "grad_norm": 0.05572497099637985, "learning_rate": 3.8905910411911625e-05, "loss": 0.0066, "num_input_tokens_seen": 25175360, "step": 765 }, { "epoch": 7.85171102661597, "grad_norm": 0.5212889909744263, "learning_rate": 3.8772424536302564e-05, "loss": 0.0033, "num_input_tokens_seen": 25339072, "step": 770 }, { "epoch": 7.902408111533587, "grad_norm": 0.07258739322423935, "learning_rate": 3.8638372530263715e-05, "loss": 0.0024, "num_input_tokens_seen": 25502656, "step": 775 }, { "epoch": 7.9531051964512045, "grad_norm": 0.0964067205786705, "learning_rate": 3.850375990414801e-05, "loss": 0.0004, "num_input_tokens_seen": 25666560, "step": 780 }, { "epoch": 8.010139416983524, "grad_norm": 0.596034824848175, "learning_rate": 3.836859219135324e-05, "loss": 0.0052, "num_input_tokens_seen": 25837632, "step": 785 }, { "epoch": 8.060836501901141, "grad_norm": 0.19794920086860657, "learning_rate": 3.823287494809469e-05, "loss": 0.0018, "num_input_tokens_seen": 26002176, "step": 790 }, { "epoch": 8.111533586818759, "grad_norm": 0.016648059710860252, "learning_rate": 3.8096613753176634e-05, "loss": 0.0032, "num_input_tokens_seen": 26164736, "step": 795 }, { "epoch": 8.162230671736376, "grad_norm": 0.25073590874671936, "learning_rate": 3.7959814207763135e-05, "loss": 0.0003, "num_input_tokens_seen": 26329664, "step": 800 }, { "epoch": 8.162230671736376, "eval_loss": 0.1101459190249443, "eval_runtime": 104.9186, "eval_samples_per_second": 6.681, "eval_steps_per_second": 0.839, "num_input_tokens_seen": 26329664, "step": 800 } ], "logging_steps": 5, "max_steps": 2450, "num_input_tokens_seen": 26329664, "num_train_epochs": 25, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.227070984070234e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }