{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9860291834833903, "eval_steps": 30, "global_step": 800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.024837007140639553, "grad_norm": 1.6665902137756348, "learning_rate": 4.390243902439025e-05, "loss": 1.8541, "step": 10 }, { "epoch": 0.04967401428127911, "grad_norm": 0.7774304747581482, "learning_rate": 9.26829268292683e-05, "loss": 0.5737, "step": 20 }, { "epoch": 0.07451102142191866, "grad_norm": 0.5949566960334778, "learning_rate": 0.00014146341463414634, "loss": 0.3491, "step": 30 }, { "epoch": 0.07451102142191866, "eval_loss": 0.2992604076862335, "eval_runtime": 38.9257, "eval_samples_per_second": 4.367, "eval_steps_per_second": 4.367, "step": 30 }, { "epoch": 0.09934802856255821, "grad_norm": 0.518982470035553, "learning_rate": 0.0001902439024390244, "loss": 0.2738, "step": 40 }, { "epoch": 0.12418503570319776, "grad_norm": 0.40878191590309143, "learning_rate": 0.00019994603803069594, "loss": 0.2339, "step": 50 }, { "epoch": 0.14902204284383733, "grad_norm": 0.3464236259460449, "learning_rate": 0.00019972691733857883, "loss": 0.199, "step": 60 }, { "epoch": 0.14902204284383733, "eval_loss": 0.1967637687921524, "eval_runtime": 38.4564, "eval_samples_per_second": 4.421, "eval_steps_per_second": 4.421, "step": 60 }, { "epoch": 0.17385904998447688, "grad_norm": 0.27865490317344666, "learning_rate": 0.00019933963450321945, "loss": 0.1858, "step": 70 }, { "epoch": 0.19869605712511643, "grad_norm": 0.31942445039749146, "learning_rate": 0.00019878484257109083, "loss": 0.1773, "step": 80 }, { "epoch": 0.22353306426575598, "grad_norm": 0.2719893157482147, "learning_rate": 0.00019806347704689778, "loss": 0.1689, "step": 90 }, { "epoch": 0.22353306426575598, "eval_loss": 0.16391794383525848, "eval_runtime": 38.3889, "eval_samples_per_second": 4.428, "eval_steps_per_second": 4.428, "step": 90 }, { "epoch": 0.24837007140639553, "grad_norm": 0.2718851566314697, "learning_rate": 0.00019717675431610415, "loss": 0.1679, "step": 100 }, { "epoch": 0.2732070785470351, "grad_norm": 0.403368204832077, "learning_rate": 0.0001961261695938319, "loss": 0.146, "step": 110 }, { "epoch": 0.29804408568767465, "grad_norm": 0.1546928435564041, "learning_rate": 0.00019491349440359015, "loss": 0.153, "step": 120 }, { "epoch": 0.29804408568767465, "eval_loss": 0.14231818914413452, "eval_runtime": 38.5329, "eval_samples_per_second": 4.412, "eval_steps_per_second": 4.412, "step": 120 }, { "epoch": 0.3228810928283142, "grad_norm": 0.3227437138557434, "learning_rate": 0.0001935407735900857, "loss": 0.1483, "step": 130 }, { "epoch": 0.34771809996895375, "grad_norm": 0.1824907660484314, "learning_rate": 0.00019201032187115234, "loss": 0.1519, "step": 140 }, { "epoch": 0.3725551071095933, "grad_norm": 0.19652438163757324, "learning_rate": 0.0001903247199346129, "loss": 0.1455, "step": 150 }, { "epoch": 0.3725551071095933, "eval_loss": 0.13592763245105743, "eval_runtime": 38.412, "eval_samples_per_second": 4.426, "eval_steps_per_second": 4.426, "step": 150 }, { "epoch": 0.39739211425023285, "grad_norm": 0.18074853718280792, "learning_rate": 0.00018848681008665582, "loss": 0.1466, "step": 160 }, { "epoch": 0.4222291213908724, "grad_norm": 0.3232487142086029, "learning_rate": 0.0001864996914590638, "loss": 0.1408, "step": 170 }, { "epoch": 0.44706612853151195, "grad_norm": 0.15204332768917084, "learning_rate": 0.00018436671478337666, "loss": 0.1549, "step": 180 }, { "epoch": 0.44706612853151195, "eval_loss": 0.13638228178024292, "eval_runtime": 38.3896, "eval_samples_per_second": 4.428, "eval_steps_per_second": 4.428, "step": 180 }, { "epoch": 0.47190313567215153, "grad_norm": 0.2306644767522812, "learning_rate": 0.00018209147674079983, "loss": 0.1444, "step": 190 }, { "epoch": 0.49674014281279105, "grad_norm": 0.14888478815555573, "learning_rate": 0.00017967781389738625, "loss": 0.1455, "step": 200 }, { "epoch": 0.5215771499534306, "grad_norm": 0.13995014131069183, "learning_rate": 0.00017712979623471807, "loss": 0.1413, "step": 210 }, { "epoch": 0.5215771499534306, "eval_loss": 0.13154758512973785, "eval_runtime": 38.4362, "eval_samples_per_second": 4.423, "eval_steps_per_second": 4.423, "step": 210 }, { "epoch": 0.5464141570940702, "grad_norm": 0.1627102494239807, "learning_rate": 0.000174451720286997, "loss": 0.1396, "step": 220 }, { "epoch": 0.5712511642347097, "grad_norm": 0.13829496502876282, "learning_rate": 0.0001716481018961156, "loss": 0.1444, "step": 230 }, { "epoch": 0.5960881713753493, "grad_norm": 0.14025089144706726, "learning_rate": 0.00016872366859692627, "loss": 0.1474, "step": 240 }, { "epoch": 0.5960881713753493, "eval_loss": 0.13241083920001984, "eval_runtime": 38.4912, "eval_samples_per_second": 4.417, "eval_steps_per_second": 4.417, "step": 240 }, { "epoch": 0.6209251785159888, "grad_norm": 0.11233100295066833, "learning_rate": 0.00016568335164554812, "loss": 0.1383, "step": 250 }, { "epoch": 0.6457621856566284, "grad_norm": 0.12720470130443573, "learning_rate": 0.0001625322777041534, "loss": 0.1359, "step": 260 }, { "epoch": 0.670599192797268, "grad_norm": 0.11088231950998306, "learning_rate": 0.0001592757601962555, "loss": 0.1437, "step": 270 }, { "epoch": 0.670599192797268, "eval_loss": 0.12654946744441986, "eval_runtime": 38.7011, "eval_samples_per_second": 4.393, "eval_steps_per_second": 4.393, "step": 270 }, { "epoch": 0.6954361999379075, "grad_norm": 0.13584497570991516, "learning_rate": 0.0001559192903470747, "loss": 0.1312, "step": 280 }, { "epoch": 0.720273207078547, "grad_norm": 0.32189086079597473, "learning_rate": 0.00015246852792409033, "loss": 0.1414, "step": 290 }, { "epoch": 0.7451102142191866, "grad_norm": 0.1525665819644928, "learning_rate": 0.00014892929169339235, "loss": 0.1496, "step": 300 }, { "epoch": 0.7451102142191866, "eval_loss": 0.12869440019130707, "eval_runtime": 38.7851, "eval_samples_per_second": 4.383, "eval_steps_per_second": 4.383, "step": 300 }, { "epoch": 0.7699472213598262, "grad_norm": 0.12182191759347916, "learning_rate": 0.00014530754960792553, "loss": 0.1436, "step": 310 }, { "epoch": 0.7947842285004657, "grad_norm": 1.4180772304534912, "learning_rate": 0.0001416094087441704, "loss": 0.145, "step": 320 }, { "epoch": 0.8196212356411052, "grad_norm": 0.14027242362499237, "learning_rate": 0.00013784110500423104, "loss": 0.1486, "step": 330 }, { "epoch": 0.8196212356411052, "eval_loss": 0.12838058173656464, "eval_runtime": 38.6316, "eval_samples_per_second": 4.401, "eval_steps_per_second": 4.401, "step": 330 }, { "epoch": 0.8444582427817448, "grad_norm": 0.13633093237876892, "learning_rate": 0.00013400899260069323, "loss": 0.1413, "step": 340 }, { "epoch": 0.8692952499223844, "grad_norm": 0.11211191117763519, "learning_rate": 0.00013011953334198466, "loss": 0.1361, "step": 350 }, { "epoch": 0.8941322570630239, "grad_norm": 0.30127570033073425, "learning_rate": 0.00012617928573630406, "loss": 0.1363, "step": 360 }, { "epoch": 0.8941322570630239, "eval_loss": 0.12504999339580536, "eval_runtime": 38.8649, "eval_samples_per_second": 4.374, "eval_steps_per_second": 4.374, "step": 360 }, { "epoch": 0.9189692642036634, "grad_norm": 0.10214308649301529, "learning_rate": 0.00012219489393249262, "loss": 0.1483, "step": 370 }, { "epoch": 0.9438062713443031, "grad_norm": 0.09070255607366562, "learning_rate": 0.00011817307651649616, "loss": 0.1349, "step": 380 }, { "epoch": 0.9686432784849426, "grad_norm": 0.10173656791448593, "learning_rate": 0.00011412061518230914, "loss": 0.1421, "step": 390 }, { "epoch": 0.9686432784849426, "eval_loss": 0.12429468333721161, "eval_runtime": 38.7145, "eval_samples_per_second": 4.391, "eval_steps_per_second": 4.391, "step": 390 }, { "epoch": 0.9934802856255821, "grad_norm": 0.10592233389616013, "learning_rate": 0.00011004434329650452, "loss": 0.1296, "step": 400 }, { "epoch": 1.0173859049984477, "grad_norm": 0.15041767060756683, "learning_rate": 0.00010595113437563176, "loss": 0.1367, "step": 410 }, { "epoch": 1.0422229121390871, "grad_norm": 0.10861553996801376, "learning_rate": 0.00010184789049591299, "loss": 0.1353, "step": 420 }, { "epoch": 1.0422229121390871, "eval_loss": 0.12352242320775986, "eval_runtime": 38.7533, "eval_samples_per_second": 4.387, "eval_steps_per_second": 4.387, "step": 420 }, { "epoch": 1.0670599192797268, "grad_norm": 0.0957934781908989, "learning_rate": 9.774153065478121e-05, "loss": 0.134, "step": 430 }, { "epoch": 1.0918969264203664, "grad_norm": 0.09080129116773605, "learning_rate": 9.36389791038851e-05, "loss": 0.1329, "step": 440 }, { "epoch": 1.1167339335610058, "grad_norm": 0.12591005861759186, "learning_rate": 8.954715367323468e-05, "loss": 0.121, "step": 450 }, { "epoch": 1.1167339335610058, "eval_loss": 0.12231362611055374, "eval_runtime": 38.7245, "eval_samples_per_second": 4.39, "eval_steps_per_second": 4.39, "step": 450 }, { "epoch": 1.1415709407016454, "grad_norm": 0.08681875467300415, "learning_rate": 8.547295410617453e-05, "loss": 0.1305, "step": 460 }, { "epoch": 1.166407947842285, "grad_norm": 0.0953899621963501, "learning_rate": 8.142325042485592e-05, "loss": 0.1309, "step": 470 }, { "epoch": 1.1912449549829245, "grad_norm": 0.07845437526702881, "learning_rate": 7.740487134582525e-05, "loss": 0.1298, "step": 480 }, { "epoch": 1.1912449549829245, "eval_loss": 0.12134242057800293, "eval_runtime": 38.8319, "eval_samples_per_second": 4.378, "eval_steps_per_second": 4.378, "step": 480 }, { "epoch": 1.2160819621235641, "grad_norm": 0.11160232126712799, "learning_rate": 7.342459276526302e-05, "loss": 0.1348, "step": 490 }, { "epoch": 1.2409189692642038, "grad_norm": 0.09501124173402786, "learning_rate": 6.948912633329007e-05, "loss": 0.1321, "step": 500 }, { "epoch": 1.2657559764048432, "grad_norm": 0.07836316525936127, "learning_rate": 6.560510813660719e-05, "loss": 0.1246, "step": 510 }, { "epoch": 1.2657559764048432, "eval_loss": 0.1200186014175415, "eval_runtime": 38.566, "eval_samples_per_second": 4.408, "eval_steps_per_second": 4.408, "step": 510 }, { "epoch": 1.2905929835454828, "grad_norm": 0.08444702625274658, "learning_rate": 6.177908750855164e-05, "loss": 0.1293, "step": 520 }, { "epoch": 1.3154299906861224, "grad_norm": 0.07452095299959183, "learning_rate": 5.8017515985439465e-05, "loss": 0.1319, "step": 530 }, { "epoch": 1.3402669978267618, "grad_norm": 0.06832710653543472, "learning_rate": 5.4326736427815946e-05, "loss": 0.1298, "step": 540 }, { "epoch": 1.3402669978267618, "eval_loss": 0.1198083758354187, "eval_runtime": 38.7172, "eval_samples_per_second": 4.391, "eval_steps_per_second": 4.391, "step": 540 }, { "epoch": 1.3651040049674015, "grad_norm": 0.10138995200395584, "learning_rate": 5.071297232495769e-05, "loss": 0.1274, "step": 550 }, { "epoch": 1.389941012108041, "grad_norm": 0.08988513052463531, "learning_rate": 4.7182317300661796e-05, "loss": 0.1309, "step": 560 }, { "epoch": 1.4147780192486805, "grad_norm": 0.08663639426231384, "learning_rate": 4.374072483801769e-05, "loss": 0.1305, "step": 570 }, { "epoch": 1.4147780192486805, "eval_loss": 0.1200200691819191, "eval_runtime": 38.6578, "eval_samples_per_second": 4.398, "eval_steps_per_second": 4.398, "step": 570 }, { "epoch": 1.4396150263893202, "grad_norm": 0.0839414894580841, "learning_rate": 4.039399824048777e-05, "loss": 0.1332, "step": 580 }, { "epoch": 1.4644520335299596, "grad_norm": 0.07634599506855011, "learning_rate": 3.714778084622492e-05, "loss": 0.1275, "step": 590 }, { "epoch": 1.4892890406705992, "grad_norm": 0.08726586401462555, "learning_rate": 3.400754651212776e-05, "loss": 0.1302, "step": 600 }, { "epoch": 1.4892890406705992, "eval_loss": 0.11870752274990082, "eval_runtime": 38.731, "eval_samples_per_second": 4.389, "eval_steps_per_second": 4.389, "step": 600 }, { "epoch": 1.5141260478112386, "grad_norm": 0.08947139978408813, "learning_rate": 3.097859038367947e-05, "loss": 0.1296, "step": 610 }, { "epoch": 1.5389630549518785, "grad_norm": 0.08135833591222763, "learning_rate": 2.8066019966134904e-05, "loss": 0.1281, "step": 620 }, { "epoch": 1.5638000620925179, "grad_norm": 0.08194943517446518, "learning_rate": 2.527474651211089e-05, "loss": 0.1296, "step": 630 }, { "epoch": 1.5638000620925179, "eval_loss": 0.11857092380523682, "eval_runtime": 38.7225, "eval_samples_per_second": 4.39, "eval_steps_per_second": 4.39, "step": 630 }, { "epoch": 1.5886370692331573, "grad_norm": 0.08808406442403793, "learning_rate": 2.260947674010372e-05, "loss": 0.1299, "step": 640 }, { "epoch": 1.613474076373797, "grad_norm": 0.08768365532159805, "learning_rate": 2.0074704897896558e-05, "loss": 0.1242, "step": 650 }, { "epoch": 1.6383110835144366, "grad_norm": 0.09054244309663773, "learning_rate": 1.767470518424129e-05, "loss": 0.1167, "step": 660 }, { "epoch": 1.6383110835144366, "eval_loss": 0.11811664700508118, "eval_runtime": 38.7626, "eval_samples_per_second": 4.386, "eval_steps_per_second": 4.386, "step": 660 }, { "epoch": 1.663148090655076, "grad_norm": 0.06916210800409317, "learning_rate": 1.541352454159237e-05, "loss": 0.1286, "step": 670 }, { "epoch": 1.6879850977957156, "grad_norm": 0.08965995907783508, "learning_rate": 1.3294975832046353e-05, "loss": 0.1293, "step": 680 }, { "epoch": 1.7128221049363552, "grad_norm": 0.09365396201610565, "learning_rate": 1.1322631407993811e-05, "loss": 0.128, "step": 690 }, { "epoch": 1.7128221049363552, "eval_loss": 0.1179969310760498, "eval_runtime": 38.7285, "eval_samples_per_second": 4.39, "eval_steps_per_second": 4.39, "step": 690 }, { "epoch": 1.7376591120769946, "grad_norm": 0.06408954411745071, "learning_rate": 9.499817088325102e-06, "loss": 0.1292, "step": 700 }, { "epoch": 1.7624961192176343, "grad_norm": 0.08411859720945358, "learning_rate": 7.829606550347313e-06, "loss": 0.1238, "step": 710 }, { "epoch": 1.787333126358274, "grad_norm": 0.08746035397052765, "learning_rate": 6.314816146868952e-06, "loss": 0.1354, "step": 720 }, { "epoch": 1.787333126358274, "eval_loss": 0.11767658591270447, "eval_runtime": 38.9009, "eval_samples_per_second": 4.37, "eval_steps_per_second": 4.37, "step": 720 }, { "epoch": 1.8121701334989133, "grad_norm": 0.10084281116724014, "learning_rate": 4.958000157192022e-06, "loss": 0.1277, "step": 730 }, { "epoch": 1.837007140639553, "grad_norm": 0.08554716408252716, "learning_rate": 3.761446480019315e-06, "loss": 0.1287, "step": 740 }, { "epoch": 1.8618441477801926, "grad_norm": 0.08761674165725708, "learning_rate": 2.7271727755395214e-06, "loss": 0.1289, "step": 750 }, { "epoch": 1.8618441477801926, "eval_loss": 0.11756357550621033, "eval_runtime": 38.7293, "eval_samples_per_second": 4.389, "eval_steps_per_second": 4.389, "step": 750 }, { "epoch": 1.886681154920832, "grad_norm": 0.08505561947822571, "learning_rate": 1.8569230631958256e-06, "loss": 0.1245, "step": 760 }, { "epoch": 1.9115181620614716, "grad_norm": 0.08099253475666046, "learning_rate": 1.1521647808744873e-06, "loss": 0.1215, "step": 770 }, { "epoch": 1.9363551692021113, "grad_norm": 0.08459154516458511, "learning_rate": 6.140863104726391e-07, "loss": 0.13, "step": 780 }, { "epoch": 1.9363551692021113, "eval_loss": 0.11756289005279541, "eval_runtime": 38.7944, "eval_samples_per_second": 4.382, "eval_steps_per_second": 4.382, "step": 780 }, { "epoch": 1.9611921763427507, "grad_norm": 0.08602018654346466, "learning_rate": 2.4359497401758024e-07, "loss": 0.1288, "step": 790 }, { "epoch": 1.9860291834833903, "grad_norm": 0.08176976442337036, "learning_rate": 4.131550371655468e-08, "loss": 0.128, "step": 800 } ], "logging_steps": 10, "max_steps": 806, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.2864376597719245e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }