| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.9860291834833903, | |
| "eval_steps": 30, | |
| "global_step": 800, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.024837007140639553, | |
| "grad_norm": 1.6665902137756348, | |
| "learning_rate": 4.390243902439025e-05, | |
| "loss": 1.8541, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.04967401428127911, | |
| "grad_norm": 0.7774304747581482, | |
| "learning_rate": 9.26829268292683e-05, | |
| "loss": 0.5737, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.07451102142191866, | |
| "grad_norm": 0.5949566960334778, | |
| "learning_rate": 0.00014146341463414634, | |
| "loss": 0.3491, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.07451102142191866, | |
| "eval_loss": 0.2992604076862335, | |
| "eval_runtime": 38.9257, | |
| "eval_samples_per_second": 4.367, | |
| "eval_steps_per_second": 4.367, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.09934802856255821, | |
| "grad_norm": 0.518982470035553, | |
| "learning_rate": 0.0001902439024390244, | |
| "loss": 0.2738, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.12418503570319776, | |
| "grad_norm": 0.40878191590309143, | |
| "learning_rate": 0.00019994603803069594, | |
| "loss": 0.2339, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.14902204284383733, | |
| "grad_norm": 0.3464236259460449, | |
| "learning_rate": 0.00019972691733857883, | |
| "loss": 0.199, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.14902204284383733, | |
| "eval_loss": 0.1967637687921524, | |
| "eval_runtime": 38.4564, | |
| "eval_samples_per_second": 4.421, | |
| "eval_steps_per_second": 4.421, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.17385904998447688, | |
| "grad_norm": 0.27865490317344666, | |
| "learning_rate": 0.00019933963450321945, | |
| "loss": 0.1858, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.19869605712511643, | |
| "grad_norm": 0.31942445039749146, | |
| "learning_rate": 0.00019878484257109083, | |
| "loss": 0.1773, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.22353306426575598, | |
| "grad_norm": 0.2719893157482147, | |
| "learning_rate": 0.00019806347704689778, | |
| "loss": 0.1689, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.22353306426575598, | |
| "eval_loss": 0.16391794383525848, | |
| "eval_runtime": 38.3889, | |
| "eval_samples_per_second": 4.428, | |
| "eval_steps_per_second": 4.428, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.24837007140639553, | |
| "grad_norm": 0.2718851566314697, | |
| "learning_rate": 0.00019717675431610415, | |
| "loss": 0.1679, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.2732070785470351, | |
| "grad_norm": 0.403368204832077, | |
| "learning_rate": 0.0001961261695938319, | |
| "loss": 0.146, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.29804408568767465, | |
| "grad_norm": 0.1546928435564041, | |
| "learning_rate": 0.00019491349440359015, | |
| "loss": 0.153, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.29804408568767465, | |
| "eval_loss": 0.14231818914413452, | |
| "eval_runtime": 38.5329, | |
| "eval_samples_per_second": 4.412, | |
| "eval_steps_per_second": 4.412, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.3228810928283142, | |
| "grad_norm": 0.3227437138557434, | |
| "learning_rate": 0.0001935407735900857, | |
| "loss": 0.1483, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.34771809996895375, | |
| "grad_norm": 0.1824907660484314, | |
| "learning_rate": 0.00019201032187115234, | |
| "loss": 0.1519, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.3725551071095933, | |
| "grad_norm": 0.19652438163757324, | |
| "learning_rate": 0.0001903247199346129, | |
| "loss": 0.1455, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.3725551071095933, | |
| "eval_loss": 0.13592763245105743, | |
| "eval_runtime": 38.412, | |
| "eval_samples_per_second": 4.426, | |
| "eval_steps_per_second": 4.426, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.39739211425023285, | |
| "grad_norm": 0.18074853718280792, | |
| "learning_rate": 0.00018848681008665582, | |
| "loss": 0.1466, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.4222291213908724, | |
| "grad_norm": 0.3232487142086029, | |
| "learning_rate": 0.0001864996914590638, | |
| "loss": 0.1408, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.44706612853151195, | |
| "grad_norm": 0.15204332768917084, | |
| "learning_rate": 0.00018436671478337666, | |
| "loss": 0.1549, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.44706612853151195, | |
| "eval_loss": 0.13638228178024292, | |
| "eval_runtime": 38.3896, | |
| "eval_samples_per_second": 4.428, | |
| "eval_steps_per_second": 4.428, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.47190313567215153, | |
| "grad_norm": 0.2306644767522812, | |
| "learning_rate": 0.00018209147674079983, | |
| "loss": 0.1444, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.49674014281279105, | |
| "grad_norm": 0.14888478815555573, | |
| "learning_rate": 0.00017967781389738625, | |
| "loss": 0.1455, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.5215771499534306, | |
| "grad_norm": 0.13995014131069183, | |
| "learning_rate": 0.00017712979623471807, | |
| "loss": 0.1413, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.5215771499534306, | |
| "eval_loss": 0.13154758512973785, | |
| "eval_runtime": 38.4362, | |
| "eval_samples_per_second": 4.423, | |
| "eval_steps_per_second": 4.423, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.5464141570940702, | |
| "grad_norm": 0.1627102494239807, | |
| "learning_rate": 0.000174451720286997, | |
| "loss": 0.1396, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.5712511642347097, | |
| "grad_norm": 0.13829496502876282, | |
| "learning_rate": 0.0001716481018961156, | |
| "loss": 0.1444, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.5960881713753493, | |
| "grad_norm": 0.14025089144706726, | |
| "learning_rate": 0.00016872366859692627, | |
| "loss": 0.1474, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.5960881713753493, | |
| "eval_loss": 0.13241083920001984, | |
| "eval_runtime": 38.4912, | |
| "eval_samples_per_second": 4.417, | |
| "eval_steps_per_second": 4.417, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.6209251785159888, | |
| "grad_norm": 0.11233100295066833, | |
| "learning_rate": 0.00016568335164554812, | |
| "loss": 0.1383, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.6457621856566284, | |
| "grad_norm": 0.12720470130443573, | |
| "learning_rate": 0.0001625322777041534, | |
| "loss": 0.1359, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.670599192797268, | |
| "grad_norm": 0.11088231950998306, | |
| "learning_rate": 0.0001592757601962555, | |
| "loss": 0.1437, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.670599192797268, | |
| "eval_loss": 0.12654946744441986, | |
| "eval_runtime": 38.7011, | |
| "eval_samples_per_second": 4.393, | |
| "eval_steps_per_second": 4.393, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.6954361999379075, | |
| "grad_norm": 0.13584497570991516, | |
| "learning_rate": 0.0001559192903470747, | |
| "loss": 0.1312, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.720273207078547, | |
| "grad_norm": 0.32189086079597473, | |
| "learning_rate": 0.00015246852792409033, | |
| "loss": 0.1414, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.7451102142191866, | |
| "grad_norm": 0.1525665819644928, | |
| "learning_rate": 0.00014892929169339235, | |
| "loss": 0.1496, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.7451102142191866, | |
| "eval_loss": 0.12869440019130707, | |
| "eval_runtime": 38.7851, | |
| "eval_samples_per_second": 4.383, | |
| "eval_steps_per_second": 4.383, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.7699472213598262, | |
| "grad_norm": 0.12182191759347916, | |
| "learning_rate": 0.00014530754960792553, | |
| "loss": 0.1436, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.7947842285004657, | |
| "grad_norm": 1.4180772304534912, | |
| "learning_rate": 0.0001416094087441704, | |
| "loss": 0.145, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.8196212356411052, | |
| "grad_norm": 0.14027242362499237, | |
| "learning_rate": 0.00013784110500423104, | |
| "loss": 0.1486, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.8196212356411052, | |
| "eval_loss": 0.12838058173656464, | |
| "eval_runtime": 38.6316, | |
| "eval_samples_per_second": 4.401, | |
| "eval_steps_per_second": 4.401, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.8444582427817448, | |
| "grad_norm": 0.13633093237876892, | |
| "learning_rate": 0.00013400899260069323, | |
| "loss": 0.1413, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.8692952499223844, | |
| "grad_norm": 0.11211191117763519, | |
| "learning_rate": 0.00013011953334198466, | |
| "loss": 0.1361, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.8941322570630239, | |
| "grad_norm": 0.30127570033073425, | |
| "learning_rate": 0.00012617928573630406, | |
| "loss": 0.1363, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.8941322570630239, | |
| "eval_loss": 0.12504999339580536, | |
| "eval_runtime": 38.8649, | |
| "eval_samples_per_second": 4.374, | |
| "eval_steps_per_second": 4.374, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.9189692642036634, | |
| "grad_norm": 0.10214308649301529, | |
| "learning_rate": 0.00012219489393249262, | |
| "loss": 0.1483, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.9438062713443031, | |
| "grad_norm": 0.09070255607366562, | |
| "learning_rate": 0.00011817307651649616, | |
| "loss": 0.1349, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.9686432784849426, | |
| "grad_norm": 0.10173656791448593, | |
| "learning_rate": 0.00011412061518230914, | |
| "loss": 0.1421, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.9686432784849426, | |
| "eval_loss": 0.12429468333721161, | |
| "eval_runtime": 38.7145, | |
| "eval_samples_per_second": 4.391, | |
| "eval_steps_per_second": 4.391, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.9934802856255821, | |
| "grad_norm": 0.10592233389616013, | |
| "learning_rate": 0.00011004434329650452, | |
| "loss": 0.1296, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.0173859049984477, | |
| "grad_norm": 0.15041767060756683, | |
| "learning_rate": 0.00010595113437563176, | |
| "loss": 0.1367, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.0422229121390871, | |
| "grad_norm": 0.10861553996801376, | |
| "learning_rate": 0.00010184789049591299, | |
| "loss": 0.1353, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.0422229121390871, | |
| "eval_loss": 0.12352242320775986, | |
| "eval_runtime": 38.7533, | |
| "eval_samples_per_second": 4.387, | |
| "eval_steps_per_second": 4.387, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.0670599192797268, | |
| "grad_norm": 0.0957934781908989, | |
| "learning_rate": 9.774153065478121e-05, | |
| "loss": 0.134, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.0918969264203664, | |
| "grad_norm": 0.09080129116773605, | |
| "learning_rate": 9.36389791038851e-05, | |
| "loss": 0.1329, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.1167339335610058, | |
| "grad_norm": 0.12591005861759186, | |
| "learning_rate": 8.954715367323468e-05, | |
| "loss": 0.121, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.1167339335610058, | |
| "eval_loss": 0.12231362611055374, | |
| "eval_runtime": 38.7245, | |
| "eval_samples_per_second": 4.39, | |
| "eval_steps_per_second": 4.39, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.1415709407016454, | |
| "grad_norm": 0.08681875467300415, | |
| "learning_rate": 8.547295410617453e-05, | |
| "loss": 0.1305, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.166407947842285, | |
| "grad_norm": 0.0953899621963501, | |
| "learning_rate": 8.142325042485592e-05, | |
| "loss": 0.1309, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.1912449549829245, | |
| "grad_norm": 0.07845437526702881, | |
| "learning_rate": 7.740487134582525e-05, | |
| "loss": 0.1298, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.1912449549829245, | |
| "eval_loss": 0.12134242057800293, | |
| "eval_runtime": 38.8319, | |
| "eval_samples_per_second": 4.378, | |
| "eval_steps_per_second": 4.378, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.2160819621235641, | |
| "grad_norm": 0.11160232126712799, | |
| "learning_rate": 7.342459276526302e-05, | |
| "loss": 0.1348, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.2409189692642038, | |
| "grad_norm": 0.09501124173402786, | |
| "learning_rate": 6.948912633329007e-05, | |
| "loss": 0.1321, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.2657559764048432, | |
| "grad_norm": 0.07836316525936127, | |
| "learning_rate": 6.560510813660719e-05, | |
| "loss": 0.1246, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.2657559764048432, | |
| "eval_loss": 0.1200186014175415, | |
| "eval_runtime": 38.566, | |
| "eval_samples_per_second": 4.408, | |
| "eval_steps_per_second": 4.408, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.2905929835454828, | |
| "grad_norm": 0.08444702625274658, | |
| "learning_rate": 6.177908750855164e-05, | |
| "loss": 0.1293, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.3154299906861224, | |
| "grad_norm": 0.07452095299959183, | |
| "learning_rate": 5.8017515985439465e-05, | |
| "loss": 0.1319, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.3402669978267618, | |
| "grad_norm": 0.06832710653543472, | |
| "learning_rate": 5.4326736427815946e-05, | |
| "loss": 0.1298, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.3402669978267618, | |
| "eval_loss": 0.1198083758354187, | |
| "eval_runtime": 38.7172, | |
| "eval_samples_per_second": 4.391, | |
| "eval_steps_per_second": 4.391, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.3651040049674015, | |
| "grad_norm": 0.10138995200395584, | |
| "learning_rate": 5.071297232495769e-05, | |
| "loss": 0.1274, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.389941012108041, | |
| "grad_norm": 0.08988513052463531, | |
| "learning_rate": 4.7182317300661796e-05, | |
| "loss": 0.1309, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.4147780192486805, | |
| "grad_norm": 0.08663639426231384, | |
| "learning_rate": 4.374072483801769e-05, | |
| "loss": 0.1305, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.4147780192486805, | |
| "eval_loss": 0.1200200691819191, | |
| "eval_runtime": 38.6578, | |
| "eval_samples_per_second": 4.398, | |
| "eval_steps_per_second": 4.398, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.4396150263893202, | |
| "grad_norm": 0.0839414894580841, | |
| "learning_rate": 4.039399824048777e-05, | |
| "loss": 0.1332, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.4644520335299596, | |
| "grad_norm": 0.07634599506855011, | |
| "learning_rate": 3.714778084622492e-05, | |
| "loss": 0.1275, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.4892890406705992, | |
| "grad_norm": 0.08726586401462555, | |
| "learning_rate": 3.400754651212776e-05, | |
| "loss": 0.1302, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.4892890406705992, | |
| "eval_loss": 0.11870752274990082, | |
| "eval_runtime": 38.731, | |
| "eval_samples_per_second": 4.389, | |
| "eval_steps_per_second": 4.389, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.5141260478112386, | |
| "grad_norm": 0.08947139978408813, | |
| "learning_rate": 3.097859038367947e-05, | |
| "loss": 0.1296, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.5389630549518785, | |
| "grad_norm": 0.08135833591222763, | |
| "learning_rate": 2.8066019966134904e-05, | |
| "loss": 0.1281, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.5638000620925179, | |
| "grad_norm": 0.08194943517446518, | |
| "learning_rate": 2.527474651211089e-05, | |
| "loss": 0.1296, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.5638000620925179, | |
| "eval_loss": 0.11857092380523682, | |
| "eval_runtime": 38.7225, | |
| "eval_samples_per_second": 4.39, | |
| "eval_steps_per_second": 4.39, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.5886370692331573, | |
| "grad_norm": 0.08808406442403793, | |
| "learning_rate": 2.260947674010372e-05, | |
| "loss": 0.1299, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.613474076373797, | |
| "grad_norm": 0.08768365532159805, | |
| "learning_rate": 2.0074704897896558e-05, | |
| "loss": 0.1242, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.6383110835144366, | |
| "grad_norm": 0.09054244309663773, | |
| "learning_rate": 1.767470518424129e-05, | |
| "loss": 0.1167, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.6383110835144366, | |
| "eval_loss": 0.11811664700508118, | |
| "eval_runtime": 38.7626, | |
| "eval_samples_per_second": 4.386, | |
| "eval_steps_per_second": 4.386, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.663148090655076, | |
| "grad_norm": 0.06916210800409317, | |
| "learning_rate": 1.541352454159237e-05, | |
| "loss": 0.1286, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 1.6879850977957156, | |
| "grad_norm": 0.08965995907783508, | |
| "learning_rate": 1.3294975832046353e-05, | |
| "loss": 0.1293, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 1.7128221049363552, | |
| "grad_norm": 0.09365396201610565, | |
| "learning_rate": 1.1322631407993811e-05, | |
| "loss": 0.128, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.7128221049363552, | |
| "eval_loss": 0.1179969310760498, | |
| "eval_runtime": 38.7285, | |
| "eval_samples_per_second": 4.39, | |
| "eval_steps_per_second": 4.39, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.7376591120769946, | |
| "grad_norm": 0.06408954411745071, | |
| "learning_rate": 9.499817088325102e-06, | |
| "loss": 0.1292, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.7624961192176343, | |
| "grad_norm": 0.08411859720945358, | |
| "learning_rate": 7.829606550347313e-06, | |
| "loss": 0.1238, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 1.787333126358274, | |
| "grad_norm": 0.08746035397052765, | |
| "learning_rate": 6.314816146868952e-06, | |
| "loss": 0.1354, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.787333126358274, | |
| "eval_loss": 0.11767658591270447, | |
| "eval_runtime": 38.9009, | |
| "eval_samples_per_second": 4.37, | |
| "eval_steps_per_second": 4.37, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.8121701334989133, | |
| "grad_norm": 0.10084281116724014, | |
| "learning_rate": 4.958000157192022e-06, | |
| "loss": 0.1277, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 1.837007140639553, | |
| "grad_norm": 0.08554716408252716, | |
| "learning_rate": 3.761446480019315e-06, | |
| "loss": 0.1287, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 1.8618441477801926, | |
| "grad_norm": 0.08761674165725708, | |
| "learning_rate": 2.7271727755395214e-06, | |
| "loss": 0.1289, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.8618441477801926, | |
| "eval_loss": 0.11756357550621033, | |
| "eval_runtime": 38.7293, | |
| "eval_samples_per_second": 4.389, | |
| "eval_steps_per_second": 4.389, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.886681154920832, | |
| "grad_norm": 0.08505561947822571, | |
| "learning_rate": 1.8569230631958256e-06, | |
| "loss": 0.1245, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 1.9115181620614716, | |
| "grad_norm": 0.08099253475666046, | |
| "learning_rate": 1.1521647808744873e-06, | |
| "loss": 0.1215, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 1.9363551692021113, | |
| "grad_norm": 0.08459154516458511, | |
| "learning_rate": 6.140863104726391e-07, | |
| "loss": 0.13, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.9363551692021113, | |
| "eval_loss": 0.11756289005279541, | |
| "eval_runtime": 38.7944, | |
| "eval_samples_per_second": 4.382, | |
| "eval_steps_per_second": 4.382, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.9611921763427507, | |
| "grad_norm": 0.08602018654346466, | |
| "learning_rate": 2.4359497401758024e-07, | |
| "loss": 0.1288, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 1.9860291834833903, | |
| "grad_norm": 0.08176976442337036, | |
| "learning_rate": 4.131550371655468e-08, | |
| "loss": 0.128, | |
| "step": 800 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 806, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.2864376597719245e+17, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |