{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9785780813411984, "eval_steps": 30, "global_step": 1200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.024837007140639553, "grad_norm": 8.359491348266602, "learning_rate": 1.487603305785124e-05, "loss": 2.6721, "step": 10 }, { "epoch": 0.04967401428127911, "grad_norm": 0.796862006187439, "learning_rate": 3.1404958677685955e-05, "loss": 1.0523, "step": 20 }, { "epoch": 0.07451102142191866, "grad_norm": 0.9021220803260803, "learning_rate": 4.793388429752066e-05, "loss": 0.6256, "step": 30 }, { "epoch": 0.07451102142191866, "eval_loss": 0.5602371096611023, "eval_runtime": 38.7856, "eval_samples_per_second": 4.383, "eval_steps_per_second": 4.383, "step": 30 }, { "epoch": 0.09934802856255821, "grad_norm": 0.5970816612243652, "learning_rate": 6.446280991735537e-05, "loss": 0.4598, "step": 40 }, { "epoch": 0.12418503570319776, "grad_norm": 0.6478993892669678, "learning_rate": 8.099173553719009e-05, "loss": 0.3539, "step": 50 }, { "epoch": 0.14902204284383733, "grad_norm": 0.4981272518634796, "learning_rate": 9.75206611570248e-05, "loss": 0.2828, "step": 60 }, { "epoch": 0.14902204284383733, "eval_loss": 0.28084230422973633, "eval_runtime": 38.8471, "eval_samples_per_second": 4.376, "eval_steps_per_second": 4.376, "step": 60 }, { "epoch": 0.17385904998447688, "grad_norm": 0.5028337836265564, "learning_rate": 0.0001140495867768595, "loss": 0.2523, "step": 70 }, { "epoch": 0.19869605712511643, "grad_norm": 0.7332940697669983, "learning_rate": 0.00013057851239669423, "loss": 0.2251, "step": 80 }, { "epoch": 0.22353306426575598, "grad_norm": 0.43022558093070984, "learning_rate": 0.00014710743801652894, "loss": 0.1988, "step": 90 }, { "epoch": 0.22353306426575598, "eval_loss": 0.19827328622341156, "eval_runtime": 38.898, "eval_samples_per_second": 4.37, "eval_steps_per_second": 4.37, "step": 90 }, { "epoch": 0.24837007140639553, "grad_norm": 0.33378636837005615, "learning_rate": 0.00016363636363636366, "loss": 0.1873, "step": 100 }, { "epoch": 0.2732070785470351, "grad_norm": 0.6605438590049744, "learning_rate": 0.00018016528925619835, "loss": 0.1663, "step": 110 }, { "epoch": 0.29804408568767465, "grad_norm": 0.26731953024864197, "learning_rate": 0.0001966942148760331, "loss": 0.1704, "step": 120 }, { "epoch": 0.29804408568767465, "eval_loss": 0.16433002054691315, "eval_runtime": 38.9658, "eval_samples_per_second": 4.363, "eval_steps_per_second": 4.363, "step": 120 }, { "epoch": 0.3228810928283142, "grad_norm": 0.33725589513778687, "learning_rate": 0.00019997332081116373, "loss": 0.166, "step": 130 }, { "epoch": 0.34771809996895375, "grad_norm": 0.41751629114151, "learning_rate": 0.00019986496100395275, "loss": 0.1693, "step": 140 }, { "epoch": 0.3725551071095933, "grad_norm": 0.34873735904693604, "learning_rate": 0.000199673343399533, "loss": 0.1571, "step": 150 }, { "epoch": 0.3725551071095933, "eval_loss": 0.15006589889526367, "eval_runtime": 38.8567, "eval_samples_per_second": 4.375, "eval_steps_per_second": 4.375, "step": 150 }, { "epoch": 0.39739211425023285, "grad_norm": 0.31080615520477295, "learning_rate": 0.00019939862775022893, "loss": 0.1608, "step": 160 }, { "epoch": 0.4222291213908724, "grad_norm": 0.18451546132564545, "learning_rate": 0.0001990410430875205, "loss": 0.1477, "step": 170 }, { "epoch": 0.44706612853151195, "grad_norm": 0.21936577558517456, "learning_rate": 0.00019860088753109896, "loss": 0.1582, "step": 180 }, { "epoch": 0.44706612853151195, "eval_loss": 0.14357255399227142, "eval_runtime": 38.8454, "eval_samples_per_second": 4.376, "eval_steps_per_second": 4.376, "step": 180 }, { "epoch": 0.47190313567215153, "grad_norm": 0.16537490487098694, "learning_rate": 0.00019807852804032305, "loss": 0.1474, "step": 190 }, { "epoch": 0.49674014281279105, "grad_norm": 0.21974419057369232, "learning_rate": 0.00019747440010828383, "loss": 0.1475, "step": 200 }, { "epoch": 0.5215771499534306, "grad_norm": 0.26473143696784973, "learning_rate": 0.00019678900739873226, "loss": 0.1443, "step": 210 }, { "epoch": 0.5215771499534306, "eval_loss": 0.13610073924064636, "eval_runtime": 38.8034, "eval_samples_per_second": 4.381, "eval_steps_per_second": 4.381, "step": 210 }, { "epoch": 0.5464141570940702, "grad_norm": 0.24532222747802734, "learning_rate": 0.000196022921326173, "loss": 0.1418, "step": 220 }, { "epoch": 0.5712511642347097, "grad_norm": 0.189656063914299, "learning_rate": 0.00019517678057947384, "loss": 0.1463, "step": 230 }, { "epoch": 0.5960881713753493, "grad_norm": 0.16695067286491394, "learning_rate": 0.00019425129058938832, "loss": 0.1518, "step": 240 }, { "epoch": 0.5960881713753493, "eval_loss": 0.12985987961292267, "eval_runtime": 38.9149, "eval_samples_per_second": 4.369, "eval_steps_per_second": 4.369, "step": 240 }, { "epoch": 0.6209251785159888, "grad_norm": 0.20477107167243958, "learning_rate": 0.00019324722294043558, "loss": 0.1365, "step": 250 }, { "epoch": 0.6457621856566284, "grad_norm": 0.14411672949790955, "learning_rate": 0.00019216541472762735, "loss": 0.1396, "step": 260 }, { "epoch": 0.670599192797268, "grad_norm": 0.12490475922822952, "learning_rate": 0.0001910067678585786, "loss": 0.1462, "step": 270 }, { "epoch": 0.670599192797268, "eval_loss": 0.13094556331634521, "eval_runtime": 38.8929, "eval_samples_per_second": 4.371, "eval_steps_per_second": 4.371, "step": 270 }, { "epoch": 0.6954361999379075, "grad_norm": 0.2038726955652237, "learning_rate": 0.0001897722483015838, "loss": 0.1331, "step": 280 }, { "epoch": 0.720273207078547, "grad_norm": 0.1514689326286316, "learning_rate": 0.00018846288528028555, "loss": 0.138, "step": 290 }, { "epoch": 0.7451102142191866, "grad_norm": 0.22410112619400024, "learning_rate": 0.0001870797704156067, "loss": 0.1478, "step": 300 }, { "epoch": 0.7451102142191866, "eval_loss": 0.1277894824743271, "eval_runtime": 38.922, "eval_samples_per_second": 4.368, "eval_steps_per_second": 4.368, "step": 300 }, { "epoch": 0.7699472213598262, "grad_norm": 0.1832033395767212, "learning_rate": 0.00018562405681566216, "loss": 0.1431, "step": 310 }, { "epoch": 0.7947842285004657, "grad_norm": 0.11910755932331085, "learning_rate": 0.00018409695811440796, "loss": 0.1427, "step": 320 }, { "epoch": 0.8196212356411052, "grad_norm": 0.13471344113349915, "learning_rate": 0.00018249974745983023, "loss": 0.1471, "step": 330 }, { "epoch": 0.8196212356411052, "eval_loss": 0.12796619534492493, "eval_runtime": 38.9144, "eval_samples_per_second": 4.369, "eval_steps_per_second": 4.369, "step": 330 }, { "epoch": 0.8444582427817448, "grad_norm": 0.1409328728914261, "learning_rate": 0.00018083375645251684, "loss": 0.1413, "step": 340 }, { "epoch": 0.8692952499223844, "grad_norm": 0.26645776629447937, "learning_rate": 0.00017910037403549693, "loss": 0.1383, "step": 350 }, { "epoch": 0.8941322570630239, "grad_norm": 0.21028681099414825, "learning_rate": 0.0001773010453362737, "loss": 0.138, "step": 360 }, { "epoch": 0.8941322570630239, "eval_loss": 0.12485021352767944, "eval_runtime": 39.0821, "eval_samples_per_second": 4.35, "eval_steps_per_second": 4.35, "step": 360 }, { "epoch": 0.9189692642036634, "grad_norm": 0.13296136260032654, "learning_rate": 0.0001754372704620164, "loss": 0.1494, "step": 370 }, { "epoch": 0.9438062713443031, "grad_norm": 0.16724663972854614, "learning_rate": 0.00017351060324891502, "loss": 0.1391, "step": 380 }, { "epoch": 0.9686432784849426, "grad_norm": 0.11043282598257065, "learning_rate": 0.00017152264996674136, "loss": 0.1425, "step": 390 }, { "epoch": 0.9686432784849426, "eval_loss": 0.127302885055542, "eval_runtime": 38.9, "eval_samples_per_second": 4.37, "eval_steps_per_second": 4.37, "step": 390 }, { "epoch": 0.9934802856255821, "grad_norm": 0.28038299083709717, "learning_rate": 0.00016947506797969562, "loss": 0.1323, "step": 400 }, { "epoch": 1.0173859049984477, "grad_norm": 0.13846513628959656, "learning_rate": 0.00016736956436465573, "loss": 0.1375, "step": 410 }, { "epoch": 1.0422229121390871, "grad_norm": 0.13630150258541107, "learning_rate": 0.00016520789448798087, "loss": 0.138, "step": 420 }, { "epoch": 1.0422229121390871, "eval_loss": 0.12523552775382996, "eval_runtime": 38.8684, "eval_samples_per_second": 4.374, "eval_steps_per_second": 4.374, "step": 420 }, { "epoch": 1.0670599192797268, "grad_norm": 0.6552147269248962, "learning_rate": 0.00016299186054205577, "loss": 0.1347, "step": 430 }, { "epoch": 1.0918969264203664, "grad_norm": 0.14625756442546844, "learning_rate": 0.00016072331004279614, "loss": 0.1388, "step": 440 }, { "epoch": 1.1167339335610058, "grad_norm": 0.29366788268089294, "learning_rate": 0.00015840413428936767, "loss": 0.1237, "step": 450 }, { "epoch": 1.1167339335610058, "eval_loss": 0.1281893253326416, "eval_runtime": 38.858, "eval_samples_per_second": 4.375, "eval_steps_per_second": 4.375, "step": 450 }, { "epoch": 1.1415709407016454, "grad_norm": 0.4872620105743408, "learning_rate": 0.00015603626678740263, "loss": 0.1364, "step": 460 }, { "epoch": 1.166407947842285, "grad_norm": 0.14464133977890015, "learning_rate": 0.000153621681637029, "loss": 0.1354, "step": 470 }, { "epoch": 1.1912449549829245, "grad_norm": 0.11566495150327682, "learning_rate": 0.00015116239188705556, "loss": 0.1337, "step": 480 }, { "epoch": 1.1912449549829245, "eval_loss": 0.1254083812236786, "eval_runtime": 39.142, "eval_samples_per_second": 4.343, "eval_steps_per_second": 4.343, "step": 480 }, { "epoch": 1.2160819621235641, "grad_norm": 0.14898011088371277, "learning_rate": 0.00014866044785668563, "loss": 0.1383, "step": 490 }, { "epoch": 1.2409189692642038, "grad_norm": 0.09731869399547577, "learning_rate": 0.00014611793542615803, "loss": 0.1353, "step": 500 }, { "epoch": 1.2657559764048432, "grad_norm": 0.088087297976017, "learning_rate": 0.00014353697429774084, "loss": 0.1277, "step": 510 }, { "epoch": 1.2657559764048432, "eval_loss": 0.1225721463561058, "eval_runtime": 38.987, "eval_samples_per_second": 4.36, "eval_steps_per_second": 4.36, "step": 510 }, { "epoch": 1.2905929835454828, "grad_norm": 0.0956592932343483, "learning_rate": 0.0001409197162285275, "loss": 0.1334, "step": 520 }, { "epoch": 1.3154299906861224, "grad_norm": 0.09264901280403137, "learning_rate": 0.000138268343236509, "loss": 0.1355, "step": 530 }, { "epoch": 1.3402669978267618, "grad_norm": 0.0848374217748642, "learning_rate": 0.00013558506578141682, "loss": 0.1331, "step": 540 }, { "epoch": 1.3402669978267618, "eval_loss": 0.12285340577363968, "eval_runtime": 39.0259, "eval_samples_per_second": 4.356, "eval_steps_per_second": 4.356, "step": 540 }, { "epoch": 1.3651040049674015, "grad_norm": 0.13393981754779816, "learning_rate": 0.00013287212092185464, "loss": 0.1293, "step": 550 }, { "epoch": 1.389941012108041, "grad_norm": 0.1015724316239357, "learning_rate": 0.00013013177045025374, "loss": 0.1336, "step": 560 }, { "epoch": 1.4147780192486805, "grad_norm": 0.09620890021324158, "learning_rate": 0.0001273662990072083, "loss": 0.1327, "step": 570 }, { "epoch": 1.4147780192486805, "eval_loss": 0.12293345481157303, "eval_runtime": 39.0058, "eval_samples_per_second": 4.358, "eval_steps_per_second": 4.358, "step": 570 }, { "epoch": 1.4396150263893202, "grad_norm": 0.10039085894823074, "learning_rate": 0.00012457801217676182, "loss": 0.1358, "step": 580 }, { "epoch": 1.4644520335299596, "grad_norm": 0.0853387713432312, "learning_rate": 0.00012176923456423284, "loss": 0.1294, "step": 590 }, { "epoch": 1.4892890406705992, "grad_norm": 0.10647860914468765, "learning_rate": 0.00011894230785818284, "loss": 0.1344, "step": 600 }, { "epoch": 1.4892890406705992, "eval_loss": 0.12142268568277359, "eval_runtime": 38.984, "eval_samples_per_second": 4.361, "eval_steps_per_second": 4.361, "step": 600 }, { "epoch": 1.5141260478112386, "grad_norm": 0.10750167816877365, "learning_rate": 0.00011609958887814129, "loss": 0.1328, "step": 610 }, { "epoch": 1.5389630549518785, "grad_norm": 0.10130150616168976, "learning_rate": 0.00011324344760971671, "loss": 0.1305, "step": 620 }, { "epoch": 1.5638000620925179, "grad_norm": 0.08659154921770096, "learning_rate": 0.00011037626522873019, "loss": 0.1329, "step": 630 }, { "epoch": 1.5638000620925179, "eval_loss": 0.12055275589227676, "eval_runtime": 38.9483, "eval_samples_per_second": 4.365, "eval_steps_per_second": 4.365, "step": 630 }, { "epoch": 1.5886370692331573, "grad_norm": 0.10068133473396301, "learning_rate": 0.00010750043211602045, "loss": 0.1332, "step": 640 }, { "epoch": 1.613474076373797, "grad_norm": 0.10626350343227386, "learning_rate": 0.00010461834586457398, "loss": 0.1265, "step": 650 }, { "epoch": 1.6383110835144366, "grad_norm": 0.08024556934833527, "learning_rate": 0.00010173240928064285, "loss": 0.1188, "step": 660 }, { "epoch": 1.6383110835144366, "eval_loss": 0.12125765532255173, "eval_runtime": 39.0777, "eval_samples_per_second": 4.35, "eval_steps_per_second": 4.35, "step": 660 }, { "epoch": 1.663148090655076, "grad_norm": 0.08063840121030807, "learning_rate": 9.884502838051595e-05, "loss": 0.1325, "step": 670 }, { "epoch": 1.6879850977957156, "grad_norm": 0.0849422737956047, "learning_rate": 9.595861038461398e-05, "loss": 0.1322, "step": 680 }, { "epoch": 1.7128221049363552, "grad_norm": 0.09707438945770264, "learning_rate": 9.307556171058085e-05, "loss": 0.1303, "step": 690 }, { "epoch": 1.7128221049363552, "eval_loss": 0.119967520236969, "eval_runtime": 38.9279, "eval_samples_per_second": 4.367, "eval_steps_per_second": 4.367, "step": 690 }, { "epoch": 1.7376591120769946, "grad_norm": 0.06874745339155197, "learning_rate": 9.019828596704394e-05, "loss": 0.1323, "step": 700 }, { "epoch": 1.7624961192176343, "grad_norm": 0.09782899171113968, "learning_rate": 8.732918194971664e-05, "loss": 0.1269, "step": 710 }, { "epoch": 1.787333126358274, "grad_norm": 0.08459767699241638, "learning_rate": 8.447064164151304e-05, "loss": 0.1383, "step": 720 }, { "epoch": 1.787333126358274, "eval_loss": 0.11947113275527954, "eval_runtime": 39.0046, "eval_samples_per_second": 4.358, "eval_steps_per_second": 4.358, "step": 720 }, { "epoch": 1.8121701334989133, "grad_norm": 0.10213370621204376, "learning_rate": 8.162504821834295e-05, "loss": 0.1306, "step": 730 }, { "epoch": 1.837007140639553, "grad_norm": 0.08585705608129501, "learning_rate": 7.879477406224894e-05, "loss": 0.1316, "step": 740 }, { "epoch": 1.8618441477801926, "grad_norm": 0.08754217624664307, "learning_rate": 7.598217878354237e-05, "loss": 0.1314, "step": 750 }, { "epoch": 1.8618441477801926, "eval_loss": 0.11994090676307678, "eval_runtime": 39.0279, "eval_samples_per_second": 4.356, "eval_steps_per_second": 4.356, "step": 750 }, { "epoch": 1.886681154920832, "grad_norm": 0.09176009893417358, "learning_rate": 7.318960725358741e-05, "loss": 0.1272, "step": 760 }, { "epoch": 1.9115181620614716, "grad_norm": 0.07954169064760208, "learning_rate": 7.041938764987297e-05, "loss": 0.1254, "step": 770 }, { "epoch": 1.9363551692021113, "grad_norm": 0.10337759554386139, "learning_rate": 6.767382951500204e-05, "loss": 0.1324, "step": 780 }, { "epoch": 1.9363551692021113, "eval_loss": 0.12009570002555847, "eval_runtime": 39.0135, "eval_samples_per_second": 4.357, "eval_steps_per_second": 4.357, "step": 780 }, { "epoch": 1.9611921763427507, "grad_norm": 0.0923035591840744, "learning_rate": 6.495522183121741e-05, "loss": 0.1312, "step": 790 }, { "epoch": 1.9860291834833903, "grad_norm": 0.08417502790689468, "learning_rate": 6.226583111206856e-05, "loss": 0.1308, "step": 800 }, { "epoch": 2.009934802856256, "grad_norm": 0.0744447335600853, "learning_rate": 5.960789951281052e-05, "loss": 0.1226, "step": 810 }, { "epoch": 2.009934802856256, "eval_loss": 0.11844488978385925, "eval_runtime": 39.1559, "eval_samples_per_second": 4.342, "eval_steps_per_second": 4.342, "step": 810 }, { "epoch": 2.0347718099968954, "grad_norm": 0.07120949774980545, "learning_rate": 5.698364296111056e-05, "loss": 0.1307, "step": 820 }, { "epoch": 2.059608817137535, "grad_norm": 0.10359616577625275, "learning_rate": 5.43952493096211e-05, "loss": 0.1235, "step": 830 }, { "epoch": 2.0844458242781743, "grad_norm": 0.07456395030021667, "learning_rate": 5.184487651195825e-05, "loss": 0.1251, "step": 840 }, { "epoch": 2.0844458242781743, "eval_loss": 0.11879772692918777, "eval_runtime": 39.037, "eval_samples_per_second": 4.355, "eval_steps_per_second": 4.355, "step": 840 }, { "epoch": 2.109282831418814, "grad_norm": 0.08009591698646545, "learning_rate": 4.933465082360807e-05, "loss": 0.1349, "step": 850 }, { "epoch": 2.1341198385594535, "grad_norm": 0.09120004624128342, "learning_rate": 4.686666502925908e-05, "loss": 0.1268, "step": 860 }, { "epoch": 2.158956845700093, "grad_norm": 0.06066734343767166, "learning_rate": 4.444297669803981e-05, "loss": 0.1259, "step": 870 }, { "epoch": 2.158956845700093, "eval_loss": 0.11877793818712234, "eval_runtime": 38.9985, "eval_samples_per_second": 4.359, "eval_steps_per_second": 4.359, "step": 870 }, { "epoch": 2.183793852840733, "grad_norm": 0.09893694519996643, "learning_rate": 4.206560646811545e-05, "loss": 0.1295, "step": 880 }, { "epoch": 2.208630859981372, "grad_norm": 0.0868426039814949, "learning_rate": 3.973653636207437e-05, "loss": 0.1193, "step": 890 }, { "epoch": 2.2334678671220116, "grad_norm": 0.0975928083062172, "learning_rate": 3.745770813450824e-05, "loss": 0.1282, "step": 900 }, { "epoch": 2.2334678671220116, "eval_loss": 0.1184425801038742, "eval_runtime": 38.8574, "eval_samples_per_second": 4.375, "eval_steps_per_second": 4.375, "step": 900 }, { "epoch": 2.2583048742626515, "grad_norm": 0.07092616707086563, "learning_rate": 3.523102165316381e-05, "loss": 0.1212, "step": 910 }, { "epoch": 2.283141881403291, "grad_norm": 0.0809750035405159, "learning_rate": 3.3058333315016065e-05, "loss": 0.1252, "step": 920 }, { "epoch": 2.3079788885439303, "grad_norm": 0.10307040810585022, "learning_rate": 3.094145449858285e-05, "loss": 0.1263, "step": 930 }, { "epoch": 2.3079788885439303, "eval_loss": 0.11812498420476913, "eval_runtime": 38.9098, "eval_samples_per_second": 4.369, "eval_steps_per_second": 4.369, "step": 930 }, { "epoch": 2.33281589568457, "grad_norm": 0.087245412170887, "learning_rate": 2.8882150053771995e-05, "loss": 0.1261, "step": 940 }, { "epoch": 2.3576529028252096, "grad_norm": 0.08954016864299774, "learning_rate": 2.688213683051892e-05, "loss": 0.13, "step": 950 }, { "epoch": 2.382489909965849, "grad_norm": 0.08771616220474243, "learning_rate": 2.4943082247442585e-05, "loss": 0.1232, "step": 960 }, { "epoch": 2.382489909965849, "eval_loss": 0.1186952143907547, "eval_runtime": 38.9954, "eval_samples_per_second": 4.359, "eval_steps_per_second": 4.359, "step": 960 }, { "epoch": 2.407326917106489, "grad_norm": 0.09044978022575378, "learning_rate": 2.3066602901712108e-05, "loss": 0.1269, "step": 970 }, { "epoch": 2.4321639242471282, "grad_norm": 0.07978509366512299, "learning_rate": 2.1254263221283654e-05, "loss": 0.1234, "step": 980 }, { "epoch": 2.4570009313877677, "grad_norm": 0.09153130650520325, "learning_rate": 1.950757416063077e-05, "loss": 0.1219, "step": 990 }, { "epoch": 2.4570009313877677, "eval_loss": 0.11810684204101562, "eval_runtime": 39.0556, "eval_samples_per_second": 4.353, "eval_steps_per_second": 4.353, "step": 990 }, { "epoch": 2.4818379385284075, "grad_norm": 0.09865284711122513, "learning_rate": 1.7827991941056177e-05, "loss": 0.1196, "step": 1000 }, { "epoch": 2.506674945669047, "grad_norm": 0.08960665762424469, "learning_rate": 1.621691683663418e-05, "loss": 0.1279, "step": 1010 }, { "epoch": 2.5315119528096863, "grad_norm": 0.09712927043437958, "learning_rate": 1.4675692006797137e-05, "loss": 0.125, "step": 1020 }, { "epoch": 2.5315119528096863, "eval_loss": 0.11813132464885712, "eval_runtime": 38.9216, "eval_samples_per_second": 4.368, "eval_steps_per_second": 4.368, "step": 1020 }, { "epoch": 2.556348959950326, "grad_norm": 0.10061313211917877, "learning_rate": 1.3205602376538163e-05, "loss": 0.128, "step": 1030 }, { "epoch": 2.5811859670909656, "grad_norm": 0.07715890556573868, "learning_rate": 1.1807873565164506e-05, "loss": 0.1231, "step": 1040 }, { "epoch": 2.606022974231605, "grad_norm": 0.09734167903661728, "learning_rate": 1.0483670864493778e-05, "loss": 0.133, "step": 1050 }, { "epoch": 2.606022974231605, "eval_loss": 0.11806084215641022, "eval_runtime": 39.1009, "eval_samples_per_second": 4.348, "eval_steps_per_second": 4.348, "step": 1050 }, { "epoch": 2.630859981372245, "grad_norm": 0.07243157923221588, "learning_rate": 9.234098267345958e-06, "loss": 0.1278, "step": 1060 }, { "epoch": 2.6556969885128843, "grad_norm": 0.12202966213226318, "learning_rate": 8.060197547140347e-06, "loss": 0.1228, "step": 1070 }, { "epoch": 2.6805339956535237, "grad_norm": 0.07589470595121384, "learning_rate": 6.962947389365071e-06, "loss": 0.1225, "step": 1080 }, { "epoch": 2.6805339956535237, "eval_loss": 0.11801353096961975, "eval_runtime": 39.1137, "eval_samples_per_second": 4.346, "eval_steps_per_second": 4.346, "step": 1080 }, { "epoch": 2.7053710027941635, "grad_norm": 0.11445856839418411, "learning_rate": 5.943262575643238e-06, "loss": 0.135, "step": 1090 }, { "epoch": 2.730208009934803, "grad_norm": 0.09918007999658585, "learning_rate": 5.001993221076162e-06, "loss": 0.1312, "step": 1100 }, { "epoch": 2.7550450170754424, "grad_norm": 0.08956551551818848, "learning_rate": 4.139924065499035e-06, "loss": 0.1188, "step": 1110 }, { "epoch": 2.7550450170754424, "eval_loss": 0.11794496327638626, "eval_runtime": 39.0409, "eval_samples_per_second": 4.354, "eval_steps_per_second": 4.354, "step": 1110 }, { "epoch": 2.779882024216082, "grad_norm": 0.09554298222064972, "learning_rate": 3.3577738192404395e-06, "loss": 0.124, "step": 1120 }, { "epoch": 2.8047190313567216, "grad_norm": 0.09960418939590454, "learning_rate": 2.656194563930714e-06, "loss": 0.1295, "step": 1130 }, { "epoch": 2.829556038497361, "grad_norm": 0.11168912798166275, "learning_rate": 2.035771208859194e-06, "loss": 0.1243, "step": 1140 }, { "epoch": 2.829556038497361, "eval_loss": 0.11804591119289398, "eval_runtime": 39.2098, "eval_samples_per_second": 4.336, "eval_steps_per_second": 4.336, "step": 1140 }, { "epoch": 2.854393045638001, "grad_norm": 0.08320163935422897, "learning_rate": 1.49702100333291e-06, "loss": 0.1238, "step": 1150 }, { "epoch": 2.8792300527786403, "grad_norm": 0.11576654016971588, "learning_rate": 1.0403931054440374e-06, "loss": 0.1206, "step": 1160 }, { "epoch": 2.9040670599192797, "grad_norm": 0.09662230312824249, "learning_rate": 6.662682076050031e-07, "loss": 0.1205, "step": 1170 }, { "epoch": 2.9040670599192797, "eval_loss": 0.11792467534542084, "eval_runtime": 39.0045, "eval_samples_per_second": 4.358, "eval_steps_per_second": 4.358, "step": 1170 }, { "epoch": 2.928904067059919, "grad_norm": 0.09648039191961288, "learning_rate": 3.7495821916382344e-07, "loss": 0.129, "step": 1180 }, { "epoch": 2.953741074200559, "grad_norm": 0.08923009783029556, "learning_rate": 1.6670600636403687e-07, "loss": 0.1237, "step": 1190 }, { "epoch": 2.9785780813411984, "grad_norm": 0.08443022519350052, "learning_rate": 4.168518986628067e-08, "loss": 0.1196, "step": 1200 }, { "epoch": 2.9785780813411984, "eval_loss": 0.11797202378511429, "eval_runtime": 39.051, "eval_samples_per_second": 4.353, "eval_steps_per_second": 4.353, "step": 1200 } ], "logging_steps": 10, "max_steps": 1209, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.8653749005815194e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }