{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9895261845386534, "eval_steps": 30, "global_step": 1500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0199501246882793, "grad_norm": 9.905351638793945, "learning_rate": 1.1920529801324503e-05, "loss": 2.4681, "step": 10 }, { "epoch": 0.0399002493765586, "grad_norm": 1.0738855600357056, "learning_rate": 2.5165562913907287e-05, "loss": 1.192, "step": 20 }, { "epoch": 0.059850374064837904, "grad_norm": 0.9622387290000916, "learning_rate": 3.841059602649007e-05, "loss": 0.651, "step": 30 }, { "epoch": 0.059850374064837904, "eval_loss": 0.5571362376213074, "eval_runtime": 48.7371, "eval_samples_per_second": 4.329, "eval_steps_per_second": 2.175, "step": 30 }, { "epoch": 0.0798004987531172, "grad_norm": 0.6049273610115051, "learning_rate": 5.165562913907285e-05, "loss": 0.5074, "step": 40 }, { "epoch": 0.09975062344139651, "grad_norm": 0.6833104491233826, "learning_rate": 6.490066225165563e-05, "loss": 0.3756, "step": 50 }, { "epoch": 0.11970074812967581, "grad_norm": 0.5208126902580261, "learning_rate": 7.814569536423842e-05, "loss": 0.3065, "step": 60 }, { "epoch": 0.11970074812967581, "eval_loss": 0.25738316774368286, "eval_runtime": 48.1494, "eval_samples_per_second": 4.382, "eval_steps_per_second": 2.201, "step": 60 }, { "epoch": 0.1396508728179551, "grad_norm": 0.5192306041717529, "learning_rate": 9.13907284768212e-05, "loss": 0.2578, "step": 70 }, { "epoch": 0.1596009975062344, "grad_norm": 0.5098096132278442, "learning_rate": 0.00010463576158940399, "loss": 0.2096, "step": 80 }, { "epoch": 0.17955112219451372, "grad_norm": 0.5306766629219055, "learning_rate": 0.00011788079470198677, "loss": 0.1899, "step": 90 }, { "epoch": 0.17955112219451372, "eval_loss": 0.1828010529279709, "eval_runtime": 48.0843, "eval_samples_per_second": 4.388, "eval_steps_per_second": 2.204, "step": 90 }, { "epoch": 0.19950124688279303, "grad_norm": 0.3457268178462982, "learning_rate": 0.00013112582781456955, "loss": 0.1848, "step": 100 }, { "epoch": 0.2194513715710723, "grad_norm": 0.47407814860343933, "learning_rate": 0.00014437086092715232, "loss": 0.1736, "step": 110 }, { "epoch": 0.23940149625935161, "grad_norm": 0.3515409827232361, "learning_rate": 0.00015761589403973512, "loss": 0.1676, "step": 120 }, { "epoch": 0.23940149625935161, "eval_loss": 0.1554146409034729, "eval_runtime": 48.3317, "eval_samples_per_second": 4.366, "eval_steps_per_second": 2.193, "step": 120 }, { "epoch": 0.2593516209476309, "grad_norm": 0.4131653904914856, "learning_rate": 0.0001708609271523179, "loss": 0.1643, "step": 130 }, { "epoch": 0.2793017456359102, "grad_norm": 0.7977883219718933, "learning_rate": 0.0001841059602649007, "loss": 0.1543, "step": 140 }, { "epoch": 0.29925187032418954, "grad_norm": 0.32713833451271057, "learning_rate": 0.00019735099337748346, "loss": 0.1569, "step": 150 }, { "epoch": 0.29925187032418954, "eval_loss": 0.14544202387332916, "eval_runtime": 48.2076, "eval_samples_per_second": 4.377, "eval_steps_per_second": 2.199, "step": 150 }, { "epoch": 0.3192019950124688, "grad_norm": 0.3618054986000061, "learning_rate": 0.00019998279880250371, "loss": 0.1661, "step": 160 }, { "epoch": 0.33915211970074816, "grad_norm": 0.5105156302452087, "learning_rate": 0.0001999129290795728, "loss": 0.1542, "step": 170 }, { "epoch": 0.35910224438902744, "grad_norm": 0.3221360445022583, "learning_rate": 0.00019978935328445287, "loss": 0.1437, "step": 180 }, { "epoch": 0.35910224438902744, "eval_loss": 0.1328973025083542, "eval_runtime": 48.1577, "eval_samples_per_second": 4.381, "eval_steps_per_second": 2.201, "step": 180 }, { "epoch": 0.3790523690773067, "grad_norm": 0.39137646555900574, "learning_rate": 0.000199612137842687, "loss": 0.1511, "step": 190 }, { "epoch": 0.39900249376558605, "grad_norm": 0.29165342450141907, "learning_rate": 0.00019938137801267064, "loss": 0.1456, "step": 200 }, { "epoch": 0.41895261845386533, "grad_norm": 0.2816756069660187, "learning_rate": 0.0001990971978344475, "loss": 0.1271, "step": 210 }, { "epoch": 0.41895261845386533, "eval_loss": 0.13140253722667694, "eval_runtime": 47.9806, "eval_samples_per_second": 4.398, "eval_steps_per_second": 2.209, "step": 210 }, { "epoch": 0.4389027431421446, "grad_norm": 0.6542349457740784, "learning_rate": 0.00019875975006303435, "loss": 0.1409, "step": 220 }, { "epoch": 0.45885286783042395, "grad_norm": 0.2651802599430084, "learning_rate": 0.00019836921608631114, "loss": 0.1474, "step": 230 }, { "epoch": 0.47880299251870323, "grad_norm": 0.2070140242576599, "learning_rate": 0.00019792580582751935, "loss": 0.1369, "step": 240 }, { "epoch": 0.47880299251870323, "eval_loss": 0.12214324623346329, "eval_runtime": 48.2216, "eval_samples_per_second": 4.376, "eval_steps_per_second": 2.198, "step": 240 }, { "epoch": 0.49875311720698257, "grad_norm": 0.16217675805091858, "learning_rate": 0.00019742975763242248, "loss": 0.1342, "step": 250 }, { "epoch": 0.5187032418952618, "grad_norm": 0.17511174082756042, "learning_rate": 0.00019688133814118843, "loss": 0.1233, "step": 260 }, { "epoch": 0.5386533665835411, "grad_norm": 0.16075153648853302, "learning_rate": 0.0001962808421450624, "loss": 0.1333, "step": 270 }, { "epoch": 0.5386533665835411, "eval_loss": 0.11943639069795609, "eval_runtime": 48.0763, "eval_samples_per_second": 4.389, "eval_steps_per_second": 2.205, "step": 270 }, { "epoch": 0.5586034912718204, "grad_norm": 0.1652883142232895, "learning_rate": 0.00019562859242790853, "loss": 0.1308, "step": 280 }, { "epoch": 0.5785536159600998, "grad_norm": 0.2698552906513214, "learning_rate": 0.00019492493959270398, "loss": 0.1378, "step": 290 }, { "epoch": 0.5985037406483791, "grad_norm": 0.11757837980985641, "learning_rate": 0.00019417026187307985, "loss": 0.1186, "step": 300 }, { "epoch": 0.5985037406483791, "eval_loss": 0.11812838912010193, "eval_runtime": 48.1525, "eval_samples_per_second": 4.382, "eval_steps_per_second": 2.201, "step": 300 }, { "epoch": 0.6184538653366584, "grad_norm": 0.1514044553041458, "learning_rate": 0.00019336496493000985, "loss": 0.1329, "step": 310 }, { "epoch": 0.6384039900249376, "grad_norm": 0.1278185248374939, "learning_rate": 0.00019250948163375563, "loss": 0.1311, "step": 320 }, { "epoch": 0.6583541147132169, "grad_norm": 0.14280234277248383, "learning_rate": 0.00019160427183118674, "loss": 0.1281, "step": 330 }, { "epoch": 0.6583541147132169, "eval_loss": 0.11799625307321548, "eval_runtime": 48.1223, "eval_samples_per_second": 4.385, "eval_steps_per_second": 2.203, "step": 330 }, { "epoch": 0.6783042394014963, "grad_norm": 0.141315296292305, "learning_rate": 0.0001906498220985997, "loss": 0.1131, "step": 340 }, { "epoch": 0.6982543640897756, "grad_norm": 0.11978308856487274, "learning_rate": 0.0001896466454801692, "loss": 0.1294, "step": 350 }, { "epoch": 0.7182044887780549, "grad_norm": 0.2772483825683594, "learning_rate": 0.00018859528121217204, "loss": 0.1297, "step": 360 }, { "epoch": 0.7182044887780549, "eval_loss": 0.11690443754196167, "eval_runtime": 48.1421, "eval_samples_per_second": 4.383, "eval_steps_per_second": 2.202, "step": 360 }, { "epoch": 0.7381546134663342, "grad_norm": 0.13961242139339447, "learning_rate": 0.00018749629443313233, "loss": 0.1249, "step": 370 }, { "epoch": 0.7581047381546134, "grad_norm": 0.17335286736488342, "learning_rate": 0.0001863502758800431, "loss": 0.1255, "step": 380 }, { "epoch": 0.7780548628428927, "grad_norm": 0.15879972279071808, "learning_rate": 0.00018515784157082822, "loss": 0.1175, "step": 390 }, { "epoch": 0.7780548628428927, "eval_loss": 0.11526743322610855, "eval_runtime": 48.2061, "eval_samples_per_second": 4.377, "eval_steps_per_second": 2.199, "step": 390 }, { "epoch": 0.7980049875311721, "grad_norm": 0.11807694286108017, "learning_rate": 0.00018391963247321513, "loss": 0.1178, "step": 400 }, { "epoch": 0.8179551122194514, "grad_norm": 0.11275653541088104, "learning_rate": 0.00018263631416019617, "loss": 0.1195, "step": 410 }, { "epoch": 0.8379052369077307, "grad_norm": 0.12657803297042847, "learning_rate": 0.000181308576452264, "loss": 0.1182, "step": 420 }, { "epoch": 0.8379052369077307, "eval_loss": 0.11410157382488251, "eval_runtime": 48.3064, "eval_samples_per_second": 4.368, "eval_steps_per_second": 2.194, "step": 420 }, { "epoch": 0.85785536159601, "grad_norm": 0.15267746150493622, "learning_rate": 0.00017993713304661322, "loss": 0.1216, "step": 430 }, { "epoch": 0.8778054862842892, "grad_norm": 0.11593750864267349, "learning_rate": 0.00017852272113350767, "loss": 0.1329, "step": 440 }, { "epoch": 0.8977556109725686, "grad_norm": 0.12276951223611832, "learning_rate": 0.0001770661010000194, "loss": 0.1189, "step": 450 }, { "epoch": 0.8977556109725686, "eval_loss": 0.11431698501110077, "eval_runtime": 48.1715, "eval_samples_per_second": 4.38, "eval_steps_per_second": 2.2, "step": 450 }, { "epoch": 0.9177057356608479, "grad_norm": 0.11874907463788986, "learning_rate": 0.00017556805562135255, "loss": 0.1313, "step": 460 }, { "epoch": 0.9376558603491272, "grad_norm": 0.22391599416732788, "learning_rate": 0.00017402939023997157, "loss": 0.1127, "step": 470 }, { "epoch": 0.9576059850374065, "grad_norm": 0.09968144446611404, "learning_rate": 0.00017245093193276047, "loss": 0.118, "step": 480 }, { "epoch": 0.9576059850374065, "eval_loss": 0.11578261852264404, "eval_runtime": 48.1681, "eval_samples_per_second": 4.38, "eval_steps_per_second": 2.201, "step": 480 }, { "epoch": 0.9775561097256857, "grad_norm": 0.09711634367704391, "learning_rate": 0.00017083352916644494, "loss": 0.1184, "step": 490 }, { "epoch": 0.9975062344139651, "grad_norm": 0.0991770550608635, "learning_rate": 0.0001691780513415173, "loss": 0.1215, "step": 500 }, { "epoch": 1.0159600997506235, "grad_norm": 0.09751348197460175, "learning_rate": 0.00016748538832490857, "loss": 0.1206, "step": 510 }, { "epoch": 1.0159600997506235, "eval_loss": 0.1116185411810875, "eval_runtime": 48.1046, "eval_samples_per_second": 4.386, "eval_steps_per_second": 2.204, "step": 510 }, { "epoch": 1.0359102244389027, "grad_norm": 0.10556904226541519, "learning_rate": 0.0001657564499716595, "loss": 0.1135, "step": 520 }, { "epoch": 1.055860349127182, "grad_norm": 0.15284962952136993, "learning_rate": 0.00016399216563584736, "loss": 0.1218, "step": 530 }, { "epoch": 1.0758104738154612, "grad_norm": 0.09085577726364136, "learning_rate": 0.00016219348367103132, "loss": 0.1171, "step": 540 }, { "epoch": 1.0758104738154612, "eval_loss": 0.11126323789358139, "eval_runtime": 48.6749, "eval_samples_per_second": 4.335, "eval_steps_per_second": 2.178, "step": 540 }, { "epoch": 1.0957605985037406, "grad_norm": 0.12901732325553894, "learning_rate": 0.00016036137092048525, "loss": 0.1188, "step": 550 }, { "epoch": 1.11571072319202, "grad_norm": 0.09154735505580902, "learning_rate": 0.0001584968121974915, "loss": 0.1232, "step": 560 }, { "epoch": 1.1356608478802992, "grad_norm": 0.10134833306074142, "learning_rate": 0.00015660080975597553, "loss": 0.1205, "step": 570 }, { "epoch": 1.1356608478802992, "eval_loss": 0.11302559077739716, "eval_runtime": 48.709, "eval_samples_per_second": 4.332, "eval_steps_per_second": 2.176, "step": 570 }, { "epoch": 1.1556109725685786, "grad_norm": 0.14130111038684845, "learning_rate": 0.00015467438275176568, "loss": 0.1214, "step": 580 }, { "epoch": 1.1755610972568578, "grad_norm": 0.1193675845861435, "learning_rate": 0.0001527185666947675, "loss": 0.1173, "step": 590 }, { "epoch": 1.1955112219451371, "grad_norm": 0.06070871651172638, "learning_rate": 0.00015073441289234745, "loss": 0.1189, "step": 600 }, { "epoch": 1.1955112219451371, "eval_loss": 0.11231612414121628, "eval_runtime": 48.6333, "eval_samples_per_second": 4.339, "eval_steps_per_second": 2.18, "step": 600 }, { "epoch": 1.2154613466334165, "grad_norm": 0.08564524352550507, "learning_rate": 0.00014872298788422497, "loss": 0.1194, "step": 610 }, { "epoch": 1.2354114713216957, "grad_norm": 0.1110881119966507, "learning_rate": 0.00014668537286917664, "loss": 0.117, "step": 620 }, { "epoch": 1.255361596009975, "grad_norm": 0.10232548415660858, "learning_rate": 0.00014462266312386085, "loss": 0.1235, "step": 630 }, { "epoch": 1.255361596009975, "eval_loss": 0.11166342347860336, "eval_runtime": 48.6028, "eval_samples_per_second": 4.341, "eval_steps_per_second": 2.181, "step": 630 }, { "epoch": 1.2753117206982543, "grad_norm": 0.08676601946353912, "learning_rate": 0.00014253596741407507, "loss": 0.1281, "step": 640 }, { "epoch": 1.2952618453865337, "grad_norm": 0.09077729284763336, "learning_rate": 0.0001404264073987623, "loss": 0.1242, "step": 650 }, { "epoch": 1.315211970074813, "grad_norm": 0.3734651505947113, "learning_rate": 0.00013829511702708727, "loss": 0.1137, "step": 660 }, { "epoch": 1.315211970074813, "eval_loss": 0.11021307855844498, "eval_runtime": 48.6287, "eval_samples_per_second": 4.339, "eval_steps_per_second": 2.18, "step": 660 }, { "epoch": 1.3351620947630922, "grad_norm": 0.09780021756887436, "learning_rate": 0.00013614324192890592, "loss": 0.1219, "step": 670 }, { "epoch": 1.3551122194513716, "grad_norm": 0.09793366491794586, "learning_rate": 0.00013397193879895671, "loss": 0.1046, "step": 680 }, { "epoch": 1.3750623441396508, "grad_norm": 0.11061827093362808, "learning_rate": 0.00013178237477510374, "loss": 0.1174, "step": 690 }, { "epoch": 1.3750623441396508, "eval_loss": 0.1095186397433281, "eval_runtime": 48.691, "eval_samples_per_second": 4.333, "eval_steps_per_second": 2.177, "step": 690 }, { "epoch": 1.3950124688279302, "grad_norm": 0.11096978187561035, "learning_rate": 0.0001295757268109666, "loss": 0.1071, "step": 700 }, { "epoch": 1.4149625935162096, "grad_norm": 0.10013638436794281, "learning_rate": 0.0001273531810432741, "loss": 0.1262, "step": 710 }, { "epoch": 1.4349127182044887, "grad_norm": 0.12431971728801727, "learning_rate": 0.00012511593215428141, "loss": 0.1098, "step": 720 }, { "epoch": 1.4349127182044887, "eval_loss": 0.10981423407793045, "eval_runtime": 48.7018, "eval_samples_per_second": 4.332, "eval_steps_per_second": 2.177, "step": 720 }, { "epoch": 1.4548628428927681, "grad_norm": 0.13086780905723572, "learning_rate": 0.0001228651827295943, "loss": 0.1142, "step": 730 }, { "epoch": 1.4748129675810473, "grad_norm": 0.08466003090143204, "learning_rate": 0.00012060214261174465, "loss": 0.1144, "step": 740 }, { "epoch": 1.4947630922693267, "grad_norm": 0.11653965711593628, "learning_rate": 0.00011832802824986523, "loss": 0.1183, "step": 750 }, { "epoch": 1.4947630922693267, "eval_loss": 0.10894475877285004, "eval_runtime": 48.6934, "eval_samples_per_second": 4.333, "eval_steps_per_second": 2.177, "step": 750 }, { "epoch": 1.514713216957606, "grad_norm": 0.11816778779029846, "learning_rate": 0.00011604406204581346, "loss": 0.13, "step": 760 }, { "epoch": 1.5346633416458852, "grad_norm": 0.09233927726745605, "learning_rate": 0.00011375147169709519, "loss": 0.109, "step": 770 }, { "epoch": 1.5546134663341646, "grad_norm": 0.08808460831642151, "learning_rate": 0.00011145148953694195, "loss": 0.1145, "step": 780 }, { "epoch": 1.5546134663341646, "eval_loss": 0.10957030206918716, "eval_runtime": 48.6129, "eval_samples_per_second": 4.34, "eval_steps_per_second": 2.18, "step": 780 }, { "epoch": 1.5745635910224438, "grad_norm": 0.0909830778837204, "learning_rate": 0.00010914535187189654, "loss": 0.1115, "step": 790 }, { "epoch": 1.5945137157107232, "grad_norm": 0.08932233601808548, "learning_rate": 0.00010683429831726252, "loss": 0.1191, "step": 800 }, { "epoch": 1.6144638403990026, "grad_norm": 0.08355925977230072, "learning_rate": 0.0001045195711307756, "loss": 0.1127, "step": 810 }, { "epoch": 1.6144638403990026, "eval_loss": 0.10913572460412979, "eval_runtime": 48.4393, "eval_samples_per_second": 4.356, "eval_steps_per_second": 2.188, "step": 810 }, { "epoch": 1.6344139650872818, "grad_norm": 0.06819329410791397, "learning_rate": 0.00010220241454485406, "loss": 0.1098, "step": 820 }, { "epoch": 1.654364089775561, "grad_norm": 0.6049116849899292, "learning_rate": 9.988407409778838e-05, "loss": 0.1141, "step": 830 }, { "epoch": 1.6743142144638403, "grad_norm": 0.07323434203863144, "learning_rate": 9.756579596422839e-05, "loss": 0.1049, "step": 840 }, { "epoch": 1.6743142144638403, "eval_loss": 0.10999356210231781, "eval_runtime": 48.6539, "eval_samples_per_second": 4.337, "eval_steps_per_second": 2.179, "step": 840 }, { "epoch": 1.6942643391521197, "grad_norm": 0.10413742810487747, "learning_rate": 9.524882628532858e-05, "loss": 0.1158, "step": 850 }, { "epoch": 1.714214463840399, "grad_norm": 0.08523685485124588, "learning_rate": 9.293441049891148e-05, "loss": 0.115, "step": 860 }, { "epoch": 1.7341645885286783, "grad_norm": 0.12683580815792084, "learning_rate": 9.062379267000898e-05, "loss": 0.1185, "step": 870 }, { "epoch": 1.7341645885286783, "eval_loss": 0.10940343141555786, "eval_runtime": 48.6011, "eval_samples_per_second": 4.341, "eval_steps_per_second": 2.181, "step": 870 }, { "epoch": 1.7541147132169574, "grad_norm": 0.10133107006549835, "learning_rate": 8.831821482214159e-05, "loss": 0.1194, "step": 880 }, { "epoch": 1.7740648379052368, "grad_norm": 0.08002304285764694, "learning_rate": 8.601891626969514e-05, "loss": 0.1178, "step": 890 }, { "epoch": 1.7940149625935162, "grad_norm": 0.0782008096575737, "learning_rate": 8.372713295175352e-05, "loss": 0.1162, "step": 900 }, { "epoch": 1.7940149625935162, "eval_loss": 0.10899555683135986, "eval_runtime": 48.522, "eval_samples_per_second": 4.349, "eval_steps_per_second": 2.185, "step": 900 }, { "epoch": 1.8139650872817956, "grad_norm": 0.08133247494697571, "learning_rate": 8.14440967677461e-05, "loss": 0.1234, "step": 910 }, { "epoch": 1.8339152119700748, "grad_norm": 0.08592060953378677, "learning_rate": 7.917103491526617e-05, "loss": 0.1073, "step": 920 }, { "epoch": 1.853865336658354, "grad_norm": 0.07544530183076859, "learning_rate": 7.690916923041708e-05, "loss": 0.1072, "step": 930 }, { "epoch": 1.853865336658354, "eval_loss": 0.10886865854263306, "eval_runtime": 48.4695, "eval_samples_per_second": 4.353, "eval_steps_per_second": 2.187, "step": 930 }, { "epoch": 1.8738154613466333, "grad_norm": 0.10101612657308578, "learning_rate": 7.465971553104014e-05, "loss": 0.1109, "step": 940 }, { "epoch": 1.8937655860349127, "grad_norm": 0.07430823147296906, "learning_rate": 7.242388296317757e-05, "loss": 0.108, "step": 950 }, { "epoch": 1.9137157107231921, "grad_norm": 0.0772905945777893, "learning_rate": 7.020287335112179e-05, "loss": 0.1087, "step": 960 }, { "epoch": 1.9137157107231921, "eval_loss": 0.10754832625389099, "eval_runtime": 48.5946, "eval_samples_per_second": 4.342, "eval_steps_per_second": 2.181, "step": 960 }, { "epoch": 1.9336658354114713, "grad_norm": 0.08496296405792236, "learning_rate": 6.799788055140025e-05, "loss": 0.1203, "step": 970 }, { "epoch": 1.9536159600997505, "grad_norm": 0.07446739822626114, "learning_rate": 6.58100898110432e-05, "loss": 0.1158, "step": 980 }, { "epoch": 1.9735660847880299, "grad_norm": 0.08105003088712692, "learning_rate": 6.364067713047943e-05, "loss": 0.1184, "step": 990 }, { "epoch": 1.9735660847880299, "eval_loss": 0.1074095070362091, "eval_runtime": 48.6824, "eval_samples_per_second": 4.334, "eval_steps_per_second": 2.177, "step": 990 }, { "epoch": 1.9935162094763093, "grad_norm": 0.08692453801631927, "learning_rate": 6.149080863140208e-05, "loss": 0.1145, "step": 1000 }, { "epoch": 2.0119700748129676, "grad_norm": 0.0745435431599617, "learning_rate": 5.9361639929944867e-05, "loss": 0.1081, "step": 1010 }, { "epoch": 2.031920199501247, "grad_norm": 0.08805131912231445, "learning_rate": 5.7254315515505105e-05, "loss": 0.1092, "step": 1020 }, { "epoch": 2.031920199501247, "eval_loss": 0.10798249393701553, "eval_runtime": 48.9954, "eval_samples_per_second": 4.307, "eval_steps_per_second": 2.163, "step": 1020 }, { "epoch": 2.051870324189526, "grad_norm": 0.08542945981025696, "learning_rate": 5.5169968135547655e-05, "loss": 0.1185, "step": 1030 }, { "epoch": 2.0718204488778054, "grad_norm": 0.08667082339525223, "learning_rate": 5.310971818672077e-05, "loss": 0.1107, "step": 1040 }, { "epoch": 2.0917705735660848, "grad_norm": 0.0777161717414856, "learning_rate": 5.1074673112610584e-05, "loss": 0.1082, "step": 1050 }, { "epoch": 2.0917705735660848, "eval_loss": 0.10762733221054077, "eval_runtime": 49.1196, "eval_samples_per_second": 4.296, "eval_steps_per_second": 2.158, "step": 1050 }, { "epoch": 2.111720698254364, "grad_norm": 0.06888756155967712, "learning_rate": 4.906592680845829e-05, "loss": 0.1075, "step": 1060 }, { "epoch": 2.1316708229426435, "grad_norm": 0.09783507883548737, "learning_rate": 4.7084559033160135e-05, "loss": 0.1149, "step": 1070 }, { "epoch": 2.1516209476309225, "grad_norm": 0.09118674695491791, "learning_rate": 4.5131634828865845e-05, "loss": 0.1127, "step": 1080 }, { "epoch": 2.1516209476309225, "eval_loss": 0.10765193402767181, "eval_runtime": 49.1258, "eval_samples_per_second": 4.295, "eval_steps_per_second": 2.158, "step": 1080 }, { "epoch": 2.171571072319202, "grad_norm": 0.08113594353199005, "learning_rate": 4.320820394848794e-05, "loss": 0.1151, "step": 1090 }, { "epoch": 2.1915211970074813, "grad_norm": 0.08070435374975204, "learning_rate": 4.1315300291429174e-05, "loss": 0.1093, "step": 1100 }, { "epoch": 2.2114713216957607, "grad_norm": 0.07660377770662308, "learning_rate": 3.9453941347832146e-05, "loss": 0.1113, "step": 1110 }, { "epoch": 2.2114713216957607, "eval_loss": 0.10742755234241486, "eval_runtime": 48.9958, "eval_samples_per_second": 4.306, "eval_steps_per_second": 2.163, "step": 1110 }, { "epoch": 2.23142144638404, "grad_norm": 0.08561329543590546, "learning_rate": 3.762512765164895e-05, "loss": 0.1096, "step": 1120 }, { "epoch": 2.251371571072319, "grad_norm": 0.07384736835956573, "learning_rate": 3.5829842242825374e-05, "loss": 0.1071, "step": 1130 }, { "epoch": 2.2713216957605984, "grad_norm": 0.089817114174366, "learning_rate": 3.406905013888875e-05, "loss": 0.1128, "step": 1140 }, { "epoch": 2.2713216957605984, "eval_loss": 0.10679538547992706, "eval_runtime": 49.1277, "eval_samples_per_second": 4.295, "eval_steps_per_second": 2.158, "step": 1140 }, { "epoch": 2.291271820448878, "grad_norm": 0.0809311643242836, "learning_rate": 3.234369781622315e-05, "loss": 0.1055, "step": 1150 }, { "epoch": 2.311221945137157, "grad_norm": 0.11156892031431198, "learning_rate": 3.065471270131132e-05, "loss": 0.1118, "step": 1160 }, { "epoch": 2.3311720698254366, "grad_norm": 0.07891872525215149, "learning_rate": 2.9003002672216106e-05, "loss": 0.1146, "step": 1170 }, { "epoch": 2.3311720698254366, "eval_loss": 0.1066785454750061, "eval_runtime": 49.1277, "eval_samples_per_second": 4.295, "eval_steps_per_second": 2.158, "step": 1170 }, { "epoch": 2.3511221945137155, "grad_norm": 0.08137693256139755, "learning_rate": 2.738945557056999e-05, "loss": 0.1068, "step": 1180 }, { "epoch": 2.371072319201995, "grad_norm": 0.08419659733772278, "learning_rate": 2.5814938724334624e-05, "loss": 0.1039, "step": 1190 }, { "epoch": 2.3910224438902743, "grad_norm": 0.08609955757856369, "learning_rate": 2.4280298481587104e-05, "loss": 0.1138, "step": 1200 }, { "epoch": 2.3910224438902743, "eval_loss": 0.10642586648464203, "eval_runtime": 49.084, "eval_samples_per_second": 4.299, "eval_steps_per_second": 2.16, "step": 1200 }, { "epoch": 2.4109725685785537, "grad_norm": 0.10461217164993286, "learning_rate": 2.2786359755583632e-05, "loss": 0.1181, "step": 1210 }, { "epoch": 2.430922693266833, "grad_norm": 0.09914015233516693, "learning_rate": 2.133392558134483e-05, "loss": 0.1077, "step": 1220 }, { "epoch": 2.450872817955112, "grad_norm": 0.09925994277000427, "learning_rate": 1.992377668400136e-05, "loss": 0.1092, "step": 1230 }, { "epoch": 2.450872817955112, "eval_loss": 0.10633692890405655, "eval_runtime": 49.0203, "eval_samples_per_second": 4.304, "eval_steps_per_second": 2.162, "step": 1230 }, { "epoch": 2.4708229426433914, "grad_norm": 0.07879967987537384, "learning_rate": 1.855667105913176e-05, "loss": 0.105, "step": 1240 }, { "epoch": 2.490773067331671, "grad_norm": 0.08635237067937851, "learning_rate": 1.7233343565317928e-05, "loss": 0.1083, "step": 1250 }, { "epoch": 2.51072319201995, "grad_norm": 0.07894821465015411, "learning_rate": 1.5954505529137587e-05, "loss": 0.1025, "step": 1260 }, { "epoch": 2.51072319201995, "eval_loss": 0.10649015009403229, "eval_runtime": 49.129, "eval_samples_per_second": 4.295, "eval_steps_per_second": 2.158, "step": 1260 }, { "epoch": 2.5306733167082296, "grad_norm": 0.08609526604413986, "learning_rate": 1.472084436280582e-05, "loss": 0.1103, "step": 1270 }, { "epoch": 2.5506234413965085, "grad_norm": 0.12177404761314392, "learning_rate": 1.3533023194671057e-05, "loss": 0.1124, "step": 1280 }, { "epoch": 2.570573566084788, "grad_norm": 0.09495075047016144, "learning_rate": 1.2391680512764802e-05, "loss": 0.1099, "step": 1290 }, { "epoch": 2.570573566084788, "eval_loss": 0.10633713006973267, "eval_runtime": 49.2486, "eval_samples_per_second": 4.284, "eval_steps_per_second": 2.152, "step": 1290 }, { "epoch": 2.5905236907730673, "grad_norm": 0.08574570715427399, "learning_rate": 1.129742982159574e-05, "loss": 0.1173, "step": 1300 }, { "epoch": 2.6104738154613467, "grad_norm": 0.08771918714046478, "learning_rate": 1.0250859312373462e-05, "loss": 0.1102, "step": 1310 }, { "epoch": 2.630423940149626, "grad_norm": 0.07304377853870392, "learning_rate": 9.252531546838872e-06, "loss": 0.1046, "step": 1320 }, { "epoch": 2.630423940149626, "eval_loss": 0.10615119338035583, "eval_runtime": 48.6835, "eval_samples_per_second": 4.334, "eval_steps_per_second": 2.177, "step": 1320 }, { "epoch": 2.650374064837905, "grad_norm": 0.09657382220029831, "learning_rate": 8.30298315487098e-06, "loss": 0.1147, "step": 1330 }, { "epoch": 2.6703241895261844, "grad_norm": 0.09089631587266922, "learning_rate": 7.402724546032957e-06, "loss": 0.1102, "step": 1340 }, { "epoch": 2.690274314214464, "grad_norm": 0.09991210699081421, "learning_rate": 6.552239635212298e-06, "loss": 0.1202, "step": 1350 }, { "epoch": 2.690274314214464, "eval_loss": 0.10618466883897781, "eval_runtime": 48.5373, "eval_samples_per_second": 4.347, "eval_steps_per_second": 2.184, "step": 1350 }, { "epoch": 2.7102244389027432, "grad_norm": 0.13392846286296844, "learning_rate": 5.751985582502672e-06, "loss": 0.1176, "step": 1360 }, { "epoch": 2.7301745635910226, "grad_norm": 0.10461334884166718, "learning_rate": 5.002392547467194e-06, "loss": 0.1116, "step": 1370 }, { "epoch": 2.7501246882793016, "grad_norm": 0.0903896763920784, "learning_rate": 4.303863457915358e-06, "loss": 0.1048, "step": 1380 }, { "epoch": 2.7501246882793016, "eval_loss": 0.10620897263288498, "eval_runtime": 48.5684, "eval_samples_per_second": 4.344, "eval_steps_per_second": 2.182, "step": 1380 }, { "epoch": 2.770074812967581, "grad_norm": 0.09017899632453918, "learning_rate": 3.6567737933176916e-06, "loss": 0.1028, "step": 1390 }, { "epoch": 2.7900249376558603, "grad_norm": 0.08241838961839676, "learning_rate": 3.0614713829747456e-06, "loss": 0.1051, "step": 1400 }, { "epoch": 2.8099750623441397, "grad_norm": 0.12076063454151154, "learning_rate": 2.5182762190488873e-06, "loss": 0.1124, "step": 1410 }, { "epoch": 2.8099750623441397, "eval_loss": 0.10615089535713196, "eval_runtime": 48.4144, "eval_samples_per_second": 4.358, "eval_steps_per_second": 2.189, "step": 1410 }, { "epoch": 2.829925187032419, "grad_norm": 0.053800225257873535, "learning_rate": 2.0274802845593133e-06, "loss": 0.1149, "step": 1420 }, { "epoch": 2.849875311720698, "grad_norm": 0.08684830367565155, "learning_rate": 1.5893473964326832e-06, "loss": 0.1039, "step": 1430 }, { "epoch": 2.8698254364089775, "grad_norm": 0.10123500972986221, "learning_rate": 1.2041130636940678e-06, "loss": 0.1127, "step": 1440 }, { "epoch": 2.8698254364089775, "eval_loss": 0.10614271461963654, "eval_runtime": 48.6688, "eval_samples_per_second": 4.335, "eval_steps_per_second": 2.178, "step": 1440 }, { "epoch": 2.889775561097257, "grad_norm": 0.09655766934156418, "learning_rate": 8.71984360874023e-07, "loss": 0.1105, "step": 1450 }, { "epoch": 2.9097256857855363, "grad_norm": 0.08582356572151184, "learning_rate": 5.931398167000391e-07, "loss": 0.1038, "step": 1460 }, { "epoch": 2.9296758104738156, "grad_norm": 0.11041458696126938, "learning_rate": 3.677293181322594e-07, "loss": 0.1046, "step": 1470 }, { "epoch": 2.9296758104738156, "eval_loss": 0.10609422624111176, "eval_runtime": 48.6034, "eval_samples_per_second": 4.341, "eval_steps_per_second": 2.181, "step": 1470 }, { "epoch": 2.9496259351620946, "grad_norm": 0.08140657842159271, "learning_rate": 1.9587402979491797e-07, "loss": 0.1058, "step": 1480 }, { "epoch": 2.969576059850374, "grad_norm": 0.08797825127840042, "learning_rate": 7.766632884689262e-08, "loss": 0.1144, "step": 1490 }, { "epoch": 2.9895261845386534, "grad_norm": 0.1317855268716812, "learning_rate": 1.316975532625886e-08, "loss": 0.1069, "step": 1500 }, { "epoch": 2.9895261845386534, "eval_loss": 0.10605818033218384, "eval_runtime": 48.5753, "eval_samples_per_second": 4.344, "eval_steps_per_second": 2.182, "step": 1500 } ], "logging_steps": 10, "max_steps": 1506, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.6345734237933363e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }