| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 30, | |
| "global_step": 1506, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0199501246882793, | |
| "grad_norm": 9.905351638793945, | |
| "learning_rate": 1.1920529801324503e-05, | |
| "loss": 2.4681, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.0399002493765586, | |
| "grad_norm": 1.0738855600357056, | |
| "learning_rate": 2.5165562913907287e-05, | |
| "loss": 1.192, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.059850374064837904, | |
| "grad_norm": 0.9622387290000916, | |
| "learning_rate": 3.841059602649007e-05, | |
| "loss": 0.651, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.059850374064837904, | |
| "eval_loss": 0.5571362376213074, | |
| "eval_runtime": 48.7371, | |
| "eval_samples_per_second": 4.329, | |
| "eval_steps_per_second": 2.175, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.0798004987531172, | |
| "grad_norm": 0.6049273610115051, | |
| "learning_rate": 5.165562913907285e-05, | |
| "loss": 0.5074, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.09975062344139651, | |
| "grad_norm": 0.6833104491233826, | |
| "learning_rate": 6.490066225165563e-05, | |
| "loss": 0.3756, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.11970074812967581, | |
| "grad_norm": 0.5208126902580261, | |
| "learning_rate": 7.814569536423842e-05, | |
| "loss": 0.3065, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.11970074812967581, | |
| "eval_loss": 0.25738316774368286, | |
| "eval_runtime": 48.1494, | |
| "eval_samples_per_second": 4.382, | |
| "eval_steps_per_second": 2.201, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.1396508728179551, | |
| "grad_norm": 0.5192306041717529, | |
| "learning_rate": 9.13907284768212e-05, | |
| "loss": 0.2578, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.1596009975062344, | |
| "grad_norm": 0.5098096132278442, | |
| "learning_rate": 0.00010463576158940399, | |
| "loss": 0.2096, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.17955112219451372, | |
| "grad_norm": 0.5306766629219055, | |
| "learning_rate": 0.00011788079470198677, | |
| "loss": 0.1899, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.17955112219451372, | |
| "eval_loss": 0.1828010529279709, | |
| "eval_runtime": 48.0843, | |
| "eval_samples_per_second": 4.388, | |
| "eval_steps_per_second": 2.204, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.19950124688279303, | |
| "grad_norm": 0.3457268178462982, | |
| "learning_rate": 0.00013112582781456955, | |
| "loss": 0.1848, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.2194513715710723, | |
| "grad_norm": 0.47407814860343933, | |
| "learning_rate": 0.00014437086092715232, | |
| "loss": 0.1736, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.23940149625935161, | |
| "grad_norm": 0.3515409827232361, | |
| "learning_rate": 0.00015761589403973512, | |
| "loss": 0.1676, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.23940149625935161, | |
| "eval_loss": 0.1554146409034729, | |
| "eval_runtime": 48.3317, | |
| "eval_samples_per_second": 4.366, | |
| "eval_steps_per_second": 2.193, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.2593516209476309, | |
| "grad_norm": 0.4131653904914856, | |
| "learning_rate": 0.0001708609271523179, | |
| "loss": 0.1643, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.2793017456359102, | |
| "grad_norm": 0.7977883219718933, | |
| "learning_rate": 0.0001841059602649007, | |
| "loss": 0.1543, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.29925187032418954, | |
| "grad_norm": 0.32713833451271057, | |
| "learning_rate": 0.00019735099337748346, | |
| "loss": 0.1569, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.29925187032418954, | |
| "eval_loss": 0.14544202387332916, | |
| "eval_runtime": 48.2076, | |
| "eval_samples_per_second": 4.377, | |
| "eval_steps_per_second": 2.199, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.3192019950124688, | |
| "grad_norm": 0.3618054986000061, | |
| "learning_rate": 0.00019998279880250371, | |
| "loss": 0.1661, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.33915211970074816, | |
| "grad_norm": 0.5105156302452087, | |
| "learning_rate": 0.0001999129290795728, | |
| "loss": 0.1542, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.35910224438902744, | |
| "grad_norm": 0.3221360445022583, | |
| "learning_rate": 0.00019978935328445287, | |
| "loss": 0.1437, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.35910224438902744, | |
| "eval_loss": 0.1328973025083542, | |
| "eval_runtime": 48.1577, | |
| "eval_samples_per_second": 4.381, | |
| "eval_steps_per_second": 2.201, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.3790523690773067, | |
| "grad_norm": 0.39137646555900574, | |
| "learning_rate": 0.000199612137842687, | |
| "loss": 0.1511, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.39900249376558605, | |
| "grad_norm": 0.29165342450141907, | |
| "learning_rate": 0.00019938137801267064, | |
| "loss": 0.1456, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.41895261845386533, | |
| "grad_norm": 0.2816756069660187, | |
| "learning_rate": 0.0001990971978344475, | |
| "loss": 0.1271, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.41895261845386533, | |
| "eval_loss": 0.13140253722667694, | |
| "eval_runtime": 47.9806, | |
| "eval_samples_per_second": 4.398, | |
| "eval_steps_per_second": 2.209, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.4389027431421446, | |
| "grad_norm": 0.6542349457740784, | |
| "learning_rate": 0.00019875975006303435, | |
| "loss": 0.1409, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.45885286783042395, | |
| "grad_norm": 0.2651802599430084, | |
| "learning_rate": 0.00019836921608631114, | |
| "loss": 0.1474, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.47880299251870323, | |
| "grad_norm": 0.2070140242576599, | |
| "learning_rate": 0.00019792580582751935, | |
| "loss": 0.1369, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.47880299251870323, | |
| "eval_loss": 0.12214324623346329, | |
| "eval_runtime": 48.2216, | |
| "eval_samples_per_second": 4.376, | |
| "eval_steps_per_second": 2.198, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.49875311720698257, | |
| "grad_norm": 0.16217675805091858, | |
| "learning_rate": 0.00019742975763242248, | |
| "loss": 0.1342, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.5187032418952618, | |
| "grad_norm": 0.17511174082756042, | |
| "learning_rate": 0.00019688133814118843, | |
| "loss": 0.1233, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.5386533665835411, | |
| "grad_norm": 0.16075153648853302, | |
| "learning_rate": 0.0001962808421450624, | |
| "loss": 0.1333, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.5386533665835411, | |
| "eval_loss": 0.11943639069795609, | |
| "eval_runtime": 48.0763, | |
| "eval_samples_per_second": 4.389, | |
| "eval_steps_per_second": 2.205, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.5586034912718204, | |
| "grad_norm": 0.1652883142232895, | |
| "learning_rate": 0.00019562859242790853, | |
| "loss": 0.1308, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.5785536159600998, | |
| "grad_norm": 0.2698552906513214, | |
| "learning_rate": 0.00019492493959270398, | |
| "loss": 0.1378, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.5985037406483791, | |
| "grad_norm": 0.11757837980985641, | |
| "learning_rate": 0.00019417026187307985, | |
| "loss": 0.1186, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.5985037406483791, | |
| "eval_loss": 0.11812838912010193, | |
| "eval_runtime": 48.1525, | |
| "eval_samples_per_second": 4.382, | |
| "eval_steps_per_second": 2.201, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.6184538653366584, | |
| "grad_norm": 0.1514044553041458, | |
| "learning_rate": 0.00019336496493000985, | |
| "loss": 0.1329, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.6384039900249376, | |
| "grad_norm": 0.1278185248374939, | |
| "learning_rate": 0.00019250948163375563, | |
| "loss": 0.1311, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.6583541147132169, | |
| "grad_norm": 0.14280234277248383, | |
| "learning_rate": 0.00019160427183118674, | |
| "loss": 0.1281, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.6583541147132169, | |
| "eval_loss": 0.11799625307321548, | |
| "eval_runtime": 48.1223, | |
| "eval_samples_per_second": 4.385, | |
| "eval_steps_per_second": 2.203, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.6783042394014963, | |
| "grad_norm": 0.141315296292305, | |
| "learning_rate": 0.0001906498220985997, | |
| "loss": 0.1131, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.6982543640897756, | |
| "grad_norm": 0.11978308856487274, | |
| "learning_rate": 0.0001896466454801692, | |
| "loss": 0.1294, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.7182044887780549, | |
| "grad_norm": 0.2772483825683594, | |
| "learning_rate": 0.00018859528121217204, | |
| "loss": 0.1297, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.7182044887780549, | |
| "eval_loss": 0.11690443754196167, | |
| "eval_runtime": 48.1421, | |
| "eval_samples_per_second": 4.383, | |
| "eval_steps_per_second": 2.202, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.7381546134663342, | |
| "grad_norm": 0.13961242139339447, | |
| "learning_rate": 0.00018749629443313233, | |
| "loss": 0.1249, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.7581047381546134, | |
| "grad_norm": 0.17335286736488342, | |
| "learning_rate": 0.0001863502758800431, | |
| "loss": 0.1255, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.7780548628428927, | |
| "grad_norm": 0.15879972279071808, | |
| "learning_rate": 0.00018515784157082822, | |
| "loss": 0.1175, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.7780548628428927, | |
| "eval_loss": 0.11526743322610855, | |
| "eval_runtime": 48.2061, | |
| "eval_samples_per_second": 4.377, | |
| "eval_steps_per_second": 2.199, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.7980049875311721, | |
| "grad_norm": 0.11807694286108017, | |
| "learning_rate": 0.00018391963247321513, | |
| "loss": 0.1178, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.8179551122194514, | |
| "grad_norm": 0.11275653541088104, | |
| "learning_rate": 0.00018263631416019617, | |
| "loss": 0.1195, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.8379052369077307, | |
| "grad_norm": 0.12657803297042847, | |
| "learning_rate": 0.000181308576452264, | |
| "loss": 0.1182, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.8379052369077307, | |
| "eval_loss": 0.11410157382488251, | |
| "eval_runtime": 48.3064, | |
| "eval_samples_per_second": 4.368, | |
| "eval_steps_per_second": 2.194, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.85785536159601, | |
| "grad_norm": 0.15267746150493622, | |
| "learning_rate": 0.00017993713304661322, | |
| "loss": 0.1216, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.8778054862842892, | |
| "grad_norm": 0.11593750864267349, | |
| "learning_rate": 0.00017852272113350767, | |
| "loss": 0.1329, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.8977556109725686, | |
| "grad_norm": 0.12276951223611832, | |
| "learning_rate": 0.0001770661010000194, | |
| "loss": 0.1189, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.8977556109725686, | |
| "eval_loss": 0.11431698501110077, | |
| "eval_runtime": 48.1715, | |
| "eval_samples_per_second": 4.38, | |
| "eval_steps_per_second": 2.2, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.9177057356608479, | |
| "grad_norm": 0.11874907463788986, | |
| "learning_rate": 0.00017556805562135255, | |
| "loss": 0.1313, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.9376558603491272, | |
| "grad_norm": 0.22391599416732788, | |
| "learning_rate": 0.00017402939023997157, | |
| "loss": 0.1127, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.9576059850374065, | |
| "grad_norm": 0.09968144446611404, | |
| "learning_rate": 0.00017245093193276047, | |
| "loss": 0.118, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.9576059850374065, | |
| "eval_loss": 0.11578261852264404, | |
| "eval_runtime": 48.1681, | |
| "eval_samples_per_second": 4.38, | |
| "eval_steps_per_second": 2.201, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.9775561097256857, | |
| "grad_norm": 0.09711634367704391, | |
| "learning_rate": 0.00017083352916644494, | |
| "loss": 0.1184, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.9975062344139651, | |
| "grad_norm": 0.0991770550608635, | |
| "learning_rate": 0.0001691780513415173, | |
| "loss": 0.1215, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.0159600997506235, | |
| "grad_norm": 0.09751348197460175, | |
| "learning_rate": 0.00016748538832490857, | |
| "loss": 0.1206, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.0159600997506235, | |
| "eval_loss": 0.1116185411810875, | |
| "eval_runtime": 48.1046, | |
| "eval_samples_per_second": 4.386, | |
| "eval_steps_per_second": 2.204, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.0359102244389027, | |
| "grad_norm": 0.10556904226541519, | |
| "learning_rate": 0.0001657564499716595, | |
| "loss": 0.1135, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.055860349127182, | |
| "grad_norm": 0.15284962952136993, | |
| "learning_rate": 0.00016399216563584736, | |
| "loss": 0.1218, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.0758104738154612, | |
| "grad_norm": 0.09085577726364136, | |
| "learning_rate": 0.00016219348367103132, | |
| "loss": 0.1171, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.0758104738154612, | |
| "eval_loss": 0.11126323789358139, | |
| "eval_runtime": 48.6749, | |
| "eval_samples_per_second": 4.335, | |
| "eval_steps_per_second": 2.178, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.0957605985037406, | |
| "grad_norm": 0.12901732325553894, | |
| "learning_rate": 0.00016036137092048525, | |
| "loss": 0.1188, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.11571072319202, | |
| "grad_norm": 0.09154735505580902, | |
| "learning_rate": 0.0001584968121974915, | |
| "loss": 0.1232, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.1356608478802992, | |
| "grad_norm": 0.10134833306074142, | |
| "learning_rate": 0.00015660080975597553, | |
| "loss": 0.1205, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.1356608478802992, | |
| "eval_loss": 0.11302559077739716, | |
| "eval_runtime": 48.709, | |
| "eval_samples_per_second": 4.332, | |
| "eval_steps_per_second": 2.176, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.1556109725685786, | |
| "grad_norm": 0.14130111038684845, | |
| "learning_rate": 0.00015467438275176568, | |
| "loss": 0.1214, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.1755610972568578, | |
| "grad_norm": 0.1193675845861435, | |
| "learning_rate": 0.0001527185666947675, | |
| "loss": 0.1173, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.1955112219451371, | |
| "grad_norm": 0.06070871651172638, | |
| "learning_rate": 0.00015073441289234745, | |
| "loss": 0.1189, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.1955112219451371, | |
| "eval_loss": 0.11231612414121628, | |
| "eval_runtime": 48.6333, | |
| "eval_samples_per_second": 4.339, | |
| "eval_steps_per_second": 2.18, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.2154613466334165, | |
| "grad_norm": 0.08564524352550507, | |
| "learning_rate": 0.00014872298788422497, | |
| "loss": 0.1194, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.2354114713216957, | |
| "grad_norm": 0.1110881119966507, | |
| "learning_rate": 0.00014668537286917664, | |
| "loss": 0.117, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.255361596009975, | |
| "grad_norm": 0.10232548415660858, | |
| "learning_rate": 0.00014462266312386085, | |
| "loss": 0.1235, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.255361596009975, | |
| "eval_loss": 0.11166342347860336, | |
| "eval_runtime": 48.6028, | |
| "eval_samples_per_second": 4.341, | |
| "eval_steps_per_second": 2.181, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.2753117206982543, | |
| "grad_norm": 0.08676601946353912, | |
| "learning_rate": 0.00014253596741407507, | |
| "loss": 0.1281, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.2952618453865337, | |
| "grad_norm": 0.09077729284763336, | |
| "learning_rate": 0.0001404264073987623, | |
| "loss": 0.1242, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.315211970074813, | |
| "grad_norm": 0.3734651505947113, | |
| "learning_rate": 0.00013829511702708727, | |
| "loss": 0.1137, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.315211970074813, | |
| "eval_loss": 0.11021307855844498, | |
| "eval_runtime": 48.6287, | |
| "eval_samples_per_second": 4.339, | |
| "eval_steps_per_second": 2.18, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.3351620947630922, | |
| "grad_norm": 0.09780021756887436, | |
| "learning_rate": 0.00013614324192890592, | |
| "loss": 0.1219, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 1.3551122194513716, | |
| "grad_norm": 0.09793366491794586, | |
| "learning_rate": 0.00013397193879895671, | |
| "loss": 0.1046, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 1.3750623441396508, | |
| "grad_norm": 0.11061827093362808, | |
| "learning_rate": 0.00013178237477510374, | |
| "loss": 0.1174, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.3750623441396508, | |
| "eval_loss": 0.1095186397433281, | |
| "eval_runtime": 48.691, | |
| "eval_samples_per_second": 4.333, | |
| "eval_steps_per_second": 2.177, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.3950124688279302, | |
| "grad_norm": 0.11096978187561035, | |
| "learning_rate": 0.0001295757268109666, | |
| "loss": 0.1071, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.4149625935162096, | |
| "grad_norm": 0.10013638436794281, | |
| "learning_rate": 0.0001273531810432741, | |
| "loss": 0.1262, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 1.4349127182044887, | |
| "grad_norm": 0.12431971728801727, | |
| "learning_rate": 0.00012511593215428141, | |
| "loss": 0.1098, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.4349127182044887, | |
| "eval_loss": 0.10981423407793045, | |
| "eval_runtime": 48.7018, | |
| "eval_samples_per_second": 4.332, | |
| "eval_steps_per_second": 2.177, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.4548628428927681, | |
| "grad_norm": 0.13086780905723572, | |
| "learning_rate": 0.0001228651827295943, | |
| "loss": 0.1142, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 1.4748129675810473, | |
| "grad_norm": 0.08466003090143204, | |
| "learning_rate": 0.00012060214261174465, | |
| "loss": 0.1144, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 1.4947630922693267, | |
| "grad_norm": 0.11653965711593628, | |
| "learning_rate": 0.00011832802824986523, | |
| "loss": 0.1183, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.4947630922693267, | |
| "eval_loss": 0.10894475877285004, | |
| "eval_runtime": 48.6934, | |
| "eval_samples_per_second": 4.333, | |
| "eval_steps_per_second": 2.177, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.514713216957606, | |
| "grad_norm": 0.11816778779029846, | |
| "learning_rate": 0.00011604406204581346, | |
| "loss": 0.13, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 1.5346633416458852, | |
| "grad_norm": 0.09233927726745605, | |
| "learning_rate": 0.00011375147169709519, | |
| "loss": 0.109, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 1.5546134663341646, | |
| "grad_norm": 0.08808460831642151, | |
| "learning_rate": 0.00011145148953694195, | |
| "loss": 0.1145, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.5546134663341646, | |
| "eval_loss": 0.10957030206918716, | |
| "eval_runtime": 48.6129, | |
| "eval_samples_per_second": 4.34, | |
| "eval_steps_per_second": 2.18, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.5745635910224438, | |
| "grad_norm": 0.0909830778837204, | |
| "learning_rate": 0.00010914535187189654, | |
| "loss": 0.1115, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 1.5945137157107232, | |
| "grad_norm": 0.08932233601808548, | |
| "learning_rate": 0.00010683429831726252, | |
| "loss": 0.1191, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.6144638403990026, | |
| "grad_norm": 0.08355925977230072, | |
| "learning_rate": 0.0001045195711307756, | |
| "loss": 0.1127, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 1.6144638403990026, | |
| "eval_loss": 0.10913572460412979, | |
| "eval_runtime": 48.4393, | |
| "eval_samples_per_second": 4.356, | |
| "eval_steps_per_second": 2.188, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 1.6344139650872818, | |
| "grad_norm": 0.06819329410791397, | |
| "learning_rate": 0.00010220241454485406, | |
| "loss": 0.1098, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 1.654364089775561, | |
| "grad_norm": 0.6049116849899292, | |
| "learning_rate": 9.988407409778838e-05, | |
| "loss": 0.1141, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 1.6743142144638403, | |
| "grad_norm": 0.07323434203863144, | |
| "learning_rate": 9.756579596422839e-05, | |
| "loss": 0.1049, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 1.6743142144638403, | |
| "eval_loss": 0.10999356210231781, | |
| "eval_runtime": 48.6539, | |
| "eval_samples_per_second": 4.337, | |
| "eval_steps_per_second": 2.179, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 1.6942643391521197, | |
| "grad_norm": 0.10413742810487747, | |
| "learning_rate": 9.524882628532858e-05, | |
| "loss": 0.1158, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.714214463840399, | |
| "grad_norm": 0.08523685485124588, | |
| "learning_rate": 9.293441049891148e-05, | |
| "loss": 0.115, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 1.7341645885286783, | |
| "grad_norm": 0.12683580815792084, | |
| "learning_rate": 9.062379267000898e-05, | |
| "loss": 0.1185, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 1.7341645885286783, | |
| "eval_loss": 0.10940343141555786, | |
| "eval_runtime": 48.6011, | |
| "eval_samples_per_second": 4.341, | |
| "eval_steps_per_second": 2.181, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 1.7541147132169574, | |
| "grad_norm": 0.10133107006549835, | |
| "learning_rate": 8.831821482214159e-05, | |
| "loss": 0.1194, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 1.7740648379052368, | |
| "grad_norm": 0.08002304285764694, | |
| "learning_rate": 8.601891626969514e-05, | |
| "loss": 0.1178, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 1.7940149625935162, | |
| "grad_norm": 0.0782008096575737, | |
| "learning_rate": 8.372713295175352e-05, | |
| "loss": 0.1162, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.7940149625935162, | |
| "eval_loss": 0.10899555683135986, | |
| "eval_runtime": 48.522, | |
| "eval_samples_per_second": 4.349, | |
| "eval_steps_per_second": 2.185, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.8139650872817956, | |
| "grad_norm": 0.08133247494697571, | |
| "learning_rate": 8.14440967677461e-05, | |
| "loss": 0.1234, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 1.8339152119700748, | |
| "grad_norm": 0.08592060953378677, | |
| "learning_rate": 7.917103491526617e-05, | |
| "loss": 0.1073, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 1.853865336658354, | |
| "grad_norm": 0.07544530183076859, | |
| "learning_rate": 7.690916923041708e-05, | |
| "loss": 0.1072, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 1.853865336658354, | |
| "eval_loss": 0.10886865854263306, | |
| "eval_runtime": 48.4695, | |
| "eval_samples_per_second": 4.353, | |
| "eval_steps_per_second": 2.187, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 1.8738154613466333, | |
| "grad_norm": 0.10101612657308578, | |
| "learning_rate": 7.465971553104014e-05, | |
| "loss": 0.1109, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 1.8937655860349127, | |
| "grad_norm": 0.07430823147296906, | |
| "learning_rate": 7.242388296317757e-05, | |
| "loss": 0.108, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.9137157107231921, | |
| "grad_norm": 0.0772905945777893, | |
| "learning_rate": 7.020287335112179e-05, | |
| "loss": 0.1087, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 1.9137157107231921, | |
| "eval_loss": 0.10754832625389099, | |
| "eval_runtime": 48.5946, | |
| "eval_samples_per_second": 4.342, | |
| "eval_steps_per_second": 2.181, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 1.9336658354114713, | |
| "grad_norm": 0.08496296405792236, | |
| "learning_rate": 6.799788055140025e-05, | |
| "loss": 0.1203, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 1.9536159600997505, | |
| "grad_norm": 0.07446739822626114, | |
| "learning_rate": 6.58100898110432e-05, | |
| "loss": 0.1158, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 1.9735660847880299, | |
| "grad_norm": 0.08105003088712692, | |
| "learning_rate": 6.364067713047943e-05, | |
| "loss": 0.1184, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 1.9735660847880299, | |
| "eval_loss": 0.1074095070362091, | |
| "eval_runtime": 48.6824, | |
| "eval_samples_per_second": 4.334, | |
| "eval_steps_per_second": 2.177, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 1.9935162094763093, | |
| "grad_norm": 0.08692453801631927, | |
| "learning_rate": 6.149080863140208e-05, | |
| "loss": 0.1145, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 2.0119700748129676, | |
| "grad_norm": 0.0745435431599617, | |
| "learning_rate": 5.9361639929944867e-05, | |
| "loss": 0.1081, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 2.031920199501247, | |
| "grad_norm": 0.08805131912231445, | |
| "learning_rate": 5.7254315515505105e-05, | |
| "loss": 0.1092, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 2.031920199501247, | |
| "eval_loss": 0.10798249393701553, | |
| "eval_runtime": 48.9954, | |
| "eval_samples_per_second": 4.307, | |
| "eval_steps_per_second": 2.163, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 2.051870324189526, | |
| "grad_norm": 0.08542945981025696, | |
| "learning_rate": 5.5169968135547655e-05, | |
| "loss": 0.1185, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 2.0718204488778054, | |
| "grad_norm": 0.08667082339525223, | |
| "learning_rate": 5.310971818672077e-05, | |
| "loss": 0.1107, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 2.0917705735660848, | |
| "grad_norm": 0.0777161717414856, | |
| "learning_rate": 5.1074673112610584e-05, | |
| "loss": 0.1082, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 2.0917705735660848, | |
| "eval_loss": 0.10762733221054077, | |
| "eval_runtime": 49.1196, | |
| "eval_samples_per_second": 4.296, | |
| "eval_steps_per_second": 2.158, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 2.111720698254364, | |
| "grad_norm": 0.06888756155967712, | |
| "learning_rate": 4.906592680845829e-05, | |
| "loss": 0.1075, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 2.1316708229426435, | |
| "grad_norm": 0.09783507883548737, | |
| "learning_rate": 4.7084559033160135e-05, | |
| "loss": 0.1149, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 2.1516209476309225, | |
| "grad_norm": 0.09118674695491791, | |
| "learning_rate": 4.5131634828865845e-05, | |
| "loss": 0.1127, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 2.1516209476309225, | |
| "eval_loss": 0.10765193402767181, | |
| "eval_runtime": 49.1258, | |
| "eval_samples_per_second": 4.295, | |
| "eval_steps_per_second": 2.158, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 2.171571072319202, | |
| "grad_norm": 0.08113594353199005, | |
| "learning_rate": 4.320820394848794e-05, | |
| "loss": 0.1151, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 2.1915211970074813, | |
| "grad_norm": 0.08070435374975204, | |
| "learning_rate": 4.1315300291429174e-05, | |
| "loss": 0.1093, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 2.2114713216957607, | |
| "grad_norm": 0.07660377770662308, | |
| "learning_rate": 3.9453941347832146e-05, | |
| "loss": 0.1113, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 2.2114713216957607, | |
| "eval_loss": 0.10742755234241486, | |
| "eval_runtime": 48.9958, | |
| "eval_samples_per_second": 4.306, | |
| "eval_steps_per_second": 2.163, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 2.23142144638404, | |
| "grad_norm": 0.08561329543590546, | |
| "learning_rate": 3.762512765164895e-05, | |
| "loss": 0.1096, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 2.251371571072319, | |
| "grad_norm": 0.07384736835956573, | |
| "learning_rate": 3.5829842242825374e-05, | |
| "loss": 0.1071, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 2.2713216957605984, | |
| "grad_norm": 0.089817114174366, | |
| "learning_rate": 3.406905013888875e-05, | |
| "loss": 0.1128, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 2.2713216957605984, | |
| "eval_loss": 0.10679538547992706, | |
| "eval_runtime": 49.1277, | |
| "eval_samples_per_second": 4.295, | |
| "eval_steps_per_second": 2.158, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 2.291271820448878, | |
| "grad_norm": 0.0809311643242836, | |
| "learning_rate": 3.234369781622315e-05, | |
| "loss": 0.1055, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 2.311221945137157, | |
| "grad_norm": 0.11156892031431198, | |
| "learning_rate": 3.065471270131132e-05, | |
| "loss": 0.1118, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 2.3311720698254366, | |
| "grad_norm": 0.07891872525215149, | |
| "learning_rate": 2.9003002672216106e-05, | |
| "loss": 0.1146, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 2.3311720698254366, | |
| "eval_loss": 0.1066785454750061, | |
| "eval_runtime": 49.1277, | |
| "eval_samples_per_second": 4.295, | |
| "eval_steps_per_second": 2.158, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 2.3511221945137155, | |
| "grad_norm": 0.08137693256139755, | |
| "learning_rate": 2.738945557056999e-05, | |
| "loss": 0.1068, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 2.371072319201995, | |
| "grad_norm": 0.08419659733772278, | |
| "learning_rate": 2.5814938724334624e-05, | |
| "loss": 0.1039, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 2.3910224438902743, | |
| "grad_norm": 0.08609955757856369, | |
| "learning_rate": 2.4280298481587104e-05, | |
| "loss": 0.1138, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 2.3910224438902743, | |
| "eval_loss": 0.10642586648464203, | |
| "eval_runtime": 49.084, | |
| "eval_samples_per_second": 4.299, | |
| "eval_steps_per_second": 2.16, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 2.4109725685785537, | |
| "grad_norm": 0.10461217164993286, | |
| "learning_rate": 2.2786359755583632e-05, | |
| "loss": 0.1181, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 2.430922693266833, | |
| "grad_norm": 0.09914015233516693, | |
| "learning_rate": 2.133392558134483e-05, | |
| "loss": 0.1077, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 2.450872817955112, | |
| "grad_norm": 0.09925994277000427, | |
| "learning_rate": 1.992377668400136e-05, | |
| "loss": 0.1092, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 2.450872817955112, | |
| "eval_loss": 0.10633692890405655, | |
| "eval_runtime": 49.0203, | |
| "eval_samples_per_second": 4.304, | |
| "eval_steps_per_second": 2.162, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 2.4708229426433914, | |
| "grad_norm": 0.07879967987537384, | |
| "learning_rate": 1.855667105913176e-05, | |
| "loss": 0.105, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 2.490773067331671, | |
| "grad_norm": 0.08635237067937851, | |
| "learning_rate": 1.7233343565317928e-05, | |
| "loss": 0.1083, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 2.51072319201995, | |
| "grad_norm": 0.07894821465015411, | |
| "learning_rate": 1.5954505529137587e-05, | |
| "loss": 0.1025, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 2.51072319201995, | |
| "eval_loss": 0.10649015009403229, | |
| "eval_runtime": 49.129, | |
| "eval_samples_per_second": 4.295, | |
| "eval_steps_per_second": 2.158, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 2.5306733167082296, | |
| "grad_norm": 0.08609526604413986, | |
| "learning_rate": 1.472084436280582e-05, | |
| "loss": 0.1103, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 2.5506234413965085, | |
| "grad_norm": 0.12177404761314392, | |
| "learning_rate": 1.3533023194671057e-05, | |
| "loss": 0.1124, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 2.570573566084788, | |
| "grad_norm": 0.09495075047016144, | |
| "learning_rate": 1.2391680512764802e-05, | |
| "loss": 0.1099, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 2.570573566084788, | |
| "eval_loss": 0.10633713006973267, | |
| "eval_runtime": 49.2486, | |
| "eval_samples_per_second": 4.284, | |
| "eval_steps_per_second": 2.152, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 2.5905236907730673, | |
| "grad_norm": 0.08574570715427399, | |
| "learning_rate": 1.129742982159574e-05, | |
| "loss": 0.1173, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 2.6104738154613467, | |
| "grad_norm": 0.08771918714046478, | |
| "learning_rate": 1.0250859312373462e-05, | |
| "loss": 0.1102, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 2.630423940149626, | |
| "grad_norm": 0.07304377853870392, | |
| "learning_rate": 9.252531546838872e-06, | |
| "loss": 0.1046, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 2.630423940149626, | |
| "eval_loss": 0.10615119338035583, | |
| "eval_runtime": 48.6835, | |
| "eval_samples_per_second": 4.334, | |
| "eval_steps_per_second": 2.177, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 2.650374064837905, | |
| "grad_norm": 0.09657382220029831, | |
| "learning_rate": 8.30298315487098e-06, | |
| "loss": 0.1147, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 2.6703241895261844, | |
| "grad_norm": 0.09089631587266922, | |
| "learning_rate": 7.402724546032957e-06, | |
| "loss": 0.1102, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 2.690274314214464, | |
| "grad_norm": 0.09991210699081421, | |
| "learning_rate": 6.552239635212298e-06, | |
| "loss": 0.1202, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 2.690274314214464, | |
| "eval_loss": 0.10618466883897781, | |
| "eval_runtime": 48.5373, | |
| "eval_samples_per_second": 4.347, | |
| "eval_steps_per_second": 2.184, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 2.7102244389027432, | |
| "grad_norm": 0.13392846286296844, | |
| "learning_rate": 5.751985582502672e-06, | |
| "loss": 0.1176, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 2.7301745635910226, | |
| "grad_norm": 0.10461334884166718, | |
| "learning_rate": 5.002392547467194e-06, | |
| "loss": 0.1116, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 2.7501246882793016, | |
| "grad_norm": 0.0903896763920784, | |
| "learning_rate": 4.303863457915358e-06, | |
| "loss": 0.1048, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 2.7501246882793016, | |
| "eval_loss": 0.10620897263288498, | |
| "eval_runtime": 48.5684, | |
| "eval_samples_per_second": 4.344, | |
| "eval_steps_per_second": 2.182, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 2.770074812967581, | |
| "grad_norm": 0.09017899632453918, | |
| "learning_rate": 3.6567737933176916e-06, | |
| "loss": 0.1028, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 2.7900249376558603, | |
| "grad_norm": 0.08241838961839676, | |
| "learning_rate": 3.0614713829747456e-06, | |
| "loss": 0.1051, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 2.8099750623441397, | |
| "grad_norm": 0.12076063454151154, | |
| "learning_rate": 2.5182762190488873e-06, | |
| "loss": 0.1124, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 2.8099750623441397, | |
| "eval_loss": 0.10615089535713196, | |
| "eval_runtime": 48.4144, | |
| "eval_samples_per_second": 4.358, | |
| "eval_steps_per_second": 2.189, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 2.829925187032419, | |
| "grad_norm": 0.053800225257873535, | |
| "learning_rate": 2.0274802845593133e-06, | |
| "loss": 0.1149, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 2.849875311720698, | |
| "grad_norm": 0.08684830367565155, | |
| "learning_rate": 1.5893473964326832e-06, | |
| "loss": 0.1039, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 2.8698254364089775, | |
| "grad_norm": 0.10123500972986221, | |
| "learning_rate": 1.2041130636940678e-06, | |
| "loss": 0.1127, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 2.8698254364089775, | |
| "eval_loss": 0.10614271461963654, | |
| "eval_runtime": 48.6688, | |
| "eval_samples_per_second": 4.335, | |
| "eval_steps_per_second": 2.178, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 2.889775561097257, | |
| "grad_norm": 0.09655766934156418, | |
| "learning_rate": 8.71984360874023e-07, | |
| "loss": 0.1105, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 2.9097256857855363, | |
| "grad_norm": 0.08582356572151184, | |
| "learning_rate": 5.931398167000391e-07, | |
| "loss": 0.1038, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 2.9296758104738156, | |
| "grad_norm": 0.11041458696126938, | |
| "learning_rate": 3.677293181322594e-07, | |
| "loss": 0.1046, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 2.9296758104738156, | |
| "eval_loss": 0.10609422624111176, | |
| "eval_runtime": 48.6034, | |
| "eval_samples_per_second": 4.341, | |
| "eval_steps_per_second": 2.181, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 2.9496259351620946, | |
| "grad_norm": 0.08140657842159271, | |
| "learning_rate": 1.9587402979491797e-07, | |
| "loss": 0.1058, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 2.969576059850374, | |
| "grad_norm": 0.08797825127840042, | |
| "learning_rate": 7.766632884689262e-08, | |
| "loss": 0.1144, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 2.9895261845386534, | |
| "grad_norm": 0.1317855268716812, | |
| "learning_rate": 1.316975532625886e-08, | |
| "loss": 0.1069, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 2.9895261845386534, | |
| "eval_loss": 0.10605818033218384, | |
| "eval_runtime": 48.5753, | |
| "eval_samples_per_second": 4.344, | |
| "eval_steps_per_second": 2.182, | |
| "step": 1500 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 1506, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.6429055635399885e+17, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |