| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.9928263988522237, | |
| "eval_steps": 500, | |
| "global_step": 1044, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.028694404591104734, | |
| "grad_norm": 6.9535064697265625, | |
| "learning_rate": 2.9993209101500904e-05, | |
| "loss": 0.8011, | |
| "num_input_tokens_seen": 26464, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.05738880918220947, | |
| "grad_norm": 3.7668867111206055, | |
| "learning_rate": 2.997284255484393e-05, | |
| "loss": 0.1799, | |
| "num_input_tokens_seen": 53568, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.08608321377331421, | |
| "grad_norm": 5.130955219268799, | |
| "learning_rate": 2.9938918800982563e-05, | |
| "loss": 0.1418, | |
| "num_input_tokens_seen": 79840, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.11477761836441894, | |
| "grad_norm": 5.7834391593933105, | |
| "learning_rate": 2.9891468556286034e-05, | |
| "loss": 0.1428, | |
| "num_input_tokens_seen": 106848, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.14347202295552366, | |
| "grad_norm": 3.7895705699920654, | |
| "learning_rate": 2.983053478472707e-05, | |
| "loss": 0.1297, | |
| "num_input_tokens_seen": 133600, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.17216642754662842, | |
| "grad_norm": 2.2129366397857666, | |
| "learning_rate": 2.975617265898004e-05, | |
| "loss": 0.1392, | |
| "num_input_tokens_seen": 159872, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.20086083213773315, | |
| "grad_norm": 5.727760314941406, | |
| "learning_rate": 2.9668449510464707e-05, | |
| "loss": 0.1418, | |
| "num_input_tokens_seen": 186912, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.22955523672883787, | |
| "grad_norm": 7.682604789733887, | |
| "learning_rate": 2.9567444768380745e-05, | |
| "loss": 0.1336, | |
| "num_input_tokens_seen": 213696, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.2582496413199426, | |
| "grad_norm": 5.216188430786133, | |
| "learning_rate": 2.9453249887788343e-05, | |
| "loss": 0.0996, | |
| "num_input_tokens_seen": 240736, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.28694404591104733, | |
| "grad_norm": 4.260756015777588, | |
| "learning_rate": 2.9325968266799934e-05, | |
| "loss": 0.1102, | |
| "num_input_tokens_seen": 266816, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.31563845050215206, | |
| "grad_norm": 1.7236586809158325, | |
| "learning_rate": 2.918571515295803e-05, | |
| "loss": 0.1443, | |
| "num_input_tokens_seen": 293504, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.34433285509325684, | |
| "grad_norm": 3.35003399848938, | |
| "learning_rate": 2.9032617538884018e-05, | |
| "loss": 0.1224, | |
| "num_input_tokens_seen": 319872, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.37302725968436157, | |
| "grad_norm": 3.347268581390381, | |
| "learning_rate": 2.8866814047292232e-05, | |
| "loss": 0.1678, | |
| "num_input_tokens_seen": 346816, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.4017216642754663, | |
| "grad_norm": 3.474947929382324, | |
| "learning_rate": 2.8688454805473647e-05, | |
| "loss": 0.1154, | |
| "num_input_tokens_seen": 374304, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.430416068866571, | |
| "grad_norm": 6.668752670288086, | |
| "learning_rate": 2.84977013093626e-05, | |
| "loss": 0.1285, | |
| "num_input_tokens_seen": 401568, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.45911047345767575, | |
| "grad_norm": 4.752432346343994, | |
| "learning_rate": 2.8294726277309815e-05, | |
| "loss": 0.1144, | |
| "num_input_tokens_seen": 428384, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.4878048780487805, | |
| "grad_norm": 3.637117624282837, | |
| "learning_rate": 2.8079713493694024e-05, | |
| "loss": 0.0875, | |
| "num_input_tokens_seen": 454880, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.5164992826398852, | |
| "grad_norm": 3.157396078109741, | |
| "learning_rate": 2.7852857642513838e-05, | |
| "loss": 0.151, | |
| "num_input_tokens_seen": 481856, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.5451936872309899, | |
| "grad_norm": 2.444923162460327, | |
| "learning_rate": 2.7614364131110498e-05, | |
| "loss": 0.1153, | |
| "num_input_tokens_seen": 508192, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.5738880918220947, | |
| "grad_norm": 3.9938931465148926, | |
| "learning_rate": 2.7364448904181152e-05, | |
| "loss": 0.0971, | |
| "num_input_tokens_seen": 534240, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.6025824964131994, | |
| "grad_norm": 2.390864133834839, | |
| "learning_rate": 2.7103338248251055e-05, | |
| "loss": 0.1035, | |
| "num_input_tokens_seen": 561664, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.6312769010043041, | |
| "grad_norm": 4.270871639251709, | |
| "learning_rate": 2.6831268586781746e-05, | |
| "loss": 0.0908, | |
| "num_input_tokens_seen": 588128, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.6599713055954088, | |
| "grad_norm": 2.367370367050171, | |
| "learning_rate": 2.6548486266100645e-05, | |
| "loss": 0.1216, | |
| "num_input_tokens_seen": 614432, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.6886657101865137, | |
| "grad_norm": 2.0463204383850098, | |
| "learning_rate": 2.6255247332346036e-05, | |
| "loss": 0.0952, | |
| "num_input_tokens_seen": 640832, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.7173601147776184, | |
| "grad_norm": 3.1408205032348633, | |
| "learning_rate": 2.5951817299629266e-05, | |
| "loss": 0.1073, | |
| "num_input_tokens_seen": 668448, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.7460545193687231, | |
| "grad_norm": 3.5036535263061523, | |
| "learning_rate": 2.5638470909624166e-05, | |
| "loss": 0.1048, | |
| "num_input_tokens_seen": 695008, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.7747489239598279, | |
| "grad_norm": 2.891324996948242, | |
| "learning_rate": 2.531549188280135e-05, | |
| "loss": 0.1015, | |
| "num_input_tokens_seen": 722208, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.8034433285509326, | |
| "grad_norm": 1.6332054138183594, | |
| "learning_rate": 2.498317266153262e-05, | |
| "loss": 0.0815, | |
| "num_input_tokens_seen": 749216, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.8321377331420373, | |
| "grad_norm": 4.443371295928955, | |
| "learning_rate": 2.464181414529809e-05, | |
| "loss": 0.0843, | |
| "num_input_tokens_seen": 776000, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.860832137733142, | |
| "grad_norm": 3.0447335243225098, | |
| "learning_rate": 2.4291725418235848e-05, | |
| "loss": 0.1321, | |
| "num_input_tokens_seen": 802560, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.8895265423242468, | |
| "grad_norm": 3.728531837463379, | |
| "learning_rate": 2.3933223469280704e-05, | |
| "loss": 0.0868, | |
| "num_input_tokens_seen": 828704, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.9182209469153515, | |
| "grad_norm": 3.703484296798706, | |
| "learning_rate": 2.3566632905145604e-05, | |
| "loss": 0.1012, | |
| "num_input_tokens_seen": 855616, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.9469153515064562, | |
| "grad_norm": 3.728098154067993, | |
| "learning_rate": 2.3192285656405456e-05, | |
| "loss": 0.1139, | |
| "num_input_tokens_seen": 882400, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.975609756097561, | |
| "grad_norm": 1.163082480430603, | |
| "learning_rate": 2.2810520676949537e-05, | |
| "loss": 0.1004, | |
| "num_input_tokens_seen": 909760, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.0028694404591105, | |
| "grad_norm": 3.1779849529266357, | |
| "learning_rate": 2.2421683637074648e-05, | |
| "loss": 0.1131, | |
| "num_input_tokens_seen": 934112, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.0315638450502151, | |
| "grad_norm": 1.9103726148605347, | |
| "learning_rate": 2.2026126610496852e-05, | |
| "loss": 0.0896, | |
| "num_input_tokens_seen": 960320, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.06025824964132, | |
| "grad_norm": 2.4402358531951904, | |
| "learning_rate": 2.1624207755565232e-05, | |
| "loss": 0.0629, | |
| "num_input_tokens_seen": 986688, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.0889526542324246, | |
| "grad_norm": 3.0046820640563965, | |
| "learning_rate": 2.121629099096628e-05, | |
| "loss": 0.0658, | |
| "num_input_tokens_seen": 1013088, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.1176470588235294, | |
| "grad_norm": 2.693220376968384, | |
| "learning_rate": 2.0802745666212592e-05, | |
| "loss": 0.0777, | |
| "num_input_tokens_seen": 1039808, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.146341463414634, | |
| "grad_norm": 2.451016664505005, | |
| "learning_rate": 2.0383946227214188e-05, | |
| "loss": 0.0908, | |
| "num_input_tokens_seen": 1066144, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.175035868005739, | |
| "grad_norm": 0.8696689605712891, | |
| "learning_rate": 1.9960271877235306e-05, | |
| "loss": 0.0731, | |
| "num_input_tokens_seen": 1092576, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.2037302725968435, | |
| "grad_norm": 2.7256174087524414, | |
| "learning_rate": 1.953210623354359e-05, | |
| "loss": 0.0661, | |
| "num_input_tokens_seen": 1119104, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.2324246771879483, | |
| "grad_norm": 2.751459836959839, | |
| "learning_rate": 1.909983698006266e-05, | |
| "loss": 0.0391, | |
| "num_input_tokens_seen": 1145376, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.2611190817790532, | |
| "grad_norm": 3.420569658279419, | |
| "learning_rate": 1.8663855516342468e-05, | |
| "loss": 0.0594, | |
| "num_input_tokens_seen": 1172736, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.2898134863701578, | |
| "grad_norm": 3.2319304943084717, | |
| "learning_rate": 1.8224556603165363e-05, | |
| "loss": 0.0492, | |
| "num_input_tokens_seen": 1199296, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.3185078909612624, | |
| "grad_norm": 6.221270561218262, | |
| "learning_rate": 1.7782338005108694e-05, | |
| "loss": 0.0465, | |
| "num_input_tokens_seen": 1226368, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.3472022955523673, | |
| "grad_norm": 1.9661167860031128, | |
| "learning_rate": 1.733760013038765e-05, | |
| "loss": 0.1173, | |
| "num_input_tokens_seen": 1252352, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.375896700143472, | |
| "grad_norm": 2.8439626693725586, | |
| "learning_rate": 1.689074566830434e-05, | |
| "loss": 0.0721, | |
| "num_input_tokens_seen": 1278912, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.4045911047345767, | |
| "grad_norm": 2.0552496910095215, | |
| "learning_rate": 1.6442179224631558e-05, | |
| "loss": 0.061, | |
| "num_input_tokens_seen": 1306176, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.4332855093256816, | |
| "grad_norm": 2.505808115005493, | |
| "learning_rate": 1.5992306955261175e-05, | |
| "loss": 0.0599, | |
| "num_input_tokens_seen": 1332608, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.4619799139167862, | |
| "grad_norm": 1.2224159240722656, | |
| "learning_rate": 1.5541536198449044e-05, | |
| "loss": 0.0721, | |
| "num_input_tokens_seen": 1359520, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.490674318507891, | |
| "grad_norm": 1.0898300409317017, | |
| "learning_rate": 1.5090275105989284e-05, | |
| "loss": 0.0595, | |
| "num_input_tokens_seen": 1386368, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.5193687230989958, | |
| "grad_norm": 1.6561291217803955, | |
| "learning_rate": 1.463893227365195e-05, | |
| "loss": 0.0519, | |
| "num_input_tokens_seen": 1412992, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.5480631276901005, | |
| "grad_norm": 3.09395694732666, | |
| "learning_rate": 1.4187916371218739e-05, | |
| "loss": 0.0752, | |
| "num_input_tokens_seen": 1439616, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.576757532281205, | |
| "grad_norm": 3.0223190784454346, | |
| "learning_rate": 1.3737635772451642e-05, | |
| "loss": 0.0895, | |
| "num_input_tokens_seen": 1466464, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.60545193687231, | |
| "grad_norm": 2.7507283687591553, | |
| "learning_rate": 1.328849818532963e-05, | |
| "loss": 0.1118, | |
| "num_input_tokens_seen": 1493440, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.6341463414634148, | |
| "grad_norm": 1.4776769876480103, | |
| "learning_rate": 1.2840910282888211e-05, | |
| "loss": 0.0917, | |
| "num_input_tokens_seen": 1520256, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.6628407460545194, | |
| "grad_norm": 0.9096710681915283, | |
| "learning_rate": 1.2395277334996045e-05, | |
| "loss": 0.0797, | |
| "num_input_tokens_seen": 1547712, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.691535150645624, | |
| "grad_norm": 3.431025266647339, | |
| "learning_rate": 1.1952002841402057e-05, | |
| "loss": 0.0609, | |
| "num_input_tokens_seen": 1574528, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.7202295552367288, | |
| "grad_norm": 5.7561726570129395, | |
| "learning_rate": 1.1511488166385349e-05, | |
| "loss": 0.0602, | |
| "num_input_tokens_seen": 1600768, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.7489239598278337, | |
| "grad_norm": 2.8678972721099854, | |
| "learning_rate": 1.107413217533863e-05, | |
| "loss": 0.0714, | |
| "num_input_tokens_seen": 1627744, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.7776183644189383, | |
| "grad_norm": 3.1199018955230713, | |
| "learning_rate": 1.0640330873614336e-05, | |
| "loss": 0.0559, | |
| "num_input_tokens_seen": 1654784, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.806312769010043, | |
| "grad_norm": 4.480892181396484, | |
| "learning_rate": 1.0210477047960303e-05, | |
| "loss": 0.1277, | |
| "num_input_tokens_seen": 1680928, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.8350071736011477, | |
| "grad_norm": 3.975166082382202, | |
| "learning_rate": 9.78495991086979e-06, | |
| "loss": 0.0817, | |
| "num_input_tokens_seen": 1707232, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.8637015781922526, | |
| "grad_norm": 1.2731448411941528, | |
| "learning_rate": 9.364164748167806e-06, | |
| "loss": 0.073, | |
| "num_input_tokens_seen": 1733568, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.8923959827833574, | |
| "grad_norm": 2.2199535369873047, | |
| "learning_rate": 8.948472570152874e-06, | |
| "loss": 0.0637, | |
| "num_input_tokens_seen": 1760608, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.921090387374462, | |
| "grad_norm": 2.6249306201934814, | |
| "learning_rate": 8.538259766610019e-06, | |
| "loss": 0.0512, | |
| "num_input_tokens_seen": 1788064, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 1.9497847919655666, | |
| "grad_norm": 4.65223503112793, | |
| "learning_rate": 8.133897766007499e-06, | |
| "loss": 0.0608, | |
| "num_input_tokens_seen": 1814688, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 1.9784791965566715, | |
| "grad_norm": 6.477709770202637, | |
| "learning_rate": 7.735752699185711e-06, | |
| "loss": 0.0596, | |
| "num_input_tokens_seen": 1841056, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 2.005738880918221, | |
| "grad_norm": 1.0149333477020264, | |
| "learning_rate": 7.344185067842878e-06, | |
| "loss": 0.059, | |
| "num_input_tokens_seen": 1865472, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 2.034433285509326, | |
| "grad_norm": 1.824137806892395, | |
| "learning_rate": 6.959549418117669e-06, | |
| "loss": 0.0411, | |
| "num_input_tokens_seen": 1891968, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 2.0631276901004303, | |
| "grad_norm": 1.539084553718567, | |
| "learning_rate": 6.582194019564266e-06, | |
| "loss": 0.0296, | |
| "num_input_tokens_seen": 1918912, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 2.091822094691535, | |
| "grad_norm": 3.0269973278045654, | |
| "learning_rate": 6.2124605498106336e-06, | |
| "loss": 0.0433, | |
| "num_input_tokens_seen": 1945312, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 2.12051649928264, | |
| "grad_norm": 1.6702455282211304, | |
| "learning_rate": 5.850683785185409e-06, | |
| "loss": 0.0283, | |
| "num_input_tokens_seen": 1971520, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 2.149210903873745, | |
| "grad_norm": 2.5078916549682617, | |
| "learning_rate": 5.497191297593647e-06, | |
| "loss": 0.0419, | |
| "num_input_tokens_seen": 1998528, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 2.177905308464849, | |
| "grad_norm": 0.1790330410003662, | |
| "learning_rate": 5.1523031579157994e-06, | |
| "loss": 0.0601, | |
| "num_input_tokens_seen": 2025088, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 2.206599713055954, | |
| "grad_norm": 2.8430678844451904, | |
| "learning_rate": 4.816331646198556e-06, | |
| "loss": 0.0368, | |
| "num_input_tokens_seen": 2052032, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 2.235294117647059, | |
| "grad_norm": 2.523798704147339, | |
| "learning_rate": 4.4895809688998655e-06, | |
| "loss": 0.0474, | |
| "num_input_tokens_seen": 2079104, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 2.2639885222381637, | |
| "grad_norm": 4.507711410522461, | |
| "learning_rate": 4.172346983444269e-06, | |
| "loss": 0.0498, | |
| "num_input_tokens_seen": 2106496, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 2.292682926829268, | |
| "grad_norm": 5.649550914764404, | |
| "learning_rate": 3.864916930337852e-06, | |
| "loss": 0.0389, | |
| "num_input_tokens_seen": 2133248, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 2.321377331420373, | |
| "grad_norm": 2.364619255065918, | |
| "learning_rate": 3.567569173085455e-06, | |
| "loss": 0.0275, | |
| "num_input_tokens_seen": 2159840, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 2.350071736011478, | |
| "grad_norm": 2.0380613803863525, | |
| "learning_rate": 3.2805729461455307e-06, | |
| "loss": 0.0269, | |
| "num_input_tokens_seen": 2186528, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 2.3787661406025826, | |
| "grad_norm": 3.2729814052581787, | |
| "learning_rate": 3.0041881111509783e-06, | |
| "loss": 0.0586, | |
| "num_input_tokens_seen": 2213344, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 2.407460545193687, | |
| "grad_norm": 4.258326053619385, | |
| "learning_rate": 2.7386649216166233e-06, | |
| "loss": 0.046, | |
| "num_input_tokens_seen": 2239648, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 2.436154949784792, | |
| "grad_norm": 3.044335126876831, | |
| "learning_rate": 2.484243796346367e-06, | |
| "loss": 0.0414, | |
| "num_input_tokens_seen": 2267008, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 2.4648493543758967, | |
| "grad_norm": 1.9635220766067505, | |
| "learning_rate": 2.241155101745242e-06, | |
| "loss": 0.0375, | |
| "num_input_tokens_seen": 2293664, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 2.4935437589670015, | |
| "grad_norm": 0.2657749652862549, | |
| "learning_rate": 2.0096189432334194e-06, | |
| "loss": 0.0541, | |
| "num_input_tokens_seen": 2320032, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 2.5222381635581064, | |
| "grad_norm": 2.3013086318969727, | |
| "learning_rate": 1.7898449659510841e-06, | |
| "loss": 0.0324, | |
| "num_input_tokens_seen": 2346848, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 2.5509325681492108, | |
| "grad_norm": 3.4938905239105225, | |
| "learning_rate": 1.5820321649345582e-06, | |
| "loss": 0.0704, | |
| "num_input_tokens_seen": 2373408, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 2.5796269727403156, | |
| "grad_norm": 1.1432446241378784, | |
| "learning_rate": 1.3863687049356465e-06, | |
| "loss": 0.0376, | |
| "num_input_tokens_seen": 2400192, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 2.6083213773314204, | |
| "grad_norm": 3.4845967292785645, | |
| "learning_rate": 1.2030317500472572e-06, | |
| "loss": 0.0377, | |
| "num_input_tokens_seen": 2426720, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 2.637015781922525, | |
| "grad_norm": 0.9145282506942749, | |
| "learning_rate": 1.0321873032896328e-06, | |
| "loss": 0.0459, | |
| "num_input_tokens_seen": 2453504, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 2.6657101865136297, | |
| "grad_norm": 6.571470260620117, | |
| "learning_rate": 8.7399005630238e-07, | |
| "loss": 0.0352, | |
| "num_input_tokens_seen": 2480064, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 2.6944045911047345, | |
| "grad_norm": 1.0509833097457886, | |
| "learning_rate": 7.285832492784456e-07, | |
| "loss": 0.0308, | |
| "num_input_tokens_seen": 2507904, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 2.7230989956958394, | |
| "grad_norm": 1.271236777305603, | |
| "learning_rate": 5.960985412668457e-07, | |
| "loss": 0.0359, | |
| "num_input_tokens_seen": 2534624, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 2.751793400286944, | |
| "grad_norm": 4.02854061126709, | |
| "learning_rate": 4.766558909615504e-07, | |
| "loss": 0.0251, | |
| "num_input_tokens_seen": 2560384, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 2.7804878048780486, | |
| "grad_norm": 2.239231824874878, | |
| "learning_rate": 3.703634480845175e-07, | |
| "loss": 0.0278, | |
| "num_input_tokens_seen": 2586976, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 2.8091822094691534, | |
| "grad_norm": 1.5159451961517334, | |
| "learning_rate": 2.7731745546118295e-07, | |
| "loss": 0.0337, | |
| "num_input_tokens_seen": 2613568, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 2.8378766140602583, | |
| "grad_norm": 2.9302163124084473, | |
| "learning_rate": 1.9760216187710788e-07, | |
| "loss": 0.0418, | |
| "num_input_tokens_seen": 2640128, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 2.866571018651363, | |
| "grad_norm": 3.014836311340332, | |
| "learning_rate": 1.3128974579462771e-07, | |
| "loss": 0.042, | |
| "num_input_tokens_seen": 2666880, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 2.895265423242468, | |
| "grad_norm": 4.583406448364258, | |
| "learning_rate": 7.844024999865806e-08, | |
| "loss": 0.043, | |
| "num_input_tokens_seen": 2694272, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 2.9239598278335723, | |
| "grad_norm": 0.8867899179458618, | |
| "learning_rate": 3.910152723075322e-08, | |
| "loss": 0.0282, | |
| "num_input_tokens_seen": 2721024, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 2.952654232424677, | |
| "grad_norm": 2.2085297107696533, | |
| "learning_rate": 1.3309196860701867e-08, | |
| "loss": 0.049, | |
| "num_input_tokens_seen": 2746976, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 2.981348637015782, | |
| "grad_norm": 1.866648554801941, | |
| "learning_rate": 1.0866126348507699e-09, | |
| "loss": 0.0376, | |
| "num_input_tokens_seen": 2773152, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 2.9928263988522237, | |
| "num_input_tokens_seen": 2783872, | |
| "step": 1044, | |
| "total_flos": 1.191738412451758e+17, | |
| "train_loss": 0.08308002222828938, | |
| "train_runtime": 963.6682, | |
| "train_samples_per_second": 8.673, | |
| "train_steps_per_second": 1.083 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 1044, | |
| "num_input_tokens_seen": 2783872, | |
| "num_train_epochs": 3, | |
| "save_steps": 1000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.191738412451758e+17, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |