{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9928263988522237, "eval_steps": 500, "global_step": 1044, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.028694404591104734, "grad_norm": 6.9535064697265625, "learning_rate": 2.9993209101500904e-05, "loss": 0.8011, "num_input_tokens_seen": 26464, "step": 10 }, { "epoch": 0.05738880918220947, "grad_norm": 3.7668867111206055, "learning_rate": 2.997284255484393e-05, "loss": 0.1799, "num_input_tokens_seen": 53568, "step": 20 }, { "epoch": 0.08608321377331421, "grad_norm": 5.130955219268799, "learning_rate": 2.9938918800982563e-05, "loss": 0.1418, "num_input_tokens_seen": 79840, "step": 30 }, { "epoch": 0.11477761836441894, "grad_norm": 5.7834391593933105, "learning_rate": 2.9891468556286034e-05, "loss": 0.1428, "num_input_tokens_seen": 106848, "step": 40 }, { "epoch": 0.14347202295552366, "grad_norm": 3.7895705699920654, "learning_rate": 2.983053478472707e-05, "loss": 0.1297, "num_input_tokens_seen": 133600, "step": 50 }, { "epoch": 0.17216642754662842, "grad_norm": 2.2129366397857666, "learning_rate": 2.975617265898004e-05, "loss": 0.1392, "num_input_tokens_seen": 159872, "step": 60 }, { "epoch": 0.20086083213773315, "grad_norm": 5.727760314941406, "learning_rate": 2.9668449510464707e-05, "loss": 0.1418, "num_input_tokens_seen": 186912, "step": 70 }, { "epoch": 0.22955523672883787, "grad_norm": 7.682604789733887, "learning_rate": 2.9567444768380745e-05, "loss": 0.1336, "num_input_tokens_seen": 213696, "step": 80 }, { "epoch": 0.2582496413199426, "grad_norm": 5.216188430786133, "learning_rate": 2.9453249887788343e-05, "loss": 0.0996, "num_input_tokens_seen": 240736, "step": 90 }, { "epoch": 0.28694404591104733, "grad_norm": 4.260756015777588, "learning_rate": 2.9325968266799934e-05, "loss": 0.1102, "num_input_tokens_seen": 266816, "step": 100 }, { "epoch": 0.31563845050215206, "grad_norm": 1.7236586809158325, "learning_rate": 2.918571515295803e-05, "loss": 0.1443, "num_input_tokens_seen": 293504, "step": 110 }, { "epoch": 0.34433285509325684, "grad_norm": 3.35003399848938, "learning_rate": 2.9032617538884018e-05, "loss": 0.1224, "num_input_tokens_seen": 319872, "step": 120 }, { "epoch": 0.37302725968436157, "grad_norm": 3.347268581390381, "learning_rate": 2.8866814047292232e-05, "loss": 0.1678, "num_input_tokens_seen": 346816, "step": 130 }, { "epoch": 0.4017216642754663, "grad_norm": 3.474947929382324, "learning_rate": 2.8688454805473647e-05, "loss": 0.1154, "num_input_tokens_seen": 374304, "step": 140 }, { "epoch": 0.430416068866571, "grad_norm": 6.668752670288086, "learning_rate": 2.84977013093626e-05, "loss": 0.1285, "num_input_tokens_seen": 401568, "step": 150 }, { "epoch": 0.45911047345767575, "grad_norm": 4.752432346343994, "learning_rate": 2.8294726277309815e-05, "loss": 0.1144, "num_input_tokens_seen": 428384, "step": 160 }, { "epoch": 0.4878048780487805, "grad_norm": 3.637117624282837, "learning_rate": 2.8079713493694024e-05, "loss": 0.0875, "num_input_tokens_seen": 454880, "step": 170 }, { "epoch": 0.5164992826398852, "grad_norm": 3.157396078109741, "learning_rate": 2.7852857642513838e-05, "loss": 0.151, "num_input_tokens_seen": 481856, "step": 180 }, { "epoch": 0.5451936872309899, "grad_norm": 2.444923162460327, "learning_rate": 2.7614364131110498e-05, "loss": 0.1153, "num_input_tokens_seen": 508192, "step": 190 }, { "epoch": 0.5738880918220947, "grad_norm": 3.9938931465148926, "learning_rate": 2.7364448904181152e-05, "loss": 0.0971, "num_input_tokens_seen": 534240, "step": 200 }, { "epoch": 0.6025824964131994, "grad_norm": 2.390864133834839, "learning_rate": 2.7103338248251055e-05, "loss": 0.1035, "num_input_tokens_seen": 561664, "step": 210 }, { "epoch": 0.6312769010043041, "grad_norm": 4.270871639251709, "learning_rate": 2.6831268586781746e-05, "loss": 0.0908, "num_input_tokens_seen": 588128, "step": 220 }, { "epoch": 0.6599713055954088, "grad_norm": 2.367370367050171, "learning_rate": 2.6548486266100645e-05, "loss": 0.1216, "num_input_tokens_seen": 614432, "step": 230 }, { "epoch": 0.6886657101865137, "grad_norm": 2.0463204383850098, "learning_rate": 2.6255247332346036e-05, "loss": 0.0952, "num_input_tokens_seen": 640832, "step": 240 }, { "epoch": 0.7173601147776184, "grad_norm": 3.1408205032348633, "learning_rate": 2.5951817299629266e-05, "loss": 0.1073, "num_input_tokens_seen": 668448, "step": 250 }, { "epoch": 0.7460545193687231, "grad_norm": 3.5036535263061523, "learning_rate": 2.5638470909624166e-05, "loss": 0.1048, "num_input_tokens_seen": 695008, "step": 260 }, { "epoch": 0.7747489239598279, "grad_norm": 2.891324996948242, "learning_rate": 2.531549188280135e-05, "loss": 0.1015, "num_input_tokens_seen": 722208, "step": 270 }, { "epoch": 0.8034433285509326, "grad_norm": 1.6332054138183594, "learning_rate": 2.498317266153262e-05, "loss": 0.0815, "num_input_tokens_seen": 749216, "step": 280 }, { "epoch": 0.8321377331420373, "grad_norm": 4.443371295928955, "learning_rate": 2.464181414529809e-05, "loss": 0.0843, "num_input_tokens_seen": 776000, "step": 290 }, { "epoch": 0.860832137733142, "grad_norm": 3.0447335243225098, "learning_rate": 2.4291725418235848e-05, "loss": 0.1321, "num_input_tokens_seen": 802560, "step": 300 }, { "epoch": 0.8895265423242468, "grad_norm": 3.728531837463379, "learning_rate": 2.3933223469280704e-05, "loss": 0.0868, "num_input_tokens_seen": 828704, "step": 310 }, { "epoch": 0.9182209469153515, "grad_norm": 3.703484296798706, "learning_rate": 2.3566632905145604e-05, "loss": 0.1012, "num_input_tokens_seen": 855616, "step": 320 }, { "epoch": 0.9469153515064562, "grad_norm": 3.728098154067993, "learning_rate": 2.3192285656405456e-05, "loss": 0.1139, "num_input_tokens_seen": 882400, "step": 330 }, { "epoch": 0.975609756097561, "grad_norm": 1.163082480430603, "learning_rate": 2.2810520676949537e-05, "loss": 0.1004, "num_input_tokens_seen": 909760, "step": 340 }, { "epoch": 1.0028694404591105, "grad_norm": 3.1779849529266357, "learning_rate": 2.2421683637074648e-05, "loss": 0.1131, "num_input_tokens_seen": 934112, "step": 350 }, { "epoch": 1.0315638450502151, "grad_norm": 1.9103726148605347, "learning_rate": 2.2026126610496852e-05, "loss": 0.0896, "num_input_tokens_seen": 960320, "step": 360 }, { "epoch": 1.06025824964132, "grad_norm": 2.4402358531951904, "learning_rate": 2.1624207755565232e-05, "loss": 0.0629, "num_input_tokens_seen": 986688, "step": 370 }, { "epoch": 1.0889526542324246, "grad_norm": 3.0046820640563965, "learning_rate": 2.121629099096628e-05, "loss": 0.0658, "num_input_tokens_seen": 1013088, "step": 380 }, { "epoch": 1.1176470588235294, "grad_norm": 2.693220376968384, "learning_rate": 2.0802745666212592e-05, "loss": 0.0777, "num_input_tokens_seen": 1039808, "step": 390 }, { "epoch": 1.146341463414634, "grad_norm": 2.451016664505005, "learning_rate": 2.0383946227214188e-05, "loss": 0.0908, "num_input_tokens_seen": 1066144, "step": 400 }, { "epoch": 1.175035868005739, "grad_norm": 0.8696689605712891, "learning_rate": 1.9960271877235306e-05, "loss": 0.0731, "num_input_tokens_seen": 1092576, "step": 410 }, { "epoch": 1.2037302725968435, "grad_norm": 2.7256174087524414, "learning_rate": 1.953210623354359e-05, "loss": 0.0661, "num_input_tokens_seen": 1119104, "step": 420 }, { "epoch": 1.2324246771879483, "grad_norm": 2.751459836959839, "learning_rate": 1.909983698006266e-05, "loss": 0.0391, "num_input_tokens_seen": 1145376, "step": 430 }, { "epoch": 1.2611190817790532, "grad_norm": 3.420569658279419, "learning_rate": 1.8663855516342468e-05, "loss": 0.0594, "num_input_tokens_seen": 1172736, "step": 440 }, { "epoch": 1.2898134863701578, "grad_norm": 3.2319304943084717, "learning_rate": 1.8224556603165363e-05, "loss": 0.0492, "num_input_tokens_seen": 1199296, "step": 450 }, { "epoch": 1.3185078909612624, "grad_norm": 6.221270561218262, "learning_rate": 1.7782338005108694e-05, "loss": 0.0465, "num_input_tokens_seen": 1226368, "step": 460 }, { "epoch": 1.3472022955523673, "grad_norm": 1.9661167860031128, "learning_rate": 1.733760013038765e-05, "loss": 0.1173, "num_input_tokens_seen": 1252352, "step": 470 }, { "epoch": 1.375896700143472, "grad_norm": 2.8439626693725586, "learning_rate": 1.689074566830434e-05, "loss": 0.0721, "num_input_tokens_seen": 1278912, "step": 480 }, { "epoch": 1.4045911047345767, "grad_norm": 2.0552496910095215, "learning_rate": 1.6442179224631558e-05, "loss": 0.061, "num_input_tokens_seen": 1306176, "step": 490 }, { "epoch": 1.4332855093256816, "grad_norm": 2.505808115005493, "learning_rate": 1.5992306955261175e-05, "loss": 0.0599, "num_input_tokens_seen": 1332608, "step": 500 }, { "epoch": 1.4619799139167862, "grad_norm": 1.2224159240722656, "learning_rate": 1.5541536198449044e-05, "loss": 0.0721, "num_input_tokens_seen": 1359520, "step": 510 }, { "epoch": 1.490674318507891, "grad_norm": 1.0898300409317017, "learning_rate": 1.5090275105989284e-05, "loss": 0.0595, "num_input_tokens_seen": 1386368, "step": 520 }, { "epoch": 1.5193687230989958, "grad_norm": 1.6561291217803955, "learning_rate": 1.463893227365195e-05, "loss": 0.0519, "num_input_tokens_seen": 1412992, "step": 530 }, { "epoch": 1.5480631276901005, "grad_norm": 3.09395694732666, "learning_rate": 1.4187916371218739e-05, "loss": 0.0752, "num_input_tokens_seen": 1439616, "step": 540 }, { "epoch": 1.576757532281205, "grad_norm": 3.0223190784454346, "learning_rate": 1.3737635772451642e-05, "loss": 0.0895, "num_input_tokens_seen": 1466464, "step": 550 }, { "epoch": 1.60545193687231, "grad_norm": 2.7507283687591553, "learning_rate": 1.328849818532963e-05, "loss": 0.1118, "num_input_tokens_seen": 1493440, "step": 560 }, { "epoch": 1.6341463414634148, "grad_norm": 1.4776769876480103, "learning_rate": 1.2840910282888211e-05, "loss": 0.0917, "num_input_tokens_seen": 1520256, "step": 570 }, { "epoch": 1.6628407460545194, "grad_norm": 0.9096710681915283, "learning_rate": 1.2395277334996045e-05, "loss": 0.0797, "num_input_tokens_seen": 1547712, "step": 580 }, { "epoch": 1.691535150645624, "grad_norm": 3.431025266647339, "learning_rate": 1.1952002841402057e-05, "loss": 0.0609, "num_input_tokens_seen": 1574528, "step": 590 }, { "epoch": 1.7202295552367288, "grad_norm": 5.7561726570129395, "learning_rate": 1.1511488166385349e-05, "loss": 0.0602, "num_input_tokens_seen": 1600768, "step": 600 }, { "epoch": 1.7489239598278337, "grad_norm": 2.8678972721099854, "learning_rate": 1.107413217533863e-05, "loss": 0.0714, "num_input_tokens_seen": 1627744, "step": 610 }, { "epoch": 1.7776183644189383, "grad_norm": 3.1199018955230713, "learning_rate": 1.0640330873614336e-05, "loss": 0.0559, "num_input_tokens_seen": 1654784, "step": 620 }, { "epoch": 1.806312769010043, "grad_norm": 4.480892181396484, "learning_rate": 1.0210477047960303e-05, "loss": 0.1277, "num_input_tokens_seen": 1680928, "step": 630 }, { "epoch": 1.8350071736011477, "grad_norm": 3.975166082382202, "learning_rate": 9.78495991086979e-06, "loss": 0.0817, "num_input_tokens_seen": 1707232, "step": 640 }, { "epoch": 1.8637015781922526, "grad_norm": 1.2731448411941528, "learning_rate": 9.364164748167806e-06, "loss": 0.073, "num_input_tokens_seen": 1733568, "step": 650 }, { "epoch": 1.8923959827833574, "grad_norm": 2.2199535369873047, "learning_rate": 8.948472570152874e-06, "loss": 0.0637, "num_input_tokens_seen": 1760608, "step": 660 }, { "epoch": 1.921090387374462, "grad_norm": 2.6249306201934814, "learning_rate": 8.538259766610019e-06, "loss": 0.0512, "num_input_tokens_seen": 1788064, "step": 670 }, { "epoch": 1.9497847919655666, "grad_norm": 4.65223503112793, "learning_rate": 8.133897766007499e-06, "loss": 0.0608, "num_input_tokens_seen": 1814688, "step": 680 }, { "epoch": 1.9784791965566715, "grad_norm": 6.477709770202637, "learning_rate": 7.735752699185711e-06, "loss": 0.0596, "num_input_tokens_seen": 1841056, "step": 690 }, { "epoch": 2.005738880918221, "grad_norm": 1.0149333477020264, "learning_rate": 7.344185067842878e-06, "loss": 0.059, "num_input_tokens_seen": 1865472, "step": 700 }, { "epoch": 2.034433285509326, "grad_norm": 1.824137806892395, "learning_rate": 6.959549418117669e-06, "loss": 0.0411, "num_input_tokens_seen": 1891968, "step": 710 }, { "epoch": 2.0631276901004303, "grad_norm": 1.539084553718567, "learning_rate": 6.582194019564266e-06, "loss": 0.0296, "num_input_tokens_seen": 1918912, "step": 720 }, { "epoch": 2.091822094691535, "grad_norm": 3.0269973278045654, "learning_rate": 6.2124605498106336e-06, "loss": 0.0433, "num_input_tokens_seen": 1945312, "step": 730 }, { "epoch": 2.12051649928264, "grad_norm": 1.6702455282211304, "learning_rate": 5.850683785185409e-06, "loss": 0.0283, "num_input_tokens_seen": 1971520, "step": 740 }, { "epoch": 2.149210903873745, "grad_norm": 2.5078916549682617, "learning_rate": 5.497191297593647e-06, "loss": 0.0419, "num_input_tokens_seen": 1998528, "step": 750 }, { "epoch": 2.177905308464849, "grad_norm": 0.1790330410003662, "learning_rate": 5.1523031579157994e-06, "loss": 0.0601, "num_input_tokens_seen": 2025088, "step": 760 }, { "epoch": 2.206599713055954, "grad_norm": 2.8430678844451904, "learning_rate": 4.816331646198556e-06, "loss": 0.0368, "num_input_tokens_seen": 2052032, "step": 770 }, { "epoch": 2.235294117647059, "grad_norm": 2.523798704147339, "learning_rate": 4.4895809688998655e-06, "loss": 0.0474, "num_input_tokens_seen": 2079104, "step": 780 }, { "epoch": 2.2639885222381637, "grad_norm": 4.507711410522461, "learning_rate": 4.172346983444269e-06, "loss": 0.0498, "num_input_tokens_seen": 2106496, "step": 790 }, { "epoch": 2.292682926829268, "grad_norm": 5.649550914764404, "learning_rate": 3.864916930337852e-06, "loss": 0.0389, "num_input_tokens_seen": 2133248, "step": 800 }, { "epoch": 2.321377331420373, "grad_norm": 2.364619255065918, "learning_rate": 3.567569173085455e-06, "loss": 0.0275, "num_input_tokens_seen": 2159840, "step": 810 }, { "epoch": 2.350071736011478, "grad_norm": 2.0380613803863525, "learning_rate": 3.2805729461455307e-06, "loss": 0.0269, "num_input_tokens_seen": 2186528, "step": 820 }, { "epoch": 2.3787661406025826, "grad_norm": 3.2729814052581787, "learning_rate": 3.0041881111509783e-06, "loss": 0.0586, "num_input_tokens_seen": 2213344, "step": 830 }, { "epoch": 2.407460545193687, "grad_norm": 4.258326053619385, "learning_rate": 2.7386649216166233e-06, "loss": 0.046, "num_input_tokens_seen": 2239648, "step": 840 }, { "epoch": 2.436154949784792, "grad_norm": 3.044335126876831, "learning_rate": 2.484243796346367e-06, "loss": 0.0414, "num_input_tokens_seen": 2267008, "step": 850 }, { "epoch": 2.4648493543758967, "grad_norm": 1.9635220766067505, "learning_rate": 2.241155101745242e-06, "loss": 0.0375, "num_input_tokens_seen": 2293664, "step": 860 }, { "epoch": 2.4935437589670015, "grad_norm": 0.2657749652862549, "learning_rate": 2.0096189432334194e-06, "loss": 0.0541, "num_input_tokens_seen": 2320032, "step": 870 }, { "epoch": 2.5222381635581064, "grad_norm": 2.3013086318969727, "learning_rate": 1.7898449659510841e-06, "loss": 0.0324, "num_input_tokens_seen": 2346848, "step": 880 }, { "epoch": 2.5509325681492108, "grad_norm": 3.4938905239105225, "learning_rate": 1.5820321649345582e-06, "loss": 0.0704, "num_input_tokens_seen": 2373408, "step": 890 }, { "epoch": 2.5796269727403156, "grad_norm": 1.1432446241378784, "learning_rate": 1.3863687049356465e-06, "loss": 0.0376, "num_input_tokens_seen": 2400192, "step": 900 }, { "epoch": 2.6083213773314204, "grad_norm": 3.4845967292785645, "learning_rate": 1.2030317500472572e-06, "loss": 0.0377, "num_input_tokens_seen": 2426720, "step": 910 }, { "epoch": 2.637015781922525, "grad_norm": 0.9145282506942749, "learning_rate": 1.0321873032896328e-06, "loss": 0.0459, "num_input_tokens_seen": 2453504, "step": 920 }, { "epoch": 2.6657101865136297, "grad_norm": 6.571470260620117, "learning_rate": 8.7399005630238e-07, "loss": 0.0352, "num_input_tokens_seen": 2480064, "step": 930 }, { "epoch": 2.6944045911047345, "grad_norm": 1.0509833097457886, "learning_rate": 7.285832492784456e-07, "loss": 0.0308, "num_input_tokens_seen": 2507904, "step": 940 }, { "epoch": 2.7230989956958394, "grad_norm": 1.271236777305603, "learning_rate": 5.960985412668457e-07, "loss": 0.0359, "num_input_tokens_seen": 2534624, "step": 950 }, { "epoch": 2.751793400286944, "grad_norm": 4.02854061126709, "learning_rate": 4.766558909615504e-07, "loss": 0.0251, "num_input_tokens_seen": 2560384, "step": 960 }, { "epoch": 2.7804878048780486, "grad_norm": 2.239231824874878, "learning_rate": 3.703634480845175e-07, "loss": 0.0278, "num_input_tokens_seen": 2586976, "step": 970 }, { "epoch": 2.8091822094691534, "grad_norm": 1.5159451961517334, "learning_rate": 2.7731745546118295e-07, "loss": 0.0337, "num_input_tokens_seen": 2613568, "step": 980 }, { "epoch": 2.8378766140602583, "grad_norm": 2.9302163124084473, "learning_rate": 1.9760216187710788e-07, "loss": 0.0418, "num_input_tokens_seen": 2640128, "step": 990 }, { "epoch": 2.866571018651363, "grad_norm": 3.014836311340332, "learning_rate": 1.3128974579462771e-07, "loss": 0.042, "num_input_tokens_seen": 2666880, "step": 1000 }, { "epoch": 2.895265423242468, "grad_norm": 4.583406448364258, "learning_rate": 7.844024999865806e-08, "loss": 0.043, "num_input_tokens_seen": 2694272, "step": 1010 }, { "epoch": 2.9239598278335723, "grad_norm": 0.8867899179458618, "learning_rate": 3.910152723075322e-08, "loss": 0.0282, "num_input_tokens_seen": 2721024, "step": 1020 }, { "epoch": 2.952654232424677, "grad_norm": 2.2085297107696533, "learning_rate": 1.3309196860701867e-08, "loss": 0.049, "num_input_tokens_seen": 2746976, "step": 1030 }, { "epoch": 2.981348637015782, "grad_norm": 1.866648554801941, "learning_rate": 1.0866126348507699e-09, "loss": 0.0376, "num_input_tokens_seen": 2773152, "step": 1040 }, { "epoch": 2.9928263988522237, "num_input_tokens_seen": 2783872, "step": 1044, "total_flos": 1.191738412451758e+17, "train_loss": 0.08308002222828938, "train_runtime": 963.6682, "train_samples_per_second": 8.673, "train_steps_per_second": 1.083 } ], "logging_steps": 10, "max_steps": 1044, "num_input_tokens_seen": 2783872, "num_train_epochs": 3, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.191738412451758e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }