| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 457, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.01095290251916758, | |
| "grad_norm": 1.3321008682250977, | |
| "learning_rate": 1.0434782608695653e-06, | |
| "loss": 1.3073, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.02190580503833516, | |
| "grad_norm": 1.1412581205368042, | |
| "learning_rate": 2.347826086956522e-06, | |
| "loss": 1.3358, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.03285870755750274, | |
| "grad_norm": 0.9217167496681213, | |
| "learning_rate": 3.6521739130434787e-06, | |
| "loss": 1.2979, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.04381161007667032, | |
| "grad_norm": 0.6123777627944946, | |
| "learning_rate": 4.956521739130435e-06, | |
| "loss": 1.2163, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.054764512595837894, | |
| "grad_norm": 0.9250121116638184, | |
| "learning_rate": 6.260869565217391e-06, | |
| "loss": 1.264, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.06571741511500548, | |
| "grad_norm": 0.5497902035713196, | |
| "learning_rate": 7.5652173913043475e-06, | |
| "loss": 1.225, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.07667031763417305, | |
| "grad_norm": 0.602593183517456, | |
| "learning_rate": 8.869565217391306e-06, | |
| "loss": 1.1959, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.08762322015334063, | |
| "grad_norm": 0.5248430371284485, | |
| "learning_rate": 1.017391304347826e-05, | |
| "loss": 1.2186, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.09857612267250822, | |
| "grad_norm": 0.5515073537826538, | |
| "learning_rate": 1.1478260869565218e-05, | |
| "loss": 1.1472, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.10952902519167579, | |
| "grad_norm": 1.2510318756103516, | |
| "learning_rate": 1.2782608695652173e-05, | |
| "loss": 1.1666, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.12048192771084337, | |
| "grad_norm": 0.49298274517059326, | |
| "learning_rate": 1.4086956521739131e-05, | |
| "loss": 1.2019, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.13143483023001096, | |
| "grad_norm": 0.4494762122631073, | |
| "learning_rate": 1.5391304347826088e-05, | |
| "loss": 1.173, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.14238773274917854, | |
| "grad_norm": 0.6092376112937927, | |
| "learning_rate": 1.6695652173913044e-05, | |
| "loss": 1.1323, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.1533406352683461, | |
| "grad_norm": 0.4575347602367401, | |
| "learning_rate": 1.8e-05, | |
| "loss": 1.1164, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.16429353778751368, | |
| "grad_norm": 1.1347095966339111, | |
| "learning_rate": 1.9304347826086957e-05, | |
| "loss": 1.193, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.17524644030668127, | |
| "grad_norm": 0.45952585339546204, | |
| "learning_rate": 2.0608695652173913e-05, | |
| "loss": 1.1403, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.18619934282584885, | |
| "grad_norm": 0.49114152789115906, | |
| "learning_rate": 2.191304347826087e-05, | |
| "loss": 1.1535, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.19715224534501644, | |
| "grad_norm": 0.5684127807617188, | |
| "learning_rate": 2.3217391304347826e-05, | |
| "loss": 1.1398, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.20810514786418402, | |
| "grad_norm": 0.5454962849617004, | |
| "learning_rate": 2.4521739130434786e-05, | |
| "loss": 1.0928, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.21905805038335158, | |
| "grad_norm": 0.491813600063324, | |
| "learning_rate": 2.582608695652174e-05, | |
| "loss": 1.0858, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.23001095290251916, | |
| "grad_norm": 0.5732094645500183, | |
| "learning_rate": 2.7130434782608695e-05, | |
| "loss": 1.0765, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.24096385542168675, | |
| "grad_norm": 0.5950366258621216, | |
| "learning_rate": 2.8434782608695652e-05, | |
| "loss": 1.149, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.25191675794085433, | |
| "grad_norm": 0.536044180393219, | |
| "learning_rate": 2.973913043478261e-05, | |
| "loss": 1.0859, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.2628696604600219, | |
| "grad_norm": 0.5769652724266052, | |
| "learning_rate": 2.999974848707377e-05, | |
| "loss": 1.0433, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.2738225629791895, | |
| "grad_norm": 0.5500301718711853, | |
| "learning_rate": 2.9998726730266534e-05, | |
| "loss": 1.051, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.2847754654983571, | |
| "grad_norm": 0.5901659727096558, | |
| "learning_rate": 2.9996919063517692e-05, | |
| "loss": 1.0166, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.29572836801752467, | |
| "grad_norm": 0.6402804851531982, | |
| "learning_rate": 2.999432558154617e-05, | |
| "loss": 1.078, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.3066812705366922, | |
| "grad_norm": 0.5208508372306824, | |
| "learning_rate": 2.9990946420246395e-05, | |
| "loss": 1.0167, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.3176341730558598, | |
| "grad_norm": 0.6010607481002808, | |
| "learning_rate": 2.9986781756681146e-05, | |
| "loss": 1.0342, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.32858707557502737, | |
| "grad_norm": 0.5925593376159668, | |
| "learning_rate": 2.9981831809072318e-05, | |
| "loss": 0.9813, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.33953997809419495, | |
| "grad_norm": 0.7121065855026245, | |
| "learning_rate": 2.997609683678947e-05, | |
| "loss": 0.9768, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.35049288061336253, | |
| "grad_norm": 0.5926089882850647, | |
| "learning_rate": 2.996957714033622e-05, | |
| "loss": 1.0256, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.3614457831325301, | |
| "grad_norm": 0.6746991276741028, | |
| "learning_rate": 2.9962273061334515e-05, | |
| "loss": 0.9901, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.3723986856516977, | |
| "grad_norm": 0.6542229056358337, | |
| "learning_rate": 2.9954184982506734e-05, | |
| "loss": 0.9707, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.3833515881708653, | |
| "grad_norm": 0.7646006345748901, | |
| "learning_rate": 2.9945313327655622e-05, | |
| "loss": 0.9826, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.39430449069003287, | |
| "grad_norm": 0.6529242396354675, | |
| "learning_rate": 2.9935658561642083e-05, | |
| "loss": 0.9659, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.40525739320920046, | |
| "grad_norm": 0.6744837760925293, | |
| "learning_rate": 2.9925221190360836e-05, | |
| "loss": 0.9517, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.41621029572836804, | |
| "grad_norm": 0.6999198198318481, | |
| "learning_rate": 2.9914001760713898e-05, | |
| "loss": 1.0202, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.42716319824753557, | |
| "grad_norm": 0.7155759334564209, | |
| "learning_rate": 2.9902000860581916e-05, | |
| "loss": 0.9947, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.43811610076670315, | |
| "grad_norm": 0.6805759072303772, | |
| "learning_rate": 2.9889219118793393e-05, | |
| "loss": 0.9751, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.44906900328587074, | |
| "grad_norm": 0.7391874194145203, | |
| "learning_rate": 2.9875657205091703e-05, | |
| "loss": 1.0113, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.4600219058050383, | |
| "grad_norm": 0.7946105003356934, | |
| "learning_rate": 2.9861315830100023e-05, | |
| "loss": 0.952, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.4709748083242059, | |
| "grad_norm": 0.6928731203079224, | |
| "learning_rate": 2.984619574528408e-05, | |
| "loss": 0.9207, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.4819277108433735, | |
| "grad_norm": 0.7348776459693909, | |
| "learning_rate": 2.9830297742912798e-05, | |
| "loss": 0.933, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.4928806133625411, | |
| "grad_norm": 0.7400885820388794, | |
| "learning_rate": 2.981362265601676e-05, | |
| "loss": 0.8837, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.5038335158817087, | |
| "grad_norm": 0.79625004529953, | |
| "learning_rate": 2.9796171358344578e-05, | |
| "loss": 0.9132, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.5147864184008762, | |
| "grad_norm": 0.8105800747871399, | |
| "learning_rate": 2.9777944764317082e-05, | |
| "loss": 0.9753, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.5257393209200438, | |
| "grad_norm": 0.8556815981864929, | |
| "learning_rate": 2.9758943828979444e-05, | |
| "loss": 0.9439, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.5366922234392114, | |
| "grad_norm": 0.8349775671958923, | |
| "learning_rate": 2.9739169547951116e-05, | |
| "loss": 0.897, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.547645125958379, | |
| "grad_norm": 0.6672224402427673, | |
| "learning_rate": 2.971862295737364e-05, | |
| "loss": 0.8832, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.5585980284775466, | |
| "grad_norm": 0.7600496411323547, | |
| "learning_rate": 2.9697305133856394e-05, | |
| "loss": 0.8834, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.5695509309967142, | |
| "grad_norm": 0.8782379031181335, | |
| "learning_rate": 2.9675217194420174e-05, | |
| "loss": 0.906, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.5805038335158818, | |
| "grad_norm": 0.7369331121444702, | |
| "learning_rate": 2.9652360296438612e-05, | |
| "loss": 0.8909, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.5914567360350493, | |
| "grad_norm": 0.7942291498184204, | |
| "learning_rate": 2.9628735637577612e-05, | |
| "loss": 0.8707, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.6024096385542169, | |
| "grad_norm": 0.782035231590271, | |
| "learning_rate": 2.960434445573253e-05, | |
| "loss": 0.8819, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.6133625410733844, | |
| "grad_norm": 0.9603850841522217, | |
| "learning_rate": 2.957918802896334e-05, | |
| "loss": 0.8689, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.624315443592552, | |
| "grad_norm": 0.7741917967796326, | |
| "learning_rate": 2.9553267675427667e-05, | |
| "loss": 0.8543, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.6352683461117196, | |
| "grad_norm": 0.7536712884902954, | |
| "learning_rate": 2.952658475331169e-05, | |
| "loss": 0.8515, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.6462212486308871, | |
| "grad_norm": 0.8260005712509155, | |
| "learning_rate": 2.9499140660759018e-05, | |
| "loss": 0.8181, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.6571741511500547, | |
| "grad_norm": 0.8287461400032043, | |
| "learning_rate": 2.9470936835797392e-05, | |
| "loss": 0.8527, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.6681270536692223, | |
| "grad_norm": 0.8062161207199097, | |
| "learning_rate": 2.944197475626335e-05, | |
| "loss": 0.8389, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.6790799561883899, | |
| "grad_norm": 0.8538839817047119, | |
| "learning_rate": 2.94122559397248e-05, | |
| "loss": 0.8177, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.6900328587075575, | |
| "grad_norm": 0.7861185073852539, | |
| "learning_rate": 2.9381781943401477e-05, | |
| "loss": 0.839, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.7009857612267251, | |
| "grad_norm": 0.9167773723602295, | |
| "learning_rate": 2.935055436408337e-05, | |
| "loss": 0.7825, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.7119386637458927, | |
| "grad_norm": 0.807101309299469, | |
| "learning_rate": 2.9318574838047045e-05, | |
| "loss": 0.7862, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.7228915662650602, | |
| "grad_norm": 0.8477482199668884, | |
| "learning_rate": 2.9285845040969906e-05, | |
| "loss": 0.8001, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.7338444687842278, | |
| "grad_norm": 1.5396567583084106, | |
| "learning_rate": 2.9252366687842383e-05, | |
| "loss": 0.7666, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.7447973713033954, | |
| "grad_norm": 0.8536133170127869, | |
| "learning_rate": 2.921814153287809e-05, | |
| "loss": 0.767, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.755750273822563, | |
| "grad_norm": 0.8579494953155518, | |
| "learning_rate": 2.918317136942189e-05, | |
| "loss": 0.8055, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.7667031763417306, | |
| "grad_norm": 0.8801035284996033, | |
| "learning_rate": 2.9147458029855933e-05, | |
| "loss": 0.7404, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.7776560788608982, | |
| "grad_norm": 0.8523620963096619, | |
| "learning_rate": 2.911100338550364e-05, | |
| "loss": 0.7868, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.7886089813800657, | |
| "grad_norm": 0.8762844800949097, | |
| "learning_rate": 2.9073809346531645e-05, | |
| "loss": 0.786, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.7995618838992333, | |
| "grad_norm": 0.8992785215377808, | |
| "learning_rate": 2.903587786184972e-05, | |
| "loss": 0.7726, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.8105147864184009, | |
| "grad_norm": 0.8587476015090942, | |
| "learning_rate": 2.899721091900863e-05, | |
| "loss": 0.7611, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.8214676889375685, | |
| "grad_norm": 0.9412994980812073, | |
| "learning_rate": 2.895781054409602e-05, | |
| "loss": 0.7366, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.8324205914567361, | |
| "grad_norm": 0.8805333375930786, | |
| "learning_rate": 2.891767880163022e-05, | |
| "loss": 0.7364, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.8433734939759037, | |
| "grad_norm": 0.944273054599762, | |
| "learning_rate": 2.8876817794452087e-05, | |
| "loss": 0.7449, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.8543263964950711, | |
| "grad_norm": 0.8578236103057861, | |
| "learning_rate": 2.883522966361482e-05, | |
| "loss": 0.7288, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.8652792990142387, | |
| "grad_norm": 0.8459666967391968, | |
| "learning_rate": 2.8792916588271762e-05, | |
| "loss": 0.7062, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.8762322015334063, | |
| "grad_norm": 0.9263506531715393, | |
| "learning_rate": 2.874988078556221e-05, | |
| "loss": 0.7203, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.8871851040525739, | |
| "grad_norm": 0.9023187756538391, | |
| "learning_rate": 2.8706124510495263e-05, | |
| "loss": 0.6927, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.8981380065717415, | |
| "grad_norm": 0.9095348119735718, | |
| "learning_rate": 2.8661650055831645e-05, | |
| "loss": 0.7178, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.9090909090909091, | |
| "grad_norm": 0.9606519937515259, | |
| "learning_rate": 2.8616459751963583e-05, | |
| "loss": 0.7349, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.9200438116100766, | |
| "grad_norm": 0.9363222122192383, | |
| "learning_rate": 2.8570555966792677e-05, | |
| "loss": 0.6886, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.9309967141292442, | |
| "grad_norm": 0.8668004870414734, | |
| "learning_rate": 2.852394110560583e-05, | |
| "loss": 0.689, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.9419496166484118, | |
| "grad_norm": 0.8470218181610107, | |
| "learning_rate": 2.8476617610949246e-05, | |
| "loss": 0.7109, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.9529025191675794, | |
| "grad_norm": 1.0149718523025513, | |
| "learning_rate": 2.84285879625004e-05, | |
| "loss": 0.6739, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.963855421686747, | |
| "grad_norm": 0.8750612139701843, | |
| "learning_rate": 2.8379854676938137e-05, | |
| "loss": 0.6981, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.9748083242059146, | |
| "grad_norm": 0.9368709325790405, | |
| "learning_rate": 2.8330420307810784e-05, | |
| "loss": 0.6762, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.9857612267250822, | |
| "grad_norm": 1.0392800569534302, | |
| "learning_rate": 2.828028744540236e-05, | |
| "loss": 0.6816, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.9967141292442497, | |
| "grad_norm": 1.062735915184021, | |
| "learning_rate": 2.822945871659685e-05, | |
| "loss": 0.6951, | |
| "step": 455 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 2285, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 2000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 6.804863634510971e+17, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |