{ "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 500, "global_step": 14090, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.07097232079488999, "grad_norm": 4.760485649108887, "learning_rate": 1.2411284599006388e-05, "loss": 5.3493, "step": 100 }, { "epoch": 0.14194464158977999, "grad_norm": 3.876619577407837, "learning_rate": 1.2322569198012776e-05, "loss": 4.314, "step": 200 }, { "epoch": 0.21291696238466998, "grad_norm": 3.3549132347106934, "learning_rate": 1.2233853797019163e-05, "loss": 3.8419, "step": 300 }, { "epoch": 0.28388928317955997, "grad_norm": 2.582584857940674, "learning_rate": 1.214513839602555e-05, "loss": 3.6877, "step": 400 }, { "epoch": 0.35486160397445, "grad_norm": 3.2531728744506836, "learning_rate": 1.2056422995031938e-05, "loss": 3.5523, "step": 500 }, { "epoch": 0.42583392476933996, "grad_norm": 26.283876419067383, "learning_rate": 1.1967707594038326e-05, "loss": 3.5161, "step": 600 }, { "epoch": 0.49680624556423, "grad_norm": 3.0806777477264404, "learning_rate": 1.1878992193044713e-05, "loss": 3.496, "step": 700 }, { "epoch": 0.5677785663591199, "grad_norm": 2.6790292263031006, "learning_rate": 1.17902767920511e-05, "loss": 3.4429, "step": 800 }, { "epoch": 0.63875088715401, "grad_norm": 2.5105574131011963, "learning_rate": 1.1701561391057488e-05, "loss": 3.4733, "step": 900 }, { "epoch": 0.7097232079489, "grad_norm": 2.3946118354797363, "learning_rate": 1.1612845990063876e-05, "loss": 3.4635, "step": 1000 }, { "epoch": 0.7806955287437899, "grad_norm": 2.8345706462860107, "learning_rate": 1.1524130589070263e-05, "loss": 3.3713, "step": 1100 }, { "epoch": 0.8516678495386799, "grad_norm": 2.603573799133301, "learning_rate": 1.143541518807665e-05, "loss": 3.345, "step": 1200 }, { "epoch": 0.9226401703335699, "grad_norm": 2.7637786865234375, "learning_rate": 1.1346699787083038e-05, "loss": 3.4174, "step": 1300 }, { "epoch": 0.99361249112846, "grad_norm": 3.634453296661377, "learning_rate": 1.1257984386089426e-05, "loss": 3.3344, "step": 1400 }, { "epoch": 1.0645848119233499, "grad_norm": 3.2783639430999756, "learning_rate": 1.1169268985095813e-05, "loss": 3.2603, "step": 1500 }, { "epoch": 1.1355571327182399, "grad_norm": 12.153242111206055, "learning_rate": 1.1080553584102201e-05, "loss": 3.2665, "step": 1600 }, { "epoch": 1.20652945351313, "grad_norm": 3.7843754291534424, "learning_rate": 1.0991838183108588e-05, "loss": 3.2353, "step": 1700 }, { "epoch": 1.27750177430802, "grad_norm": 3.065302848815918, "learning_rate": 1.0903122782114976e-05, "loss": 3.2399, "step": 1800 }, { "epoch": 1.34847409510291, "grad_norm": 5.1582183837890625, "learning_rate": 1.0814407381121363e-05, "loss": 3.2087, "step": 1900 }, { "epoch": 1.4194464158978, "grad_norm": 3.7378036975860596, "learning_rate": 1.072569198012775e-05, "loss": 3.1919, "step": 2000 }, { "epoch": 1.49041873669269, "grad_norm": 3.0015852451324463, "learning_rate": 1.0636976579134138e-05, "loss": 3.2515, "step": 2100 }, { "epoch": 1.56139105748758, "grad_norm": 2.782742977142334, "learning_rate": 1.0548261178140526e-05, "loss": 3.182, "step": 2200 }, { "epoch": 1.6323633782824698, "grad_norm": 2.8899967670440674, "learning_rate": 1.0459545777146914e-05, "loss": 3.2035, "step": 2300 }, { "epoch": 1.7033356990773598, "grad_norm": 3.529968500137329, "learning_rate": 1.0370830376153301e-05, "loss": 3.2547, "step": 2400 }, { "epoch": 1.7743080198722498, "grad_norm": 3.1597557067871094, "learning_rate": 1.0282114975159689e-05, "loss": 3.2193, "step": 2500 }, { "epoch": 1.8452803406671399, "grad_norm": 2.7804019451141357, "learning_rate": 1.0193399574166074e-05, "loss": 3.1883, "step": 2600 }, { "epoch": 1.9162526614620297, "grad_norm": 3.4074594974517822, "learning_rate": 1.0104684173172464e-05, "loss": 3.1894, "step": 2700 }, { "epoch": 1.9872249822569197, "grad_norm": 3.2770557403564453, "learning_rate": 1.0015968772178851e-05, "loss": 3.1904, "step": 2800 }, { "epoch": 2.0581973030518097, "grad_norm": 3.2407679557800293, "learning_rate": 9.927253371185239e-06, "loss": 3.0811, "step": 2900 }, { "epoch": 2.1291696238466997, "grad_norm": 2.982088565826416, "learning_rate": 9.838537970191626e-06, "loss": 3.0856, "step": 3000 }, { "epoch": 2.2001419446415897, "grad_norm": 3.4891321659088135, "learning_rate": 9.749822569198014e-06, "loss": 3.0932, "step": 3100 }, { "epoch": 2.2711142654364798, "grad_norm": 2.988189220428467, "learning_rate": 9.6611071682044e-06, "loss": 3.0661, "step": 3200 }, { "epoch": 2.34208658623137, "grad_norm": 2.7912137508392334, "learning_rate": 9.572391767210789e-06, "loss": 3.0924, "step": 3300 }, { "epoch": 2.41305890702626, "grad_norm": 3.0504982471466064, "learning_rate": 9.483676366217176e-06, "loss": 3.0704, "step": 3400 }, { "epoch": 2.48403122782115, "grad_norm": 3.195739984512329, "learning_rate": 9.394960965223564e-06, "loss": 3.06, "step": 3500 }, { "epoch": 2.55500354861604, "grad_norm": 6.643301010131836, "learning_rate": 9.306245564229951e-06, "loss": 3.0857, "step": 3600 }, { "epoch": 2.62597586941093, "grad_norm": 2.8578591346740723, "learning_rate": 9.217530163236339e-06, "loss": 3.0512, "step": 3700 }, { "epoch": 2.69694819020582, "grad_norm": 3.1686370372772217, "learning_rate": 9.128814762242725e-06, "loss": 3.1005, "step": 3800 }, { "epoch": 2.7679205110007095, "grad_norm": 2.9187655448913574, "learning_rate": 9.040099361249114e-06, "loss": 3.0878, "step": 3900 }, { "epoch": 2.8388928317956, "grad_norm": 3.5484399795532227, "learning_rate": 8.951383960255501e-06, "loss": 3.0252, "step": 4000 }, { "epoch": 2.9098651525904895, "grad_norm": 3.374964714050293, "learning_rate": 8.862668559261889e-06, "loss": 3.0888, "step": 4100 }, { "epoch": 2.98083747338538, "grad_norm": 3.1180434226989746, "learning_rate": 8.773953158268276e-06, "loss": 3.0332, "step": 4200 }, { "epoch": 3.0518097941802695, "grad_norm": 3.4749999046325684, "learning_rate": 8.685237757274662e-06, "loss": 3.031, "step": 4300 }, { "epoch": 3.1227821149751596, "grad_norm": 3.021486282348633, "learning_rate": 8.59652235628105e-06, "loss": 3.0399, "step": 4400 }, { "epoch": 3.1937544357700496, "grad_norm": 3.392218589782715, "learning_rate": 8.507806955287439e-06, "loss": 2.9733, "step": 4500 }, { "epoch": 3.2647267565649396, "grad_norm": 2.912814140319824, "learning_rate": 8.419091554293826e-06, "loss": 2.9437, "step": 4600 }, { "epoch": 3.3356990773598296, "grad_norm": 4.143916606903076, "learning_rate": 8.330376153300214e-06, "loss": 2.9484, "step": 4700 }, { "epoch": 3.4066713981547196, "grad_norm": 12.112326622009277, "learning_rate": 8.241660752306602e-06, "loss": 2.9534, "step": 4800 }, { "epoch": 3.4776437189496097, "grad_norm": 3.6819448471069336, "learning_rate": 8.152945351312987e-06, "loss": 2.9621, "step": 4900 }, { "epoch": 3.5486160397444997, "grad_norm": 3.4722225666046143, "learning_rate": 8.064229950319375e-06, "loss": 2.9712, "step": 5000 }, { "epoch": 3.6195883605393897, "grad_norm": 3.237942934036255, "learning_rate": 7.975514549325764e-06, "loss": 3.0101, "step": 5100 }, { "epoch": 3.6905606813342797, "grad_norm": 2.8785219192504883, "learning_rate": 7.886799148332152e-06, "loss": 2.9564, "step": 5200 }, { "epoch": 3.7615330021291697, "grad_norm": 3.193901777267456, "learning_rate": 7.798083747338539e-06, "loss": 2.9435, "step": 5300 }, { "epoch": 3.8325053229240598, "grad_norm": 2.641021490097046, "learning_rate": 7.709368346344927e-06, "loss": 2.9404, "step": 5400 }, { "epoch": 3.90347764371895, "grad_norm": 2.879225492477417, "learning_rate": 7.6206529453513125e-06, "loss": 2.9642, "step": 5500 }, { "epoch": 3.9744499645138394, "grad_norm": 2.779848575592041, "learning_rate": 7.531937544357701e-06, "loss": 2.991, "step": 5600 }, { "epoch": 4.04542228530873, "grad_norm": 4.226039886474609, "learning_rate": 7.443222143364088e-06, "loss": 2.8704, "step": 5700 }, { "epoch": 4.116394606103619, "grad_norm": 3.5658841133117676, "learning_rate": 7.354506742370476e-06, "loss": 2.8641, "step": 5800 }, { "epoch": 4.18736692689851, "grad_norm": 2.6197798252105713, "learning_rate": 7.265791341376864e-06, "loss": 2.8462, "step": 5900 }, { "epoch": 4.258339247693399, "grad_norm": 2.7553303241729736, "learning_rate": 7.17707594038325e-06, "loss": 2.882, "step": 6000 }, { "epoch": 4.32931156848829, "grad_norm": 2.9696245193481445, "learning_rate": 7.0883605393896376e-06, "loss": 2.9033, "step": 6100 }, { "epoch": 4.4002838892831795, "grad_norm": 3.14622163772583, "learning_rate": 6.999645138396026e-06, "loss": 2.9003, "step": 6200 }, { "epoch": 4.47125621007807, "grad_norm": 3.744727849960327, "learning_rate": 6.9109297374024135e-06, "loss": 2.8839, "step": 6300 }, { "epoch": 4.5422285308729595, "grad_norm": 3.2394585609436035, "learning_rate": 6.822214336408801e-06, "loss": 2.9144, "step": 6400 }, { "epoch": 4.61320085166785, "grad_norm": 3.425605535507202, "learning_rate": 6.733498935415189e-06, "loss": 2.9345, "step": 6500 }, { "epoch": 4.68417317246274, "grad_norm": 2.960824489593506, "learning_rate": 6.644783534421575e-06, "loss": 2.8784, "step": 6600 }, { "epoch": 4.755145493257629, "grad_norm": 2.8103156089782715, "learning_rate": 6.556068133427963e-06, "loss": 2.9073, "step": 6700 }, { "epoch": 4.82611781405252, "grad_norm": 3.3681435585021973, "learning_rate": 6.467352732434351e-06, "loss": 2.8941, "step": 6800 }, { "epoch": 4.897090134847409, "grad_norm": 2.99110746383667, "learning_rate": 6.3786373314407386e-06, "loss": 2.8776, "step": 6900 }, { "epoch": 4.9680624556423, "grad_norm": 3.0673208236694336, "learning_rate": 6.289921930447126e-06, "loss": 2.8941, "step": 7000 }, { "epoch": 5.039034776437189, "grad_norm": 3.0405173301696777, "learning_rate": 6.201206529453514e-06, "loss": 2.8133, "step": 7100 }, { "epoch": 5.11000709723208, "grad_norm": 2.673586368560791, "learning_rate": 6.112491128459901e-06, "loss": 2.8009, "step": 7200 }, { "epoch": 5.180979418026969, "grad_norm": 2.86667799949646, "learning_rate": 6.023775727466288e-06, "loss": 2.8341, "step": 7300 }, { "epoch": 5.25195173882186, "grad_norm": 2.725980758666992, "learning_rate": 5.935060326472676e-06, "loss": 2.8206, "step": 7400 }, { "epoch": 5.322924059616749, "grad_norm": 9.25928783416748, "learning_rate": 5.846344925479064e-06, "loss": 2.8782, "step": 7500 }, { "epoch": 5.39389638041164, "grad_norm": 3.619668960571289, "learning_rate": 5.75762952448545e-06, "loss": 2.8083, "step": 7600 }, { "epoch": 5.464868701206529, "grad_norm": 4.820442199707031, "learning_rate": 5.668914123491839e-06, "loss": 2.8051, "step": 7700 }, { "epoch": 5.53584102200142, "grad_norm": 3.3483476638793945, "learning_rate": 5.580198722498226e-06, "loss": 2.7862, "step": 7800 }, { "epoch": 5.606813342796309, "grad_norm": 3.04085373878479, "learning_rate": 5.491483321504613e-06, "loss": 2.786, "step": 7900 }, { "epoch": 5.6777856635912, "grad_norm": 6.183100700378418, "learning_rate": 5.402767920511001e-06, "loss": 2.8541, "step": 8000 }, { "epoch": 5.748757984386089, "grad_norm": 3.20927357673645, "learning_rate": 5.314052519517389e-06, "loss": 2.8428, "step": 8100 }, { "epoch": 5.819730305180979, "grad_norm": 2.7963485717773438, "learning_rate": 5.2253371185237755e-06, "loss": 2.8391, "step": 8200 }, { "epoch": 5.8907026259758695, "grad_norm": 2.9603395462036133, "learning_rate": 5.136621717530164e-06, "loss": 2.8483, "step": 8300 }, { "epoch": 5.961674946770759, "grad_norm": 3.2761342525482178, "learning_rate": 5.047906316536551e-06, "loss": 2.8908, "step": 8400 }, { "epoch": 6.0326472675656495, "grad_norm": 2.543006181716919, "learning_rate": 4.959190915542938e-06, "loss": 2.7455, "step": 8500 }, { "epoch": 6.103619588360539, "grad_norm": 2.7663486003875732, "learning_rate": 4.870475514549326e-06, "loss": 2.781, "step": 8600 }, { "epoch": 6.1745919091554295, "grad_norm": 3.1819772720336914, "learning_rate": 4.781760113555713e-06, "loss": 2.8151, "step": 8700 }, { "epoch": 6.245564229950319, "grad_norm": 3.4800798892974854, "learning_rate": 4.693044712562101e-06, "loss": 2.742, "step": 8800 }, { "epoch": 6.31653655074521, "grad_norm": 2.997176170349121, "learning_rate": 4.604329311568489e-06, "loss": 2.7626, "step": 8900 }, { "epoch": 6.387508871540099, "grad_norm": 2.6546881198883057, "learning_rate": 4.515613910574876e-06, "loss": 2.7803, "step": 9000 }, { "epoch": 6.45848119233499, "grad_norm": 3.736813545227051, "learning_rate": 4.426898509581263e-06, "loss": 2.7491, "step": 9100 }, { "epoch": 6.529453513129879, "grad_norm": 2.904879331588745, "learning_rate": 4.3381831085876515e-06, "loss": 2.7707, "step": 9200 }, { "epoch": 6.60042583392477, "grad_norm": 3.1349782943725586, "learning_rate": 4.249467707594038e-06, "loss": 2.7715, "step": 9300 }, { "epoch": 6.671398154719659, "grad_norm": 3.8655664920806885, "learning_rate": 4.160752306600426e-06, "loss": 2.8137, "step": 9400 }, { "epoch": 6.74237047551455, "grad_norm": 3.0926291942596436, "learning_rate": 4.072036905606814e-06, "loss": 2.7606, "step": 9500 }, { "epoch": 6.813342796309439, "grad_norm": 2.9830901622772217, "learning_rate": 3.983321504613201e-06, "loss": 2.7696, "step": 9600 }, { "epoch": 6.884315117104329, "grad_norm": 2.9719297885894775, "learning_rate": 3.894606103619588e-06, "loss": 2.7477, "step": 9700 }, { "epoch": 6.955287437899219, "grad_norm": 4.136593818664551, "learning_rate": 3.805890702625976e-06, "loss": 2.7814, "step": 9800 }, { "epoch": 7.026259758694109, "grad_norm": 3.006192922592163, "learning_rate": 3.7171753016323633e-06, "loss": 2.7316, "step": 9900 }, { "epoch": 7.097232079488999, "grad_norm": 3.204758644104004, "learning_rate": 3.6284599006387512e-06, "loss": 2.7784, "step": 10000 }, { "epoch": 7.168204400283889, "grad_norm": 4.11678409576416, "learning_rate": 3.5397444996451388e-06, "loss": 2.7377, "step": 10100 }, { "epoch": 7.239176721078779, "grad_norm": 3.0145442485809326, "learning_rate": 3.451029098651526e-06, "loss": 2.7327, "step": 10200 }, { "epoch": 7.310149041873669, "grad_norm": 2.759552240371704, "learning_rate": 3.362313697657914e-06, "loss": 2.6663, "step": 10300 }, { "epoch": 7.3811213626685594, "grad_norm": 3.0694785118103027, "learning_rate": 3.2735982966643013e-06, "loss": 2.7523, "step": 10400 }, { "epoch": 7.452093683463449, "grad_norm": 2.6538655757904053, "learning_rate": 3.1848828956706884e-06, "loss": 2.749, "step": 10500 }, { "epoch": 7.5230660042583395, "grad_norm": 3.4055073261260986, "learning_rate": 3.0961674946770763e-06, "loss": 2.7119, "step": 10600 }, { "epoch": 7.594038325053229, "grad_norm": 2.686981439590454, "learning_rate": 3.0074520936834634e-06, "loss": 2.6696, "step": 10700 }, { "epoch": 7.6650106458481195, "grad_norm": 3.1230781078338623, "learning_rate": 2.918736692689851e-06, "loss": 2.7344, "step": 10800 }, { "epoch": 7.735982966643009, "grad_norm": 2.9182968139648438, "learning_rate": 2.830021291696239e-06, "loss": 2.7724, "step": 10900 }, { "epoch": 7.8069552874379, "grad_norm": 4.071280002593994, "learning_rate": 2.741305890702626e-06, "loss": 2.7562, "step": 11000 }, { "epoch": 7.877927608232789, "grad_norm": 2.9077835083007812, "learning_rate": 2.6525904897090135e-06, "loss": 2.7375, "step": 11100 }, { "epoch": 7.948899929027679, "grad_norm": 3.786334276199341, "learning_rate": 2.563875088715401e-06, "loss": 2.7451, "step": 11200 }, { "epoch": 8.01987224982257, "grad_norm": 3.843573808670044, "learning_rate": 2.4751596877217886e-06, "loss": 2.7024, "step": 11300 }, { "epoch": 8.09084457061746, "grad_norm": 3.3538873195648193, "learning_rate": 2.386444286728176e-06, "loss": 2.7569, "step": 11400 }, { "epoch": 8.161816891412348, "grad_norm": 3.028632879257202, "learning_rate": 2.2977288857345636e-06, "loss": 2.7473, "step": 11500 }, { "epoch": 8.232789212207239, "grad_norm": 3.1207797527313232, "learning_rate": 2.209013484740951e-06, "loss": 2.7304, "step": 11600 }, { "epoch": 8.30376153300213, "grad_norm": 2.9007532596588135, "learning_rate": 2.1202980837473386e-06, "loss": 2.6945, "step": 11700 }, { "epoch": 8.37473385379702, "grad_norm": 3.2682387828826904, "learning_rate": 2.031582682753726e-06, "loss": 2.6944, "step": 11800 }, { "epoch": 8.445706174591908, "grad_norm": 2.944650173187256, "learning_rate": 1.9428672817601137e-06, "loss": 2.6788, "step": 11900 }, { "epoch": 8.516678495386799, "grad_norm": 3.6891725063323975, "learning_rate": 1.8541518807665012e-06, "loss": 2.6576, "step": 12000 }, { "epoch": 8.58765081618169, "grad_norm": 3.5069870948791504, "learning_rate": 1.7654364797728887e-06, "loss": 2.6916, "step": 12100 }, { "epoch": 8.65862313697658, "grad_norm": 3.2562413215637207, "learning_rate": 1.676721078779276e-06, "loss": 2.709, "step": 12200 }, { "epoch": 8.729595457771469, "grad_norm": 2.882202386856079, "learning_rate": 1.5880056777856637e-06, "loss": 2.6932, "step": 12300 }, { "epoch": 8.800567778566359, "grad_norm": 3.014678955078125, "learning_rate": 1.499290276792051e-06, "loss": 2.7494, "step": 12400 }, { "epoch": 8.87154009936125, "grad_norm": 3.5019314289093018, "learning_rate": 1.4105748757984388e-06, "loss": 2.716, "step": 12500 }, { "epoch": 8.94251242015614, "grad_norm": 3.2683048248291016, "learning_rate": 1.321859474804826e-06, "loss": 2.7186, "step": 12600 }, { "epoch": 9.013484740951029, "grad_norm": 2.985898017883301, "learning_rate": 1.2331440738112136e-06, "loss": 2.7296, "step": 12700 }, { "epoch": 9.084457061745919, "grad_norm": 3.0097787380218506, "learning_rate": 1.1444286728176011e-06, "loss": 2.7134, "step": 12800 }, { "epoch": 9.15542938254081, "grad_norm": 3.375105142593384, "learning_rate": 1.0557132718239887e-06, "loss": 2.6976, "step": 12900 }, { "epoch": 9.2264017033357, "grad_norm": 18.599666595458984, "learning_rate": 9.669978708303762e-07, "loss": 2.6748, "step": 13000 }, { "epoch": 9.297374024130589, "grad_norm": 3.4463281631469727, "learning_rate": 8.782824698367637e-07, "loss": 2.6669, "step": 13100 }, { "epoch": 9.36834634492548, "grad_norm": 3.3725767135620117, "learning_rate": 7.895670688431512e-07, "loss": 2.6394, "step": 13200 }, { "epoch": 9.43931866572037, "grad_norm": 3.3101234436035156, "learning_rate": 7.008516678495387e-07, "loss": 2.7324, "step": 13300 }, { "epoch": 9.510290986515258, "grad_norm": 3.0802764892578125, "learning_rate": 6.121362668559262e-07, "loss": 2.6929, "step": 13400 }, { "epoch": 9.581263307310149, "grad_norm": 2.806544303894043, "learning_rate": 5.234208658623137e-07, "loss": 2.6382, "step": 13500 }, { "epoch": 9.65223562810504, "grad_norm": 3.4582176208496094, "learning_rate": 4.347054648687013e-07, "loss": 2.694, "step": 13600 }, { "epoch": 9.72320794889993, "grad_norm": 4.037984848022461, "learning_rate": 3.459900638750887e-07, "loss": 2.6497, "step": 13700 }, { "epoch": 9.794180269694818, "grad_norm": 3.1365439891815186, "learning_rate": 2.572746628814762e-07, "loss": 2.705, "step": 13800 }, { "epoch": 9.865152590489709, "grad_norm": 3.4496893882751465, "learning_rate": 1.6855926188786376e-07, "loss": 2.7123, "step": 13900 }, { "epoch": 9.9361249112846, "grad_norm": 3.1736724376678467, "learning_rate": 7.984386089425125e-08, "loss": 2.7009, "step": 14000 } ], "logging_steps": 100, "max_steps": 14090, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.430320762257408e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }