| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 3756, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.003993610223642172, | |
| "grad_norm": 124.79125135092546, | |
| "learning_rate": 9.308510638297872e-08, | |
| "loss": 1.1351, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.007987220447284345, | |
| "grad_norm": 53.854891738476, | |
| "learning_rate": 1.8617021276595745e-07, | |
| "loss": 1.1123, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.011980830670926517, | |
| "grad_norm": 35.55984338795738, | |
| "learning_rate": 2.7925531914893617e-07, | |
| "loss": 1.1318, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.01597444089456869, | |
| "grad_norm": 53.424023941482716, | |
| "learning_rate": 3.723404255319149e-07, | |
| "loss": 1.1141, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.019968051118210862, | |
| "grad_norm": 61.65516106085094, | |
| "learning_rate": 4.654255319148936e-07, | |
| "loss": 1.0358, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.023961661341853034, | |
| "grad_norm": 24.644625180093044, | |
| "learning_rate": 5.585106382978723e-07, | |
| "loss": 0.9898, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.027955271565495207, | |
| "grad_norm": 35.18949474973098, | |
| "learning_rate": 6.515957446808511e-07, | |
| "loss": 0.7832, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.03194888178913738, | |
| "grad_norm": 21.730902642961105, | |
| "learning_rate": 7.446808510638298e-07, | |
| "loss": 0.7333, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.035942492012779555, | |
| "grad_norm": 9.326268068588167, | |
| "learning_rate": 8.377659574468085e-07, | |
| "loss": 0.5276, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.039936102236421724, | |
| "grad_norm": 4.219331242308076, | |
| "learning_rate": 9.308510638297872e-07, | |
| "loss": 0.443, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.0439297124600639, | |
| "grad_norm": 3.828106728488943, | |
| "learning_rate": 1.023936170212766e-06, | |
| "loss": 0.3801, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.04792332268370607, | |
| "grad_norm": 26.95012737394801, | |
| "learning_rate": 1.1170212765957447e-06, | |
| "loss": 0.3434, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.051916932907348244, | |
| "grad_norm": 3.0269858271064702, | |
| "learning_rate": 1.2101063829787234e-06, | |
| "loss": 0.3256, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.05591054313099041, | |
| "grad_norm": 2.9924716110173173, | |
| "learning_rate": 1.3031914893617021e-06, | |
| "loss": 0.3081, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.05990415335463259, | |
| "grad_norm": 3.332606341722849, | |
| "learning_rate": 1.3962765957446809e-06, | |
| "loss": 0.2981, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.06389776357827476, | |
| "grad_norm": 2.9070407704071846, | |
| "learning_rate": 1.4893617021276596e-06, | |
| "loss": 0.2841, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.06789137380191693, | |
| "grad_norm": 2.8536909013493745, | |
| "learning_rate": 1.5824468085106383e-06, | |
| "loss": 0.2818, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.07188498402555911, | |
| "grad_norm": 3.0044229558990367, | |
| "learning_rate": 1.675531914893617e-06, | |
| "loss": 0.2738, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.07587859424920128, | |
| "grad_norm": 2.8334575842346887, | |
| "learning_rate": 1.7686170212765958e-06, | |
| "loss": 0.2648, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.07987220447284345, | |
| "grad_norm": 2.634618871564216, | |
| "learning_rate": 1.8617021276595745e-06, | |
| "loss": 0.2629, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.08386581469648563, | |
| "grad_norm": 2.6590626092313014, | |
| "learning_rate": 1.9547872340425528e-06, | |
| "loss": 0.2472, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.0878594249201278, | |
| "grad_norm": 2.90486484818493, | |
| "learning_rate": 2.047872340425532e-06, | |
| "loss": 0.2495, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.09185303514376997, | |
| "grad_norm": 2.8099143216926317, | |
| "learning_rate": 2.1409574468085107e-06, | |
| "loss": 0.2327, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.09584664536741214, | |
| "grad_norm": 2.8136050336335434, | |
| "learning_rate": 2.2340425531914894e-06, | |
| "loss": 0.2338, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.09984025559105432, | |
| "grad_norm": 2.4697542437553768, | |
| "learning_rate": 2.327127659574468e-06, | |
| "loss": 0.2339, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.10383386581469649, | |
| "grad_norm": 2.6351363837368136, | |
| "learning_rate": 2.420212765957447e-06, | |
| "loss": 0.2262, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.10782747603833866, | |
| "grad_norm": 2.613372563691564, | |
| "learning_rate": 2.5132978723404256e-06, | |
| "loss": 0.2206, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.11182108626198083, | |
| "grad_norm": 2.597752416642004, | |
| "learning_rate": 2.6063829787234043e-06, | |
| "loss": 0.2083, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.11581469648562301, | |
| "grad_norm": 2.4178777850084767, | |
| "learning_rate": 2.699468085106383e-06, | |
| "loss": 0.2039, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.11980830670926518, | |
| "grad_norm": 2.249342405853297, | |
| "learning_rate": 2.7925531914893617e-06, | |
| "loss": 0.2003, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.12380191693290735, | |
| "grad_norm": 2.050068983302969, | |
| "learning_rate": 2.8856382978723405e-06, | |
| "loss": 0.2024, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.12779552715654952, | |
| "grad_norm": 1.7186257113193864, | |
| "learning_rate": 2.978723404255319e-06, | |
| "loss": 0.1958, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.13178913738019168, | |
| "grad_norm": 1.6632638689827575, | |
| "learning_rate": 3.071808510638298e-06, | |
| "loss": 0.1961, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.13578274760383385, | |
| "grad_norm": 1.4976658741753586, | |
| "learning_rate": 3.1648936170212766e-06, | |
| "loss": 0.1881, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.13977635782747605, | |
| "grad_norm": 1.5262647756049854, | |
| "learning_rate": 3.2579787234042553e-06, | |
| "loss": 0.1882, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.14376996805111822, | |
| "grad_norm": 1.5576363273434715, | |
| "learning_rate": 3.351063829787234e-06, | |
| "loss": 0.1861, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.1477635782747604, | |
| "grad_norm": 1.4077910584213011, | |
| "learning_rate": 3.444148936170213e-06, | |
| "loss": 0.1888, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.15175718849840256, | |
| "grad_norm": 1.44303025282263, | |
| "learning_rate": 3.5372340425531915e-06, | |
| "loss": 0.1887, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.15575079872204473, | |
| "grad_norm": 1.3605098039485544, | |
| "learning_rate": 3.63031914893617e-06, | |
| "loss": 0.1809, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.1597444089456869, | |
| "grad_norm": 1.2997453068507787, | |
| "learning_rate": 3.723404255319149e-06, | |
| "loss": 0.1877, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.16373801916932906, | |
| "grad_norm": 1.2577351321884693, | |
| "learning_rate": 3.816489361702128e-06, | |
| "loss": 0.18, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.16773162939297126, | |
| "grad_norm": 1.2562249686118996, | |
| "learning_rate": 3.9095744680851056e-06, | |
| "loss": 0.1846, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.17172523961661343, | |
| "grad_norm": 1.3018013547612421, | |
| "learning_rate": 4.002659574468085e-06, | |
| "loss": 0.1853, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.1757188498402556, | |
| "grad_norm": 1.2860696034925374, | |
| "learning_rate": 4.095744680851064e-06, | |
| "loss": 0.183, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.17971246006389777, | |
| "grad_norm": 1.3940860735108558, | |
| "learning_rate": 4.188829787234043e-06, | |
| "loss": 0.1749, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.18370607028753994, | |
| "grad_norm": 1.266616659720171, | |
| "learning_rate": 4.281914893617021e-06, | |
| "loss": 0.1812, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.1876996805111821, | |
| "grad_norm": 1.1679912165565878, | |
| "learning_rate": 4.375e-06, | |
| "loss": 0.1764, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.19169329073482427, | |
| "grad_norm": 1.2063141158373527, | |
| "learning_rate": 4.468085106382979e-06, | |
| "loss": 0.1725, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.19568690095846644, | |
| "grad_norm": 1.3549203536924732, | |
| "learning_rate": 4.561170212765957e-06, | |
| "loss": 0.1811, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.19968051118210864, | |
| "grad_norm": 1.3023445750500837, | |
| "learning_rate": 4.654255319148936e-06, | |
| "loss": 0.1732, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.2036741214057508, | |
| "grad_norm": 1.1897447106164616, | |
| "learning_rate": 4.747340425531915e-06, | |
| "loss": 0.1768, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.20766773162939298, | |
| "grad_norm": 1.3217343761947535, | |
| "learning_rate": 4.840425531914894e-06, | |
| "loss": 0.1876, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.21166134185303515, | |
| "grad_norm": 1.1882978703516491, | |
| "learning_rate": 4.933510638297872e-06, | |
| "loss": 0.1849, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.21565495207667731, | |
| "grad_norm": 1.2988676577170748, | |
| "learning_rate": 5.026595744680851e-06, | |
| "loss": 0.1742, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.21964856230031948, | |
| "grad_norm": 1.3576611779609293, | |
| "learning_rate": 5.11968085106383e-06, | |
| "loss": 0.1915, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.22364217252396165, | |
| "grad_norm": 1.324286801402471, | |
| "learning_rate": 5.2127659574468086e-06, | |
| "loss": 0.1849, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.22763578274760382, | |
| "grad_norm": 1.1601790573503918, | |
| "learning_rate": 5.305851063829787e-06, | |
| "loss": 0.176, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.23162939297124602, | |
| "grad_norm": 1.2319656878269474, | |
| "learning_rate": 5.398936170212766e-06, | |
| "loss": 0.179, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.2356230031948882, | |
| "grad_norm": 1.1947505074640163, | |
| "learning_rate": 5.492021276595744e-06, | |
| "loss": 0.1805, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.23961661341853036, | |
| "grad_norm": 1.1865842816293095, | |
| "learning_rate": 5.5851063829787235e-06, | |
| "loss": 0.1798, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.24361022364217252, | |
| "grad_norm": 1.2294425800486448, | |
| "learning_rate": 5.678191489361702e-06, | |
| "loss": 0.176, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.2476038338658147, | |
| "grad_norm": 1.1750231455196776, | |
| "learning_rate": 5.771276595744681e-06, | |
| "loss": 0.1798, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.2515974440894569, | |
| "grad_norm": 1.2451767206261621, | |
| "learning_rate": 5.864361702127659e-06, | |
| "loss": 0.1791, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.25559105431309903, | |
| "grad_norm": 1.2675583808344273, | |
| "learning_rate": 5.957446808510638e-06, | |
| "loss": 0.1798, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.2595846645367412, | |
| "grad_norm": 1.179067660563583, | |
| "learning_rate": 6.0505319148936175e-06, | |
| "loss": 0.1709, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.26357827476038337, | |
| "grad_norm": 1.2526557533302813, | |
| "learning_rate": 6.143617021276596e-06, | |
| "loss": 0.1751, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.26757188498402557, | |
| "grad_norm": 1.1681286446845103, | |
| "learning_rate": 6.236702127659574e-06, | |
| "loss": 0.1656, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.2715654952076677, | |
| "grad_norm": 1.1728046626126167, | |
| "learning_rate": 6.329787234042553e-06, | |
| "loss": 0.1798, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.2755591054313099, | |
| "grad_norm": 1.1964596076655767, | |
| "learning_rate": 6.4228723404255316e-06, | |
| "loss": 0.1775, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.2795527156549521, | |
| "grad_norm": 1.2208354176011962, | |
| "learning_rate": 6.515957446808511e-06, | |
| "loss": 0.1779, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.28354632587859424, | |
| "grad_norm": 1.196375387187927, | |
| "learning_rate": 6.609042553191489e-06, | |
| "loss": 0.1779, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.28753993610223644, | |
| "grad_norm": 1.299599153387323, | |
| "learning_rate": 6.702127659574468e-06, | |
| "loss": 0.1793, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.2915335463258786, | |
| "grad_norm": 1.2361399705105702, | |
| "learning_rate": 6.7952127659574464e-06, | |
| "loss": 0.1772, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.2955271565495208, | |
| "grad_norm": 1.1486851112889924, | |
| "learning_rate": 6.888297872340426e-06, | |
| "loss": 0.171, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.2995207667731629, | |
| "grad_norm": 1.2309787273113733, | |
| "learning_rate": 6.981382978723405e-06, | |
| "loss": 0.169, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.3035143769968051, | |
| "grad_norm": 1.1069081217064252, | |
| "learning_rate": 6.999975810667964e-06, | |
| "loss": 0.174, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.3075079872204473, | |
| "grad_norm": 1.0934902660778836, | |
| "learning_rate": 6.999877542079611e-06, | |
| "loss": 0.1749, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.31150159744408945, | |
| "grad_norm": 1.1647975706264049, | |
| "learning_rate": 6.99970368452242e-06, | |
| "loss": 0.1682, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.31549520766773165, | |
| "grad_norm": 1.150865125937996, | |
| "learning_rate": 6.9994542417513e-06, | |
| "loss": 0.1797, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.3194888178913738, | |
| "grad_norm": 1.0698977727857675, | |
| "learning_rate": 6.999129219153615e-06, | |
| "loss": 0.1729, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.323482428115016, | |
| "grad_norm": 1.1500528795772351, | |
| "learning_rate": 6.998728623749078e-06, | |
| "loss": 0.1824, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.3274760383386581, | |
| "grad_norm": 1.1428662750727947, | |
| "learning_rate": 6.9982524641895845e-06, | |
| "loss": 0.1705, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.3314696485623003, | |
| "grad_norm": 1.1575666664852504, | |
| "learning_rate": 6.997700750759044e-06, | |
| "loss": 0.1769, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.3354632587859425, | |
| "grad_norm": 1.115412562169602, | |
| "learning_rate": 6.997073495373145e-06, | |
| "loss": 0.1734, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.33945686900958466, | |
| "grad_norm": 1.151869893863153, | |
| "learning_rate": 6.996370711579098e-06, | |
| "loss": 0.1789, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.34345047923322686, | |
| "grad_norm": 1.1454117113132618, | |
| "learning_rate": 6.99559241455535e-06, | |
| "loss": 0.166, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.347444089456869, | |
| "grad_norm": 1.0873453028650606, | |
| "learning_rate": 6.994738621111253e-06, | |
| "loss": 0.1654, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.3514376996805112, | |
| "grad_norm": 0.99906893867618, | |
| "learning_rate": 6.993809349686699e-06, | |
| "loss": 0.1751, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.35543130990415334, | |
| "grad_norm": 1.157573643863577, | |
| "learning_rate": 6.992804620351724e-06, | |
| "loss": 0.1728, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.35942492012779553, | |
| "grad_norm": 1.0531396174518157, | |
| "learning_rate": 6.991724454806074e-06, | |
| "loss": 0.1765, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.3634185303514377, | |
| "grad_norm": 1.1386062653595457, | |
| "learning_rate": 6.990568876378738e-06, | |
| "loss": 0.1761, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.36741214057507987, | |
| "grad_norm": 1.2777140078161082, | |
| "learning_rate": 6.989337910027439e-06, | |
| "loss": 0.1772, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.37140575079872207, | |
| "grad_norm": 1.1495981738430336, | |
| "learning_rate": 6.988031582338101e-06, | |
| "loss": 0.1685, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.3753993610223642, | |
| "grad_norm": 1.6362808683762557, | |
| "learning_rate": 6.986649921524274e-06, | |
| "loss": 0.1717, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.3793929712460064, | |
| "grad_norm": 0.9691070079618013, | |
| "learning_rate": 6.9851929574265194e-06, | |
| "loss": 0.1682, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.38338658146964855, | |
| "grad_norm": 1.024317390117457, | |
| "learning_rate": 6.98366072151177e-06, | |
| "loss": 0.167, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.38738019169329074, | |
| "grad_norm": 1.2702847001741562, | |
| "learning_rate": 6.982053246872652e-06, | |
| "loss": 0.1703, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.3913738019169329, | |
| "grad_norm": 1.0391707960731285, | |
| "learning_rate": 6.9803705682267635e-06, | |
| "loss": 0.1637, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.3953674121405751, | |
| "grad_norm": 1.0070748510170011, | |
| "learning_rate": 6.978612721915935e-06, | |
| "loss": 0.1677, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.3993610223642173, | |
| "grad_norm": 1.0402855278956755, | |
| "learning_rate": 6.976779745905432e-06, | |
| "loss": 0.1654, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.3993610223642173, | |
| "eval_loss": 0.15384411811828613, | |
| "eval_runtime": 389.0289, | |
| "eval_samples_per_second": 45.773, | |
| "eval_steps_per_second": 5.722, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.4033546325878594, | |
| "grad_norm": 1.1163339009849595, | |
| "learning_rate": 6.974871679783144e-06, | |
| "loss": 0.17, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 0.4073482428115016, | |
| "grad_norm": 0.9859275646648643, | |
| "learning_rate": 6.972888564758729e-06, | |
| "loss": 0.1657, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.41134185303514376, | |
| "grad_norm": 0.9394276032165236, | |
| "learning_rate": 6.970830443662719e-06, | |
| "loss": 0.1685, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 0.41533546325878595, | |
| "grad_norm": 1.073675948253867, | |
| "learning_rate": 6.968697360945598e-06, | |
| "loss": 0.1759, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.4193290734824281, | |
| "grad_norm": 1.036048195729432, | |
| "learning_rate": 6.966489362676843e-06, | |
| "loss": 0.1631, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.4233226837060703, | |
| "grad_norm": 1.034694676356374, | |
| "learning_rate": 6.964206496543924e-06, | |
| "loss": 0.1693, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.4273162939297125, | |
| "grad_norm": 1.0185215828025804, | |
| "learning_rate": 6.96184881185128e-06, | |
| "loss": 0.1653, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 0.43130990415335463, | |
| "grad_norm": 0.9931981204110877, | |
| "learning_rate": 6.959416359519253e-06, | |
| "loss": 0.168, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.4353035143769968, | |
| "grad_norm": 1.0616121384749488, | |
| "learning_rate": 6.956909192082982e-06, | |
| "loss": 0.1709, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 0.43929712460063897, | |
| "grad_norm": 1.0045397760966588, | |
| "learning_rate": 6.954327363691278e-06, | |
| "loss": 0.1721, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.44329073482428116, | |
| "grad_norm": 1.0341210608686486, | |
| "learning_rate": 6.951670930105448e-06, | |
| "loss": 0.1637, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 0.4472843450479233, | |
| "grad_norm": 0.943453900900678, | |
| "learning_rate": 6.9489399486980925e-06, | |
| "loss": 0.1672, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.4512779552715655, | |
| "grad_norm": 1.0198128200679288, | |
| "learning_rate": 6.946134478451864e-06, | |
| "loss": 0.1561, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 0.45527156549520764, | |
| "grad_norm": 0.981934871464691, | |
| "learning_rate": 6.943254579958201e-06, | |
| "loss": 0.1696, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.45926517571884984, | |
| "grad_norm": 0.9799305322175899, | |
| "learning_rate": 6.940300315416007e-06, | |
| "loss": 0.1716, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.46325878594249204, | |
| "grad_norm": 1.0222871844508339, | |
| "learning_rate": 6.93727174863032e-06, | |
| "loss": 0.1732, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.4672523961661342, | |
| "grad_norm": 1.0017118270280685, | |
| "learning_rate": 6.934168945010925e-06, | |
| "loss": 0.1697, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 0.4712460063897764, | |
| "grad_norm": 1.0155218802083956, | |
| "learning_rate": 6.930991971570945e-06, | |
| "loss": 0.1612, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.4752396166134185, | |
| "grad_norm": 0.968986765149799, | |
| "learning_rate": 6.9277408969253935e-06, | |
| "loss": 0.1624, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 0.4792332268370607, | |
| "grad_norm": 0.9049030479920512, | |
| "learning_rate": 6.924415791289696e-06, | |
| "loss": 0.1666, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.48322683706070285, | |
| "grad_norm": 1.0516179412211666, | |
| "learning_rate": 6.921016726478164e-06, | |
| "loss": 0.1711, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 0.48722044728434505, | |
| "grad_norm": 0.9668990395103417, | |
| "learning_rate": 6.917543775902452e-06, | |
| "loss": 0.1726, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.49121405750798725, | |
| "grad_norm": 1.106970562985427, | |
| "learning_rate": 6.913997014569974e-06, | |
| "loss": 0.1659, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 0.4952076677316294, | |
| "grad_norm": 0.995421768407361, | |
| "learning_rate": 6.910376519082274e-06, | |
| "loss": 0.1588, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.4992012779552716, | |
| "grad_norm": 1.0063189979179088, | |
| "learning_rate": 6.906682367633382e-06, | |
| "loss": 0.1733, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 0.5031948881789138, | |
| "grad_norm": 1.0076370540912887, | |
| "learning_rate": 6.902914640008114e-06, | |
| "loss": 0.1583, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.5071884984025559, | |
| "grad_norm": 0.9196883887602973, | |
| "learning_rate": 6.899073417580362e-06, | |
| "loss": 0.167, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 0.5111821086261981, | |
| "grad_norm": 0.9516430809786864, | |
| "learning_rate": 6.895158783311325e-06, | |
| "loss": 0.1727, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.5151757188498403, | |
| "grad_norm": 1.0089849026997129, | |
| "learning_rate": 6.891170821747724e-06, | |
| "loss": 0.1656, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 0.5191693290734825, | |
| "grad_norm": 0.9583023331989294, | |
| "learning_rate": 6.887109619019972e-06, | |
| "loss": 0.1665, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.5231629392971247, | |
| "grad_norm": 1.00161133810888, | |
| "learning_rate": 6.882975262840319e-06, | |
| "loss": 0.1644, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 0.5271565495207667, | |
| "grad_norm": 1.051049075871936, | |
| "learning_rate": 6.87876784250095e-06, | |
| "loss": 0.1668, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.5311501597444089, | |
| "grad_norm": 0.8907899001628331, | |
| "learning_rate": 6.874487448872064e-06, | |
| "loss": 0.1642, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 0.5351437699680511, | |
| "grad_norm": 0.9259445853753279, | |
| "learning_rate": 6.870134174399907e-06, | |
| "loss": 0.1662, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.5391373801916933, | |
| "grad_norm": 0.8930200609325035, | |
| "learning_rate": 6.8657081131047766e-06, | |
| "loss": 0.1646, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 0.5431309904153354, | |
| "grad_norm": 0.9561210976739172, | |
| "learning_rate": 6.861209360578991e-06, | |
| "loss": 0.1627, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.5471246006389776, | |
| "grad_norm": 0.9117689744304602, | |
| "learning_rate": 6.856638013984827e-06, | |
| "loss": 0.1623, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 0.5511182108626198, | |
| "grad_norm": 0.9687731076699732, | |
| "learning_rate": 6.8519941720524155e-06, | |
| "loss": 0.1567, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.555111821086262, | |
| "grad_norm": 0.9600827029154959, | |
| "learning_rate": 6.847277935077615e-06, | |
| "loss": 0.1608, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 0.5591054313099042, | |
| "grad_norm": 0.9096771947825363, | |
| "learning_rate": 6.842489404919846e-06, | |
| "loss": 0.1523, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.5630990415335463, | |
| "grad_norm": 0.9443224796506533, | |
| "learning_rate": 6.837628684999885e-06, | |
| "loss": 0.1566, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 0.5670926517571885, | |
| "grad_norm": 0.979707108806894, | |
| "learning_rate": 6.832695880297634e-06, | |
| "loss": 0.166, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.5710862619808307, | |
| "grad_norm": 1.0043374037070385, | |
| "learning_rate": 6.827691097349857e-06, | |
| "loss": 0.1627, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 0.5750798722044729, | |
| "grad_norm": 0.9464996778618299, | |
| "learning_rate": 6.8226144442478715e-06, | |
| "loss": 0.164, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.579073482428115, | |
| "grad_norm": 0.9416490321614909, | |
| "learning_rate": 6.817466030635222e-06, | |
| "loss": 0.1618, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 0.5830670926517572, | |
| "grad_norm": 0.8925338429207857, | |
| "learning_rate": 6.812245967705307e-06, | |
| "loss": 0.1619, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.5870607028753994, | |
| "grad_norm": 0.9194904859850123, | |
| "learning_rate": 6.8069543681989755e-06, | |
| "loss": 0.1553, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 0.5910543130990416, | |
| "grad_norm": 0.895691163374912, | |
| "learning_rate": 6.8015913464021e-06, | |
| "loss": 0.1606, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.5950479233226837, | |
| "grad_norm": 0.8442230894893846, | |
| "learning_rate": 6.796157018143101e-06, | |
| "loss": 0.1541, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 0.5990415335463258, | |
| "grad_norm": 0.911157960300542, | |
| "learning_rate": 6.790651500790446e-06, | |
| "loss": 0.158, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.603035143769968, | |
| "grad_norm": 0.9270097775375741, | |
| "learning_rate": 6.785074913250119e-06, | |
| "loss": 0.1578, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 0.6070287539936102, | |
| "grad_norm": 0.985999202515655, | |
| "learning_rate": 6.779427375963048e-06, | |
| "loss": 0.1605, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.6110223642172524, | |
| "grad_norm": 0.9816513779676883, | |
| "learning_rate": 6.773709010902506e-06, | |
| "loss": 0.1579, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 0.6150159744408946, | |
| "grad_norm": 0.9851452504449081, | |
| "learning_rate": 6.767919941571479e-06, | |
| "loss": 0.1643, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.6190095846645367, | |
| "grad_norm": 0.932019250350353, | |
| "learning_rate": 6.762060292999991e-06, | |
| "loss": 0.1641, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 0.6230031948881789, | |
| "grad_norm": 1.0112706975782446, | |
| "learning_rate": 6.756130191742413e-06, | |
| "loss": 0.1669, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.6269968051118211, | |
| "grad_norm": 1.0652365454569854, | |
| "learning_rate": 6.75012976587472e-06, | |
| "loss": 0.1553, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 0.6309904153354633, | |
| "grad_norm": 0.9293147453915607, | |
| "learning_rate": 6.7440591449917345e-06, | |
| "loss": 0.1629, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.6349840255591054, | |
| "grad_norm": 0.9124885849493852, | |
| "learning_rate": 6.737918460204323e-06, | |
| "loss": 0.1615, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 0.6389776357827476, | |
| "grad_norm": 0.9576719497775147, | |
| "learning_rate": 6.731707844136561e-06, | |
| "loss": 0.166, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.6429712460063898, | |
| "grad_norm": 0.8872595723908449, | |
| "learning_rate": 6.725427430922875e-06, | |
| "loss": 0.1542, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 0.646964856230032, | |
| "grad_norm": 0.9935128432068615, | |
| "learning_rate": 6.719077356205143e-06, | |
| "loss": 0.1574, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.6509584664536742, | |
| "grad_norm": 0.931891327221199, | |
| "learning_rate": 6.712657757129762e-06, | |
| "loss": 0.1646, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 0.6549520766773163, | |
| "grad_norm": 0.8727423850294462, | |
| "learning_rate": 6.7061687723446925e-06, | |
| "loss": 0.1588, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.6589456869009584, | |
| "grad_norm": 0.9689019750184885, | |
| "learning_rate": 6.699610541996453e-06, | |
| "loss": 0.1544, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 0.6629392971246006, | |
| "grad_norm": 0.9135093758857515, | |
| "learning_rate": 6.69298320772711e-06, | |
| "loss": 0.1633, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.6669329073482428, | |
| "grad_norm": 0.8841844046649162, | |
| "learning_rate": 6.6862869126712006e-06, | |
| "loss": 0.1517, | |
| "step": 835 | |
| }, | |
| { | |
| "epoch": 0.670926517571885, | |
| "grad_norm": 0.9197410885367548, | |
| "learning_rate": 6.6795218014526535e-06, | |
| "loss": 0.1582, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.6749201277955271, | |
| "grad_norm": 0.9071224596757496, | |
| "learning_rate": 6.672688020181662e-06, | |
| "loss": 0.1568, | |
| "step": 845 | |
| }, | |
| { | |
| "epoch": 0.6789137380191693, | |
| "grad_norm": 0.8384791088112575, | |
| "learning_rate": 6.665785716451527e-06, | |
| "loss": 0.161, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.6829073482428115, | |
| "grad_norm": 0.9697839295867846, | |
| "learning_rate": 6.658815039335469e-06, | |
| "loss": 0.1663, | |
| "step": 855 | |
| }, | |
| { | |
| "epoch": 0.6869009584664537, | |
| "grad_norm": 0.93519835830344, | |
| "learning_rate": 6.651776139383413e-06, | |
| "loss": 0.1635, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.6908945686900958, | |
| "grad_norm": 0.8980357502380838, | |
| "learning_rate": 6.644669168618731e-06, | |
| "loss": 0.1614, | |
| "step": 865 | |
| }, | |
| { | |
| "epoch": 0.694888178913738, | |
| "grad_norm": 0.9335437765568465, | |
| "learning_rate": 6.637494280534963e-06, | |
| "loss": 0.1629, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.6988817891373802, | |
| "grad_norm": 0.8803593207070962, | |
| "learning_rate": 6.630251630092499e-06, | |
| "loss": 0.1477, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 0.7028753993610224, | |
| "grad_norm": 0.904997999578463, | |
| "learning_rate": 6.622941373715235e-06, | |
| "loss": 0.1575, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.7068690095846646, | |
| "grad_norm": 0.9235638802056787, | |
| "learning_rate": 6.615563669287194e-06, | |
| "loss": 0.1584, | |
| "step": 885 | |
| }, | |
| { | |
| "epoch": 0.7108626198083067, | |
| "grad_norm": 0.9076843251345146, | |
| "learning_rate": 6.6081186761491116e-06, | |
| "loss": 0.1557, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.7148562300319489, | |
| "grad_norm": 0.9740399647119267, | |
| "learning_rate": 6.600606555095002e-06, | |
| "loss": 0.1604, | |
| "step": 895 | |
| }, | |
| { | |
| "epoch": 0.7188498402555911, | |
| "grad_norm": 0.928718969148393, | |
| "learning_rate": 6.593027468368679e-06, | |
| "loss": 0.1571, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.7228434504792333, | |
| "grad_norm": 0.9098156003642778, | |
| "learning_rate": 6.585381579660256e-06, | |
| "loss": 0.1572, | |
| "step": 905 | |
| }, | |
| { | |
| "epoch": 0.7268370607028753, | |
| "grad_norm": 0.9164438133952856, | |
| "learning_rate": 6.577669054102609e-06, | |
| "loss": 0.1549, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.7308306709265175, | |
| "grad_norm": 0.8693093471558418, | |
| "learning_rate": 6.56989005826781e-06, | |
| "loss": 0.1644, | |
| "step": 915 | |
| }, | |
| { | |
| "epoch": 0.7348242811501597, | |
| "grad_norm": 0.8225661786689502, | |
| "learning_rate": 6.5620447601635305e-06, | |
| "loss": 0.1598, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.7388178913738019, | |
| "grad_norm": 0.9061170444615521, | |
| "learning_rate": 6.55413332922941e-06, | |
| "loss": 0.1507, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 0.7428115015974441, | |
| "grad_norm": 1.2155491539418426, | |
| "learning_rate": 6.546155936333401e-06, | |
| "loss": 0.1531, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.7468051118210862, | |
| "grad_norm": 0.8958306454880214, | |
| "learning_rate": 6.538112753768076e-06, | |
| "loss": 0.1543, | |
| "step": 935 | |
| }, | |
| { | |
| "epoch": 0.7507987220447284, | |
| "grad_norm": 0.9538107994557221, | |
| "learning_rate": 6.530003955246904e-06, | |
| "loss": 0.1552, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.7547923322683706, | |
| "grad_norm": 0.9543004936787222, | |
| "learning_rate": 6.521829715900506e-06, | |
| "loss": 0.1579, | |
| "step": 945 | |
| }, | |
| { | |
| "epoch": 0.7587859424920128, | |
| "grad_norm": 0.9325838144594478, | |
| "learning_rate": 6.5135902122728635e-06, | |
| "loss": 0.1556, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.762779552715655, | |
| "grad_norm": 0.9186355171261438, | |
| "learning_rate": 6.505285622317513e-06, | |
| "loss": 0.1589, | |
| "step": 955 | |
| }, | |
| { | |
| "epoch": 0.7667731629392971, | |
| "grad_norm": 0.9099495360242699, | |
| "learning_rate": 6.496916125393698e-06, | |
| "loss": 0.1587, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.7707667731629393, | |
| "grad_norm": 0.8392658119656629, | |
| "learning_rate": 6.4884819022625e-06, | |
| "loss": 0.1619, | |
| "step": 965 | |
| }, | |
| { | |
| "epoch": 0.7747603833865815, | |
| "grad_norm": 0.8457903416638893, | |
| "learning_rate": 6.479983135082927e-06, | |
| "loss": 0.1516, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.7787539936102237, | |
| "grad_norm": 0.9121023222975818, | |
| "learning_rate": 6.4714200074079885e-06, | |
| "loss": 0.157, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 0.7827476038338658, | |
| "grad_norm": 0.8553962057764459, | |
| "learning_rate": 6.462792704180721e-06, | |
| "loss": 0.1517, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.786741214057508, | |
| "grad_norm": 0.9373339203123983, | |
| "learning_rate": 6.454101411730205e-06, | |
| "loss": 0.1492, | |
| "step": 985 | |
| }, | |
| { | |
| "epoch": 0.7907348242811502, | |
| "grad_norm": 0.8309793749606978, | |
| "learning_rate": 6.445346317767532e-06, | |
| "loss": 0.1475, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.7947284345047924, | |
| "grad_norm": 0.8968174110255692, | |
| "learning_rate": 6.436527611381752e-06, | |
| "loss": 0.1554, | |
| "step": 995 | |
| }, | |
| { | |
| "epoch": 0.7987220447284346, | |
| "grad_norm": 0.8719592876952176, | |
| "learning_rate": 6.427645483035793e-06, | |
| "loss": 0.1531, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.7987220447284346, | |
| "eval_loss": 0.13947652280330658, | |
| "eval_runtime": 388.4879, | |
| "eval_samples_per_second": 45.837, | |
| "eval_steps_per_second": 5.73, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.8027156549520766, | |
| "grad_norm": 0.8945078745681652, | |
| "learning_rate": 6.418700124562346e-06, | |
| "loss": 0.1511, | |
| "step": 1005 | |
| }, | |
| { | |
| "epoch": 0.8067092651757188, | |
| "grad_norm": 0.9037985750009097, | |
| "learning_rate": 6.409691729159718e-06, | |
| "loss": 0.158, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.810702875399361, | |
| "grad_norm": 0.9841558619803956, | |
| "learning_rate": 6.400620491387666e-06, | |
| "loss": 0.1572, | |
| "step": 1015 | |
| }, | |
| { | |
| "epoch": 0.8146964856230032, | |
| "grad_norm": 0.8856887705179193, | |
| "learning_rate": 6.39148660716319e-06, | |
| "loss": 0.1564, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.8186900958466453, | |
| "grad_norm": 0.8458094119326396, | |
| "learning_rate": 6.382290273756303e-06, | |
| "loss": 0.1564, | |
| "step": 1025 | |
| }, | |
| { | |
| "epoch": 0.8226837060702875, | |
| "grad_norm": 0.8951775862164378, | |
| "learning_rate": 6.373031689785771e-06, | |
| "loss": 0.1567, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.8266773162939297, | |
| "grad_norm": 0.8814016163914961, | |
| "learning_rate": 6.363711055214824e-06, | |
| "loss": 0.1559, | |
| "step": 1035 | |
| }, | |
| { | |
| "epoch": 0.8306709265175719, | |
| "grad_norm": 0.8434088161690633, | |
| "learning_rate": 6.354328571346836e-06, | |
| "loss": 0.154, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.8346645367412141, | |
| "grad_norm": 0.8465270748003091, | |
| "learning_rate": 6.344884440820976e-06, | |
| "loss": 0.1621, | |
| "step": 1045 | |
| }, | |
| { | |
| "epoch": 0.8386581469648562, | |
| "grad_norm": 0.8575168379781873, | |
| "learning_rate": 6.335378867607834e-06, | |
| "loss": 0.1494, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.8426517571884984, | |
| "grad_norm": 0.9030974173812433, | |
| "learning_rate": 6.325812057005017e-06, | |
| "loss": 0.1509, | |
| "step": 1055 | |
| }, | |
| { | |
| "epoch": 0.8466453674121406, | |
| "grad_norm": 0.9655594054454741, | |
| "learning_rate": 6.316184215632708e-06, | |
| "loss": 0.1574, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.8506389776357828, | |
| "grad_norm": 0.8882567467448087, | |
| "learning_rate": 6.306495551429215e-06, | |
| "loss": 0.1523, | |
| "step": 1065 | |
| }, | |
| { | |
| "epoch": 0.854632587859425, | |
| "grad_norm": 0.8714942442812957, | |
| "learning_rate": 6.296746273646466e-06, | |
| "loss": 0.1494, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.8586261980830671, | |
| "grad_norm": 0.9184869505642743, | |
| "learning_rate": 6.286936592845503e-06, | |
| "loss": 0.1553, | |
| "step": 1075 | |
| }, | |
| { | |
| "epoch": 0.8626198083067093, | |
| "grad_norm": 0.9132914637442158, | |
| "learning_rate": 6.2770667208919265e-06, | |
| "loss": 0.1507, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.8666134185303515, | |
| "grad_norm": 0.8713397978039784, | |
| "learning_rate": 6.26713687095132e-06, | |
| "loss": 0.1495, | |
| "step": 1085 | |
| }, | |
| { | |
| "epoch": 0.8706070287539937, | |
| "grad_norm": 0.8854147468102307, | |
| "learning_rate": 6.2571472574846515e-06, | |
| "loss": 0.1486, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.8746006389776357, | |
| "grad_norm": 0.9369163481137155, | |
| "learning_rate": 6.247098096243633e-06, | |
| "loss": 0.1506, | |
| "step": 1095 | |
| }, | |
| { | |
| "epoch": 0.8785942492012779, | |
| "grad_norm": 0.9543529445696081, | |
| "learning_rate": 6.23698960426607e-06, | |
| "loss": 0.1547, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.8825878594249201, | |
| "grad_norm": 0.9373088824897908, | |
| "learning_rate": 6.2268219998711676e-06, | |
| "loss": 0.1488, | |
| "step": 1105 | |
| }, | |
| { | |
| "epoch": 0.8865814696485623, | |
| "grad_norm": 0.9422023974896926, | |
| "learning_rate": 6.216595502654819e-06, | |
| "loss": 0.1573, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.8905750798722045, | |
| "grad_norm": 0.9696343156334155, | |
| "learning_rate": 6.206310333484859e-06, | |
| "loss": 0.1625, | |
| "step": 1115 | |
| }, | |
| { | |
| "epoch": 0.8945686900958466, | |
| "grad_norm": 0.8142960364081798, | |
| "learning_rate": 6.1959667144962984e-06, | |
| "loss": 0.1525, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.8985623003194888, | |
| "grad_norm": 0.8889141559955737, | |
| "learning_rate": 6.185564869086523e-06, | |
| "loss": 0.1493, | |
| "step": 1125 | |
| }, | |
| { | |
| "epoch": 0.902555910543131, | |
| "grad_norm": 0.9104469356832359, | |
| "learning_rate": 6.175105021910469e-06, | |
| "loss": 0.144, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.9065495207667732, | |
| "grad_norm": 0.8784613561636412, | |
| "learning_rate": 6.164587398875773e-06, | |
| "loss": 0.1553, | |
| "step": 1135 | |
| }, | |
| { | |
| "epoch": 0.9105431309904153, | |
| "grad_norm": 0.9170546731493038, | |
| "learning_rate": 6.1540122271378905e-06, | |
| "loss": 0.1501, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.9145367412140575, | |
| "grad_norm": 0.8596576185799807, | |
| "learning_rate": 6.143379735095194e-06, | |
| "loss": 0.1514, | |
| "step": 1145 | |
| }, | |
| { | |
| "epoch": 0.9185303514376997, | |
| "grad_norm": 0.8780099284965823, | |
| "learning_rate": 6.1326901523840325e-06, | |
| "loss": 0.1485, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.9225239616613419, | |
| "grad_norm": 0.8740379156570441, | |
| "learning_rate": 6.12194370987378e-06, | |
| "loss": 0.1528, | |
| "step": 1155 | |
| }, | |
| { | |
| "epoch": 0.9265175718849841, | |
| "grad_norm": 0.9196888261832091, | |
| "learning_rate": 6.111140639661846e-06, | |
| "loss": 0.149, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.9305111821086262, | |
| "grad_norm": 0.898041148881679, | |
| "learning_rate": 6.10028117506866e-06, | |
| "loss": 0.1588, | |
| "step": 1165 | |
| }, | |
| { | |
| "epoch": 0.9345047923322684, | |
| "grad_norm": 0.8638459142453613, | |
| "learning_rate": 6.089365550632639e-06, | |
| "loss": 0.1513, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.9384984025559105, | |
| "grad_norm": 0.9015055071062944, | |
| "learning_rate": 6.078394002105115e-06, | |
| "loss": 0.1561, | |
| "step": 1175 | |
| }, | |
| { | |
| "epoch": 0.9424920127795527, | |
| "grad_norm": 0.8542741624271732, | |
| "learning_rate": 6.067366766445247e-06, | |
| "loss": 0.1519, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.9464856230031949, | |
| "grad_norm": 0.9418131743815946, | |
| "learning_rate": 6.056284081814902e-06, | |
| "loss": 0.1445, | |
| "step": 1185 | |
| }, | |
| { | |
| "epoch": 0.950479233226837, | |
| "grad_norm": 0.8274533049204239, | |
| "learning_rate": 6.045146187573513e-06, | |
| "loss": 0.1508, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.9544728434504792, | |
| "grad_norm": 0.8788233365616475, | |
| "learning_rate": 6.0339533242729105e-06, | |
| "loss": 0.1457, | |
| "step": 1195 | |
| }, | |
| { | |
| "epoch": 0.9584664536741214, | |
| "grad_norm": 0.8978374665514917, | |
| "learning_rate": 6.022705733652118e-06, | |
| "loss": 0.1607, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.9624600638977636, | |
| "grad_norm": 0.9265956339818142, | |
| "learning_rate": 6.0114036586321476e-06, | |
| "loss": 0.1606, | |
| "step": 1205 | |
| }, | |
| { | |
| "epoch": 0.9664536741214057, | |
| "grad_norm": 0.8913889253563607, | |
| "learning_rate": 6.000047343310736e-06, | |
| "loss": 0.1446, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.9704472843450479, | |
| "grad_norm": 0.9162076451056782, | |
| "learning_rate": 5.988637032957083e-06, | |
| "loss": 0.1472, | |
| "step": 1215 | |
| }, | |
| { | |
| "epoch": 0.9744408945686901, | |
| "grad_norm": 0.9037423765093193, | |
| "learning_rate": 5.977172974006552e-06, | |
| "loss": 0.1495, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.9784345047923323, | |
| "grad_norm": 0.8965101197123303, | |
| "learning_rate": 5.965655414055348e-06, | |
| "loss": 0.1514, | |
| "step": 1225 | |
| }, | |
| { | |
| "epoch": 0.9824281150159745, | |
| "grad_norm": 0.8363882753686213, | |
| "learning_rate": 5.954084601855169e-06, | |
| "loss": 0.1517, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.9864217252396166, | |
| "grad_norm": 0.8508334278041144, | |
| "learning_rate": 5.942460787307833e-06, | |
| "loss": 0.1545, | |
| "step": 1235 | |
| }, | |
| { | |
| "epoch": 0.9904153354632588, | |
| "grad_norm": 0.8440202005698025, | |
| "learning_rate": 5.930784221459883e-06, | |
| "loss": 0.154, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.994408945686901, | |
| "grad_norm": 0.8641285063450203, | |
| "learning_rate": 5.919055156497162e-06, | |
| "loss": 0.1575, | |
| "step": 1245 | |
| }, | |
| { | |
| "epoch": 0.9984025559105432, | |
| "grad_norm": 0.8395654019211741, | |
| "learning_rate": 5.907273845739368e-06, | |
| "loss": 0.1433, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 1.0023961661341854, | |
| "grad_norm": 0.7574896989713115, | |
| "learning_rate": 5.895440543634588e-06, | |
| "loss": 0.123, | |
| "step": 1255 | |
| }, | |
| { | |
| "epoch": 1.0063897763578276, | |
| "grad_norm": 0.7001918457109648, | |
| "learning_rate": 5.88355550575379e-06, | |
| "loss": 0.0967, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 1.0103833865814698, | |
| "grad_norm": 0.8541947338770247, | |
| "learning_rate": 5.871618988785316e-06, | |
| "loss": 0.0961, | |
| "step": 1265 | |
| }, | |
| { | |
| "epoch": 1.0143769968051117, | |
| "grad_norm": 0.7524342512654641, | |
| "learning_rate": 5.8596312505293315e-06, | |
| "loss": 0.0947, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 1.018370607028754, | |
| "grad_norm": 0.7718835887830281, | |
| "learning_rate": 5.847592549892258e-06, | |
| "loss": 0.0945, | |
| "step": 1275 | |
| }, | |
| { | |
| "epoch": 1.0223642172523961, | |
| "grad_norm": 0.8408835100355743, | |
| "learning_rate": 5.835503146881185e-06, | |
| "loss": 0.1015, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 1.0263578274760383, | |
| "grad_norm": 0.8763612828817176, | |
| "learning_rate": 5.823363302598248e-06, | |
| "loss": 0.0968, | |
| "step": 1285 | |
| }, | |
| { | |
| "epoch": 1.0303514376996805, | |
| "grad_norm": 0.8178713771043465, | |
| "learning_rate": 5.811173279234996e-06, | |
| "loss": 0.0962, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 1.0343450479233227, | |
| "grad_norm": 0.9101366597977056, | |
| "learning_rate": 5.798933340066726e-06, | |
| "loss": 0.0983, | |
| "step": 1295 | |
| }, | |
| { | |
| "epoch": 1.038338658146965, | |
| "grad_norm": 0.753210589288919, | |
| "learning_rate": 5.786643749446795e-06, | |
| "loss": 0.0923, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 1.042332268370607, | |
| "grad_norm": 0.8263779547352994, | |
| "learning_rate": 5.774304772800912e-06, | |
| "loss": 0.0978, | |
| "step": 1305 | |
| }, | |
| { | |
| "epoch": 1.0463258785942493, | |
| "grad_norm": 0.835011482405387, | |
| "learning_rate": 5.76191667662141e-06, | |
| "loss": 0.0917, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 1.0503194888178913, | |
| "grad_norm": 0.8228387988406632, | |
| "learning_rate": 5.74947972846148e-06, | |
| "loss": 0.0977, | |
| "step": 1315 | |
| }, | |
| { | |
| "epoch": 1.0543130990415335, | |
| "grad_norm": 0.7510886619859262, | |
| "learning_rate": 5.736994196929405e-06, | |
| "loss": 0.0934, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 1.0583067092651757, | |
| "grad_norm": 0.8004708846108053, | |
| "learning_rate": 5.724460351682749e-06, | |
| "loss": 0.0946, | |
| "step": 1325 | |
| }, | |
| { | |
| "epoch": 1.0623003194888179, | |
| "grad_norm": 0.8382732441836329, | |
| "learning_rate": 5.711878463422534e-06, | |
| "loss": 0.0925, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 1.06629392971246, | |
| "grad_norm": 0.8616180876824737, | |
| "learning_rate": 5.699248803887402e-06, | |
| "loss": 0.0933, | |
| "step": 1335 | |
| }, | |
| { | |
| "epoch": 1.0702875399361023, | |
| "grad_norm": 0.8300175767051388, | |
| "learning_rate": 5.686571645847735e-06, | |
| "loss": 0.0964, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 1.0742811501597445, | |
| "grad_norm": 0.8522423266080669, | |
| "learning_rate": 5.673847263099774e-06, | |
| "loss": 0.0917, | |
| "step": 1345 | |
| }, | |
| { | |
| "epoch": 1.0782747603833867, | |
| "grad_norm": 0.7589979341067651, | |
| "learning_rate": 5.6610759304596946e-06, | |
| "loss": 0.0914, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 1.0822683706070289, | |
| "grad_norm": 0.8241214078415051, | |
| "learning_rate": 5.64825792375768e-06, | |
| "loss": 0.0972, | |
| "step": 1355 | |
| }, | |
| { | |
| "epoch": 1.0862619808306708, | |
| "grad_norm": 0.7525508184236472, | |
| "learning_rate": 5.6353935198319636e-06, | |
| "loss": 0.091, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 1.090255591054313, | |
| "grad_norm": 0.8304249863595534, | |
| "learning_rate": 5.622482996522844e-06, | |
| "loss": 0.094, | |
| "step": 1365 | |
| }, | |
| { | |
| "epoch": 1.0942492012779552, | |
| "grad_norm": 0.814379429479353, | |
| "learning_rate": 5.6095266326666915e-06, | |
| "loss": 0.0988, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 1.0982428115015974, | |
| "grad_norm": 0.7740762735894441, | |
| "learning_rate": 5.5965247080899184e-06, | |
| "loss": 0.0936, | |
| "step": 1375 | |
| }, | |
| { | |
| "epoch": 1.1022364217252396, | |
| "grad_norm": 0.7765444476341753, | |
| "learning_rate": 5.583477503602944e-06, | |
| "loss": 0.0949, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 1.1062300319488818, | |
| "grad_norm": 0.7960825199906291, | |
| "learning_rate": 5.570385300994121e-06, | |
| "loss": 0.0973, | |
| "step": 1385 | |
| }, | |
| { | |
| "epoch": 1.110223642172524, | |
| "grad_norm": 0.8211272733995488, | |
| "learning_rate": 5.557248383023656e-06, | |
| "loss": 0.0989, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 1.1142172523961662, | |
| "grad_norm": 0.8230061747958791, | |
| "learning_rate": 5.5440670334175e-06, | |
| "loss": 0.0959, | |
| "step": 1395 | |
| }, | |
| { | |
| "epoch": 1.1182108626198084, | |
| "grad_norm": 0.7930641006824678, | |
| "learning_rate": 5.5308415368612194e-06, | |
| "loss": 0.0941, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 1.1222044728434506, | |
| "grad_norm": 0.7986731856931012, | |
| "learning_rate": 5.5175721789938525e-06, | |
| "loss": 0.0964, | |
| "step": 1405 | |
| }, | |
| { | |
| "epoch": 1.1261980830670926, | |
| "grad_norm": 0.8180516432624779, | |
| "learning_rate": 5.504259246401733e-06, | |
| "loss": 0.0992, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 1.1301916932907348, | |
| "grad_norm": 0.8080517814845294, | |
| "learning_rate": 5.490903026612306e-06, | |
| "loss": 0.0954, | |
| "step": 1415 | |
| }, | |
| { | |
| "epoch": 1.134185303514377, | |
| "grad_norm": 0.7894397030245984, | |
| "learning_rate": 5.477503808087915e-06, | |
| "loss": 0.0939, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 1.1381789137380192, | |
| "grad_norm": 0.8266502827314096, | |
| "learning_rate": 5.464061880219575e-06, | |
| "loss": 0.0972, | |
| "step": 1425 | |
| }, | |
| { | |
| "epoch": 1.1421725239616614, | |
| "grad_norm": 0.7695060771844507, | |
| "learning_rate": 5.450577533320721e-06, | |
| "loss": 0.0889, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 1.1461661341853036, | |
| "grad_norm": 0.7975054651942407, | |
| "learning_rate": 5.437051058620934e-06, | |
| "loss": 0.0912, | |
| "step": 1435 | |
| }, | |
| { | |
| "epoch": 1.1501597444089458, | |
| "grad_norm": 0.835290771956647, | |
| "learning_rate": 5.423482748259657e-06, | |
| "loss": 0.1003, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 1.154153354632588, | |
| "grad_norm": 0.796837303035922, | |
| "learning_rate": 5.4098728952798835e-06, | |
| "loss": 0.0924, | |
| "step": 1445 | |
| }, | |
| { | |
| "epoch": 1.15814696485623, | |
| "grad_norm": 0.7979782248094183, | |
| "learning_rate": 5.396221793621827e-06, | |
| "loss": 0.097, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 1.1621405750798721, | |
| "grad_norm": 0.7733365430240714, | |
| "learning_rate": 5.382529738116575e-06, | |
| "loss": 0.0943, | |
| "step": 1455 | |
| }, | |
| { | |
| "epoch": 1.1661341853035143, | |
| "grad_norm": 0.792955069966389, | |
| "learning_rate": 5.3687970244797175e-06, | |
| "loss": 0.0907, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 1.1701277955271565, | |
| "grad_norm": 0.7955575959074169, | |
| "learning_rate": 5.355023949304962e-06, | |
| "loss": 0.0935, | |
| "step": 1465 | |
| }, | |
| { | |
| "epoch": 1.1741214057507987, | |
| "grad_norm": 0.773556306974029, | |
| "learning_rate": 5.3412108100577344e-06, | |
| "loss": 0.0961, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 1.178115015974441, | |
| "grad_norm": 0.7701308254197153, | |
| "learning_rate": 5.327357905068743e-06, | |
| "loss": 0.091, | |
| "step": 1475 | |
| }, | |
| { | |
| "epoch": 1.182108626198083, | |
| "grad_norm": 0.8555538690082194, | |
| "learning_rate": 5.3134655335275425e-06, | |
| "loss": 0.097, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 1.1861022364217253, | |
| "grad_norm": 0.8028090926659807, | |
| "learning_rate": 5.29953399547607e-06, | |
| "loss": 0.0991, | |
| "step": 1485 | |
| }, | |
| { | |
| "epoch": 1.1900958466453675, | |
| "grad_norm": 0.8257848244515429, | |
| "learning_rate": 5.285563591802168e-06, | |
| "loss": 0.0951, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 1.1940894568690097, | |
| "grad_norm": 0.7918972043683726, | |
| "learning_rate": 5.271554624233079e-06, | |
| "loss": 0.094, | |
| "step": 1495 | |
| }, | |
| { | |
| "epoch": 1.1980830670926517, | |
| "grad_norm": 0.8361622190630472, | |
| "learning_rate": 5.257507395328937e-06, | |
| "loss": 0.0938, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.1980830670926517, | |
| "eval_loss": 0.14057603478431702, | |
| "eval_runtime": 388.2062, | |
| "eval_samples_per_second": 45.87, | |
| "eval_steps_per_second": 5.734, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.2020766773162939, | |
| "grad_norm": 0.835859137908053, | |
| "learning_rate": 5.243422208476228e-06, | |
| "loss": 0.0963, | |
| "step": 1505 | |
| }, | |
| { | |
| "epoch": 1.206070287539936, | |
| "grad_norm": 0.8373993609652842, | |
| "learning_rate": 5.229299367881236e-06, | |
| "loss": 0.0931, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 1.2100638977635783, | |
| "grad_norm": 0.8468046787504842, | |
| "learning_rate": 5.215139178563481e-06, | |
| "loss": 0.0983, | |
| "step": 1515 | |
| }, | |
| { | |
| "epoch": 1.2140575079872205, | |
| "grad_norm": 0.835364718161382, | |
| "learning_rate": 5.200941946349121e-06, | |
| "loss": 0.0964, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 1.2180511182108626, | |
| "grad_norm": 0.7759190731109368, | |
| "learning_rate": 5.186707977864355e-06, | |
| "loss": 0.0929, | |
| "step": 1525 | |
| }, | |
| { | |
| "epoch": 1.2220447284345048, | |
| "grad_norm": 0.8158943737288525, | |
| "learning_rate": 5.172437580528796e-06, | |
| "loss": 0.094, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 1.226038338658147, | |
| "grad_norm": 0.8729951582908195, | |
| "learning_rate": 5.158131062548833e-06, | |
| "loss": 0.096, | |
| "step": 1535 | |
| }, | |
| { | |
| "epoch": 1.230031948881789, | |
| "grad_norm": 0.8333545374654573, | |
| "learning_rate": 5.143788732910973e-06, | |
| "loss": 0.0952, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 1.2340255591054312, | |
| "grad_norm": 0.8612855433349985, | |
| "learning_rate": 5.129410901375172e-06, | |
| "loss": 0.0974, | |
| "step": 1545 | |
| }, | |
| { | |
| "epoch": 1.2380191693290734, | |
| "grad_norm": 0.8542415949428595, | |
| "learning_rate": 5.11499787846814e-06, | |
| "loss": 0.0947, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 1.2420127795527156, | |
| "grad_norm": 0.8391599714715479, | |
| "learning_rate": 5.100549975476637e-06, | |
| "loss": 0.0944, | |
| "step": 1555 | |
| }, | |
| { | |
| "epoch": 1.2460063897763578, | |
| "grad_norm": 0.8898214771674834, | |
| "learning_rate": 5.0860675044407495e-06, | |
| "loss": 0.0965, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 0.8779811650939291, | |
| "learning_rate": 5.071550778147149e-06, | |
| "loss": 0.098, | |
| "step": 1565 | |
| }, | |
| { | |
| "epoch": 1.2539936102236422, | |
| "grad_norm": 0.8272086627597742, | |
| "learning_rate": 5.05700011012234e-06, | |
| "loss": 0.0935, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 1.2579872204472844, | |
| "grad_norm": 0.810515606440639, | |
| "learning_rate": 5.042415814625887e-06, | |
| "loss": 0.0969, | |
| "step": 1575 | |
| }, | |
| { | |
| "epoch": 1.2619808306709266, | |
| "grad_norm": 0.8943157588820453, | |
| "learning_rate": 5.027798206643629e-06, | |
| "loss": 0.0985, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 1.2659744408945688, | |
| "grad_norm": 0.825143864442046, | |
| "learning_rate": 5.013147601880871e-06, | |
| "loss": 0.0944, | |
| "step": 1585 | |
| }, | |
| { | |
| "epoch": 1.269968051118211, | |
| "grad_norm": 0.8256699054355341, | |
| "learning_rate": 4.998464316755571e-06, | |
| "loss": 0.0932, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 1.273961661341853, | |
| "grad_norm": 0.8116571534197091, | |
| "learning_rate": 4.983748668391507e-06, | |
| "loss": 0.0936, | |
| "step": 1595 | |
| }, | |
| { | |
| "epoch": 1.2779552715654952, | |
| "grad_norm": 0.8161005230017339, | |
| "learning_rate": 4.969000974611423e-06, | |
| "loss": 0.0968, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 1.2819488817891374, | |
| "grad_norm": 0.8346647605386535, | |
| "learning_rate": 4.954221553930166e-06, | |
| "loss": 0.0942, | |
| "step": 1605 | |
| }, | |
| { | |
| "epoch": 1.2859424920127795, | |
| "grad_norm": 0.7827131547372781, | |
| "learning_rate": 4.939410725547812e-06, | |
| "loss": 0.0941, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 1.2899361022364217, | |
| "grad_norm": 0.8568068704330788, | |
| "learning_rate": 4.9245688093427655e-06, | |
| "loss": 0.0926, | |
| "step": 1615 | |
| }, | |
| { | |
| "epoch": 1.293929712460064, | |
| "grad_norm": 0.7805159905736859, | |
| "learning_rate": 4.909696125864851e-06, | |
| "loss": 0.0912, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 1.2979233226837061, | |
| "grad_norm": 0.7738061176599114, | |
| "learning_rate": 4.894792996328399e-06, | |
| "loss": 0.0922, | |
| "step": 1625 | |
| }, | |
| { | |
| "epoch": 1.3019169329073481, | |
| "grad_norm": 0.8184213649823732, | |
| "learning_rate": 4.879859742605294e-06, | |
| "loss": 0.0942, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 1.3059105431309903, | |
| "grad_norm": 0.8290453852769142, | |
| "learning_rate": 4.864896687218038e-06, | |
| "loss": 0.096, | |
| "step": 1635 | |
| }, | |
| { | |
| "epoch": 1.3099041533546325, | |
| "grad_norm": 0.8629463577885302, | |
| "learning_rate": 4.849904153332774e-06, | |
| "loss": 0.0951, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 1.3138977635782747, | |
| "grad_norm": 0.8123701678440974, | |
| "learning_rate": 4.834882464752308e-06, | |
| "loss": 0.0948, | |
| "step": 1645 | |
| }, | |
| { | |
| "epoch": 1.317891373801917, | |
| "grad_norm": 0.7528537020864617, | |
| "learning_rate": 4.8198319459091215e-06, | |
| "loss": 0.0939, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 1.321884984025559, | |
| "grad_norm": 0.7800246305597872, | |
| "learning_rate": 4.8047529218583586e-06, | |
| "loss": 0.0942, | |
| "step": 1655 | |
| }, | |
| { | |
| "epoch": 1.3258785942492013, | |
| "grad_norm": 0.8313222255456414, | |
| "learning_rate": 4.789645718270808e-06, | |
| "loss": 0.0948, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 1.3298722044728435, | |
| "grad_norm": 0.8179733784658576, | |
| "learning_rate": 4.774510661425869e-06, | |
| "loss": 0.0947, | |
| "step": 1665 | |
| }, | |
| { | |
| "epoch": 1.3338658146964857, | |
| "grad_norm": 0.7776969162457743, | |
| "learning_rate": 4.759348078204504e-06, | |
| "loss": 0.09, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 1.3378594249201279, | |
| "grad_norm": 0.841754427277134, | |
| "learning_rate": 4.744158296082179e-06, | |
| "loss": 0.0945, | |
| "step": 1675 | |
| }, | |
| { | |
| "epoch": 1.34185303514377, | |
| "grad_norm": 0.8370881837026815, | |
| "learning_rate": 4.7289416431217925e-06, | |
| "loss": 0.0904, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 1.3458466453674123, | |
| "grad_norm": 0.8562758183663967, | |
| "learning_rate": 4.713698447966586e-06, | |
| "loss": 0.0977, | |
| "step": 1685 | |
| }, | |
| { | |
| "epoch": 1.3498402555910542, | |
| "grad_norm": 0.7855201641830065, | |
| "learning_rate": 4.6984290398330524e-06, | |
| "loss": 0.0943, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 1.3538338658146964, | |
| "grad_norm": 0.8763494885051274, | |
| "learning_rate": 4.6831337485038196e-06, | |
| "loss": 0.0979, | |
| "step": 1695 | |
| }, | |
| { | |
| "epoch": 1.3578274760383386, | |
| "grad_norm": 0.8144070746777402, | |
| "learning_rate": 4.6678129043205315e-06, | |
| "loss": 0.0913, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 1.3618210862619808, | |
| "grad_norm": 0.7944782321390458, | |
| "learning_rate": 4.652466838176711e-06, | |
| "loss": 0.0938, | |
| "step": 1705 | |
| }, | |
| { | |
| "epoch": 1.365814696485623, | |
| "grad_norm": 0.8864834970350833, | |
| "learning_rate": 4.6370958815106175e-06, | |
| "loss": 0.0944, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 1.3698083067092652, | |
| "grad_norm": 0.8054616729912718, | |
| "learning_rate": 4.621700366298084e-06, | |
| "loss": 0.0939, | |
| "step": 1715 | |
| }, | |
| { | |
| "epoch": 1.3738019169329074, | |
| "grad_norm": 0.8454180008589398, | |
| "learning_rate": 4.60628062504535e-06, | |
| "loss": 0.093, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 1.3777955271565494, | |
| "grad_norm": 0.7990549534085654, | |
| "learning_rate": 4.590836990781877e-06, | |
| "loss": 0.0886, | |
| "step": 1725 | |
| }, | |
| { | |
| "epoch": 1.3817891373801916, | |
| "grad_norm": 0.8070045082712269, | |
| "learning_rate": 4.575369797053165e-06, | |
| "loss": 0.0882, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 1.3857827476038338, | |
| "grad_norm": 0.8612739828428202, | |
| "learning_rate": 4.559879377913534e-06, | |
| "loss": 0.0964, | |
| "step": 1735 | |
| }, | |
| { | |
| "epoch": 1.389776357827476, | |
| "grad_norm": 0.8180434067759751, | |
| "learning_rate": 4.544366067918922e-06, | |
| "loss": 0.0947, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 1.3937699680511182, | |
| "grad_norm": 0.7929033707791658, | |
| "learning_rate": 4.5288302021196525e-06, | |
| "loss": 0.0934, | |
| "step": 1745 | |
| }, | |
| { | |
| "epoch": 1.3977635782747604, | |
| "grad_norm": 0.8213908701484177, | |
| "learning_rate": 4.513272116053203e-06, | |
| "loss": 0.092, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 1.4017571884984026, | |
| "grad_norm": 0.8758208928982602, | |
| "learning_rate": 4.497692145736952e-06, | |
| "loss": 0.0984, | |
| "step": 1755 | |
| }, | |
| { | |
| "epoch": 1.4057507987220448, | |
| "grad_norm": 0.8498430639803995, | |
| "learning_rate": 4.482090627660928e-06, | |
| "loss": 0.0948, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 1.409744408945687, | |
| "grad_norm": 0.8357042132874987, | |
| "learning_rate": 4.466467898780538e-06, | |
| "loss": 0.0937, | |
| "step": 1765 | |
| }, | |
| { | |
| "epoch": 1.4137380191693292, | |
| "grad_norm": 0.8049671549685841, | |
| "learning_rate": 4.450824296509293e-06, | |
| "loss": 0.0962, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 1.4177316293929714, | |
| "grad_norm": 0.762029470780092, | |
| "learning_rate": 4.435160158711517e-06, | |
| "loss": 0.093, | |
| "step": 1775 | |
| }, | |
| { | |
| "epoch": 1.4217252396166133, | |
| "grad_norm": 0.7777991642289898, | |
| "learning_rate": 4.419475823695054e-06, | |
| "loss": 0.0881, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 1.4257188498402555, | |
| "grad_norm": 0.8172923019508063, | |
| "learning_rate": 4.40377163020396e-06, | |
| "loss": 0.0962, | |
| "step": 1785 | |
| }, | |
| { | |
| "epoch": 1.4297124600638977, | |
| "grad_norm": 0.8195226636883436, | |
| "learning_rate": 4.388047917411185e-06, | |
| "loss": 0.0938, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 1.43370607028754, | |
| "grad_norm": 0.8567775821515811, | |
| "learning_rate": 4.37230502491125e-06, | |
| "loss": 0.094, | |
| "step": 1795 | |
| }, | |
| { | |
| "epoch": 1.4376996805111821, | |
| "grad_norm": 0.8975281588105369, | |
| "learning_rate": 4.35654329271291e-06, | |
| "loss": 0.0931, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 1.4416932907348243, | |
| "grad_norm": 0.7980268710605869, | |
| "learning_rate": 4.340763061231815e-06, | |
| "loss": 0.094, | |
| "step": 1805 | |
| }, | |
| { | |
| "epoch": 1.4456869009584665, | |
| "grad_norm": 0.8423324417742822, | |
| "learning_rate": 4.3249646712831525e-06, | |
| "loss": 0.0934, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 1.4496805111821085, | |
| "grad_norm": 0.760524105440743, | |
| "learning_rate": 4.309148464074293e-06, | |
| "loss": 0.0929, | |
| "step": 1815 | |
| }, | |
| { | |
| "epoch": 1.4536741214057507, | |
| "grad_norm": 0.8316187906255309, | |
| "learning_rate": 4.293314781197414e-06, | |
| "loss": 0.0933, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 1.457667731629393, | |
| "grad_norm": 0.8371905859373513, | |
| "learning_rate": 4.277463964622125e-06, | |
| "loss": 0.0887, | |
| "step": 1825 | |
| }, | |
| { | |
| "epoch": 1.461661341853035, | |
| "grad_norm": 0.8364714730137837, | |
| "learning_rate": 4.261596356688086e-06, | |
| "loss": 0.0962, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 1.4656549520766773, | |
| "grad_norm": 0.8141222333202962, | |
| "learning_rate": 4.245712300097606e-06, | |
| "loss": 0.0925, | |
| "step": 1835 | |
| }, | |
| { | |
| "epoch": 1.4696485623003195, | |
| "grad_norm": 0.8478554496354959, | |
| "learning_rate": 4.229812137908251e-06, | |
| "loss": 0.0904, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 1.4736421725239617, | |
| "grad_norm": 0.7540954927526371, | |
| "learning_rate": 4.213896213525421e-06, | |
| "loss": 0.0892, | |
| "step": 1845 | |
| }, | |
| { | |
| "epoch": 1.4776357827476039, | |
| "grad_norm": 0.8171840914315421, | |
| "learning_rate": 4.197964870694949e-06, | |
| "loss": 0.0959, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 1.481629392971246, | |
| "grad_norm": 0.8254400596397985, | |
| "learning_rate": 4.182018453495667e-06, | |
| "loss": 0.0938, | |
| "step": 1855 | |
| }, | |
| { | |
| "epoch": 1.4856230031948883, | |
| "grad_norm": 0.8125736388997561, | |
| "learning_rate": 4.166057306331977e-06, | |
| "loss": 0.0909, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 1.4896166134185305, | |
| "grad_norm": 0.7640120620441644, | |
| "learning_rate": 4.150081773926414e-06, | |
| "loss": 0.0901, | |
| "step": 1865 | |
| }, | |
| { | |
| "epoch": 1.4936102236421724, | |
| "grad_norm": 0.799653826309561, | |
| "learning_rate": 4.134092201312196e-06, | |
| "loss": 0.0922, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 1.4976038338658146, | |
| "grad_norm": 0.8128276034935282, | |
| "learning_rate": 4.118088933825783e-06, | |
| "loss": 0.093, | |
| "step": 1875 | |
| }, | |
| { | |
| "epoch": 1.5015974440894568, | |
| "grad_norm": 0.8834498526452107, | |
| "learning_rate": 4.102072317099404e-06, | |
| "loss": 0.0953, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 1.505591054313099, | |
| "grad_norm": 0.8866139042397759, | |
| "learning_rate": 4.086042697053604e-06, | |
| "loss": 0.0959, | |
| "step": 1885 | |
| }, | |
| { | |
| "epoch": 1.5095846645367412, | |
| "grad_norm": 0.9024588302568654, | |
| "learning_rate": 4.0700004198897665e-06, | |
| "loss": 0.0929, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 1.5135782747603834, | |
| "grad_norm": 0.7631982111355976, | |
| "learning_rate": 4.053945832082642e-06, | |
| "loss": 0.0907, | |
| "step": 1895 | |
| }, | |
| { | |
| "epoch": 1.5175718849840254, | |
| "grad_norm": 0.752187865320078, | |
| "learning_rate": 4.037879280372855e-06, | |
| "loss": 0.0922, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 1.5215654952076676, | |
| "grad_norm": 0.8037283021019334, | |
| "learning_rate": 4.0218011117594265e-06, | |
| "loss": 0.0887, | |
| "step": 1905 | |
| }, | |
| { | |
| "epoch": 1.5255591054313098, | |
| "grad_norm": 0.7822698417920745, | |
| "learning_rate": 4.005711673492274e-06, | |
| "loss": 0.0874, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 1.529552715654952, | |
| "grad_norm": 0.8037410692104164, | |
| "learning_rate": 3.989611313064714e-06, | |
| "loss": 0.0909, | |
| "step": 1915 | |
| }, | |
| { | |
| "epoch": 1.5335463258785942, | |
| "grad_norm": 0.8877519379788956, | |
| "learning_rate": 3.97350037820595e-06, | |
| "loss": 0.0945, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 1.5375399361022364, | |
| "grad_norm": 0.7148728619247341, | |
| "learning_rate": 3.9573792168735735e-06, | |
| "loss": 0.091, | |
| "step": 1925 | |
| }, | |
| { | |
| "epoch": 1.5415335463258786, | |
| "grad_norm": 0.8136672941763351, | |
| "learning_rate": 3.9412481772460404e-06, | |
| "loss": 0.0917, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 1.5455271565495208, | |
| "grad_norm": 0.7546989024401581, | |
| "learning_rate": 3.925107607715156e-06, | |
| "loss": 0.0918, | |
| "step": 1935 | |
| }, | |
| { | |
| "epoch": 1.549520766773163, | |
| "grad_norm": 0.7557424472213525, | |
| "learning_rate": 3.908957856878548e-06, | |
| "loss": 0.0909, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 1.5535143769968052, | |
| "grad_norm": 0.8400496651270898, | |
| "learning_rate": 3.892799273532136e-06, | |
| "loss": 0.0951, | |
| "step": 1945 | |
| }, | |
| { | |
| "epoch": 1.5575079872204474, | |
| "grad_norm": 0.8463913720969456, | |
| "learning_rate": 3.876632206662605e-06, | |
| "loss": 0.0933, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 1.5615015974440896, | |
| "grad_norm": 0.8750388641162147, | |
| "learning_rate": 3.860457005439858e-06, | |
| "loss": 0.0936, | |
| "step": 1955 | |
| }, | |
| { | |
| "epoch": 1.5654952076677318, | |
| "grad_norm": 0.8208298877616164, | |
| "learning_rate": 3.844274019209488e-06, | |
| "loss": 0.0942, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 1.569488817891374, | |
| "grad_norm": 0.8289575426887811, | |
| "learning_rate": 3.828083597485219e-06, | |
| "loss": 0.0942, | |
| "step": 1965 | |
| }, | |
| { | |
| "epoch": 1.573482428115016, | |
| "grad_norm": 0.7742099605330947, | |
| "learning_rate": 3.811886089941368e-06, | |
| "loss": 0.0917, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 1.5774760383386581, | |
| "grad_norm": 0.851766269863927, | |
| "learning_rate": 3.7956818464052853e-06, | |
| "loss": 0.0922, | |
| "step": 1975 | |
| }, | |
| { | |
| "epoch": 1.5814696485623003, | |
| "grad_norm": 0.8425829506887897, | |
| "learning_rate": 3.779471216849804e-06, | |
| "loss": 0.0957, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 1.5854632587859425, | |
| "grad_norm": 0.8510499144275525, | |
| "learning_rate": 3.7632545513856812e-06, | |
| "loss": 0.0938, | |
| "step": 1985 | |
| }, | |
| { | |
| "epoch": 1.5894568690095847, | |
| "grad_norm": 0.7793249157088529, | |
| "learning_rate": 3.747032200254035e-06, | |
| "loss": 0.087, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 1.5934504792332267, | |
| "grad_norm": 0.7890445255133955, | |
| "learning_rate": 3.7308045138187764e-06, | |
| "loss": 0.0914, | |
| "step": 1995 | |
| }, | |
| { | |
| "epoch": 1.5974440894568689, | |
| "grad_norm": 0.8507435937119314, | |
| "learning_rate": 3.714571842559053e-06, | |
| "loss": 0.0948, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.5974440894568689, | |
| "eval_loss": 0.135970339179039, | |
| "eval_runtime": 388.3901, | |
| "eval_samples_per_second": 45.848, | |
| "eval_steps_per_second": 5.731, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.601437699680511, | |
| "grad_norm": 0.777341324033066, | |
| "learning_rate": 3.6983345370616666e-06, | |
| "loss": 0.0901, | |
| "step": 2005 | |
| }, | |
| { | |
| "epoch": 1.6054313099041533, | |
| "grad_norm": 0.8135499478146813, | |
| "learning_rate": 3.6820929480135086e-06, | |
| "loss": 0.093, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 1.6094249201277955, | |
| "grad_norm": 0.8719757100439282, | |
| "learning_rate": 3.6658474261939888e-06, | |
| "loss": 0.0961, | |
| "step": 2015 | |
| }, | |
| { | |
| "epoch": 1.6134185303514377, | |
| "grad_norm": 0.8303170119187012, | |
| "learning_rate": 3.6495983224674493e-06, | |
| "loss": 0.0964, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 1.6174121405750799, | |
| "grad_norm": 0.784594767811727, | |
| "learning_rate": 3.633345987775599e-06, | |
| "loss": 0.0954, | |
| "step": 2025 | |
| }, | |
| { | |
| "epoch": 1.621405750798722, | |
| "grad_norm": 0.7991817936852438, | |
| "learning_rate": 3.617090773129923e-06, | |
| "loss": 0.0899, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 1.6253993610223643, | |
| "grad_norm": 0.8182423808938526, | |
| "learning_rate": 3.6008330296041096e-06, | |
| "loss": 0.0883, | |
| "step": 2035 | |
| }, | |
| { | |
| "epoch": 1.6293929712460065, | |
| "grad_norm": 0.7858919236328515, | |
| "learning_rate": 3.584573108326463e-06, | |
| "loss": 0.0897, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 1.6333865814696487, | |
| "grad_norm": 0.8156791778123983, | |
| "learning_rate": 3.5683113604723233e-06, | |
| "loss": 0.0897, | |
| "step": 2045 | |
| }, | |
| { | |
| "epoch": 1.6373801916932909, | |
| "grad_norm": 0.8881831052271689, | |
| "learning_rate": 3.552048137256478e-06, | |
| "loss": 0.0935, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 1.641373801916933, | |
| "grad_norm": 0.8016008616346979, | |
| "learning_rate": 3.53578378992558e-06, | |
| "loss": 0.0917, | |
| "step": 2055 | |
| }, | |
| { | |
| "epoch": 1.645367412140575, | |
| "grad_norm": 0.8557576853411646, | |
| "learning_rate": 3.5195186697505616e-06, | |
| "loss": 0.0891, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 1.6493610223642172, | |
| "grad_norm": 0.8028621053717053, | |
| "learning_rate": 3.503253128019046e-06, | |
| "loss": 0.0895, | |
| "step": 2065 | |
| }, | |
| { | |
| "epoch": 1.6533546325878594, | |
| "grad_norm": 0.8130307630678764, | |
| "learning_rate": 3.48698751602776e-06, | |
| "loss": 0.0868, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 1.6573482428115016, | |
| "grad_norm": 0.7589933957894843, | |
| "learning_rate": 3.470722185074948e-06, | |
| "loss": 0.0836, | |
| "step": 2075 | |
| }, | |
| { | |
| "epoch": 1.6613418530351438, | |
| "grad_norm": 0.833019057283452, | |
| "learning_rate": 3.454457486452786e-06, | |
| "loss": 0.0853, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 1.6653354632587858, | |
| "grad_norm": 0.8485088182697132, | |
| "learning_rate": 3.4381937714397917e-06, | |
| "loss": 0.0922, | |
| "step": 2085 | |
| }, | |
| { | |
| "epoch": 1.669329073482428, | |
| "grad_norm": 0.8184010129649519, | |
| "learning_rate": 3.42193139129324e-06, | |
| "loss": 0.0901, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 1.6733226837060702, | |
| "grad_norm": 0.8164910293916446, | |
| "learning_rate": 3.4056706972415752e-06, | |
| "loss": 0.0902, | |
| "step": 2095 | |
| }, | |
| { | |
| "epoch": 1.6773162939297124, | |
| "grad_norm": 0.7799450306868917, | |
| "learning_rate": 3.3894120404768293e-06, | |
| "loss": 0.0879, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 1.6813099041533546, | |
| "grad_norm": 0.8213948220491234, | |
| "learning_rate": 3.373155772147028e-06, | |
| "loss": 0.0866, | |
| "step": 2105 | |
| }, | |
| { | |
| "epoch": 1.6853035143769968, | |
| "grad_norm": 0.8140459087876813, | |
| "learning_rate": 3.3569022433486152e-06, | |
| "loss": 0.0905, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 1.689297124600639, | |
| "grad_norm": 0.8090512899368958, | |
| "learning_rate": 3.3406518051188697e-06, | |
| "loss": 0.093, | |
| "step": 2115 | |
| }, | |
| { | |
| "epoch": 1.6932907348242812, | |
| "grad_norm": 0.786423302788487, | |
| "learning_rate": 3.3244048084283183e-06, | |
| "loss": 0.0886, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 1.6972843450479234, | |
| "grad_norm": 0.8010633066357388, | |
| "learning_rate": 3.3081616041731606e-06, | |
| "loss": 0.0871, | |
| "step": 2125 | |
| }, | |
| { | |
| "epoch": 1.7012779552715656, | |
| "grad_norm": 0.8562989387605436, | |
| "learning_rate": 3.2919225431676874e-06, | |
| "loss": 0.0909, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 1.7052715654952078, | |
| "grad_norm": 0.8137206318229443, | |
| "learning_rate": 3.2756879761367076e-06, | |
| "loss": 0.0877, | |
| "step": 2135 | |
| }, | |
| { | |
| "epoch": 1.70926517571885, | |
| "grad_norm": 0.8327531430876184, | |
| "learning_rate": 3.259458253707968e-06, | |
| "loss": 0.0893, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 1.7132587859424921, | |
| "grad_norm": 0.819012210220874, | |
| "learning_rate": 3.2432337264045843e-06, | |
| "loss": 0.0914, | |
| "step": 2145 | |
| }, | |
| { | |
| "epoch": 1.7172523961661343, | |
| "grad_norm": 0.8495772189306744, | |
| "learning_rate": 3.2270147446374722e-06, | |
| "loss": 0.092, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 1.7212460063897763, | |
| "grad_norm": 0.8018376512404187, | |
| "learning_rate": 3.210801658697774e-06, | |
| "loss": 0.0881, | |
| "step": 2155 | |
| }, | |
| { | |
| "epoch": 1.7252396166134185, | |
| "grad_norm": 0.7827860574369773, | |
| "learning_rate": 3.1945948187492992e-06, | |
| "loss": 0.0893, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 1.7292332268370607, | |
| "grad_norm": 0.7731065648493003, | |
| "learning_rate": 3.1783945748209557e-06, | |
| "loss": 0.0854, | |
| "step": 2165 | |
| }, | |
| { | |
| "epoch": 1.733226837060703, | |
| "grad_norm": 0.8628787752311644, | |
| "learning_rate": 3.1622012767991967e-06, | |
| "loss": 0.0946, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 1.7372204472843449, | |
| "grad_norm": 0.7757583751727869, | |
| "learning_rate": 3.1460152744204584e-06, | |
| "loss": 0.0898, | |
| "step": 2175 | |
| }, | |
| { | |
| "epoch": 1.741214057507987, | |
| "grad_norm": 0.773307933535615, | |
| "learning_rate": 3.129836917263607e-06, | |
| "loss": 0.0833, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 1.7452076677316293, | |
| "grad_norm": 0.8402186654533444, | |
| "learning_rate": 3.113666554742394e-06, | |
| "loss": 0.0885, | |
| "step": 2185 | |
| }, | |
| { | |
| "epoch": 1.7492012779552715, | |
| "grad_norm": 0.8286606926069269, | |
| "learning_rate": 3.0975045360979025e-06, | |
| "loss": 0.0902, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 1.7531948881789137, | |
| "grad_norm": 0.8317794334903522, | |
| "learning_rate": 3.0813512103910113e-06, | |
| "loss": 0.0882, | |
| "step": 2195 | |
| }, | |
| { | |
| "epoch": 1.7571884984025559, | |
| "grad_norm": 0.7846450982359997, | |
| "learning_rate": 3.065206926494848e-06, | |
| "loss": 0.0888, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 1.761182108626198, | |
| "grad_norm": 0.8623658744163902, | |
| "learning_rate": 3.049072033087264e-06, | |
| "loss": 0.0899, | |
| "step": 2205 | |
| }, | |
| { | |
| "epoch": 1.7651757188498403, | |
| "grad_norm": 0.7914568138762265, | |
| "learning_rate": 3.032946878643295e-06, | |
| "loss": 0.0906, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 1.7691693290734825, | |
| "grad_norm": 0.8007387802489633, | |
| "learning_rate": 3.0168318114276377e-06, | |
| "loss": 0.0916, | |
| "step": 2215 | |
| }, | |
| { | |
| "epoch": 1.7731629392971247, | |
| "grad_norm": 0.776240184386906, | |
| "learning_rate": 3.0007271794871337e-06, | |
| "loss": 0.0888, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 1.7771565495207668, | |
| "grad_norm": 0.7537760505089393, | |
| "learning_rate": 2.9846333306432412e-06, | |
| "loss": 0.0853, | |
| "step": 2225 | |
| }, | |
| { | |
| "epoch": 1.781150159744409, | |
| "grad_norm": 0.8216268561818008, | |
| "learning_rate": 2.968550612484536e-06, | |
| "loss": 0.0905, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 1.7851437699680512, | |
| "grad_norm": 0.8623008566979109, | |
| "learning_rate": 2.9524793723591913e-06, | |
| "loss": 0.0884, | |
| "step": 2235 | |
| }, | |
| { | |
| "epoch": 1.7891373801916934, | |
| "grad_norm": 0.8570248441650711, | |
| "learning_rate": 2.936419957367489e-06, | |
| "loss": 0.0955, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 1.7931309904153354, | |
| "grad_norm": 0.8090425637420421, | |
| "learning_rate": 2.92037271435431e-06, | |
| "loss": 0.0929, | |
| "step": 2245 | |
| }, | |
| { | |
| "epoch": 1.7971246006389776, | |
| "grad_norm": 0.8662160177063208, | |
| "learning_rate": 2.904337989901653e-06, | |
| "loss": 0.0872, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 1.8011182108626198, | |
| "grad_norm": 0.789286926224427, | |
| "learning_rate": 2.888316130321146e-06, | |
| "loss": 0.0889, | |
| "step": 2255 | |
| }, | |
| { | |
| "epoch": 1.805111821086262, | |
| "grad_norm": 0.8199288743264955, | |
| "learning_rate": 2.872307481646565e-06, | |
| "loss": 0.0918, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 1.8091054313099042, | |
| "grad_norm": 0.8451894181207451, | |
| "learning_rate": 2.856312389626365e-06, | |
| "loss": 0.0942, | |
| "step": 2265 | |
| }, | |
| { | |
| "epoch": 1.8130990415335462, | |
| "grad_norm": 0.831433941954139, | |
| "learning_rate": 2.840331199716205e-06, | |
| "loss": 0.0873, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 1.8170926517571884, | |
| "grad_norm": 0.7904106905610935, | |
| "learning_rate": 2.8243642570714958e-06, | |
| "loss": 0.0865, | |
| "step": 2275 | |
| }, | |
| { | |
| "epoch": 1.8210862619808306, | |
| "grad_norm": 0.8007696286288565, | |
| "learning_rate": 2.808411906539941e-06, | |
| "loss": 0.0908, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 1.8250798722044728, | |
| "grad_norm": 0.7398549977807148, | |
| "learning_rate": 2.7924744926540866e-06, | |
| "loss": 0.0828, | |
| "step": 2285 | |
| }, | |
| { | |
| "epoch": 1.829073482428115, | |
| "grad_norm": 0.8051951554405458, | |
| "learning_rate": 2.7765523596238862e-06, | |
| "loss": 0.0904, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 1.8330670926517572, | |
| "grad_norm": 0.8101567895702511, | |
| "learning_rate": 2.7606458513292615e-06, | |
| "loss": 0.0877, | |
| "step": 2295 | |
| }, | |
| { | |
| "epoch": 1.8370607028753994, | |
| "grad_norm": 0.810684787255368, | |
| "learning_rate": 2.744755311312679e-06, | |
| "loss": 0.0878, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 1.8410543130990416, | |
| "grad_norm": 0.836337456473946, | |
| "learning_rate": 2.7288810827717262e-06, | |
| "loss": 0.0882, | |
| "step": 2305 | |
| }, | |
| { | |
| "epoch": 1.8450479233226837, | |
| "grad_norm": 0.7870319338949235, | |
| "learning_rate": 2.713023508551704e-06, | |
| "loss": 0.0855, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 1.849041533546326, | |
| "grad_norm": 0.8711250048723298, | |
| "learning_rate": 2.6971829311382195e-06, | |
| "loss": 0.0903, | |
| "step": 2315 | |
| }, | |
| { | |
| "epoch": 1.8530351437699681, | |
| "grad_norm": 0.7898917076439477, | |
| "learning_rate": 2.6813596926497892e-06, | |
| "loss": 0.09, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 1.8570287539936103, | |
| "grad_norm": 0.8447633678009813, | |
| "learning_rate": 2.66555413483045e-06, | |
| "loss": 0.092, | |
| "step": 2325 | |
| }, | |
| { | |
| "epoch": 1.8610223642172525, | |
| "grad_norm": 0.8503814184125837, | |
| "learning_rate": 2.649766599042378e-06, | |
| "loss": 0.0855, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 1.8650159744408947, | |
| "grad_norm": 0.7976736872294655, | |
| "learning_rate": 2.6339974262585194e-06, | |
| "loss": 0.089, | |
| "step": 2335 | |
| }, | |
| { | |
| "epoch": 1.8690095846645367, | |
| "grad_norm": 0.7883072515843511, | |
| "learning_rate": 2.61824695705522e-06, | |
| "loss": 0.0879, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 1.873003194888179, | |
| "grad_norm": 0.7546676570625951, | |
| "learning_rate": 2.602515531604877e-06, | |
| "loss": 0.086, | |
| "step": 2345 | |
| }, | |
| { | |
| "epoch": 1.876996805111821, | |
| "grad_norm": 0.8281853119034305, | |
| "learning_rate": 2.586803489668584e-06, | |
| "loss": 0.0869, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 1.8809904153354633, | |
| "grad_norm": 0.8108187343259362, | |
| "learning_rate": 2.5711111705888005e-06, | |
| "loss": 0.0903, | |
| "step": 2355 | |
| }, | |
| { | |
| "epoch": 1.8849840255591053, | |
| "grad_norm": 0.7924520966861539, | |
| "learning_rate": 2.555438913282018e-06, | |
| "loss": 0.0924, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 1.8889776357827475, | |
| "grad_norm": 0.7259687130542957, | |
| "learning_rate": 2.5397870562314398e-06, | |
| "loss": 0.0852, | |
| "step": 2365 | |
| }, | |
| { | |
| "epoch": 1.8929712460063897, | |
| "grad_norm": 0.8020364993306531, | |
| "learning_rate": 2.5241559374796786e-06, | |
| "loss": 0.085, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 1.8969648562300319, | |
| "grad_norm": 0.8025355343637186, | |
| "learning_rate": 2.5085458946214414e-06, | |
| "loss": 0.082, | |
| "step": 2375 | |
| }, | |
| { | |
| "epoch": 1.900958466453674, | |
| "grad_norm": 0.8669003546070674, | |
| "learning_rate": 2.4929572647962543e-06, | |
| "loss": 0.0889, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 1.9049520766773163, | |
| "grad_norm": 0.8325372820796868, | |
| "learning_rate": 2.477390384681166e-06, | |
| "loss": 0.0863, | |
| "step": 2385 | |
| }, | |
| { | |
| "epoch": 1.9089456869009584, | |
| "grad_norm": 0.8158237511694976, | |
| "learning_rate": 2.4618455904834918e-06, | |
| "loss": 0.0821, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 1.9129392971246006, | |
| "grad_norm": 0.7873429759802417, | |
| "learning_rate": 2.446323217933536e-06, | |
| "loss": 0.0831, | |
| "step": 2395 | |
| }, | |
| { | |
| "epoch": 1.9169329073482428, | |
| "grad_norm": 0.8243924857344042, | |
| "learning_rate": 2.4308236022773507e-06, | |
| "loss": 0.0882, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 1.920926517571885, | |
| "grad_norm": 0.8133568397024189, | |
| "learning_rate": 2.415347078269501e-06, | |
| "loss": 0.0906, | |
| "step": 2405 | |
| }, | |
| { | |
| "epoch": 1.9249201277955272, | |
| "grad_norm": 0.8850847431900986, | |
| "learning_rate": 2.3998939801658167e-06, | |
| "loss": 0.0866, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 1.9289137380191694, | |
| "grad_norm": 0.8265464069503446, | |
| "learning_rate": 2.3844646417161936e-06, | |
| "loss": 0.0893, | |
| "step": 2415 | |
| }, | |
| { | |
| "epoch": 1.9329073482428116, | |
| "grad_norm": 0.7667422944606539, | |
| "learning_rate": 2.3690593961573685e-06, | |
| "loss": 0.0817, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 1.9369009584664538, | |
| "grad_norm": 0.7912825165955961, | |
| "learning_rate": 2.353678576205736e-06, | |
| "loss": 0.0892, | |
| "step": 2425 | |
| }, | |
| { | |
| "epoch": 1.9408945686900958, | |
| "grad_norm": 0.8354304573744004, | |
| "learning_rate": 2.3383225140501514e-06, | |
| "loss": 0.0887, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 1.944888178913738, | |
| "grad_norm": 0.8061340310847699, | |
| "learning_rate": 2.3229915413447616e-06, | |
| "loss": 0.0852, | |
| "step": 2435 | |
| }, | |
| { | |
| "epoch": 1.9488817891373802, | |
| "grad_norm": 0.8450228338142728, | |
| "learning_rate": 2.3076859892018444e-06, | |
| "loss": 0.0856, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 1.9528753993610224, | |
| "grad_norm": 0.8337145461940115, | |
| "learning_rate": 2.2924061881846505e-06, | |
| "loss": 0.0887, | |
| "step": 2445 | |
| }, | |
| { | |
| "epoch": 1.9568690095846646, | |
| "grad_norm": 0.803788343960733, | |
| "learning_rate": 2.277152468300272e-06, | |
| "loss": 0.0847, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 1.9608626198083066, | |
| "grad_norm": 0.8037804153473037, | |
| "learning_rate": 2.2619251589925065e-06, | |
| "loss": 0.0862, | |
| "step": 2455 | |
| }, | |
| { | |
| "epoch": 1.9648562300319488, | |
| "grad_norm": 0.7921163159021665, | |
| "learning_rate": 2.2467245891347513e-06, | |
| "loss": 0.0867, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 1.968849840255591, | |
| "grad_norm": 0.8492215680100746, | |
| "learning_rate": 2.2315510870228916e-06, | |
| "loss": 0.0923, | |
| "step": 2465 | |
| }, | |
| { | |
| "epoch": 1.9728434504792332, | |
| "grad_norm": 0.8033174401260257, | |
| "learning_rate": 2.2164049803682157e-06, | |
| "loss": 0.086, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 1.9768370607028753, | |
| "grad_norm": 0.8216004654984708, | |
| "learning_rate": 2.2012865962903367e-06, | |
| "loss": 0.0858, | |
| "step": 2475 | |
| }, | |
| { | |
| "epoch": 1.9808306709265175, | |
| "grad_norm": 0.7388468150996138, | |
| "learning_rate": 2.1861962613101236e-06, | |
| "loss": 0.0858, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 1.9848242811501597, | |
| "grad_norm": 0.7699033555586355, | |
| "learning_rate": 2.1711343013426562e-06, | |
| "loss": 0.0843, | |
| "step": 2485 | |
| }, | |
| { | |
| "epoch": 1.988817891373802, | |
| "grad_norm": 0.8060166372469154, | |
| "learning_rate": 2.156101041690178e-06, | |
| "loss": 0.0861, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 1.9928115015974441, | |
| "grad_norm": 0.7929032945553157, | |
| "learning_rate": 2.1410968070350774e-06, | |
| "loss": 0.0831, | |
| "step": 2495 | |
| }, | |
| { | |
| "epoch": 1.9968051118210863, | |
| "grad_norm": 0.7490472268146842, | |
| "learning_rate": 2.1261219214328716e-06, | |
| "loss": 0.0844, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 1.9968051118210863, | |
| "eval_loss": 0.13146671652793884, | |
| "eval_runtime": 388.8053, | |
| "eval_samples_per_second": 45.799, | |
| "eval_steps_per_second": 5.725, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 2.0007987220447285, | |
| "grad_norm": 0.5590440556683411, | |
| "learning_rate": 2.111176708305206e-06, | |
| "loss": 0.0823, | |
| "step": 2505 | |
| }, | |
| { | |
| "epoch": 2.0047923322683707, | |
| "grad_norm": 0.5779473730088169, | |
| "learning_rate": 2.096261490432876e-06, | |
| "loss": 0.0463, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 2.008785942492013, | |
| "grad_norm": 0.5866488405628578, | |
| "learning_rate": 2.0813765899488486e-06, | |
| "loss": 0.0451, | |
| "step": 2515 | |
| }, | |
| { | |
| "epoch": 2.012779552715655, | |
| "grad_norm": 0.6677470451681199, | |
| "learning_rate": 2.066522328331308e-06, | |
| "loss": 0.0446, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 2.0167731629392973, | |
| "grad_norm": 0.6989167945663965, | |
| "learning_rate": 2.051699026396713e-06, | |
| "loss": 0.0434, | |
| "step": 2525 | |
| }, | |
| { | |
| "epoch": 2.0207667731629395, | |
| "grad_norm": 0.7661180883841344, | |
| "learning_rate": 2.036907004292868e-06, | |
| "loss": 0.0432, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 2.0247603833865813, | |
| "grad_norm": 0.8024449864690092, | |
| "learning_rate": 2.0221465814920067e-06, | |
| "loss": 0.0424, | |
| "step": 2535 | |
| }, | |
| { | |
| "epoch": 2.0287539936102235, | |
| "grad_norm": 0.7501816123790741, | |
| "learning_rate": 2.007418076783894e-06, | |
| "loss": 0.0439, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 2.0327476038338657, | |
| "grad_norm": 0.6877745159716515, | |
| "learning_rate": 1.99272180826894e-06, | |
| "loss": 0.04, | |
| "step": 2545 | |
| }, | |
| { | |
| "epoch": 2.036741214057508, | |
| "grad_norm": 0.6608397420428562, | |
| "learning_rate": 1.9780580933513303e-06, | |
| "loss": 0.0422, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 2.04073482428115, | |
| "grad_norm": 0.6603272146294105, | |
| "learning_rate": 1.9634272487321734e-06, | |
| "loss": 0.043, | |
| "step": 2555 | |
| }, | |
| { | |
| "epoch": 2.0447284345047922, | |
| "grad_norm": 0.7257523609250439, | |
| "learning_rate": 1.9488295904026556e-06, | |
| "loss": 0.0439, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 2.0487220447284344, | |
| "grad_norm": 0.6619441686023483, | |
| "learning_rate": 1.9342654336372184e-06, | |
| "loss": 0.041, | |
| "step": 2565 | |
| }, | |
| { | |
| "epoch": 2.0527156549520766, | |
| "grad_norm": 0.775856056665606, | |
| "learning_rate": 1.9197350929867496e-06, | |
| "loss": 0.0422, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 2.056709265175719, | |
| "grad_norm": 0.71946656984334, | |
| "learning_rate": 1.905238882271794e-06, | |
| "loss": 0.0403, | |
| "step": 2575 | |
| }, | |
| { | |
| "epoch": 2.060702875399361, | |
| "grad_norm": 0.8286391951732616, | |
| "learning_rate": 1.8907771145757672e-06, | |
| "loss": 0.0453, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 2.0646964856230032, | |
| "grad_norm": 0.6701516033428961, | |
| "learning_rate": 1.8763501022381988e-06, | |
| "loss": 0.0432, | |
| "step": 2585 | |
| }, | |
| { | |
| "epoch": 2.0686900958466454, | |
| "grad_norm": 0.6951719440989161, | |
| "learning_rate": 1.861958156847988e-06, | |
| "loss": 0.0401, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 2.0726837060702876, | |
| "grad_norm": 0.7071648587283893, | |
| "learning_rate": 1.8476015892366678e-06, | |
| "loss": 0.0422, | |
| "step": 2595 | |
| }, | |
| { | |
| "epoch": 2.07667731629393, | |
| "grad_norm": 0.7088715440583857, | |
| "learning_rate": 1.8332807094717024e-06, | |
| "loss": 0.0427, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 2.080670926517572, | |
| "grad_norm": 0.6796726953293041, | |
| "learning_rate": 1.8189958268497788e-06, | |
| "loss": 0.0398, | |
| "step": 2605 | |
| }, | |
| { | |
| "epoch": 2.084664536741214, | |
| "grad_norm": 0.7275520062937834, | |
| "learning_rate": 1.8047472498901338e-06, | |
| "loss": 0.0437, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 2.0886581469648564, | |
| "grad_norm": 0.6983089859377336, | |
| "learning_rate": 1.7905352863278885e-06, | |
| "loss": 0.0423, | |
| "step": 2615 | |
| }, | |
| { | |
| "epoch": 2.0926517571884986, | |
| "grad_norm": 0.749372173179819, | |
| "learning_rate": 1.776360243107401e-06, | |
| "loss": 0.0426, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 2.0966453674121404, | |
| "grad_norm": 0.6497883844703632, | |
| "learning_rate": 1.7622224263756437e-06, | |
| "loss": 0.0414, | |
| "step": 2625 | |
| }, | |
| { | |
| "epoch": 2.1006389776357826, | |
| "grad_norm": 0.6784997898192929, | |
| "learning_rate": 1.7481221414755777e-06, | |
| "loss": 0.0424, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 2.1046325878594248, | |
| "grad_norm": 0.7187277256802926, | |
| "learning_rate": 1.7340596929395748e-06, | |
| "loss": 0.0433, | |
| "step": 2635 | |
| }, | |
| { | |
| "epoch": 2.108626198083067, | |
| "grad_norm": 0.7412210732229237, | |
| "learning_rate": 1.7200353844828254e-06, | |
| "loss": 0.0419, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 2.112619808306709, | |
| "grad_norm": 0.6639717542375545, | |
| "learning_rate": 1.706049518996793e-06, | |
| "loss": 0.0403, | |
| "step": 2645 | |
| }, | |
| { | |
| "epoch": 2.1166134185303513, | |
| "grad_norm": 0.8077179697601087, | |
| "learning_rate": 1.6921023985426556e-06, | |
| "loss": 0.0449, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 2.1206070287539935, | |
| "grad_norm": 0.7020946900796683, | |
| "learning_rate": 1.6781943243447969e-06, | |
| "loss": 0.043, | |
| "step": 2655 | |
| }, | |
| { | |
| "epoch": 2.1246006389776357, | |
| "grad_norm": 0.7157899672728614, | |
| "learning_rate": 1.664325596784297e-06, | |
| "loss": 0.0416, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 2.128594249201278, | |
| "grad_norm": 0.6802103945208559, | |
| "learning_rate": 1.650496515392439e-06, | |
| "loss": 0.0399, | |
| "step": 2665 | |
| }, | |
| { | |
| "epoch": 2.13258785942492, | |
| "grad_norm": 0.6278491083549733, | |
| "learning_rate": 1.6367073788442447e-06, | |
| "loss": 0.037, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 2.1365814696485623, | |
| "grad_norm": 0.7345746287621161, | |
| "learning_rate": 1.6229584849520226e-06, | |
| "loss": 0.0407, | |
| "step": 2675 | |
| }, | |
| { | |
| "epoch": 2.1405750798722045, | |
| "grad_norm": 0.7382168583442479, | |
| "learning_rate": 1.6092501306589388e-06, | |
| "loss": 0.0421, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 2.1445686900958467, | |
| "grad_norm": 1.0670796847742963, | |
| "learning_rate": 1.5955826120325985e-06, | |
| "loss": 0.0418, | |
| "step": 2685 | |
| }, | |
| { | |
| "epoch": 2.148562300319489, | |
| "grad_norm": 0.748357154888744, | |
| "learning_rate": 1.5819562242586558e-06, | |
| "loss": 0.042, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 2.152555910543131, | |
| "grad_norm": 0.6924925720927515, | |
| "learning_rate": 1.5683712616344354e-06, | |
| "loss": 0.0413, | |
| "step": 2695 | |
| }, | |
| { | |
| "epoch": 2.1565495207667733, | |
| "grad_norm": 0.7321556671315872, | |
| "learning_rate": 1.5548280175625781e-06, | |
| "loss": 0.0427, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 2.1605431309904155, | |
| "grad_norm": 0.7345504259937425, | |
| "learning_rate": 1.5413267845447073e-06, | |
| "loss": 0.0407, | |
| "step": 2705 | |
| }, | |
| { | |
| "epoch": 2.1645367412140577, | |
| "grad_norm": 0.7275376645175038, | |
| "learning_rate": 1.527867854175105e-06, | |
| "loss": 0.0404, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 2.1685303514377, | |
| "grad_norm": 0.7841865628062479, | |
| "learning_rate": 1.5144515171344174e-06, | |
| "loss": 0.0422, | |
| "step": 2715 | |
| }, | |
| { | |
| "epoch": 2.1725239616613417, | |
| "grad_norm": 0.7399131108932046, | |
| "learning_rate": 1.5010780631833782e-06, | |
| "loss": 0.0396, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 2.176517571884984, | |
| "grad_norm": 0.7364100768910417, | |
| "learning_rate": 1.4877477811565473e-06, | |
| "loss": 0.042, | |
| "step": 2725 | |
| }, | |
| { | |
| "epoch": 2.180511182108626, | |
| "grad_norm": 0.7009848783954564, | |
| "learning_rate": 1.474460958956078e-06, | |
| "loss": 0.0427, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 2.1845047923322682, | |
| "grad_norm": 0.6692563631050057, | |
| "learning_rate": 1.461217883545492e-06, | |
| "loss": 0.0386, | |
| "step": 2735 | |
| }, | |
| { | |
| "epoch": 2.1884984025559104, | |
| "grad_norm": 0.7198493387599846, | |
| "learning_rate": 1.4480188409434866e-06, | |
| "loss": 0.0405, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 2.1924920127795526, | |
| "grad_norm": 0.7159870701360656, | |
| "learning_rate": 1.4348641162177543e-06, | |
| "loss": 0.0421, | |
| "step": 2745 | |
| }, | |
| { | |
| "epoch": 2.196485623003195, | |
| "grad_norm": 0.6323328853392578, | |
| "learning_rate": 1.4217539934788313e-06, | |
| "loss": 0.0379, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 2.200479233226837, | |
| "grad_norm": 0.716152180482688, | |
| "learning_rate": 1.4086887558739545e-06, | |
| "loss": 0.0399, | |
| "step": 2755 | |
| }, | |
| { | |
| "epoch": 2.2044728434504792, | |
| "grad_norm": 0.7209751580895504, | |
| "learning_rate": 1.395668685580951e-06, | |
| "loss": 0.0399, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 2.2084664536741214, | |
| "grad_norm": 0.6976023824417826, | |
| "learning_rate": 1.3826940638021407e-06, | |
| "loss": 0.0405, | |
| "step": 2765 | |
| }, | |
| { | |
| "epoch": 2.2124600638977636, | |
| "grad_norm": 0.676003189556891, | |
| "learning_rate": 1.369765170758265e-06, | |
| "loss": 0.0405, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 2.216453674121406, | |
| "grad_norm": 0.6769501596871886, | |
| "learning_rate": 1.3568822856824374e-06, | |
| "loss": 0.04, | |
| "step": 2775 | |
| }, | |
| { | |
| "epoch": 2.220447284345048, | |
| "grad_norm": 0.757978868906908, | |
| "learning_rate": 1.3440456868141044e-06, | |
| "loss": 0.0427, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 2.22444089456869, | |
| "grad_norm": 0.7268819944292451, | |
| "learning_rate": 1.3312556513930447e-06, | |
| "loss": 0.0421, | |
| "step": 2785 | |
| }, | |
| { | |
| "epoch": 2.2284345047923324, | |
| "grad_norm": 0.7714098456385522, | |
| "learning_rate": 1.3185124556533755e-06, | |
| "loss": 0.0417, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 2.2324281150159746, | |
| "grad_norm": 0.7430135294254091, | |
| "learning_rate": 1.3058163748175932e-06, | |
| "loss": 0.0389, | |
| "step": 2795 | |
| }, | |
| { | |
| "epoch": 2.236421725239617, | |
| "grad_norm": 0.7214319576784142, | |
| "learning_rate": 1.2931676830906231e-06, | |
| "loss": 0.0386, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 2.2404153354632586, | |
| "grad_norm": 0.7127065991787688, | |
| "learning_rate": 1.280566653653895e-06, | |
| "loss": 0.0378, | |
| "step": 2805 | |
| }, | |
| { | |
| "epoch": 2.244408945686901, | |
| "grad_norm": 0.6839690662193432, | |
| "learning_rate": 1.2680135586594546e-06, | |
| "loss": 0.0406, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 2.248402555910543, | |
| "grad_norm": 0.7523843789217508, | |
| "learning_rate": 1.2555086692240736e-06, | |
| "loss": 0.0391, | |
| "step": 2815 | |
| }, | |
| { | |
| "epoch": 2.252396166134185, | |
| "grad_norm": 0.6885045467953661, | |
| "learning_rate": 1.2430522554234037e-06, | |
| "loss": 0.0416, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 2.2563897763578273, | |
| "grad_norm": 0.702918819197609, | |
| "learning_rate": 1.2306445862861316e-06, | |
| "loss": 0.0403, | |
| "step": 2825 | |
| }, | |
| { | |
| "epoch": 2.2603833865814695, | |
| "grad_norm": 0.7187953136824686, | |
| "learning_rate": 1.2182859297881844e-06, | |
| "loss": 0.0387, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 2.2643769968051117, | |
| "grad_norm": 0.7261350898101718, | |
| "learning_rate": 1.2059765528469286e-06, | |
| "loss": 0.0386, | |
| "step": 2835 | |
| }, | |
| { | |
| "epoch": 2.268370607028754, | |
| "grad_norm": 0.8144574975767674, | |
| "learning_rate": 1.193716721315411e-06, | |
| "loss": 0.0407, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 2.272364217252396, | |
| "grad_norm": 0.719536955678268, | |
| "learning_rate": 1.1815066999766174e-06, | |
| "loss": 0.0405, | |
| "step": 2845 | |
| }, | |
| { | |
| "epoch": 2.2763578274760383, | |
| "grad_norm": 0.7645179459629033, | |
| "learning_rate": 1.1693467525377513e-06, | |
| "loss": 0.0405, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 2.2803514376996805, | |
| "grad_norm": 0.7168125333161555, | |
| "learning_rate": 1.1572371416245424e-06, | |
| "loss": 0.041, | |
| "step": 2855 | |
| }, | |
| { | |
| "epoch": 2.2843450479233227, | |
| "grad_norm": 0.6809214841238408, | |
| "learning_rate": 1.14517812877557e-06, | |
| "loss": 0.0372, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 2.288338658146965, | |
| "grad_norm": 0.7113403532991136, | |
| "learning_rate": 1.1331699744366168e-06, | |
| "loss": 0.0419, | |
| "step": 2865 | |
| }, | |
| { | |
| "epoch": 2.292332268370607, | |
| "grad_norm": 0.7363573736512515, | |
| "learning_rate": 1.1212129379550432e-06, | |
| "loss": 0.0401, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 2.2963258785942493, | |
| "grad_norm": 0.7386118577478687, | |
| "learning_rate": 1.109307277574187e-06, | |
| "loss": 0.0413, | |
| "step": 2875 | |
| }, | |
| { | |
| "epoch": 2.3003194888178915, | |
| "grad_norm": 0.727683344048625, | |
| "learning_rate": 1.0974532504277863e-06, | |
| "loss": 0.042, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 2.3043130990415337, | |
| "grad_norm": 0.7386228312374142, | |
| "learning_rate": 1.0856511125344233e-06, | |
| "loss": 0.0407, | |
| "step": 2885 | |
| }, | |
| { | |
| "epoch": 2.308306709265176, | |
| "grad_norm": 0.7140159548965673, | |
| "learning_rate": 1.0739011187919976e-06, | |
| "loss": 0.0408, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 2.312300319488818, | |
| "grad_norm": 0.7497153823378125, | |
| "learning_rate": 1.0622035229722183e-06, | |
| "loss": 0.0418, | |
| "step": 2895 | |
| }, | |
| { | |
| "epoch": 2.31629392971246, | |
| "grad_norm": 0.7001040487899245, | |
| "learning_rate": 1.0505585777151287e-06, | |
| "loss": 0.042, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 2.3202875399361025, | |
| "grad_norm": 0.6894355300080932, | |
| "learning_rate": 1.038966534523643e-06, | |
| "loss": 0.0392, | |
| "step": 2905 | |
| }, | |
| { | |
| "epoch": 2.3242811501597442, | |
| "grad_norm": 0.6990435544277329, | |
| "learning_rate": 1.0274276437581183e-06, | |
| "loss": 0.0381, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 2.3282747603833864, | |
| "grad_norm": 0.6909814467001467, | |
| "learning_rate": 1.0159421546309467e-06, | |
| "loss": 0.0406, | |
| "step": 2915 | |
| }, | |
| { | |
| "epoch": 2.3322683706070286, | |
| "grad_norm": 0.7109244643358452, | |
| "learning_rate": 1.004510315201172e-06, | |
| "loss": 0.041, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 2.336261980830671, | |
| "grad_norm": 0.6878258243345245, | |
| "learning_rate": 9.931323723691355e-07, | |
| "loss": 0.0408, | |
| "step": 2925 | |
| }, | |
| { | |
| "epoch": 2.340255591054313, | |
| "grad_norm": 0.6756279917857075, | |
| "learning_rate": 9.81808571871139e-07, | |
| "loss": 0.0402, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 2.344249201277955, | |
| "grad_norm": 0.749467435614857, | |
| "learning_rate": 9.70539158274141e-07, | |
| "loss": 0.041, | |
| "step": 2935 | |
| }, | |
| { | |
| "epoch": 2.3482428115015974, | |
| "grad_norm": 0.752336563070383, | |
| "learning_rate": 9.593243749704702e-07, | |
| "loss": 0.0421, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 2.3522364217252396, | |
| "grad_norm": 0.7364168174436757, | |
| "learning_rate": 9.481644641725768e-07, | |
| "loss": 0.0413, | |
| "step": 2945 | |
| }, | |
| { | |
| "epoch": 2.356230031948882, | |
| "grad_norm": 0.7454044222548492, | |
| "learning_rate": 9.370596669077937e-07, | |
| "loss": 0.0401, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 2.360223642172524, | |
| "grad_norm": 0.6962574991902063, | |
| "learning_rate": 9.260102230131306e-07, | |
| "loss": 0.039, | |
| "step": 2955 | |
| }, | |
| { | |
| "epoch": 2.364217252396166, | |
| "grad_norm": 0.7110768770465447, | |
| "learning_rate": 9.150163711301031e-07, | |
| "loss": 0.0374, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 2.3682108626198084, | |
| "grad_norm": 0.7757468444639659, | |
| "learning_rate": 9.040783486995669e-07, | |
| "loss": 0.0386, | |
| "step": 2965 | |
| }, | |
| { | |
| "epoch": 2.3722044728434506, | |
| "grad_norm": 0.6850534935139829, | |
| "learning_rate": 8.931963919566015e-07, | |
| "loss": 0.0391, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 2.376198083067093, | |
| "grad_norm": 0.7275846393994331, | |
| "learning_rate": 8.823707359253941e-07, | |
| "loss": 0.0404, | |
| "step": 2975 | |
| }, | |
| { | |
| "epoch": 2.380191693290735, | |
| "grad_norm": 0.725401060190788, | |
| "learning_rate": 8.716016144141784e-07, | |
| "loss": 0.0392, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 2.384185303514377, | |
| "grad_norm": 0.719311654275402, | |
| "learning_rate": 8.608892600101741e-07, | |
| "loss": 0.0408, | |
| "step": 2985 | |
| }, | |
| { | |
| "epoch": 2.3881789137380194, | |
| "grad_norm": 0.6867331873092225, | |
| "learning_rate": 8.502339040745675e-07, | |
| "loss": 0.0385, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 2.392172523961661, | |
| "grad_norm": 0.7395697453495743, | |
| "learning_rate": 8.396357767375185e-07, | |
| "loss": 0.041, | |
| "step": 2995 | |
| }, | |
| { | |
| "epoch": 2.3961661341853033, | |
| "grad_norm": 0.7118596021755634, | |
| "learning_rate": 8.290951068931795e-07, | |
| "loss": 0.0418, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 2.3961661341853033, | |
| "eval_loss": 0.16109129786491394, | |
| "eval_runtime": 388.1385, | |
| "eval_samples_per_second": 45.878, | |
| "eval_steps_per_second": 5.735, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 2.4001597444089455, | |
| "grad_norm": 0.6579880169340647, | |
| "learning_rate": 8.186121221947648e-07, | |
| "loss": 0.0412, | |
| "step": 3005 | |
| }, | |
| { | |
| "epoch": 2.4041533546325877, | |
| "grad_norm": 0.6563989234205505, | |
| "learning_rate": 8.081870490496246e-07, | |
| "loss": 0.0392, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 2.40814696485623, | |
| "grad_norm": 0.6782118306828682, | |
| "learning_rate": 7.978201126143619e-07, | |
| "loss": 0.0405, | |
| "step": 3015 | |
| }, | |
| { | |
| "epoch": 2.412140575079872, | |
| "grad_norm": 0.7287775052219024, | |
| "learning_rate": 7.875115367899602e-07, | |
| "loss": 0.0404, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 2.4161341853035143, | |
| "grad_norm": 0.6939389857285485, | |
| "learning_rate": 7.772615442169569e-07, | |
| "loss": 0.0385, | |
| "step": 3025 | |
| }, | |
| { | |
| "epoch": 2.4201277955271565, | |
| "grad_norm": 0.6771579465197793, | |
| "learning_rate": 7.670703562706338e-07, | |
| "loss": 0.0391, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 2.4241214057507987, | |
| "grad_norm": 0.7805400199400504, | |
| "learning_rate": 7.569381930562301e-07, | |
| "loss": 0.0404, | |
| "step": 3035 | |
| }, | |
| { | |
| "epoch": 2.428115015974441, | |
| "grad_norm": 0.7430898082732748, | |
| "learning_rate": 7.468652734041938e-07, | |
| "loss": 0.0384, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 2.432108626198083, | |
| "grad_norm": 0.7030712092404906, | |
| "learning_rate": 7.368518148654524e-07, | |
| "loss": 0.0399, | |
| "step": 3045 | |
| }, | |
| { | |
| "epoch": 2.4361022364217253, | |
| "grad_norm": 0.6593873003197812, | |
| "learning_rate": 7.268980337067193e-07, | |
| "loss": 0.0387, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 2.4400958466453675, | |
| "grad_norm": 0.7180247154760864, | |
| "learning_rate": 7.17004144905816e-07, | |
| "loss": 0.04, | |
| "step": 3055 | |
| }, | |
| { | |
| "epoch": 2.4440894568690097, | |
| "grad_norm": 0.6917266349878818, | |
| "learning_rate": 7.07170362147034e-07, | |
| "loss": 0.0388, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 2.448083067092652, | |
| "grad_norm": 0.7191478188087486, | |
| "learning_rate": 6.973968978165176e-07, | |
| "loss": 0.0388, | |
| "step": 3065 | |
| }, | |
| { | |
| "epoch": 2.452076677316294, | |
| "grad_norm": 0.7285189143803087, | |
| "learning_rate": 6.876839629976778e-07, | |
| "loss": 0.0386, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 2.4560702875399363, | |
| "grad_norm": 0.6602961842830507, | |
| "learning_rate": 6.78031767466635e-07, | |
| "loss": 0.039, | |
| "step": 3075 | |
| }, | |
| { | |
| "epoch": 2.460063897763578, | |
| "grad_norm": 0.7397023360105911, | |
| "learning_rate": 6.684405196876843e-07, | |
| "loss": 0.0397, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 2.4640575079872207, | |
| "grad_norm": 0.6184869436420118, | |
| "learning_rate": 6.589104268087958e-07, | |
| "loss": 0.0365, | |
| "step": 3085 | |
| }, | |
| { | |
| "epoch": 2.4680511182108624, | |
| "grad_norm": 0.7393488490012594, | |
| "learning_rate": 6.494416946571408e-07, | |
| "loss": 0.0389, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 2.4720447284345046, | |
| "grad_norm": 0.7764250937534626, | |
| "learning_rate": 6.400345277346447e-07, | |
| "loss": 0.038, | |
| "step": 3095 | |
| }, | |
| { | |
| "epoch": 2.476038338658147, | |
| "grad_norm": 0.6463875407701625, | |
| "learning_rate": 6.306891292135743e-07, | |
| "loss": 0.0413, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 2.480031948881789, | |
| "grad_norm": 0.7320022575736014, | |
| "learning_rate": 6.21405700932144e-07, | |
| "loss": 0.0375, | |
| "step": 3105 | |
| }, | |
| { | |
| "epoch": 2.484025559105431, | |
| "grad_norm": 0.6818592089753043, | |
| "learning_rate": 6.121844433901612e-07, | |
| "loss": 0.0386, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 2.4880191693290734, | |
| "grad_norm": 0.6923239557377695, | |
| "learning_rate": 6.030255557446922e-07, | |
| "loss": 0.038, | |
| "step": 3115 | |
| }, | |
| { | |
| "epoch": 2.4920127795527156, | |
| "grad_norm": 0.7225836003409724, | |
| "learning_rate": 5.939292358057669e-07, | |
| "loss": 0.0401, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 2.496006389776358, | |
| "grad_norm": 0.7224726921229143, | |
| "learning_rate": 5.848956800320994e-07, | |
| "loss": 0.0396, | |
| "step": 3125 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 0.7730383752980494, | |
| "learning_rate": 5.75925083526847e-07, | |
| "loss": 0.0403, | |
| "step": 3130 | |
| }, | |
| { | |
| "epoch": 2.503993610223642, | |
| "grad_norm": 0.7211592353668832, | |
| "learning_rate": 5.670176400334023e-07, | |
| "loss": 0.0381, | |
| "step": 3135 | |
| }, | |
| { | |
| "epoch": 2.5079872204472844, | |
| "grad_norm": 0.7472591071933815, | |
| "learning_rate": 5.581735419312e-07, | |
| "loss": 0.0401, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 2.5119808306709266, | |
| "grad_norm": 0.6705306160257385, | |
| "learning_rate": 5.493929802315701e-07, | |
| "loss": 0.0404, | |
| "step": 3145 | |
| }, | |
| { | |
| "epoch": 2.515974440894569, | |
| "grad_norm": 0.744340088134437, | |
| "learning_rate": 5.406761445736025e-07, | |
| "loss": 0.0372, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 2.519968051118211, | |
| "grad_norm": 0.7438732345302391, | |
| "learning_rate": 5.320232232200639e-07, | |
| "loss": 0.0411, | |
| "step": 3155 | |
| }, | |
| { | |
| "epoch": 2.523961661341853, | |
| "grad_norm": 0.7988995555141504, | |
| "learning_rate": 5.234344030533204e-07, | |
| "loss": 0.038, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 2.527955271565495, | |
| "grad_norm": 0.6711977319305636, | |
| "learning_rate": 5.149098695713102e-07, | |
| "loss": 0.0352, | |
| "step": 3165 | |
| }, | |
| { | |
| "epoch": 2.5319488817891376, | |
| "grad_norm": 0.6847061252244899, | |
| "learning_rate": 5.064498068835288e-07, | |
| "loss": 0.0388, | |
| "step": 3170 | |
| }, | |
| { | |
| "epoch": 2.5359424920127793, | |
| "grad_norm": 0.6841997385045966, | |
| "learning_rate": 4.980543977070587e-07, | |
| "loss": 0.0385, | |
| "step": 3175 | |
| }, | |
| { | |
| "epoch": 2.539936102236422, | |
| "grad_norm": 0.7373519760893469, | |
| "learning_rate": 4.897238233626237e-07, | |
| "loss": 0.0397, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 2.5439297124600637, | |
| "grad_norm": 0.7612391951526681, | |
| "learning_rate": 4.814582637706676e-07, | |
| "loss": 0.0406, | |
| "step": 3185 | |
| }, | |
| { | |
| "epoch": 2.547923322683706, | |
| "grad_norm": 0.6691221826001316, | |
| "learning_rate": 4.732578974474729e-07, | |
| "loss": 0.0373, | |
| "step": 3190 | |
| }, | |
| { | |
| "epoch": 2.551916932907348, | |
| "grad_norm": 0.7504326919914636, | |
| "learning_rate": 4.651229015013024e-07, | |
| "loss": 0.0395, | |
| "step": 3195 | |
| }, | |
| { | |
| "epoch": 2.5559105431309903, | |
| "grad_norm": 0.728197142532798, | |
| "learning_rate": 4.5705345162857707e-07, | |
| "loss": 0.0404, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 2.5599041533546325, | |
| "grad_norm": 0.7638750317365375, | |
| "learning_rate": 4.4904972211007877e-07, | |
| "loss": 0.0365, | |
| "step": 3205 | |
| }, | |
| { | |
| "epoch": 2.5638977635782747, | |
| "grad_norm": 0.6786482009733499, | |
| "learning_rate": 4.4111188580718737e-07, | |
| "loss": 0.0381, | |
| "step": 3210 | |
| }, | |
| { | |
| "epoch": 2.567891373801917, | |
| "grad_norm": 0.6750362855329872, | |
| "learning_rate": 4.332401141581471e-07, | |
| "loss": 0.0384, | |
| "step": 3215 | |
| }, | |
| { | |
| "epoch": 2.571884984025559, | |
| "grad_norm": 0.6434774711250648, | |
| "learning_rate": 4.2543457717436274e-07, | |
| "loss": 0.0389, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 2.5758785942492013, | |
| "grad_norm": 0.678657018918754, | |
| "learning_rate": 4.1769544343673265e-07, | |
| "loss": 0.0387, | |
| "step": 3225 | |
| }, | |
| { | |
| "epoch": 2.5798722044728435, | |
| "grad_norm": 0.7234126311376552, | |
| "learning_rate": 4.10022880092e-07, | |
| "loss": 0.0374, | |
| "step": 3230 | |
| }, | |
| { | |
| "epoch": 2.5838658146964857, | |
| "grad_norm": 0.7172155271619155, | |
| "learning_rate": 4.0241705284914966e-07, | |
| "loss": 0.0411, | |
| "step": 3235 | |
| }, | |
| { | |
| "epoch": 2.587859424920128, | |
| "grad_norm": 0.698249391590756, | |
| "learning_rate": 3.9487812597582513e-07, | |
| "loss": 0.0408, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 2.59185303514377, | |
| "grad_norm": 0.6637186291346941, | |
| "learning_rate": 3.874062622947825e-07, | |
| "loss": 0.0363, | |
| "step": 3245 | |
| }, | |
| { | |
| "epoch": 2.5958466453674123, | |
| "grad_norm": 0.6576549119431471, | |
| "learning_rate": 3.800016231803752e-07, | |
| "loss": 0.0391, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 2.5998402555910545, | |
| "grad_norm": 0.7376314683338078, | |
| "learning_rate": 3.7266436855506484e-07, | |
| "loss": 0.0373, | |
| "step": 3255 | |
| }, | |
| { | |
| "epoch": 2.6038338658146962, | |
| "grad_norm": 0.715005088433797, | |
| "learning_rate": 3.6539465688597107e-07, | |
| "loss": 0.0374, | |
| "step": 3260 | |
| }, | |
| { | |
| "epoch": 2.607827476038339, | |
| "grad_norm": 0.7126399642722202, | |
| "learning_rate": 3.5819264518144565e-07, | |
| "loss": 0.0399, | |
| "step": 3265 | |
| }, | |
| { | |
| "epoch": 2.6118210862619806, | |
| "grad_norm": 0.7304318779489742, | |
| "learning_rate": 3.5105848898768615e-07, | |
| "loss": 0.0373, | |
| "step": 3270 | |
| }, | |
| { | |
| "epoch": 2.6158146964856233, | |
| "grad_norm": 0.7507980457632606, | |
| "learning_rate": 3.4399234238537254e-07, | |
| "loss": 0.0393, | |
| "step": 3275 | |
| }, | |
| { | |
| "epoch": 2.619808306709265, | |
| "grad_norm": 0.6923767027594208, | |
| "learning_rate": 3.369943579863379e-07, | |
| "loss": 0.0371, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 2.623801916932907, | |
| "grad_norm": 0.6964206029100168, | |
| "learning_rate": 3.300646869302794e-07, | |
| "loss": 0.0362, | |
| "step": 3285 | |
| }, | |
| { | |
| "epoch": 2.6277955271565494, | |
| "grad_norm": 0.7352109790702203, | |
| "learning_rate": 3.2320347888148644e-07, | |
| "loss": 0.0377, | |
| "step": 3290 | |
| }, | |
| { | |
| "epoch": 2.6317891373801916, | |
| "grad_norm": 0.6628167533609087, | |
| "learning_rate": 3.1641088202561373e-07, | |
| "loss": 0.0379, | |
| "step": 3295 | |
| }, | |
| { | |
| "epoch": 2.635782747603834, | |
| "grad_norm": 0.6564933786076047, | |
| "learning_rate": 3.0968704306647714e-07, | |
| "loss": 0.0369, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 2.639776357827476, | |
| "grad_norm": 0.6308056550542149, | |
| "learning_rate": 3.0303210722288666e-07, | |
| "loss": 0.0346, | |
| "step": 3305 | |
| }, | |
| { | |
| "epoch": 2.643769968051118, | |
| "grad_norm": 0.6573474228547711, | |
| "learning_rate": 2.964462182255117e-07, | |
| "loss": 0.0368, | |
| "step": 3310 | |
| }, | |
| { | |
| "epoch": 2.6477635782747604, | |
| "grad_norm": 0.6896389035881784, | |
| "learning_rate": 2.899295183137723e-07, | |
| "loss": 0.0395, | |
| "step": 3315 | |
| }, | |
| { | |
| "epoch": 2.6517571884984026, | |
| "grad_norm": 0.7174278292512358, | |
| "learning_rate": 2.834821482327745e-07, | |
| "loss": 0.0366, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 2.655750798722045, | |
| "grad_norm": 0.7073005459759208, | |
| "learning_rate": 2.7710424723026073e-07, | |
| "loss": 0.0404, | |
| "step": 3325 | |
| }, | |
| { | |
| "epoch": 2.659744408945687, | |
| "grad_norm": 0.7369122238419571, | |
| "learning_rate": 2.7079595305361237e-07, | |
| "loss": 0.0361, | |
| "step": 3330 | |
| }, | |
| { | |
| "epoch": 2.663738019169329, | |
| "grad_norm": 0.703027115337837, | |
| "learning_rate": 2.645574019468673e-07, | |
| "loss": 0.0357, | |
| "step": 3335 | |
| }, | |
| { | |
| "epoch": 2.6677316293929714, | |
| "grad_norm": 0.769356516837401, | |
| "learning_rate": 2.5838872864778276e-07, | |
| "loss": 0.0398, | |
| "step": 3340 | |
| }, | |
| { | |
| "epoch": 2.6717252396166136, | |
| "grad_norm": 0.7144690912826553, | |
| "learning_rate": 2.522900663849195e-07, | |
| "loss": 0.035, | |
| "step": 3345 | |
| }, | |
| { | |
| "epoch": 2.6757188498402558, | |
| "grad_norm": 0.7538629864589558, | |
| "learning_rate": 2.4626154687476913e-07, | |
| "loss": 0.0392, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 2.6797124600638975, | |
| "grad_norm": 0.6680095163501195, | |
| "learning_rate": 2.403033003189097e-07, | |
| "loss": 0.0384, | |
| "step": 3355 | |
| }, | |
| { | |
| "epoch": 2.68370607028754, | |
| "grad_norm": 0.7091669937348918, | |
| "learning_rate": 2.3441545540118957e-07, | |
| "loss": 0.04, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 2.687699680511182, | |
| "grad_norm": 0.7387779458841153, | |
| "learning_rate": 2.285981392849507e-07, | |
| "loss": 0.0404, | |
| "step": 3365 | |
| }, | |
| { | |
| "epoch": 2.6916932907348246, | |
| "grad_norm": 0.716479182444788, | |
| "learning_rate": 2.2285147761028234e-07, | |
| "loss": 0.037, | |
| "step": 3370 | |
| }, | |
| { | |
| "epoch": 2.6956869009584663, | |
| "grad_norm": 0.736182154333335, | |
| "learning_rate": 2.1717559449130745e-07, | |
| "loss": 0.0374, | |
| "step": 3375 | |
| }, | |
| { | |
| "epoch": 2.6996805111821085, | |
| "grad_norm": 0.7195196251858598, | |
| "learning_rate": 2.115706125135016e-07, | |
| "loss": 0.0403, | |
| "step": 3380 | |
| }, | |
| { | |
| "epoch": 2.7036741214057507, | |
| "grad_norm": 0.795586487340967, | |
| "learning_rate": 2.0603665273104477e-07, | |
| "loss": 0.0383, | |
| "step": 3385 | |
| }, | |
| { | |
| "epoch": 2.707667731629393, | |
| "grad_norm": 0.6816457489515708, | |
| "learning_rate": 2.0057383466420887e-07, | |
| "loss": 0.0362, | |
| "step": 3390 | |
| }, | |
| { | |
| "epoch": 2.711661341853035, | |
| "grad_norm": 0.7211817535473453, | |
| "learning_rate": 1.9518227629677343e-07, | |
| "loss": 0.0393, | |
| "step": 3395 | |
| }, | |
| { | |
| "epoch": 2.7156549520766773, | |
| "grad_norm": 0.6621050649016567, | |
| "learning_rate": 1.898620940734823e-07, | |
| "loss": 0.0387, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 2.7196485623003195, | |
| "grad_norm": 0.7338961263391105, | |
| "learning_rate": 1.8461340289752259e-07, | |
| "loss": 0.0386, | |
| "step": 3405 | |
| }, | |
| { | |
| "epoch": 2.7236421725239617, | |
| "grad_norm": 0.7256126254117897, | |
| "learning_rate": 1.7943631612804777e-07, | |
| "loss": 0.0379, | |
| "step": 3410 | |
| }, | |
| { | |
| "epoch": 2.727635782747604, | |
| "grad_norm": 0.7117270166197374, | |
| "learning_rate": 1.7433094557772664e-07, | |
| "loss": 0.0384, | |
| "step": 3415 | |
| }, | |
| { | |
| "epoch": 2.731629392971246, | |
| "grad_norm": 0.6841486868214987, | |
| "learning_rate": 1.692974015103306e-07, | |
| "loss": 0.0363, | |
| "step": 3420 | |
| }, | |
| { | |
| "epoch": 2.7356230031948883, | |
| "grad_norm": 0.7446852668120804, | |
| "learning_rate": 1.643357926383505e-07, | |
| "loss": 0.0383, | |
| "step": 3425 | |
| }, | |
| { | |
| "epoch": 2.7396166134185305, | |
| "grad_norm": 0.6829132700695434, | |
| "learning_rate": 1.5944622612064964e-07, | |
| "loss": 0.0375, | |
| "step": 3430 | |
| }, | |
| { | |
| "epoch": 2.7436102236421727, | |
| "grad_norm": 0.6858172526376208, | |
| "learning_rate": 1.5462880756014852e-07, | |
| "loss": 0.0397, | |
| "step": 3435 | |
| }, | |
| { | |
| "epoch": 2.747603833865815, | |
| "grad_norm": 0.67374596113679, | |
| "learning_rate": 1.498836410015449e-07, | |
| "loss": 0.0382, | |
| "step": 3440 | |
| }, | |
| { | |
| "epoch": 2.751597444089457, | |
| "grad_norm": 0.6968820872384376, | |
| "learning_rate": 1.4521082892906716e-07, | |
| "loss": 0.0375, | |
| "step": 3445 | |
| }, | |
| { | |
| "epoch": 2.755591054313099, | |
| "grad_norm": 0.7350519767986172, | |
| "learning_rate": 1.4061047226425888e-07, | |
| "loss": 0.0401, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 2.7595846645367414, | |
| "grad_norm": 0.7301495139163381, | |
| "learning_rate": 1.3608267036380177e-07, | |
| "loss": 0.0381, | |
| "step": 3455 | |
| }, | |
| { | |
| "epoch": 2.763578274760383, | |
| "grad_norm": 0.706129469994641, | |
| "learning_rate": 1.3162752101736706e-07, | |
| "loss": 0.0403, | |
| "step": 3460 | |
| }, | |
| { | |
| "epoch": 2.7675718849840254, | |
| "grad_norm": 0.72296699028865, | |
| "learning_rate": 1.2724512044550578e-07, | |
| "loss": 0.0373, | |
| "step": 3465 | |
| }, | |
| { | |
| "epoch": 2.7715654952076676, | |
| "grad_norm": 0.7326657651133558, | |
| "learning_rate": 1.2293556329757026e-07, | |
| "loss": 0.039, | |
| "step": 3470 | |
| }, | |
| { | |
| "epoch": 2.77555910543131, | |
| "grad_norm": 0.7083978204265703, | |
| "learning_rate": 1.1869894264966757e-07, | |
| "loss": 0.0365, | |
| "step": 3475 | |
| }, | |
| { | |
| "epoch": 2.779552715654952, | |
| "grad_norm": 0.6541405696956989, | |
| "learning_rate": 1.1453535000265358e-07, | |
| "loss": 0.0363, | |
| "step": 3480 | |
| }, | |
| { | |
| "epoch": 2.783546325878594, | |
| "grad_norm": 0.6793468016189803, | |
| "learning_rate": 1.1044487528015168e-07, | |
| "loss": 0.0389, | |
| "step": 3485 | |
| }, | |
| { | |
| "epoch": 2.7875399361022364, | |
| "grad_norm": 0.6830412517472533, | |
| "learning_rate": 1.0642760682661645e-07, | |
| "loss": 0.0365, | |
| "step": 3490 | |
| }, | |
| { | |
| "epoch": 2.7915335463258786, | |
| "grad_norm": 0.7653650618048291, | |
| "learning_rate": 1.0248363140541955e-07, | |
| "loss": 0.0377, | |
| "step": 3495 | |
| }, | |
| { | |
| "epoch": 2.7955271565495208, | |
| "grad_norm": 0.6495511843544473, | |
| "learning_rate": 9.861303419697947e-08, | |
| "loss": 0.0379, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 2.7955271565495208, | |
| "eval_loss": 0.16211672127246857, | |
| "eval_runtime": 388.1552, | |
| "eval_samples_per_second": 45.876, | |
| "eval_steps_per_second": 5.735, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 2.799520766773163, | |
| "grad_norm": 0.7226650174013275, | |
| "learning_rate": 9.481589879692242e-08, | |
| "loss": 0.0378, | |
| "step": 3505 | |
| }, | |
| { | |
| "epoch": 2.803514376996805, | |
| "grad_norm": 0.7151599173267984, | |
| "learning_rate": 9.109230721427425e-08, | |
| "loss": 0.0403, | |
| "step": 3510 | |
| }, | |
| { | |
| "epoch": 2.8075079872204474, | |
| "grad_norm": 0.7251578235357717, | |
| "learning_rate": 8.744233986969163e-08, | |
| "loss": 0.0375, | |
| "step": 3515 | |
| }, | |
| { | |
| "epoch": 2.8115015974440896, | |
| "grad_norm": 0.7887886233607367, | |
| "learning_rate": 8.386607559372245e-08, | |
| "loss": 0.0396, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 2.8154952076677318, | |
| "grad_norm": 0.662483687379777, | |
| "learning_rate": 8.036359162510726e-08, | |
| "loss": 0.0381, | |
| "step": 3525 | |
| }, | |
| { | |
| "epoch": 2.819488817891374, | |
| "grad_norm": 0.7331222659123482, | |
| "learning_rate": 7.69349636091065e-08, | |
| "loss": 0.0384, | |
| "step": 3530 | |
| }, | |
| { | |
| "epoch": 2.8234824281150157, | |
| "grad_norm": 0.7383759987861803, | |
| "learning_rate": 7.358026559587044e-08, | |
| "loss": 0.0371, | |
| "step": 3535 | |
| }, | |
| { | |
| "epoch": 2.8274760383386583, | |
| "grad_norm": 0.7270481431745002, | |
| "learning_rate": 7.029957003883775e-08, | |
| "loss": 0.0393, | |
| "step": 3540 | |
| }, | |
| { | |
| "epoch": 2.8314696485623, | |
| "grad_norm": 0.6650945156242567, | |
| "learning_rate": 6.709294779317115e-08, | |
| "loss": 0.037, | |
| "step": 3545 | |
| }, | |
| { | |
| "epoch": 2.8354632587859427, | |
| "grad_norm": 0.7401242798225801, | |
| "learning_rate": 6.396046811422879e-08, | |
| "loss": 0.0366, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 2.8394568690095845, | |
| "grad_norm": 0.7233839042760212, | |
| "learning_rate": 6.0902198656065e-08, | |
| "loss": 0.0391, | |
| "step": 3555 | |
| }, | |
| { | |
| "epoch": 2.8434504792332267, | |
| "grad_norm": 0.6897214426187732, | |
| "learning_rate": 5.7918205469972835e-08, | |
| "loss": 0.0382, | |
| "step": 3560 | |
| }, | |
| { | |
| "epoch": 2.847444089456869, | |
| "grad_norm": 0.7453035813326291, | |
| "learning_rate": 5.500855300305485e-08, | |
| "loss": 0.0388, | |
| "step": 3565 | |
| }, | |
| { | |
| "epoch": 2.851437699680511, | |
| "grad_norm": 0.7866890040979969, | |
| "learning_rate": 5.217330409683235e-08, | |
| "loss": 0.0394, | |
| "step": 3570 | |
| }, | |
| { | |
| "epoch": 2.8554313099041533, | |
| "grad_norm": 0.7099023785681982, | |
| "learning_rate": 4.941251998588813e-08, | |
| "loss": 0.0391, | |
| "step": 3575 | |
| }, | |
| { | |
| "epoch": 2.8594249201277955, | |
| "grad_norm": 0.6644517725486436, | |
| "learning_rate": 4.672626029654453e-08, | |
| "loss": 0.0382, | |
| "step": 3580 | |
| }, | |
| { | |
| "epoch": 2.8634185303514377, | |
| "grad_norm": 0.7086094015075386, | |
| "learning_rate": 4.41145830455737e-08, | |
| "loss": 0.0393, | |
| "step": 3585 | |
| }, | |
| { | |
| "epoch": 2.86741214057508, | |
| "grad_norm": 0.6421742170261078, | |
| "learning_rate": 4.1577544638946026e-08, | |
| "loss": 0.0325, | |
| "step": 3590 | |
| }, | |
| { | |
| "epoch": 2.871405750798722, | |
| "grad_norm": 0.681264218176713, | |
| "learning_rate": 3.9115199870613124e-08, | |
| "loss": 0.0391, | |
| "step": 3595 | |
| }, | |
| { | |
| "epoch": 2.8753993610223643, | |
| "grad_norm": 0.6922943935506756, | |
| "learning_rate": 3.672760192132107e-08, | |
| "loss": 0.0379, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 2.8793929712460065, | |
| "grad_norm": 0.6533088211666771, | |
| "learning_rate": 3.44148023574653e-08, | |
| "loss": 0.0351, | |
| "step": 3605 | |
| }, | |
| { | |
| "epoch": 2.8833865814696487, | |
| "grad_norm": 0.719598133752619, | |
| "learning_rate": 3.21768511299742e-08, | |
| "loss": 0.0365, | |
| "step": 3610 | |
| }, | |
| { | |
| "epoch": 2.887380191693291, | |
| "grad_norm": 0.6712232835091643, | |
| "learning_rate": 3.001379657323161e-08, | |
| "loss": 0.0375, | |
| "step": 3615 | |
| }, | |
| { | |
| "epoch": 2.891373801916933, | |
| "grad_norm": 0.7793799932409174, | |
| "learning_rate": 2.7925685404034226e-08, | |
| "loss": 0.0397, | |
| "step": 3620 | |
| }, | |
| { | |
| "epoch": 2.8953674121405752, | |
| "grad_norm": 0.7349037778253329, | |
| "learning_rate": 2.5912562720578625e-08, | |
| "loss": 0.0378, | |
| "step": 3625 | |
| }, | |
| { | |
| "epoch": 2.899361022364217, | |
| "grad_norm": 0.7347196983541275, | |
| "learning_rate": 2.397447200149211e-08, | |
| "loss": 0.0372, | |
| "step": 3630 | |
| }, | |
| { | |
| "epoch": 2.9033546325878596, | |
| "grad_norm": 0.7399058877098672, | |
| "learning_rate": 2.2111455104889653e-08, | |
| "loss": 0.0376, | |
| "step": 3635 | |
| }, | |
| { | |
| "epoch": 2.9073482428115014, | |
| "grad_norm": 0.6460231812865741, | |
| "learning_rate": 2.0323552267473176e-08, | |
| "loss": 0.0379, | |
| "step": 3640 | |
| }, | |
| { | |
| "epoch": 2.911341853035144, | |
| "grad_norm": 0.7479929434238791, | |
| "learning_rate": 1.8610802103659174e-08, | |
| "loss": 0.0373, | |
| "step": 3645 | |
| }, | |
| { | |
| "epoch": 2.915335463258786, | |
| "grad_norm": 0.7817922135971772, | |
| "learning_rate": 1.697324160474678e-08, | |
| "loss": 0.0385, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 2.919329073482428, | |
| "grad_norm": 0.7824066395040583, | |
| "learning_rate": 1.5410906138120406e-08, | |
| "loss": 0.0371, | |
| "step": 3655 | |
| }, | |
| { | |
| "epoch": 2.92332268370607, | |
| "grad_norm": 0.7186999884806289, | |
| "learning_rate": 1.3923829446481128e-08, | |
| "loss": 0.0373, | |
| "step": 3660 | |
| }, | |
| { | |
| "epoch": 2.9273162939297124, | |
| "grad_norm": 0.6803186595275351, | |
| "learning_rate": 1.2512043647123937e-08, | |
| "loss": 0.0379, | |
| "step": 3665 | |
| }, | |
| { | |
| "epoch": 2.9313099041533546, | |
| "grad_norm": 0.7573331915873533, | |
| "learning_rate": 1.117557923123752e-08, | |
| "loss": 0.038, | |
| "step": 3670 | |
| }, | |
| { | |
| "epoch": 2.9353035143769968, | |
| "grad_norm": 0.6550603996023604, | |
| "learning_rate": 9.914465063252608e-09, | |
| "loss": 0.0367, | |
| "step": 3675 | |
| }, | |
| { | |
| "epoch": 2.939297124600639, | |
| "grad_norm": 0.6971503004155564, | |
| "learning_rate": 8.728728380212102e-09, | |
| "loss": 0.0382, | |
| "step": 3680 | |
| }, | |
| { | |
| "epoch": 2.943290734824281, | |
| "grad_norm": 0.8017190342088107, | |
| "learning_rate": 7.618394791188198e-09, | |
| "loss": 0.0393, | |
| "step": 3685 | |
| }, | |
| { | |
| "epoch": 2.9472843450479234, | |
| "grad_norm": 0.77895577170809, | |
| "learning_rate": 6.5834882767251686e-09, | |
| "loss": 0.039, | |
| "step": 3690 | |
| }, | |
| { | |
| "epoch": 2.9512779552715656, | |
| "grad_norm": 0.746442946729805, | |
| "learning_rate": 5.624031188324497e-09, | |
| "loss": 0.0387, | |
| "step": 3695 | |
| }, | |
| { | |
| "epoch": 2.9552715654952078, | |
| "grad_norm": 0.7104058466917486, | |
| "learning_rate": 4.740044247960318e-09, | |
| "loss": 0.0382, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 2.95926517571885, | |
| "grad_norm": 0.7440341760747408, | |
| "learning_rate": 3.931546547632558e-09, | |
| "loss": 0.0401, | |
| "step": 3705 | |
| }, | |
| { | |
| "epoch": 2.963258785942492, | |
| "grad_norm": 0.6642274503256519, | |
| "learning_rate": 3.1985555489534854e-09, | |
| "loss": 0.0381, | |
| "step": 3710 | |
| }, | |
| { | |
| "epoch": 2.9672523961661343, | |
| "grad_norm": 0.7495614794397316, | |
| "learning_rate": 2.541087082773119e-09, | |
| "loss": 0.0409, | |
| "step": 3715 | |
| }, | |
| { | |
| "epoch": 2.9712460063897765, | |
| "grad_norm": 0.6823205768784191, | |
| "learning_rate": 1.959155348834951e-09, | |
| "loss": 0.0372, | |
| "step": 3720 | |
| }, | |
| { | |
| "epoch": 2.9752396166134183, | |
| "grad_norm": 0.7640147746969532, | |
| "learning_rate": 1.4527729154697466e-09, | |
| "loss": 0.0373, | |
| "step": 3725 | |
| }, | |
| { | |
| "epoch": 2.979233226837061, | |
| "grad_norm": 0.7256837714457507, | |
| "learning_rate": 1.0219507193247045e-09, | |
| "loss": 0.0378, | |
| "step": 3730 | |
| }, | |
| { | |
| "epoch": 2.9832268370607027, | |
| "grad_norm": 0.6990359157973326, | |
| "learning_rate": 6.666980651272025e-10, | |
| "loss": 0.0389, | |
| "step": 3735 | |
| }, | |
| { | |
| "epoch": 2.987220447284345, | |
| "grad_norm": 0.6921384600587103, | |
| "learning_rate": 3.870226254831244e-10, | |
| "loss": 0.0386, | |
| "step": 3740 | |
| }, | |
| { | |
| "epoch": 2.991214057507987, | |
| "grad_norm": 0.7426276592936151, | |
| "learning_rate": 1.829304407121035e-10, | |
| "loss": 0.0395, | |
| "step": 3745 | |
| }, | |
| { | |
| "epoch": 2.9952076677316293, | |
| "grad_norm": 0.6789921703001532, | |
| "learning_rate": 5.44259187161833e-11, | |
| "loss": 0.0404, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 2.9992012779552715, | |
| "grad_norm": 0.7434455972283719, | |
| "learning_rate": 1.5118348850040597e-12, | |
| "loss": 0.0372, | |
| "step": 3755 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "step": 3756, | |
| "total_flos": 1168211181961216.0, | |
| "train_loss": 0.11109248143090163, | |
| "train_runtime": 45172.9315, | |
| "train_samples_per_second": 10.643, | |
| "train_steps_per_second": 0.083 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 3756, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1168211181961216.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |