{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.199792602834428, "eval_steps": 500, "global_step": 1734, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0003456619426201175, "grad_norm": 34.52766429032349, "learning_rate": 0.0, "loss": 1.7402, "num_tokens": 2348566.0, "step": 1 }, { "epoch": 0.000691323885240235, "grad_norm": 34.916246724175195, "learning_rate": 2.7586206896551726e-07, "loss": 1.7643, "num_tokens": 4766393.0, "step": 2 }, { "epoch": 0.0010369858278603526, "grad_norm": 37.60414874561209, "learning_rate": 5.517241379310345e-07, "loss": 1.7635, "num_tokens": 7060606.0, "step": 3 }, { "epoch": 0.00138264777048047, "grad_norm": 28.067921013815816, "learning_rate": 8.275862068965518e-07, "loss": 1.7722, "num_tokens": 9393241.0, "step": 4 }, { "epoch": 0.0017283097131005876, "grad_norm": 30.411817473811606, "learning_rate": 1.103448275862069e-06, "loss": 1.7679, "num_tokens": 11829752.0, "step": 5 }, { "epoch": 0.002073971655720705, "grad_norm": 34.72349566651572, "learning_rate": 1.3793103448275862e-06, "loss": 1.6802, "num_tokens": 14068246.0, "step": 6 }, { "epoch": 0.0024196335983408227, "grad_norm": 34.17638397070201, "learning_rate": 1.6551724137931037e-06, "loss": 1.6733, "num_tokens": 16427486.0, "step": 7 }, { "epoch": 0.00276529554096094, "grad_norm": 240.23501794389531, "learning_rate": 1.9310344827586207e-06, "loss": 1.6469, "num_tokens": 18725943.0, "step": 8 }, { "epoch": 0.0031109574835810577, "grad_norm": 199.16629728896086, "learning_rate": 2.206896551724138e-06, "loss": 1.6185, "num_tokens": 20959251.0, "step": 9 }, { "epoch": 0.0034566194262011752, "grad_norm": 127.5413424915123, "learning_rate": 2.4827586206896555e-06, "loss": 1.4787, "num_tokens": 23221296.0, "step": 10 }, { "epoch": 0.0038022813688212928, "grad_norm": 20.172387932030606, "learning_rate": 2.7586206896551725e-06, "loss": 1.4814, "num_tokens": 25623468.0, "step": 11 }, { "epoch": 0.00414794331144141, "grad_norm": 24.996893860630973, "learning_rate": 3.03448275862069e-06, "loss": 1.4958, "num_tokens": 28101258.0, "step": 12 }, { "epoch": 0.004493605254061528, "grad_norm": 15.978924269182224, "learning_rate": 3.3103448275862073e-06, "loss": 1.3222, "num_tokens": 30303835.0, "step": 13 }, { "epoch": 0.004839267196681645, "grad_norm": 5.829201156403896, "learning_rate": 3.5862068965517243e-06, "loss": 1.2836, "num_tokens": 32612096.0, "step": 14 }, { "epoch": 0.005184929139301763, "grad_norm": 103.68722318826303, "learning_rate": 3.862068965517241e-06, "loss": 1.2761, "num_tokens": 34930500.0, "step": 15 }, { "epoch": 0.00553059108192188, "grad_norm": 124.0907542896587, "learning_rate": 4.137931034482759e-06, "loss": 1.2774, "num_tokens": 37351465.0, "step": 16 }, { "epoch": 0.005876253024541998, "grad_norm": 97.23488726579467, "learning_rate": 4.413793103448276e-06, "loss": 1.2742, "num_tokens": 39747971.0, "step": 17 }, { "epoch": 0.006221914967162115, "grad_norm": 26.087299275986652, "learning_rate": 4.689655172413793e-06, "loss": 1.1917, "num_tokens": 42055613.0, "step": 18 }, { "epoch": 0.006567576909782233, "grad_norm": 16.581625382992545, "learning_rate": 4.965517241379311e-06, "loss": 1.1738, "num_tokens": 44530067.0, "step": 19 }, { "epoch": 0.0069132388524023505, "grad_norm": 9.370695208101724, "learning_rate": 5.241379310344829e-06, "loss": 1.1508, "num_tokens": 46829520.0, "step": 20 }, { "epoch": 0.007258900795022468, "grad_norm": 2.7546835662098337, "learning_rate": 5.517241379310345e-06, "loss": 1.1012, "num_tokens": 49068724.0, "step": 21 }, { "epoch": 0.0076045627376425855, "grad_norm": 2.3825001318118812, "learning_rate": 5.793103448275863e-06, "loss": 1.0944, "num_tokens": 51426882.0, "step": 22 }, { "epoch": 0.007950224680262703, "grad_norm": 2.035556006407366, "learning_rate": 6.06896551724138e-06, "loss": 1.0721, "num_tokens": 53744600.0, "step": 23 }, { "epoch": 0.00829588662288282, "grad_norm": 1.463922532189421, "learning_rate": 6.344827586206898e-06, "loss": 1.0609, "num_tokens": 56061789.0, "step": 24 }, { "epoch": 0.008641548565502939, "grad_norm": 1.5095068946017134, "learning_rate": 6.620689655172415e-06, "loss": 1.0261, "num_tokens": 58426474.0, "step": 25 }, { "epoch": 0.008987210508123056, "grad_norm": 1.7812545085036195, "learning_rate": 6.896551724137932e-06, "loss": 1.0006, "num_tokens": 60698346.0, "step": 26 }, { "epoch": 0.009332872450743173, "grad_norm": 1.471823589913617, "learning_rate": 7.172413793103449e-06, "loss": 0.98, "num_tokens": 63058045.0, "step": 27 }, { "epoch": 0.00967853439336329, "grad_norm": 1.131991682045555, "learning_rate": 7.4482758620689665e-06, "loss": 0.9489, "num_tokens": 65393200.0, "step": 28 }, { "epoch": 0.010024196335983409, "grad_norm": 0.9832756068048666, "learning_rate": 7.724137931034483e-06, "loss": 0.9484, "num_tokens": 67739098.0, "step": 29 }, { "epoch": 0.010369858278603527, "grad_norm": 1.1765295545080312, "learning_rate": 8.000000000000001e-06, "loss": 0.9475, "num_tokens": 70086884.0, "step": 30 }, { "epoch": 0.010715520221223643, "grad_norm": 1.2194966845549524, "learning_rate": 8.275862068965518e-06, "loss": 0.9175, "num_tokens": 72378842.0, "step": 31 }, { "epoch": 0.01106118216384376, "grad_norm": 0.7236272581153216, "learning_rate": 8.551724137931035e-06, "loss": 0.9239, "num_tokens": 74782017.0, "step": 32 }, { "epoch": 0.011406844106463879, "grad_norm": 0.63227886221642, "learning_rate": 8.827586206896552e-06, "loss": 0.9039, "num_tokens": 77143359.0, "step": 33 }, { "epoch": 0.011752506049083997, "grad_norm": 0.4895603891264605, "learning_rate": 9.10344827586207e-06, "loss": 0.9001, "num_tokens": 79474781.0, "step": 34 }, { "epoch": 0.012098167991704113, "grad_norm": 0.6296333140665148, "learning_rate": 9.379310344827586e-06, "loss": 0.8816, "num_tokens": 81801167.0, "step": 35 }, { "epoch": 0.01244382993432423, "grad_norm": 0.5574668525090068, "learning_rate": 9.655172413793105e-06, "loss": 0.8644, "num_tokens": 84103942.0, "step": 36 }, { "epoch": 0.012789491876944349, "grad_norm": 0.6251940562087835, "learning_rate": 9.931034482758622e-06, "loss": 0.8805, "num_tokens": 86412978.0, "step": 37 }, { "epoch": 0.013135153819564467, "grad_norm": 0.4609208716458492, "learning_rate": 1.0206896551724139e-05, "loss": 0.8375, "num_tokens": 88723803.0, "step": 38 }, { "epoch": 0.013480815762184583, "grad_norm": 0.5437599793260961, "learning_rate": 1.0482758620689658e-05, "loss": 0.83, "num_tokens": 91014493.0, "step": 39 }, { "epoch": 0.013826477704804701, "grad_norm": 0.618952678909033, "learning_rate": 1.0758620689655173e-05, "loss": 0.8354, "num_tokens": 93228218.0, "step": 40 }, { "epoch": 0.014172139647424819, "grad_norm": 0.48878986093827087, "learning_rate": 1.103448275862069e-05, "loss": 0.8456, "num_tokens": 95593743.0, "step": 41 }, { "epoch": 0.014517801590044937, "grad_norm": 0.5522829608269526, "learning_rate": 1.1310344827586209e-05, "loss": 0.8281, "num_tokens": 97815035.0, "step": 42 }, { "epoch": 0.014863463532665053, "grad_norm": 0.49672837733164593, "learning_rate": 1.1586206896551726e-05, "loss": 0.8335, "num_tokens": 100149758.0, "step": 43 }, { "epoch": 0.015209125475285171, "grad_norm": 0.5362244996387824, "learning_rate": 1.1862068965517241e-05, "loss": 0.8195, "num_tokens": 102481479.0, "step": 44 }, { "epoch": 0.015554787417905289, "grad_norm": 0.4699636631130888, "learning_rate": 1.213793103448276e-05, "loss": 0.8245, "num_tokens": 104848287.0, "step": 45 }, { "epoch": 0.015900449360525405, "grad_norm": 0.352102114661968, "learning_rate": 1.2413793103448277e-05, "loss": 0.819, "num_tokens": 107126442.0, "step": 46 }, { "epoch": 0.016246111303145523, "grad_norm": 0.5888506749748358, "learning_rate": 1.2689655172413795e-05, "loss": 0.8087, "num_tokens": 109618031.0, "step": 47 }, { "epoch": 0.01659177324576564, "grad_norm": 0.49382888158547594, "learning_rate": 1.296551724137931e-05, "loss": 0.8189, "num_tokens": 111884700.0, "step": 48 }, { "epoch": 0.01693743518838576, "grad_norm": 0.593880802739918, "learning_rate": 1.324137931034483e-05, "loss": 0.8023, "num_tokens": 114172995.0, "step": 49 }, { "epoch": 0.017283097131005877, "grad_norm": 0.6031087194755608, "learning_rate": 1.3517241379310346e-05, "loss": 0.814, "num_tokens": 116497090.0, "step": 50 }, { "epoch": 0.017628759073625995, "grad_norm": 0.40468651934265243, "learning_rate": 1.3793103448275863e-05, "loss": 0.7937, "num_tokens": 118861185.0, "step": 51 }, { "epoch": 0.017974421016246113, "grad_norm": 0.6833857006865406, "learning_rate": 1.406896551724138e-05, "loss": 0.7932, "num_tokens": 121094526.0, "step": 52 }, { "epoch": 0.018320082958866227, "grad_norm": 0.4664454198315879, "learning_rate": 1.4344827586206897e-05, "loss": 0.7926, "num_tokens": 123518108.0, "step": 53 }, { "epoch": 0.018665744901486345, "grad_norm": 0.4700018167854046, "learning_rate": 1.4620689655172416e-05, "loss": 0.7997, "num_tokens": 125954145.0, "step": 54 }, { "epoch": 0.019011406844106463, "grad_norm": 0.4845178396389685, "learning_rate": 1.4896551724137933e-05, "loss": 0.7979, "num_tokens": 128361749.0, "step": 55 }, { "epoch": 0.01935706878672658, "grad_norm": 0.33585679598083323, "learning_rate": 1.5172413793103448e-05, "loss": 0.7995, "num_tokens": 130884149.0, "step": 56 }, { "epoch": 0.0197027307293467, "grad_norm": 0.5946698053058292, "learning_rate": 1.5448275862068965e-05, "loss": 0.7851, "num_tokens": 133314593.0, "step": 57 }, { "epoch": 0.020048392671966817, "grad_norm": 0.44935201498855654, "learning_rate": 1.5724137931034484e-05, "loss": 0.7891, "num_tokens": 135786548.0, "step": 58 }, { "epoch": 0.020394054614586935, "grad_norm": 0.6016698684545718, "learning_rate": 1.6000000000000003e-05, "loss": 0.7945, "num_tokens": 138090347.0, "step": 59 }, { "epoch": 0.020739716557207053, "grad_norm": 0.5149935440294358, "learning_rate": 1.6275862068965518e-05, "loss": 0.7856, "num_tokens": 140229744.0, "step": 60 }, { "epoch": 0.021085378499827168, "grad_norm": 0.42442139251406474, "learning_rate": 1.6551724137931037e-05, "loss": 0.7638, "num_tokens": 142547274.0, "step": 61 }, { "epoch": 0.021431040442447286, "grad_norm": 0.6165584439482034, "learning_rate": 1.6827586206896552e-05, "loss": 0.7839, "num_tokens": 144867908.0, "step": 62 }, { "epoch": 0.021776702385067404, "grad_norm": 0.615695106401857, "learning_rate": 1.710344827586207e-05, "loss": 0.7789, "num_tokens": 147137485.0, "step": 63 }, { "epoch": 0.02212236432768752, "grad_norm": 0.5447303795295104, "learning_rate": 1.7379310344827586e-05, "loss": 0.7905, "num_tokens": 149486776.0, "step": 64 }, { "epoch": 0.02246802627030764, "grad_norm": 0.7075502076257513, "learning_rate": 1.7655172413793105e-05, "loss": 0.7792, "num_tokens": 151813267.0, "step": 65 }, { "epoch": 0.022813688212927757, "grad_norm": 0.5470719758055983, "learning_rate": 1.7931034482758623e-05, "loss": 0.7651, "num_tokens": 154166281.0, "step": 66 }, { "epoch": 0.023159350155547875, "grad_norm": 0.4628479897033203, "learning_rate": 1.820689655172414e-05, "loss": 0.7731, "num_tokens": 156434581.0, "step": 67 }, { "epoch": 0.023505012098167993, "grad_norm": 0.7961906721005769, "learning_rate": 1.8482758620689657e-05, "loss": 0.7699, "num_tokens": 158708953.0, "step": 68 }, { "epoch": 0.023850674040788108, "grad_norm": 0.4658600860763261, "learning_rate": 1.8758620689655173e-05, "loss": 0.7683, "num_tokens": 161030802.0, "step": 69 }, { "epoch": 0.024196335983408226, "grad_norm": 0.9531225369625607, "learning_rate": 1.903448275862069e-05, "loss": 0.7758, "num_tokens": 163400511.0, "step": 70 }, { "epoch": 0.024541997926028344, "grad_norm": 0.7489302770832681, "learning_rate": 1.931034482758621e-05, "loss": 0.7671, "num_tokens": 165666850.0, "step": 71 }, { "epoch": 0.02488765986864846, "grad_norm": 0.9597008407747405, "learning_rate": 1.9586206896551725e-05, "loss": 0.7429, "num_tokens": 167881526.0, "step": 72 }, { "epoch": 0.02523332181126858, "grad_norm": 0.9313302737360822, "learning_rate": 1.9862068965517244e-05, "loss": 0.7706, "num_tokens": 170177217.0, "step": 73 }, { "epoch": 0.025578983753888698, "grad_norm": 0.5602528199834497, "learning_rate": 2.013793103448276e-05, "loss": 0.7507, "num_tokens": 172470635.0, "step": 74 }, { "epoch": 0.025924645696508816, "grad_norm": 0.8676148008868482, "learning_rate": 2.0413793103448278e-05, "loss": 0.7661, "num_tokens": 174778857.0, "step": 75 }, { "epoch": 0.026270307639128933, "grad_norm": 0.6668635259628354, "learning_rate": 2.0689655172413797e-05, "loss": 0.7661, "num_tokens": 177130569.0, "step": 76 }, { "epoch": 0.026615969581749048, "grad_norm": 0.8038788640178728, "learning_rate": 2.0965517241379315e-05, "loss": 0.7649, "num_tokens": 179549534.0, "step": 77 }, { "epoch": 0.026961631524369166, "grad_norm": 0.8880977133991036, "learning_rate": 2.1241379310344827e-05, "loss": 0.7634, "num_tokens": 181919658.0, "step": 78 }, { "epoch": 0.027307293466989284, "grad_norm": 0.7417404170940146, "learning_rate": 2.1517241379310346e-05, "loss": 0.762, "num_tokens": 184313977.0, "step": 79 }, { "epoch": 0.027652955409609402, "grad_norm": 0.7639055493485868, "learning_rate": 2.1793103448275865e-05, "loss": 0.7583, "num_tokens": 186596058.0, "step": 80 }, { "epoch": 0.02799861735222952, "grad_norm": 0.8099691607022619, "learning_rate": 2.206896551724138e-05, "loss": 0.7512, "num_tokens": 188933869.0, "step": 81 }, { "epoch": 0.028344279294849638, "grad_norm": 0.6860198325986622, "learning_rate": 2.23448275862069e-05, "loss": 0.7521, "num_tokens": 191245733.0, "step": 82 }, { "epoch": 0.028689941237469756, "grad_norm": 0.6846981754633444, "learning_rate": 2.2620689655172417e-05, "loss": 0.7551, "num_tokens": 193553255.0, "step": 83 }, { "epoch": 0.029035603180089874, "grad_norm": 0.6501174117629096, "learning_rate": 2.2896551724137933e-05, "loss": 0.7473, "num_tokens": 195883429.0, "step": 84 }, { "epoch": 0.029381265122709988, "grad_norm": 0.6914354541215395, "learning_rate": 2.317241379310345e-05, "loss": 0.7447, "num_tokens": 198268087.0, "step": 85 }, { "epoch": 0.029726927065330106, "grad_norm": 0.4755320816841892, "learning_rate": 2.3448275862068967e-05, "loss": 0.7341, "num_tokens": 200618015.0, "step": 86 }, { "epoch": 0.030072589007950224, "grad_norm": 0.9557281222429161, "learning_rate": 2.3724137931034482e-05, "loss": 0.7603, "num_tokens": 203016946.0, "step": 87 }, { "epoch": 0.030418250950570342, "grad_norm": 0.7349019646066728, "learning_rate": 2.4e-05, "loss": 0.7622, "num_tokens": 205214919.0, "step": 88 }, { "epoch": 0.03076391289319046, "grad_norm": 0.9116469040257529, "learning_rate": 2.427586206896552e-05, "loss": 0.7433, "num_tokens": 207620550.0, "step": 89 }, { "epoch": 0.031109574835810578, "grad_norm": 0.8510261221064249, "learning_rate": 2.4551724137931038e-05, "loss": 0.7444, "num_tokens": 209994861.0, "step": 90 }, { "epoch": 0.03145523677843069, "grad_norm": 0.6153876226538251, "learning_rate": 2.4827586206896553e-05, "loss": 0.7406, "num_tokens": 212350825.0, "step": 91 }, { "epoch": 0.03180089872105081, "grad_norm": 0.810675759688931, "learning_rate": 2.5103448275862072e-05, "loss": 0.7456, "num_tokens": 214629936.0, "step": 92 }, { "epoch": 0.03214656066367093, "grad_norm": 0.4884049293244507, "learning_rate": 2.537931034482759e-05, "loss": 0.7481, "num_tokens": 216969950.0, "step": 93 }, { "epoch": 0.032492222606291046, "grad_norm": 0.8446277273383646, "learning_rate": 2.5655172413793103e-05, "loss": 0.7458, "num_tokens": 219405588.0, "step": 94 }, { "epoch": 0.032837884548911164, "grad_norm": 0.5289015523887819, "learning_rate": 2.593103448275862e-05, "loss": 0.7386, "num_tokens": 221720594.0, "step": 95 }, { "epoch": 0.03318354649153128, "grad_norm": 0.6889357864803288, "learning_rate": 2.620689655172414e-05, "loss": 0.7356, "num_tokens": 224073935.0, "step": 96 }, { "epoch": 0.0335292084341514, "grad_norm": 0.6007223645180073, "learning_rate": 2.648275862068966e-05, "loss": 0.7472, "num_tokens": 226445040.0, "step": 97 }, { "epoch": 0.03387487037677152, "grad_norm": 0.7387047295293775, "learning_rate": 2.6758620689655174e-05, "loss": 0.7612, "num_tokens": 228769431.0, "step": 98 }, { "epoch": 0.034220532319391636, "grad_norm": 0.7764160324273132, "learning_rate": 2.7034482758620693e-05, "loss": 0.75, "num_tokens": 231089769.0, "step": 99 }, { "epoch": 0.034566194262011754, "grad_norm": 0.5933258036073403, "learning_rate": 2.731034482758621e-05, "loss": 0.7489, "num_tokens": 233430572.0, "step": 100 }, { "epoch": 0.03491185620463187, "grad_norm": 0.7297206869591315, "learning_rate": 2.7586206896551727e-05, "loss": 0.7472, "num_tokens": 235617596.0, "step": 101 }, { "epoch": 0.03525751814725199, "grad_norm": 0.7684655927441242, "learning_rate": 2.7862068965517242e-05, "loss": 0.7367, "num_tokens": 238005342.0, "step": 102 }, { "epoch": 0.03560318008987211, "grad_norm": 0.7579682420634565, "learning_rate": 2.813793103448276e-05, "loss": 0.7439, "num_tokens": 240334625.0, "step": 103 }, { "epoch": 0.035948842032492226, "grad_norm": 0.9043719301469854, "learning_rate": 2.8413793103448276e-05, "loss": 0.7428, "num_tokens": 242624550.0, "step": 104 }, { "epoch": 0.03629450397511234, "grad_norm": 0.471809452106959, "learning_rate": 2.8689655172413795e-05, "loss": 0.7391, "num_tokens": 244932212.0, "step": 105 }, { "epoch": 0.036640165917732455, "grad_norm": 0.888134097580562, "learning_rate": 2.8965517241379313e-05, "loss": 0.7456, "num_tokens": 247331234.0, "step": 106 }, { "epoch": 0.03698582786035257, "grad_norm": 0.4435173245087347, "learning_rate": 2.9241379310344832e-05, "loss": 0.7504, "num_tokens": 249690250.0, "step": 107 }, { "epoch": 0.03733148980297269, "grad_norm": 1.1172848964096243, "learning_rate": 2.9517241379310347e-05, "loss": 0.733, "num_tokens": 251948004.0, "step": 108 }, { "epoch": 0.03767715174559281, "grad_norm": 0.922205477104342, "learning_rate": 2.9793103448275866e-05, "loss": 0.7505, "num_tokens": 254350208.0, "step": 109 }, { "epoch": 0.03802281368821293, "grad_norm": 0.9360098315240186, "learning_rate": 3.006896551724138e-05, "loss": 0.7312, "num_tokens": 256738015.0, "step": 110 }, { "epoch": 0.038368475630833045, "grad_norm": 0.9493746052640399, "learning_rate": 3.0344827586206897e-05, "loss": 0.7453, "num_tokens": 258993218.0, "step": 111 }, { "epoch": 0.03871413757345316, "grad_norm": 0.6772582936432366, "learning_rate": 3.0620689655172415e-05, "loss": 0.7197, "num_tokens": 261281901.0, "step": 112 }, { "epoch": 0.03905979951607328, "grad_norm": 0.9116093959201665, "learning_rate": 3.089655172413793e-05, "loss": 0.7446, "num_tokens": 263674484.0, "step": 113 }, { "epoch": 0.0394054614586934, "grad_norm": 0.7495578225130799, "learning_rate": 3.117241379310345e-05, "loss": 0.7256, "num_tokens": 266017440.0, "step": 114 }, { "epoch": 0.039751123401313516, "grad_norm": 0.8068183580979901, "learning_rate": 3.144827586206897e-05, "loss": 0.7423, "num_tokens": 268334238.0, "step": 115 }, { "epoch": 0.040096785343933634, "grad_norm": 0.7124730040409233, "learning_rate": 3.172413793103448e-05, "loss": 0.7334, "num_tokens": 270763388.0, "step": 116 }, { "epoch": 0.04044244728655375, "grad_norm": 0.8089459947320503, "learning_rate": 3.2000000000000005e-05, "loss": 0.7299, "num_tokens": 272941034.0, "step": 117 }, { "epoch": 0.04078810922917387, "grad_norm": 0.7259513784231448, "learning_rate": 3.227586206896552e-05, "loss": 0.7329, "num_tokens": 275378629.0, "step": 118 }, { "epoch": 0.04113377117179399, "grad_norm": 0.7332446808409854, "learning_rate": 3.2551724137931036e-05, "loss": 0.7249, "num_tokens": 277652371.0, "step": 119 }, { "epoch": 0.041479433114414106, "grad_norm": 0.6421931090818569, "learning_rate": 3.282758620689655e-05, "loss": 0.7278, "num_tokens": 280024124.0, "step": 120 }, { "epoch": 0.04182509505703422, "grad_norm": 0.569024220548962, "learning_rate": 3.310344827586207e-05, "loss": 0.7489, "num_tokens": 282322925.0, "step": 121 }, { "epoch": 0.042170756999654335, "grad_norm": 0.5543917946977904, "learning_rate": 3.337931034482759e-05, "loss": 0.7497, "num_tokens": 284780949.0, "step": 122 }, { "epoch": 0.04251641894227445, "grad_norm": 0.6745056035927581, "learning_rate": 3.3655172413793104e-05, "loss": 0.7376, "num_tokens": 287070053.0, "step": 123 }, { "epoch": 0.04286208088489457, "grad_norm": 0.8057112613121699, "learning_rate": 3.3931034482758626e-05, "loss": 0.7461, "num_tokens": 289398119.0, "step": 124 }, { "epoch": 0.04320774282751469, "grad_norm": 0.7435738834081497, "learning_rate": 3.420689655172414e-05, "loss": 0.7451, "num_tokens": 291806287.0, "step": 125 }, { "epoch": 0.04355340477013481, "grad_norm": 0.607350592322234, "learning_rate": 3.4482758620689657e-05, "loss": 0.738, "num_tokens": 294150810.0, "step": 126 }, { "epoch": 0.043899066712754925, "grad_norm": 0.7157013623524531, "learning_rate": 3.475862068965517e-05, "loss": 0.7336, "num_tokens": 296551922.0, "step": 127 }, { "epoch": 0.04424472865537504, "grad_norm": 0.8341954990905753, "learning_rate": 3.5034482758620694e-05, "loss": 0.7249, "num_tokens": 298832126.0, "step": 128 }, { "epoch": 0.04459039059799516, "grad_norm": 0.817494247455751, "learning_rate": 3.531034482758621e-05, "loss": 0.7219, "num_tokens": 301128017.0, "step": 129 }, { "epoch": 0.04493605254061528, "grad_norm": 0.7585436099018116, "learning_rate": 3.5586206896551725e-05, "loss": 0.7321, "num_tokens": 303506036.0, "step": 130 }, { "epoch": 0.0452817144832354, "grad_norm": 0.9678755528109797, "learning_rate": 3.586206896551725e-05, "loss": 0.7276, "num_tokens": 305918873.0, "step": 131 }, { "epoch": 0.045627376425855515, "grad_norm": 0.718133726568121, "learning_rate": 3.613793103448276e-05, "loss": 0.7251, "num_tokens": 308241872.0, "step": 132 }, { "epoch": 0.04597303836847563, "grad_norm": 1.0973484475823698, "learning_rate": 3.641379310344828e-05, "loss": 0.7379, "num_tokens": 310727507.0, "step": 133 }, { "epoch": 0.04631870031109575, "grad_norm": 0.7416516400117827, "learning_rate": 3.668965517241379e-05, "loss": 0.7346, "num_tokens": 313021983.0, "step": 134 }, { "epoch": 0.04666436225371587, "grad_norm": 1.3284478552185184, "learning_rate": 3.6965517241379315e-05, "loss": 0.7346, "num_tokens": 315493468.0, "step": 135 }, { "epoch": 0.04701002419633599, "grad_norm": 1.0152349598840011, "learning_rate": 3.724137931034483e-05, "loss": 0.7208, "num_tokens": 317840417.0, "step": 136 }, { "epoch": 0.0473556861389561, "grad_norm": 1.064480784927161, "learning_rate": 3.7517241379310345e-05, "loss": 0.7145, "num_tokens": 320098389.0, "step": 137 }, { "epoch": 0.047701348081576216, "grad_norm": 0.9745304292393031, "learning_rate": 3.779310344827587e-05, "loss": 0.7346, "num_tokens": 322339251.0, "step": 138 }, { "epoch": 0.048047010024196334, "grad_norm": 1.0646363969058728, "learning_rate": 3.806896551724138e-05, "loss": 0.7034, "num_tokens": 324697225.0, "step": 139 }, { "epoch": 0.04839267196681645, "grad_norm": 0.8084137804059474, "learning_rate": 3.83448275862069e-05, "loss": 0.7181, "num_tokens": 326976615.0, "step": 140 }, { "epoch": 0.04873833390943657, "grad_norm": 1.0998528107081083, "learning_rate": 3.862068965517242e-05, "loss": 0.7248, "num_tokens": 329339438.0, "step": 141 }, { "epoch": 0.04908399585205669, "grad_norm": 1.0307723947265541, "learning_rate": 3.8896551724137935e-05, "loss": 0.7212, "num_tokens": 331716104.0, "step": 142 }, { "epoch": 0.049429657794676805, "grad_norm": 1.0104830173773098, "learning_rate": 3.917241379310345e-05, "loss": 0.7343, "num_tokens": 334159866.0, "step": 143 }, { "epoch": 0.04977531973729692, "grad_norm": 1.2284184845848667, "learning_rate": 3.9448275862068966e-05, "loss": 0.7398, "num_tokens": 336467657.0, "step": 144 }, { "epoch": 0.05012098167991704, "grad_norm": 0.8918997830916775, "learning_rate": 3.972413793103449e-05, "loss": 0.7313, "num_tokens": 338801323.0, "step": 145 }, { "epoch": 0.05046664362253716, "grad_norm": 1.082726102460824, "learning_rate": 4e-05, "loss": 0.7263, "num_tokens": 341162307.0, "step": 146 }, { "epoch": 0.05081230556515728, "grad_norm": 0.8971107129336452, "learning_rate": 4.027586206896552e-05, "loss": 0.7235, "num_tokens": 343412060.0, "step": 147 }, { "epoch": 0.051157967507777395, "grad_norm": 0.9412664497744425, "learning_rate": 4.055172413793104e-05, "loss": 0.7048, "num_tokens": 345675733.0, "step": 148 }, { "epoch": 0.05150362945039751, "grad_norm": 0.8661134234474723, "learning_rate": 4.0827586206896556e-05, "loss": 0.7193, "num_tokens": 348009471.0, "step": 149 }, { "epoch": 0.05184929139301763, "grad_norm": 0.7772855074917321, "learning_rate": 4.110344827586207e-05, "loss": 0.7195, "num_tokens": 350251681.0, "step": 150 }, { "epoch": 0.05219495333563775, "grad_norm": 0.7497371938056758, "learning_rate": 4.137931034482759e-05, "loss": 0.6977, "num_tokens": 352485472.0, "step": 151 }, { "epoch": 0.05254061527825787, "grad_norm": 0.7819238079711345, "learning_rate": 4.165517241379311e-05, "loss": 0.7238, "num_tokens": 354839391.0, "step": 152 }, { "epoch": 0.05288627722087798, "grad_norm": 0.8592104884503061, "learning_rate": 4.193103448275863e-05, "loss": 0.7108, "num_tokens": 357043632.0, "step": 153 }, { "epoch": 0.053231939163498096, "grad_norm": 0.7706232047174564, "learning_rate": 4.2206896551724146e-05, "loss": 0.7029, "num_tokens": 359361396.0, "step": 154 }, { "epoch": 0.053577601106118214, "grad_norm": 0.9870020438536392, "learning_rate": 4.2482758620689655e-05, "loss": 0.7304, "num_tokens": 361740885.0, "step": 155 }, { "epoch": 0.05392326304873833, "grad_norm": 1.0554218992326692, "learning_rate": 4.275862068965517e-05, "loss": 0.7145, "num_tokens": 363957526.0, "step": 156 }, { "epoch": 0.05426892499135845, "grad_norm": 0.582139071481871, "learning_rate": 4.303448275862069e-05, "loss": 0.7146, "num_tokens": 366227335.0, "step": 157 }, { "epoch": 0.05461458693397857, "grad_norm": 1.068448965677722, "learning_rate": 4.331034482758621e-05, "loss": 0.7232, "num_tokens": 368527770.0, "step": 158 }, { "epoch": 0.054960248876598686, "grad_norm": 0.5329171240099785, "learning_rate": 4.358620689655173e-05, "loss": 0.7035, "num_tokens": 370790507.0, "step": 159 }, { "epoch": 0.055305910819218804, "grad_norm": 1.2586026783810411, "learning_rate": 4.3862068965517245e-05, "loss": 0.7326, "num_tokens": 373067015.0, "step": 160 }, { "epoch": 0.05565157276183892, "grad_norm": 0.9837880569753787, "learning_rate": 4.413793103448276e-05, "loss": 0.7167, "num_tokens": 375501172.0, "step": 161 }, { "epoch": 0.05599723470445904, "grad_norm": 0.9384633099741303, "learning_rate": 4.441379310344828e-05, "loss": 0.7226, "num_tokens": 377854677.0, "step": 162 }, { "epoch": 0.05634289664707916, "grad_norm": 1.1519738556824626, "learning_rate": 4.46896551724138e-05, "loss": 0.7362, "num_tokens": 380287197.0, "step": 163 }, { "epoch": 0.056688558589699276, "grad_norm": 1.043350430252056, "learning_rate": 4.496551724137931e-05, "loss": 0.7272, "num_tokens": 382556344.0, "step": 164 }, { "epoch": 0.057034220532319393, "grad_norm": 1.1247382477517336, "learning_rate": 4.5241379310344835e-05, "loss": 0.7083, "num_tokens": 384903188.0, "step": 165 }, { "epoch": 0.05737988247493951, "grad_norm": 0.8855937671393632, "learning_rate": 4.551724137931035e-05, "loss": 0.7303, "num_tokens": 387251917.0, "step": 166 }, { "epoch": 0.05772554441755963, "grad_norm": 1.1411647243629852, "learning_rate": 4.5793103448275865e-05, "loss": 0.6884, "num_tokens": 389461686.0, "step": 167 }, { "epoch": 0.05807120636017975, "grad_norm": 0.8973573737935918, "learning_rate": 4.606896551724139e-05, "loss": 0.7232, "num_tokens": 391811582.0, "step": 168 }, { "epoch": 0.05841686830279986, "grad_norm": 1.1526902094508222, "learning_rate": 4.63448275862069e-05, "loss": 0.7206, "num_tokens": 394148853.0, "step": 169 }, { "epoch": 0.058762530245419976, "grad_norm": 1.0746984364432095, "learning_rate": 4.6620689655172425e-05, "loss": 0.7144, "num_tokens": 396355036.0, "step": 170 }, { "epoch": 0.059108192188040094, "grad_norm": 0.9518903963086779, "learning_rate": 4.689655172413793e-05, "loss": 0.703, "num_tokens": 398661260.0, "step": 171 }, { "epoch": 0.05945385413066021, "grad_norm": 0.9995903536249783, "learning_rate": 4.717241379310345e-05, "loss": 0.7163, "num_tokens": 400957583.0, "step": 172 }, { "epoch": 0.05979951607328033, "grad_norm": 0.8203958785857348, "learning_rate": 4.7448275862068964e-05, "loss": 0.7223, "num_tokens": 403369717.0, "step": 173 }, { "epoch": 0.06014517801590045, "grad_norm": 1.2147731990281987, "learning_rate": 4.7724137931034486e-05, "loss": 0.7222, "num_tokens": 405780049.0, "step": 174 }, { "epoch": 0.060490839958520566, "grad_norm": 0.9022809717301816, "learning_rate": 4.8e-05, "loss": 0.7013, "num_tokens": 408140167.0, "step": 175 }, { "epoch": 0.060836501901140684, "grad_norm": 1.050648714230654, "learning_rate": 4.827586206896552e-05, "loss": 0.7138, "num_tokens": 410429807.0, "step": 176 }, { "epoch": 0.0611821638437608, "grad_norm": 0.8145480570320937, "learning_rate": 4.855172413793104e-05, "loss": 0.7149, "num_tokens": 412753090.0, "step": 177 }, { "epoch": 0.06152782578638092, "grad_norm": 1.0054753624866357, "learning_rate": 4.8827586206896554e-05, "loss": 0.7104, "num_tokens": 415053865.0, "step": 178 }, { "epoch": 0.06187348772900104, "grad_norm": 0.8548827349446674, "learning_rate": 4.9103448275862076e-05, "loss": 0.7128, "num_tokens": 417284862.0, "step": 179 }, { "epoch": 0.062219149671621156, "grad_norm": 0.6002105552755639, "learning_rate": 4.937931034482759e-05, "loss": 0.7118, "num_tokens": 419578881.0, "step": 180 }, { "epoch": 0.06256481161424127, "grad_norm": 1.107683107673729, "learning_rate": 4.9655172413793107e-05, "loss": 0.7282, "num_tokens": 422066367.0, "step": 181 }, { "epoch": 0.06291047355686138, "grad_norm": 0.8850477934882971, "learning_rate": 4.993103448275863e-05, "loss": 0.7062, "num_tokens": 424541442.0, "step": 182 }, { "epoch": 0.0632561354994815, "grad_norm": 1.0418647998717137, "learning_rate": 5.0206896551724144e-05, "loss": 0.7164, "num_tokens": 426799789.0, "step": 183 }, { "epoch": 0.06360179744210162, "grad_norm": 0.9866322854571545, "learning_rate": 5.048275862068966e-05, "loss": 0.7095, "num_tokens": 429109812.0, "step": 184 }, { "epoch": 0.06394745938472174, "grad_norm": 0.8220500322939559, "learning_rate": 5.075862068965518e-05, "loss": 0.711, "num_tokens": 431319178.0, "step": 185 }, { "epoch": 0.06429312132734186, "grad_norm": 1.1643380349268293, "learning_rate": 5.10344827586207e-05, "loss": 0.7034, "num_tokens": 433591378.0, "step": 186 }, { "epoch": 0.06463878326996197, "grad_norm": 0.7953856479900898, "learning_rate": 5.1310344827586205e-05, "loss": 0.7172, "num_tokens": 436047642.0, "step": 187 }, { "epoch": 0.06498444521258209, "grad_norm": 1.3416422670812653, "learning_rate": 5.158620689655173e-05, "loss": 0.7138, "num_tokens": 438434044.0, "step": 188 }, { "epoch": 0.06533010715520221, "grad_norm": 0.9030922035012002, "learning_rate": 5.186206896551724e-05, "loss": 0.7141, "num_tokens": 440839385.0, "step": 189 }, { "epoch": 0.06567576909782233, "grad_norm": 1.4125631245405847, "learning_rate": 5.213793103448276e-05, "loss": 0.7134, "num_tokens": 443090381.0, "step": 190 }, { "epoch": 0.06602143104044245, "grad_norm": 1.078421315455212, "learning_rate": 5.241379310344828e-05, "loss": 0.71, "num_tokens": 445465216.0, "step": 191 }, { "epoch": 0.06636709298306256, "grad_norm": 1.1339002364196766, "learning_rate": 5.2689655172413795e-05, "loss": 0.71, "num_tokens": 447860655.0, "step": 192 }, { "epoch": 0.06671275492568268, "grad_norm": 0.8596178566442743, "learning_rate": 5.296551724137932e-05, "loss": 0.7057, "num_tokens": 450184646.0, "step": 193 }, { "epoch": 0.0670584168683028, "grad_norm": 1.3790678568270909, "learning_rate": 5.324137931034483e-05, "loss": 0.7035, "num_tokens": 452569957.0, "step": 194 }, { "epoch": 0.06740407881092292, "grad_norm": 1.0390787748036805, "learning_rate": 5.351724137931035e-05, "loss": 0.7319, "num_tokens": 455024875.0, "step": 195 }, { "epoch": 0.06774974075354304, "grad_norm": 1.309925274269295, "learning_rate": 5.379310344827587e-05, "loss": 0.7166, "num_tokens": 457536389.0, "step": 196 }, { "epoch": 0.06809540269616315, "grad_norm": 1.0203712559081182, "learning_rate": 5.4068965517241385e-05, "loss": 0.7138, "num_tokens": 459901229.0, "step": 197 }, { "epoch": 0.06844106463878327, "grad_norm": 1.0971769212525957, "learning_rate": 5.43448275862069e-05, "loss": 0.7201, "num_tokens": 462218756.0, "step": 198 }, { "epoch": 0.06878672658140339, "grad_norm": 0.876460559693676, "learning_rate": 5.462068965517242e-05, "loss": 0.6966, "num_tokens": 464594880.0, "step": 199 }, { "epoch": 0.06913238852402351, "grad_norm": 1.1316941943804402, "learning_rate": 5.489655172413794e-05, "loss": 0.7152, "num_tokens": 466958545.0, "step": 200 }, { "epoch": 0.06947805046664363, "grad_norm": 0.855185780085789, "learning_rate": 5.517241379310345e-05, "loss": 0.7089, "num_tokens": 469247028.0, "step": 201 }, { "epoch": 0.06982371240926374, "grad_norm": 1.1055725145143143, "learning_rate": 5.5448275862068975e-05, "loss": 0.6987, "num_tokens": 471483053.0, "step": 202 }, { "epoch": 0.07016937435188386, "grad_norm": 0.9170336897065865, "learning_rate": 5.5724137931034484e-05, "loss": 0.7236, "num_tokens": 473819939.0, "step": 203 }, { "epoch": 0.07051503629450398, "grad_norm": 0.9278362049523065, "learning_rate": 5.6e-05, "loss": 0.7055, "num_tokens": 476112202.0, "step": 204 }, { "epoch": 0.0708606982371241, "grad_norm": 0.9449160154937349, "learning_rate": 5.627586206896552e-05, "loss": 0.7072, "num_tokens": 478404403.0, "step": 205 }, { "epoch": 0.07120636017974422, "grad_norm": 0.6526098622181347, "learning_rate": 5.6551724137931037e-05, "loss": 0.7235, "num_tokens": 480796853.0, "step": 206 }, { "epoch": 0.07155202212236433, "grad_norm": 1.31179922805551, "learning_rate": 5.682758620689655e-05, "loss": 0.7034, "num_tokens": 483199501.0, "step": 207 }, { "epoch": 0.07189768406498445, "grad_norm": 0.8958524832278518, "learning_rate": 5.7103448275862074e-05, "loss": 0.7189, "num_tokens": 485614054.0, "step": 208 }, { "epoch": 0.07224334600760456, "grad_norm": 1.2912439498883648, "learning_rate": 5.737931034482759e-05, "loss": 0.7083, "num_tokens": 487827562.0, "step": 209 }, { "epoch": 0.07258900795022467, "grad_norm": 1.0421251696269234, "learning_rate": 5.765517241379311e-05, "loss": 0.7031, "num_tokens": 490059153.0, "step": 210 }, { "epoch": 0.07293466989284479, "grad_norm": 1.0197574543609216, "learning_rate": 5.7931034482758627e-05, "loss": 0.7186, "num_tokens": 492473358.0, "step": 211 }, { "epoch": 0.07328033183546491, "grad_norm": 0.8318654597079473, "learning_rate": 5.820689655172414e-05, "loss": 0.7182, "num_tokens": 494860369.0, "step": 212 }, { "epoch": 0.07362599377808503, "grad_norm": 1.030706449769229, "learning_rate": 5.8482758620689664e-05, "loss": 0.7, "num_tokens": 497170523.0, "step": 213 }, { "epoch": 0.07397165572070515, "grad_norm": 0.7476341956398788, "learning_rate": 5.875862068965518e-05, "loss": 0.7001, "num_tokens": 499548947.0, "step": 214 }, { "epoch": 0.07431731766332526, "grad_norm": 0.8966754075924577, "learning_rate": 5.9034482758620695e-05, "loss": 0.7171, "num_tokens": 502025053.0, "step": 215 }, { "epoch": 0.07466297960594538, "grad_norm": 0.7105787426449255, "learning_rate": 5.931034482758622e-05, "loss": 0.694, "num_tokens": 504282189.0, "step": 216 }, { "epoch": 0.0750086415485655, "grad_norm": 0.8647591899831232, "learning_rate": 5.958620689655173e-05, "loss": 0.7126, "num_tokens": 506598189.0, "step": 217 }, { "epoch": 0.07535430349118562, "grad_norm": 0.6779689764303697, "learning_rate": 5.986206896551725e-05, "loss": 0.6988, "num_tokens": 508892435.0, "step": 218 }, { "epoch": 0.07569996543380574, "grad_norm": 0.7049539628997222, "learning_rate": 6.013793103448276e-05, "loss": 0.6986, "num_tokens": 511163860.0, "step": 219 }, { "epoch": 0.07604562737642585, "grad_norm": 0.9144778322979157, "learning_rate": 6.041379310344828e-05, "loss": 0.6948, "num_tokens": 513524515.0, "step": 220 }, { "epoch": 0.07639128931904597, "grad_norm": 0.9507080106150033, "learning_rate": 6.068965517241379e-05, "loss": 0.704, "num_tokens": 515990605.0, "step": 221 }, { "epoch": 0.07673695126166609, "grad_norm": 0.9946450620958396, "learning_rate": 6.0965517241379315e-05, "loss": 0.7101, "num_tokens": 518218266.0, "step": 222 }, { "epoch": 0.07708261320428621, "grad_norm": 0.5988102158537464, "learning_rate": 6.124137931034483e-05, "loss": 0.7101, "num_tokens": 520506349.0, "step": 223 }, { "epoch": 0.07742827514690633, "grad_norm": 1.7159339172962982, "learning_rate": 6.151724137931035e-05, "loss": 0.7036, "num_tokens": 522937983.0, "step": 224 }, { "epoch": 0.07777393708952644, "grad_norm": 1.1605650590574434, "learning_rate": 6.179310344827586e-05, "loss": 0.7114, "num_tokens": 525288490.0, "step": 225 }, { "epoch": 0.07811959903214656, "grad_norm": 1.984813505030011, "learning_rate": 6.206896551724138e-05, "loss": 0.7215, "num_tokens": 527601511.0, "step": 226 }, { "epoch": 0.07846526097476668, "grad_norm": 1.9410175057220138, "learning_rate": 6.23448275862069e-05, "loss": 0.7365, "num_tokens": 530003533.0, "step": 227 }, { "epoch": 0.0788109229173868, "grad_norm": 1.2043718596372153, "learning_rate": 6.262068965517241e-05, "loss": 0.7026, "num_tokens": 532317231.0, "step": 228 }, { "epoch": 0.07915658486000691, "grad_norm": 1.548664678240791, "learning_rate": 6.289655172413794e-05, "loss": 0.6977, "num_tokens": 534636808.0, "step": 229 }, { "epoch": 0.07950224680262703, "grad_norm": 1.1504835175618584, "learning_rate": 6.317241379310346e-05, "loss": 0.7124, "num_tokens": 536967339.0, "step": 230 }, { "epoch": 0.07984790874524715, "grad_norm": 1.5900417245686578, "learning_rate": 6.344827586206897e-05, "loss": 0.6941, "num_tokens": 539157144.0, "step": 231 }, { "epoch": 0.08019357068786727, "grad_norm": 1.4941892722346075, "learning_rate": 6.372413793103449e-05, "loss": 0.7118, "num_tokens": 541448568.0, "step": 232 }, { "epoch": 0.08053923263048739, "grad_norm": 1.1506457919531141, "learning_rate": 6.400000000000001e-05, "loss": 0.7173, "num_tokens": 543757589.0, "step": 233 }, { "epoch": 0.0808848945731075, "grad_norm": 1.3591672059610354, "learning_rate": 6.427586206896553e-05, "loss": 0.709, "num_tokens": 546055736.0, "step": 234 }, { "epoch": 0.08123055651572762, "grad_norm": 1.099852732427473, "learning_rate": 6.455172413793104e-05, "loss": 0.7136, "num_tokens": 548490878.0, "step": 235 }, { "epoch": 0.08157621845834774, "grad_norm": 1.1866711082429422, "learning_rate": 6.482758620689655e-05, "loss": 0.6989, "num_tokens": 550795757.0, "step": 236 }, { "epoch": 0.08192188040096786, "grad_norm": 1.0683161397356664, "learning_rate": 6.510344827586207e-05, "loss": 0.6915, "num_tokens": 553237843.0, "step": 237 }, { "epoch": 0.08226754234358798, "grad_norm": 1.0014269061191867, "learning_rate": 6.53793103448276e-05, "loss": 0.7067, "num_tokens": 555609459.0, "step": 238 }, { "epoch": 0.0826132042862081, "grad_norm": 0.7434483096314644, "learning_rate": 6.56551724137931e-05, "loss": 0.7027, "num_tokens": 557980720.0, "step": 239 }, { "epoch": 0.08295886622882821, "grad_norm": 1.0347156189570355, "learning_rate": 6.593103448275862e-05, "loss": 0.6948, "num_tokens": 560292519.0, "step": 240 }, { "epoch": 0.08330452817144833, "grad_norm": 1.1930974097925942, "learning_rate": 6.620689655172415e-05, "loss": 0.6944, "num_tokens": 562637251.0, "step": 241 }, { "epoch": 0.08365019011406843, "grad_norm": 0.6666553380523713, "learning_rate": 6.648275862068966e-05, "loss": 0.6945, "num_tokens": 564999649.0, "step": 242 }, { "epoch": 0.08399585205668855, "grad_norm": 1.9243727581703054, "learning_rate": 6.675862068965518e-05, "loss": 0.6772, "num_tokens": 567304230.0, "step": 243 }, { "epoch": 0.08434151399930867, "grad_norm": 1.3840601785141495, "learning_rate": 6.70344827586207e-05, "loss": 0.7033, "num_tokens": 569629888.0, "step": 244 }, { "epoch": 0.08468717594192879, "grad_norm": 1.9433935671131355, "learning_rate": 6.731034482758621e-05, "loss": 0.7017, "num_tokens": 571856744.0, "step": 245 }, { "epoch": 0.0850328378845489, "grad_norm": 1.7727674338865027, "learning_rate": 6.758620689655173e-05, "loss": 0.7033, "num_tokens": 574162249.0, "step": 246 }, { "epoch": 0.08537849982716902, "grad_norm": 1.3722317873887826, "learning_rate": 6.786206896551725e-05, "loss": 0.7165, "num_tokens": 576538644.0, "step": 247 }, { "epoch": 0.08572416176978914, "grad_norm": 1.1680037272703208, "learning_rate": 6.813793103448276e-05, "loss": 0.7074, "num_tokens": 579013717.0, "step": 248 }, { "epoch": 0.08606982371240926, "grad_norm": 1.5957327301656017, "learning_rate": 6.841379310344828e-05, "loss": 0.6988, "num_tokens": 581286691.0, "step": 249 }, { "epoch": 0.08641548565502938, "grad_norm": 1.2046009460189067, "learning_rate": 6.86896551724138e-05, "loss": 0.7019, "num_tokens": 583646190.0, "step": 250 }, { "epoch": 0.0867611475976495, "grad_norm": 1.7449366521877085, "learning_rate": 6.896551724137931e-05, "loss": 0.7005, "num_tokens": 585917794.0, "step": 251 }, { "epoch": 0.08710680954026961, "grad_norm": 1.4306064719383027, "learning_rate": 6.924137931034484e-05, "loss": 0.6924, "num_tokens": 588138109.0, "step": 252 }, { "epoch": 0.08745247148288973, "grad_norm": 1.5456545611198949, "learning_rate": 6.951724137931034e-05, "loss": 0.6958, "num_tokens": 590434348.0, "step": 253 }, { "epoch": 0.08779813342550985, "grad_norm": 1.3933380155584882, "learning_rate": 6.979310344827587e-05, "loss": 0.7071, "num_tokens": 592866511.0, "step": 254 }, { "epoch": 0.08814379536812997, "grad_norm": 1.487962732840059, "learning_rate": 7.006896551724139e-05, "loss": 0.7021, "num_tokens": 595105372.0, "step": 255 }, { "epoch": 0.08848945731075009, "grad_norm": 1.2753789290830655, "learning_rate": 7.03448275862069e-05, "loss": 0.6908, "num_tokens": 597453436.0, "step": 256 }, { "epoch": 0.0888351192533702, "grad_norm": 1.298149849085115, "learning_rate": 7.062068965517242e-05, "loss": 0.7023, "num_tokens": 599772241.0, "step": 257 }, { "epoch": 0.08918078119599032, "grad_norm": 1.217125692018453, "learning_rate": 7.089655172413794e-05, "loss": 0.7012, "num_tokens": 602185701.0, "step": 258 }, { "epoch": 0.08952644313861044, "grad_norm": 1.1883730812217992, "learning_rate": 7.117241379310345e-05, "loss": 0.6959, "num_tokens": 604567543.0, "step": 259 }, { "epoch": 0.08987210508123056, "grad_norm": 1.0831663735834525, "learning_rate": 7.144827586206897e-05, "loss": 0.6934, "num_tokens": 606913192.0, "step": 260 }, { "epoch": 0.09021776702385068, "grad_norm": 0.9746730620076356, "learning_rate": 7.17241379310345e-05, "loss": 0.7008, "num_tokens": 609282261.0, "step": 261 }, { "epoch": 0.0905634289664708, "grad_norm": 0.7402597608567184, "learning_rate": 7.2e-05, "loss": 0.6883, "num_tokens": 611640132.0, "step": 262 }, { "epoch": 0.09090909090909091, "grad_norm": 1.0055165785713271, "learning_rate": 7.227586206896552e-05, "loss": 0.7139, "num_tokens": 614045141.0, "step": 263 }, { "epoch": 0.09125475285171103, "grad_norm": 0.5464502547515364, "learning_rate": 7.255172413793105e-05, "loss": 0.7089, "num_tokens": 616404508.0, "step": 264 }, { "epoch": 0.09160041479433115, "grad_norm": 1.135509676557934, "learning_rate": 7.282758620689655e-05, "loss": 0.6976, "num_tokens": 618757455.0, "step": 265 }, { "epoch": 0.09194607673695127, "grad_norm": 0.7458841157543672, "learning_rate": 7.310344827586208e-05, "loss": 0.6889, "num_tokens": 621052622.0, "step": 266 }, { "epoch": 0.09229173867957138, "grad_norm": 1.2455439399749555, "learning_rate": 7.337931034482759e-05, "loss": 0.7043, "num_tokens": 623339412.0, "step": 267 }, { "epoch": 0.0926374006221915, "grad_norm": 1.138316413962965, "learning_rate": 7.365517241379311e-05, "loss": 0.6958, "num_tokens": 625647172.0, "step": 268 }, { "epoch": 0.09298306256481162, "grad_norm": 0.8726454283956959, "learning_rate": 7.393103448275863e-05, "loss": 0.6806, "num_tokens": 627913561.0, "step": 269 }, { "epoch": 0.09332872450743174, "grad_norm": 1.0876201492366608, "learning_rate": 7.420689655172414e-05, "loss": 0.7119, "num_tokens": 630197340.0, "step": 270 }, { "epoch": 0.09367438645005186, "grad_norm": 0.7733630637075924, "learning_rate": 7.448275862068966e-05, "loss": 0.6931, "num_tokens": 632499021.0, "step": 271 }, { "epoch": 0.09402004839267197, "grad_norm": 1.111664877807677, "learning_rate": 7.475862068965518e-05, "loss": 0.6797, "num_tokens": 634802444.0, "step": 272 }, { "epoch": 0.09436571033529209, "grad_norm": 0.9122161783683577, "learning_rate": 7.503448275862069e-05, "loss": 0.6848, "num_tokens": 637024078.0, "step": 273 }, { "epoch": 0.0947113722779122, "grad_norm": 0.9000420275340484, "learning_rate": 7.531034482758621e-05, "loss": 0.7097, "num_tokens": 639278981.0, "step": 274 }, { "epoch": 0.09505703422053231, "grad_norm": 0.9927997257677583, "learning_rate": 7.558620689655173e-05, "loss": 0.7084, "num_tokens": 641559187.0, "step": 275 }, { "epoch": 0.09540269616315243, "grad_norm": 0.595089510074961, "learning_rate": 7.586206896551724e-05, "loss": 0.6981, "num_tokens": 643824036.0, "step": 276 }, { "epoch": 0.09574835810577255, "grad_norm": 1.454341628047847, "learning_rate": 7.613793103448277e-05, "loss": 0.7075, "num_tokens": 646193569.0, "step": 277 }, { "epoch": 0.09609402004839267, "grad_norm": 1.2806872879871047, "learning_rate": 7.641379310344829e-05, "loss": 0.7046, "num_tokens": 648530275.0, "step": 278 }, { "epoch": 0.09643968199101279, "grad_norm": 0.8952511336846225, "learning_rate": 7.66896551724138e-05, "loss": 0.6971, "num_tokens": 650859888.0, "step": 279 }, { "epoch": 0.0967853439336329, "grad_norm": 1.3874720133983718, "learning_rate": 7.696551724137932e-05, "loss": 0.6986, "num_tokens": 653237461.0, "step": 280 }, { "epoch": 0.09713100587625302, "grad_norm": 0.8975662650525933, "learning_rate": 7.724137931034484e-05, "loss": 0.697, "num_tokens": 655501633.0, "step": 281 }, { "epoch": 0.09747666781887314, "grad_norm": 1.2127287102857875, "learning_rate": 7.751724137931035e-05, "loss": 0.698, "num_tokens": 657892341.0, "step": 282 }, { "epoch": 0.09782232976149326, "grad_norm": 1.0684794860622633, "learning_rate": 7.779310344827587e-05, "loss": 0.7027, "num_tokens": 660211835.0, "step": 283 }, { "epoch": 0.09816799170411337, "grad_norm": 0.9724390611007337, "learning_rate": 7.806896551724138e-05, "loss": 0.7086, "num_tokens": 662495958.0, "step": 284 }, { "epoch": 0.09851365364673349, "grad_norm": 0.9890746582625942, "learning_rate": 7.83448275862069e-05, "loss": 0.682, "num_tokens": 664825607.0, "step": 285 }, { "epoch": 0.09885931558935361, "grad_norm": 0.9521052208344568, "learning_rate": 7.862068965517242e-05, "loss": 0.6967, "num_tokens": 667185621.0, "step": 286 }, { "epoch": 0.09920497753197373, "grad_norm": 0.5872551608920765, "learning_rate": 7.889655172413793e-05, "loss": 0.7138, "num_tokens": 669519354.0, "step": 287 }, { "epoch": 0.09955063947459385, "grad_norm": 1.0873675042837323, "learning_rate": 7.917241379310345e-05, "loss": 0.6948, "num_tokens": 671846829.0, "step": 288 }, { "epoch": 0.09989630141721396, "grad_norm": 0.8648169705503227, "learning_rate": 7.944827586206898e-05, "loss": 0.6924, "num_tokens": 674148172.0, "step": 289 }, { "epoch": 0.10024196335983408, "grad_norm": 0.9216484637326665, "learning_rate": 7.972413793103448e-05, "loss": 0.7007, "num_tokens": 676439026.0, "step": 290 }, { "epoch": 0.1005876253024542, "grad_norm": 0.8906713041792911, "learning_rate": 8e-05, "loss": 0.7089, "num_tokens": 678787516.0, "step": 291 }, { "epoch": 0.10093328724507432, "grad_norm": 0.8857322378121782, "learning_rate": 7.999997086755005e-05, "loss": 0.6986, "num_tokens": 681216767.0, "step": 292 }, { "epoch": 0.10127894918769444, "grad_norm": 0.9887428852571887, "learning_rate": 7.99998834702426e-05, "loss": 0.6984, "num_tokens": 683618739.0, "step": 293 }, { "epoch": 0.10162461113031455, "grad_norm": 0.7687099383073623, "learning_rate": 7.999973780820498e-05, "loss": 0.6858, "num_tokens": 686091351.0, "step": 294 }, { "epoch": 0.10197027307293467, "grad_norm": 0.8070721314266507, "learning_rate": 7.999953388164936e-05, "loss": 0.6963, "num_tokens": 688492964.0, "step": 295 }, { "epoch": 0.10231593501555479, "grad_norm": 0.8901939076819861, "learning_rate": 7.999927169087277e-05, "loss": 0.7001, "num_tokens": 690889590.0, "step": 296 }, { "epoch": 0.10266159695817491, "grad_norm": 0.8617005315161973, "learning_rate": 7.999895123625716e-05, "loss": 0.6967, "num_tokens": 693225926.0, "step": 297 }, { "epoch": 0.10300725890079503, "grad_norm": 0.596840269904407, "learning_rate": 7.99985725182693e-05, "loss": 0.6885, "num_tokens": 695524308.0, "step": 298 }, { "epoch": 0.10335292084341514, "grad_norm": 1.0173827901463235, "learning_rate": 7.999813553746083e-05, "loss": 0.7039, "num_tokens": 697824611.0, "step": 299 }, { "epoch": 0.10369858278603526, "grad_norm": 0.8874306379940305, "learning_rate": 7.99976402944683e-05, "loss": 0.6888, "num_tokens": 700176190.0, "step": 300 }, { "epoch": 0.10404424472865538, "grad_norm": 0.7489150598188141, "learning_rate": 7.999708679001307e-05, "loss": 0.6942, "num_tokens": 702468839.0, "step": 301 }, { "epoch": 0.1043899066712755, "grad_norm": 1.157959309996441, "learning_rate": 7.999647502490141e-05, "loss": 0.6909, "num_tokens": 704785933.0, "step": 302 }, { "epoch": 0.10473556861389562, "grad_norm": 0.7119099167867494, "learning_rate": 7.999580500002444e-05, "loss": 0.708, "num_tokens": 707097800.0, "step": 303 }, { "epoch": 0.10508123055651573, "grad_norm": 1.1233987008921111, "learning_rate": 7.999507671635814e-05, "loss": 0.6944, "num_tokens": 709356858.0, "step": 304 }, { "epoch": 0.10542689249913585, "grad_norm": 0.7726501218242386, "learning_rate": 7.999429017496337e-05, "loss": 0.7018, "num_tokens": 711739782.0, "step": 305 }, { "epoch": 0.10577255444175596, "grad_norm": 1.3481051637224188, "learning_rate": 7.999344537698581e-05, "loss": 0.7111, "num_tokens": 714073801.0, "step": 306 }, { "epoch": 0.10611821638437607, "grad_norm": 1.0783453329483956, "learning_rate": 7.999254232365605e-05, "loss": 0.7191, "num_tokens": 716402217.0, "step": 307 }, { "epoch": 0.10646387832699619, "grad_norm": 1.0700151008928058, "learning_rate": 7.99915810162895e-05, "loss": 0.7191, "num_tokens": 718858280.0, "step": 308 }, { "epoch": 0.10680954026961631, "grad_norm": 1.0968425201547842, "learning_rate": 7.999056145628642e-05, "loss": 0.7117, "num_tokens": 721206186.0, "step": 309 }, { "epoch": 0.10715520221223643, "grad_norm": 0.8334145125925164, "learning_rate": 7.998948364513197e-05, "loss": 0.7115, "num_tokens": 723597671.0, "step": 310 }, { "epoch": 0.10750086415485655, "grad_norm": 1.554717653070663, "learning_rate": 7.99883475843961e-05, "loss": 0.7149, "num_tokens": 726083099.0, "step": 311 }, { "epoch": 0.10784652609747666, "grad_norm": 1.3739057268202843, "learning_rate": 7.998715327573365e-05, "loss": 0.7023, "num_tokens": 728498279.0, "step": 312 }, { "epoch": 0.10819218804009678, "grad_norm": 0.8360408141229195, "learning_rate": 7.99859007208843e-05, "loss": 0.6845, "num_tokens": 730825899.0, "step": 313 }, { "epoch": 0.1085378499827169, "grad_norm": 1.0524586634295543, "learning_rate": 7.998458992167255e-05, "loss": 0.6874, "num_tokens": 733034402.0, "step": 314 }, { "epoch": 0.10888351192533702, "grad_norm": 0.696929349143732, "learning_rate": 7.998322088000779e-05, "loss": 0.7127, "num_tokens": 735506694.0, "step": 315 }, { "epoch": 0.10922917386795714, "grad_norm": 1.0044519332658897, "learning_rate": 7.998179359788418e-05, "loss": 0.6951, "num_tokens": 737838781.0, "step": 316 }, { "epoch": 0.10957483581057725, "grad_norm": 0.7708743421728953, "learning_rate": 7.998030807738079e-05, "loss": 0.7129, "num_tokens": 740180538.0, "step": 317 }, { "epoch": 0.10992049775319737, "grad_norm": 0.8077441390505878, "learning_rate": 7.997876432066145e-05, "loss": 0.7047, "num_tokens": 742515800.0, "step": 318 }, { "epoch": 0.11026615969581749, "grad_norm": 0.7676928527488951, "learning_rate": 7.997716232997487e-05, "loss": 0.7054, "num_tokens": 744909576.0, "step": 319 }, { "epoch": 0.11061182163843761, "grad_norm": 0.8499848951090085, "learning_rate": 7.997550210765457e-05, "loss": 0.682, "num_tokens": 747289640.0, "step": 320 }, { "epoch": 0.11095748358105773, "grad_norm": 0.8416097531462226, "learning_rate": 7.997378365611888e-05, "loss": 0.6867, "num_tokens": 749562061.0, "step": 321 }, { "epoch": 0.11130314552367784, "grad_norm": 0.7444937632171702, "learning_rate": 7.997200697787098e-05, "loss": 0.6921, "num_tokens": 751840397.0, "step": 322 }, { "epoch": 0.11164880746629796, "grad_norm": 0.8015095549688365, "learning_rate": 7.997017207549884e-05, "loss": 0.6978, "num_tokens": 754179786.0, "step": 323 }, { "epoch": 0.11199446940891808, "grad_norm": 0.6871825143542051, "learning_rate": 7.996827895167523e-05, "loss": 0.6874, "num_tokens": 756503904.0, "step": 324 }, { "epoch": 0.1123401313515382, "grad_norm": 0.958497866215431, "learning_rate": 7.996632760915775e-05, "loss": 0.6911, "num_tokens": 758897268.0, "step": 325 }, { "epoch": 0.11268579329415832, "grad_norm": 0.6873164249150165, "learning_rate": 7.996431805078881e-05, "loss": 0.6883, "num_tokens": 761221450.0, "step": 326 }, { "epoch": 0.11303145523677843, "grad_norm": 1.0817764647800767, "learning_rate": 7.996225027949559e-05, "loss": 0.695, "num_tokens": 763515479.0, "step": 327 }, { "epoch": 0.11337711717939855, "grad_norm": 0.8611981023591838, "learning_rate": 7.996012429829011e-05, "loss": 0.6959, "num_tokens": 765899703.0, "step": 328 }, { "epoch": 0.11372277912201867, "grad_norm": 0.9859302359610935, "learning_rate": 7.995794011026911e-05, "loss": 0.6899, "num_tokens": 768199256.0, "step": 329 }, { "epoch": 0.11406844106463879, "grad_norm": 0.6293751886043986, "learning_rate": 7.995569771861421e-05, "loss": 0.693, "num_tokens": 770586189.0, "step": 330 }, { "epoch": 0.1144141030072589, "grad_norm": 1.041080382816587, "learning_rate": 7.995339712659173e-05, "loss": 0.6927, "num_tokens": 772987673.0, "step": 331 }, { "epoch": 0.11475976494987902, "grad_norm": 1.0792671151467503, "learning_rate": 7.995103833755279e-05, "loss": 0.6871, "num_tokens": 775411739.0, "step": 332 }, { "epoch": 0.11510542689249914, "grad_norm": 0.6136864084317608, "learning_rate": 7.99486213549333e-05, "loss": 0.6952, "num_tokens": 777772924.0, "step": 333 }, { "epoch": 0.11545108883511926, "grad_norm": 1.515540323199239, "learning_rate": 7.994614618225395e-05, "loss": 0.6921, "num_tokens": 779957935.0, "step": 334 }, { "epoch": 0.11579675077773938, "grad_norm": 0.8138618474246415, "learning_rate": 7.994361282312013e-05, "loss": 0.7008, "num_tokens": 782356219.0, "step": 335 }, { "epoch": 0.1161424127203595, "grad_norm": 1.6282351949417546, "learning_rate": 7.994102128122205e-05, "loss": 0.6846, "num_tokens": 784672696.0, "step": 336 }, { "epoch": 0.11648807466297961, "grad_norm": 0.9385244196948598, "learning_rate": 7.993837156033463e-05, "loss": 0.6934, "num_tokens": 787004666.0, "step": 337 }, { "epoch": 0.11683373660559972, "grad_norm": 2.0223773205509956, "learning_rate": 7.993566366431757e-05, "loss": 0.6999, "num_tokens": 789343205.0, "step": 338 }, { "epoch": 0.11717939854821983, "grad_norm": 1.6593409472481893, "learning_rate": 7.993289759711528e-05, "loss": 0.6956, "num_tokens": 791561573.0, "step": 339 }, { "epoch": 0.11752506049083995, "grad_norm": 1.4144489016925788, "learning_rate": 7.993007336275693e-05, "loss": 0.6733, "num_tokens": 793904206.0, "step": 340 }, { "epoch": 0.11787072243346007, "grad_norm": 1.442725495786414, "learning_rate": 7.992719096535636e-05, "loss": 0.6999, "num_tokens": 796192163.0, "step": 341 }, { "epoch": 0.11821638437608019, "grad_norm": 1.1210163277532932, "learning_rate": 7.992425040911223e-05, "loss": 0.7081, "num_tokens": 798562267.0, "step": 342 }, { "epoch": 0.1185620463187003, "grad_norm": 0.9521213158543966, "learning_rate": 7.992125169830782e-05, "loss": 0.6864, "num_tokens": 800897797.0, "step": 343 }, { "epoch": 0.11890770826132042, "grad_norm": 1.3346389933713596, "learning_rate": 7.99181948373112e-05, "loss": 0.6735, "num_tokens": 803216393.0, "step": 344 }, { "epoch": 0.11925337020394054, "grad_norm": 0.9460110716948854, "learning_rate": 7.991507983057512e-05, "loss": 0.6994, "num_tokens": 805426107.0, "step": 345 }, { "epoch": 0.11959903214656066, "grad_norm": 1.636088119332799, "learning_rate": 7.991190668263696e-05, "loss": 0.6859, "num_tokens": 807690662.0, "step": 346 }, { "epoch": 0.11994469408918078, "grad_norm": 1.4743393864626981, "learning_rate": 7.990867539811886e-05, "loss": 0.6857, "num_tokens": 810069568.0, "step": 347 }, { "epoch": 0.1202903560318009, "grad_norm": 1.144471301810938, "learning_rate": 7.990538598172766e-05, "loss": 0.6786, "num_tokens": 812403544.0, "step": 348 }, { "epoch": 0.12063601797442101, "grad_norm": 0.9041477229403952, "learning_rate": 7.990203843825481e-05, "loss": 0.7015, "num_tokens": 814712006.0, "step": 349 }, { "epoch": 0.12098167991704113, "grad_norm": 1.4901426257473107, "learning_rate": 7.98986327725765e-05, "loss": 0.7064, "num_tokens": 817152265.0, "step": 350 }, { "epoch": 0.12132734185966125, "grad_norm": 1.1130971912265126, "learning_rate": 7.989516898965351e-05, "loss": 0.6852, "num_tokens": 819439940.0, "step": 351 }, { "epoch": 0.12167300380228137, "grad_norm": 1.3889429435407803, "learning_rate": 7.989164709453134e-05, "loss": 0.6804, "num_tokens": 821782315.0, "step": 352 }, { "epoch": 0.12201866574490149, "grad_norm": 1.0771950560624082, "learning_rate": 7.988806709234012e-05, "loss": 0.6767, "num_tokens": 824217291.0, "step": 353 }, { "epoch": 0.1223643276875216, "grad_norm": 1.333269216833232, "learning_rate": 7.988442898829459e-05, "loss": 0.6942, "num_tokens": 826557598.0, "step": 354 }, { "epoch": 0.12270998963014172, "grad_norm": 1.146515196854587, "learning_rate": 7.988073278769417e-05, "loss": 0.6761, "num_tokens": 828871958.0, "step": 355 }, { "epoch": 0.12305565157276184, "grad_norm": 1.2418382185081331, "learning_rate": 7.987697849592287e-05, "loss": 0.6724, "num_tokens": 831102404.0, "step": 356 }, { "epoch": 0.12340131351538196, "grad_norm": 1.1675477808642603, "learning_rate": 7.987316611844933e-05, "loss": 0.6951, "num_tokens": 833571788.0, "step": 357 }, { "epoch": 0.12374697545800208, "grad_norm": 1.1212649136823032, "learning_rate": 7.986929566082681e-05, "loss": 0.6843, "num_tokens": 835768991.0, "step": 358 }, { "epoch": 0.1240926374006222, "grad_norm": 1.1001762048921309, "learning_rate": 7.986536712869315e-05, "loss": 0.68, "num_tokens": 838124224.0, "step": 359 }, { "epoch": 0.12443829934324231, "grad_norm": 1.230695121465166, "learning_rate": 7.986138052777079e-05, "loss": 0.6884, "num_tokens": 840391159.0, "step": 360 }, { "epoch": 0.12478396128586243, "grad_norm": 1.0495160483194945, "learning_rate": 7.985733586386678e-05, "loss": 0.6862, "num_tokens": 842783534.0, "step": 361 }, { "epoch": 0.12512962322848253, "grad_norm": 1.2386372164603723, "learning_rate": 7.985323314287272e-05, "loss": 0.69, "num_tokens": 845092329.0, "step": 362 }, { "epoch": 0.12547528517110265, "grad_norm": 1.209782625725921, "learning_rate": 7.984907237076479e-05, "loss": 0.6924, "num_tokens": 847347363.0, "step": 363 }, { "epoch": 0.12582094711372277, "grad_norm": 1.0125950689075163, "learning_rate": 7.984485355360371e-05, "loss": 0.6806, "num_tokens": 849680487.0, "step": 364 }, { "epoch": 0.1261666090563429, "grad_norm": 0.79931824340975, "learning_rate": 7.984057669753477e-05, "loss": 0.6876, "num_tokens": 851980604.0, "step": 365 }, { "epoch": 0.126512270998963, "grad_norm": 1.489545534073644, "learning_rate": 7.983624180878782e-05, "loss": 0.6825, "num_tokens": 854344038.0, "step": 366 }, { "epoch": 0.12685793294158312, "grad_norm": 1.25142217259151, "learning_rate": 7.983184889367718e-05, "loss": 0.6875, "num_tokens": 856723062.0, "step": 367 }, { "epoch": 0.12720359488420324, "grad_norm": 1.1365016908699426, "learning_rate": 7.982739795860177e-05, "loss": 0.6782, "num_tokens": 859055963.0, "step": 368 }, { "epoch": 0.12754925682682336, "grad_norm": 1.0574870548419908, "learning_rate": 7.982288901004496e-05, "loss": 0.6665, "num_tokens": 861363812.0, "step": 369 }, { "epoch": 0.12789491876944348, "grad_norm": 1.0996762728867564, "learning_rate": 7.981832205457467e-05, "loss": 0.69, "num_tokens": 863821983.0, "step": 370 }, { "epoch": 0.1282405807120636, "grad_norm": 1.0464576250091726, "learning_rate": 7.981369709884329e-05, "loss": 0.6844, "num_tokens": 866103060.0, "step": 371 }, { "epoch": 0.1285862426546837, "grad_norm": 1.1173050819432968, "learning_rate": 7.980901414958769e-05, "loss": 0.6753, "num_tokens": 868438447.0, "step": 372 }, { "epoch": 0.12893190459730383, "grad_norm": 0.9322098736898108, "learning_rate": 7.980427321362925e-05, "loss": 0.6734, "num_tokens": 870700785.0, "step": 373 }, { "epoch": 0.12927756653992395, "grad_norm": 1.3037928514085664, "learning_rate": 7.979947429787378e-05, "loss": 0.7043, "num_tokens": 873140472.0, "step": 374 }, { "epoch": 0.12962322848254407, "grad_norm": 1.0995241694419917, "learning_rate": 7.979461740931156e-05, "loss": 0.6937, "num_tokens": 875527359.0, "step": 375 }, { "epoch": 0.12996889042516419, "grad_norm": 1.1447417908456636, "learning_rate": 7.97897025550173e-05, "loss": 0.6875, "num_tokens": 877958509.0, "step": 376 }, { "epoch": 0.1303145523677843, "grad_norm": 0.9761571723754631, "learning_rate": 7.978472974215019e-05, "loss": 0.6559, "num_tokens": 880386334.0, "step": 377 }, { "epoch": 0.13066021431040442, "grad_norm": 1.1843923554722389, "learning_rate": 7.97796989779538e-05, "loss": 0.692, "num_tokens": 882691702.0, "step": 378 }, { "epoch": 0.13100587625302454, "grad_norm": 0.9786774014037657, "learning_rate": 7.977461026975612e-05, "loss": 0.675, "num_tokens": 884900181.0, "step": 379 }, { "epoch": 0.13135153819564466, "grad_norm": 1.2325916494671205, "learning_rate": 7.976946362496955e-05, "loss": 0.6795, "num_tokens": 887200224.0, "step": 380 }, { "epoch": 0.13169720013826478, "grad_norm": 1.1587378094828298, "learning_rate": 7.976425905109089e-05, "loss": 0.6735, "num_tokens": 889537696.0, "step": 381 }, { "epoch": 0.1320428620808849, "grad_norm": 0.9681318246124302, "learning_rate": 7.975899655570131e-05, "loss": 0.7018, "num_tokens": 891843780.0, "step": 382 }, { "epoch": 0.132388524023505, "grad_norm": 0.9417073841478321, "learning_rate": 7.975367614646637e-05, "loss": 0.6736, "num_tokens": 894232051.0, "step": 383 }, { "epoch": 0.13273418596612513, "grad_norm": 1.1778735880407256, "learning_rate": 7.974829783113594e-05, "loss": 0.6925, "num_tokens": 896630686.0, "step": 384 }, { "epoch": 0.13307984790874525, "grad_norm": 1.0275387052157854, "learning_rate": 7.974286161754433e-05, "loss": 0.6989, "num_tokens": 899019185.0, "step": 385 }, { "epoch": 0.13342550985136536, "grad_norm": 1.1894463153596653, "learning_rate": 7.973736751361008e-05, "loss": 0.6976, "num_tokens": 901319970.0, "step": 386 }, { "epoch": 0.13377117179398548, "grad_norm": 1.062230542849221, "learning_rate": 7.973181552733613e-05, "loss": 0.6849, "num_tokens": 903607467.0, "step": 387 }, { "epoch": 0.1341168337366056, "grad_norm": 1.1234192776019418, "learning_rate": 7.972620566680972e-05, "loss": 0.6935, "num_tokens": 906070989.0, "step": 388 }, { "epoch": 0.13446249567922572, "grad_norm": 0.9277483816737434, "learning_rate": 7.972053794020234e-05, "loss": 0.6837, "num_tokens": 908401611.0, "step": 389 }, { "epoch": 0.13480815762184584, "grad_norm": 1.2627915267734544, "learning_rate": 7.971481235576984e-05, "loss": 0.6668, "num_tokens": 910551071.0, "step": 390 }, { "epoch": 0.13515381956446595, "grad_norm": 1.1495363608257083, "learning_rate": 7.970902892185232e-05, "loss": 0.673, "num_tokens": 912960973.0, "step": 391 }, { "epoch": 0.13549948150708607, "grad_norm": 1.0304687786805438, "learning_rate": 7.970318764687412e-05, "loss": 0.6735, "num_tokens": 915252388.0, "step": 392 }, { "epoch": 0.1358451434497062, "grad_norm": 1.096791445079324, "learning_rate": 7.96972885393439e-05, "loss": 0.6733, "num_tokens": 917577646.0, "step": 393 }, { "epoch": 0.1361908053923263, "grad_norm": 0.8600198478534011, "learning_rate": 7.969133160785448e-05, "loss": 0.6648, "num_tokens": 919912390.0, "step": 394 }, { "epoch": 0.13653646733494643, "grad_norm": 0.699489837547432, "learning_rate": 7.968531686108295e-05, "loss": 0.677, "num_tokens": 922228131.0, "step": 395 }, { "epoch": 0.13688212927756654, "grad_norm": 1.1046038398651836, "learning_rate": 7.967924430779064e-05, "loss": 0.6762, "num_tokens": 924494616.0, "step": 396 }, { "epoch": 0.13722779122018666, "grad_norm": 0.6919660405873397, "learning_rate": 7.967311395682304e-05, "loss": 0.6808, "num_tokens": 926895580.0, "step": 397 }, { "epoch": 0.13757345316280678, "grad_norm": 1.4317507548793176, "learning_rate": 7.966692581710984e-05, "loss": 0.675, "num_tokens": 929189971.0, "step": 398 }, { "epoch": 0.1379191151054269, "grad_norm": 1.2476788183063163, "learning_rate": 7.966067989766492e-05, "loss": 0.6732, "num_tokens": 931590512.0, "step": 399 }, { "epoch": 0.13826477704804702, "grad_norm": 0.8410229284623401, "learning_rate": 7.965437620758632e-05, "loss": 0.6818, "num_tokens": 933845239.0, "step": 400 }, { "epoch": 0.13861043899066713, "grad_norm": 0.9060763725603179, "learning_rate": 7.964801475605622e-05, "loss": 0.6876, "num_tokens": 936162988.0, "step": 401 }, { "epoch": 0.13895610093328725, "grad_norm": 0.9462088723512216, "learning_rate": 7.964159555234097e-05, "loss": 0.6855, "num_tokens": 938488429.0, "step": 402 }, { "epoch": 0.13930176287590737, "grad_norm": 0.6655687384076501, "learning_rate": 7.963511860579099e-05, "loss": 0.677, "num_tokens": 940790506.0, "step": 403 }, { "epoch": 0.1396474248185275, "grad_norm": 1.0083914414843063, "learning_rate": 7.962858392584084e-05, "loss": 0.6835, "num_tokens": 943119813.0, "step": 404 }, { "epoch": 0.1399930867611476, "grad_norm": 0.6008615479710875, "learning_rate": 7.962199152200922e-05, "loss": 0.6779, "num_tokens": 945454900.0, "step": 405 }, { "epoch": 0.14033874870376772, "grad_norm": 1.0299508450761872, "learning_rate": 7.961534140389882e-05, "loss": 0.6818, "num_tokens": 947796607.0, "step": 406 }, { "epoch": 0.14068441064638784, "grad_norm": 0.9121016239330308, "learning_rate": 7.960863358119647e-05, "loss": 0.6804, "num_tokens": 950158087.0, "step": 407 }, { "epoch": 0.14103007258900796, "grad_norm": 1.0707388282989765, "learning_rate": 7.960186806367304e-05, "loss": 0.6799, "num_tokens": 952485997.0, "step": 408 }, { "epoch": 0.14137573453162808, "grad_norm": 1.0978738875444578, "learning_rate": 7.959504486118343e-05, "loss": 0.6886, "num_tokens": 954826839.0, "step": 409 }, { "epoch": 0.1417213964742482, "grad_norm": 0.8018759058420323, "learning_rate": 7.958816398366654e-05, "loss": 0.6824, "num_tokens": 957082957.0, "step": 410 }, { "epoch": 0.1420670584168683, "grad_norm": 0.7716008658301883, "learning_rate": 7.958122544114536e-05, "loss": 0.6657, "num_tokens": 959360496.0, "step": 411 }, { "epoch": 0.14241272035948843, "grad_norm": 0.7593118913119389, "learning_rate": 7.957422924372679e-05, "loss": 0.6726, "num_tokens": 961734966.0, "step": 412 }, { "epoch": 0.14275838230210855, "grad_norm": 0.6802341786838565, "learning_rate": 7.956717540160176e-05, "loss": 0.6825, "num_tokens": 964040647.0, "step": 413 }, { "epoch": 0.14310404424472867, "grad_norm": 0.6992548603440866, "learning_rate": 7.956006392504518e-05, "loss": 0.6612, "num_tokens": 966278004.0, "step": 414 }, { "epoch": 0.14344970618734879, "grad_norm": 0.6525582814227087, "learning_rate": 7.955289482441587e-05, "loss": 0.6786, "num_tokens": 968736836.0, "step": 415 }, { "epoch": 0.1437953681299689, "grad_norm": 0.8040583434216484, "learning_rate": 7.95456681101566e-05, "loss": 0.6862, "num_tokens": 971120246.0, "step": 416 }, { "epoch": 0.14414103007258902, "grad_norm": 0.7531792973959063, "learning_rate": 7.953838379279409e-05, "loss": 0.6784, "num_tokens": 973439021.0, "step": 417 }, { "epoch": 0.1444866920152091, "grad_norm": 1.0098266689438589, "learning_rate": 7.953104188293892e-05, "loss": 0.681, "num_tokens": 975792179.0, "step": 418 }, { "epoch": 0.14483235395782923, "grad_norm": 0.5490089234056582, "learning_rate": 7.952364239128564e-05, "loss": 0.6614, "num_tokens": 978085595.0, "step": 419 }, { "epoch": 0.14517801590044935, "grad_norm": 1.1613951853625943, "learning_rate": 7.951618532861257e-05, "loss": 0.6878, "num_tokens": 980513903.0, "step": 420 }, { "epoch": 0.14552367784306947, "grad_norm": 0.8329947426192129, "learning_rate": 7.950867070578195e-05, "loss": 0.676, "num_tokens": 982886030.0, "step": 421 }, { "epoch": 0.14586933978568958, "grad_norm": 1.190100036888434, "learning_rate": 7.950109853373988e-05, "loss": 0.665, "num_tokens": 985210567.0, "step": 422 }, { "epoch": 0.1462150017283097, "grad_norm": 0.8935123344795749, "learning_rate": 7.949346882351627e-05, "loss": 0.6721, "num_tokens": 987612333.0, "step": 423 }, { "epoch": 0.14656066367092982, "grad_norm": 1.0971269133187929, "learning_rate": 7.94857815862248e-05, "loss": 0.6882, "num_tokens": 989941853.0, "step": 424 }, { "epoch": 0.14690632561354994, "grad_norm": 0.9269661639824344, "learning_rate": 7.947803683306303e-05, "loss": 0.6944, "num_tokens": 992161232.0, "step": 425 }, { "epoch": 0.14725198755617006, "grad_norm": 1.1080505270489063, "learning_rate": 7.947023457531223e-05, "loss": 0.6747, "num_tokens": 994520765.0, "step": 426 }, { "epoch": 0.14759764949879017, "grad_norm": 0.9310890523912794, "learning_rate": 7.946237482433747e-05, "loss": 0.6777, "num_tokens": 996792700.0, "step": 427 }, { "epoch": 0.1479433114414103, "grad_norm": 0.8982719874466853, "learning_rate": 7.945445759158753e-05, "loss": 0.6719, "num_tokens": 999136838.0, "step": 428 }, { "epoch": 0.1482889733840304, "grad_norm": 0.7354535611353327, "learning_rate": 7.944648288859498e-05, "loss": 0.6746, "num_tokens": 1001553143.0, "step": 429 }, { "epoch": 0.14863463532665053, "grad_norm": 1.1992702747213542, "learning_rate": 7.943845072697605e-05, "loss": 0.6714, "num_tokens": 1003923247.0, "step": 430 }, { "epoch": 0.14898029726927065, "grad_norm": 0.8315287546815002, "learning_rate": 7.943036111843067e-05, "loss": 0.6793, "num_tokens": 1006261422.0, "step": 431 }, { "epoch": 0.14932595921189076, "grad_norm": 1.218025167995165, "learning_rate": 7.94222140747425e-05, "loss": 0.682, "num_tokens": 1008548652.0, "step": 432 }, { "epoch": 0.14967162115451088, "grad_norm": 1.052629611601623, "learning_rate": 7.941400960777879e-05, "loss": 0.6555, "num_tokens": 1010878607.0, "step": 433 }, { "epoch": 0.150017283097131, "grad_norm": 0.9383825053952549, "learning_rate": 7.940574772949048e-05, "loss": 0.6736, "num_tokens": 1013140608.0, "step": 434 }, { "epoch": 0.15036294503975112, "grad_norm": 0.8027960200380267, "learning_rate": 7.939742845191215e-05, "loss": 0.6813, "num_tokens": 1015491965.0, "step": 435 }, { "epoch": 0.15070860698237123, "grad_norm": 1.0650836491529845, "learning_rate": 7.938905178716194e-05, "loss": 0.6847, "num_tokens": 1017824887.0, "step": 436 }, { "epoch": 0.15105426892499135, "grad_norm": 0.9543048286901269, "learning_rate": 7.938061774744162e-05, "loss": 0.6855, "num_tokens": 1020197919.0, "step": 437 }, { "epoch": 0.15139993086761147, "grad_norm": 0.817908007481354, "learning_rate": 7.937212634503652e-05, "loss": 0.6721, "num_tokens": 1022497457.0, "step": 438 }, { "epoch": 0.1517455928102316, "grad_norm": 0.7267433998540934, "learning_rate": 7.936357759231555e-05, "loss": 0.6658, "num_tokens": 1024894188.0, "step": 439 }, { "epoch": 0.1520912547528517, "grad_norm": 1.2551259308776896, "learning_rate": 7.935497150173113e-05, "loss": 0.6753, "num_tokens": 1027318082.0, "step": 440 }, { "epoch": 0.15243691669547182, "grad_norm": 1.0274370888599909, "learning_rate": 7.93463080858192e-05, "loss": 0.6741, "num_tokens": 1029645403.0, "step": 441 }, { "epoch": 0.15278257863809194, "grad_norm": 1.091659933559066, "learning_rate": 7.933758735719923e-05, "loss": 0.6665, "num_tokens": 1032027076.0, "step": 442 }, { "epoch": 0.15312824058071206, "grad_norm": 0.9876493246318193, "learning_rate": 7.932880932857413e-05, "loss": 0.6581, "num_tokens": 1034316710.0, "step": 443 }, { "epoch": 0.15347390252333218, "grad_norm": 0.8648602252980715, "learning_rate": 7.931997401273034e-05, "loss": 0.6916, "num_tokens": 1036655119.0, "step": 444 }, { "epoch": 0.1538195644659523, "grad_norm": 0.8849497074968393, "learning_rate": 7.931108142253767e-05, "loss": 0.6815, "num_tokens": 1038940628.0, "step": 445 }, { "epoch": 0.15416522640857241, "grad_norm": 0.7717310075577019, "learning_rate": 7.930213157094943e-05, "loss": 0.6853, "num_tokens": 1041252573.0, "step": 446 }, { "epoch": 0.15451088835119253, "grad_norm": 0.6163082225472927, "learning_rate": 7.929312447100229e-05, "loss": 0.6708, "num_tokens": 1043647714.0, "step": 447 }, { "epoch": 0.15485655029381265, "grad_norm": 0.9150930137138809, "learning_rate": 7.928406013581631e-05, "loss": 0.6646, "num_tokens": 1046014911.0, "step": 448 }, { "epoch": 0.15520221223643277, "grad_norm": 0.7211587607974769, "learning_rate": 7.927493857859496e-05, "loss": 0.6787, "num_tokens": 1048389762.0, "step": 449 }, { "epoch": 0.1555478741790529, "grad_norm": 1.2928863484162245, "learning_rate": 7.926575981262503e-05, "loss": 0.6902, "num_tokens": 1050686961.0, "step": 450 }, { "epoch": 0.155893536121673, "grad_norm": 1.1210231001273558, "learning_rate": 7.925652385127665e-05, "loss": 0.6712, "num_tokens": 1052986653.0, "step": 451 }, { "epoch": 0.15623919806429312, "grad_norm": 1.0286890019270256, "learning_rate": 7.924723070800327e-05, "loss": 0.6677, "num_tokens": 1055204993.0, "step": 452 }, { "epoch": 0.15658486000691324, "grad_norm": 0.9430942323978379, "learning_rate": 7.923788039634162e-05, "loss": 0.6558, "num_tokens": 1057512682.0, "step": 453 }, { "epoch": 0.15693052194953336, "grad_norm": 0.8444811867150017, "learning_rate": 7.922847292991171e-05, "loss": 0.6901, "num_tokens": 1059946207.0, "step": 454 }, { "epoch": 0.15727618389215348, "grad_norm": 0.5828469372887928, "learning_rate": 7.92190083224168e-05, "loss": 0.6747, "num_tokens": 1062266510.0, "step": 455 }, { "epoch": 0.1576218458347736, "grad_norm": 1.0082901340857904, "learning_rate": 7.920948658764342e-05, "loss": 0.6737, "num_tokens": 1064606852.0, "step": 456 }, { "epoch": 0.1579675077773937, "grad_norm": 0.8237080604879339, "learning_rate": 7.919990773946123e-05, "loss": 0.6872, "num_tokens": 1067013715.0, "step": 457 }, { "epoch": 0.15831316972001383, "grad_norm": 0.8386534497432427, "learning_rate": 7.919027179182317e-05, "loss": 0.6627, "num_tokens": 1069333643.0, "step": 458 }, { "epoch": 0.15865883166263395, "grad_norm": 1.0835608654140119, "learning_rate": 7.918057875876532e-05, "loss": 0.6623, "num_tokens": 1071658910.0, "step": 459 }, { "epoch": 0.15900449360525407, "grad_norm": 0.6694495830725079, "learning_rate": 7.917082865440688e-05, "loss": 0.6738, "num_tokens": 1073951800.0, "step": 460 }, { "epoch": 0.15935015554787418, "grad_norm": 1.10456055169353, "learning_rate": 7.916102149295025e-05, "loss": 0.6713, "num_tokens": 1076353638.0, "step": 461 }, { "epoch": 0.1596958174904943, "grad_norm": 0.8130739395781956, "learning_rate": 7.915115728868089e-05, "loss": 0.6694, "num_tokens": 1078553778.0, "step": 462 }, { "epoch": 0.16004147943311442, "grad_norm": 0.7826133063248987, "learning_rate": 7.914123605596737e-05, "loss": 0.6934, "num_tokens": 1080863873.0, "step": 463 }, { "epoch": 0.16038714137573454, "grad_norm": 1.2920416455139134, "learning_rate": 7.913125780926131e-05, "loss": 0.7044, "num_tokens": 1083193288.0, "step": 464 }, { "epoch": 0.16073280331835466, "grad_norm": 0.8252255936972961, "learning_rate": 7.912122256309742e-05, "loss": 0.6844, "num_tokens": 1085534713.0, "step": 465 }, { "epoch": 0.16107846526097477, "grad_norm": 1.5618718094935777, "learning_rate": 7.911113033209337e-05, "loss": 0.6779, "num_tokens": 1087903931.0, "step": 466 }, { "epoch": 0.1614241272035949, "grad_norm": 1.505487500370681, "learning_rate": 7.910098113094992e-05, "loss": 0.6662, "num_tokens": 1090162566.0, "step": 467 }, { "epoch": 0.161769789146215, "grad_norm": 0.6953409699459869, "learning_rate": 7.909077497445076e-05, "loss": 0.6573, "num_tokens": 1092553627.0, "step": 468 }, { "epoch": 0.16211545108883513, "grad_norm": 0.9654921052884959, "learning_rate": 7.908051187746254e-05, "loss": 0.6911, "num_tokens": 1094972060.0, "step": 469 }, { "epoch": 0.16246111303145525, "grad_norm": 0.6278143521364654, "learning_rate": 7.90701918549349e-05, "loss": 0.6567, "num_tokens": 1097173981.0, "step": 470 }, { "epoch": 0.16280677497407536, "grad_norm": 0.9249160263967247, "learning_rate": 7.905981492190031e-05, "loss": 0.6823, "num_tokens": 1099533821.0, "step": 471 }, { "epoch": 0.16315243691669548, "grad_norm": 0.6192907922843272, "learning_rate": 7.904938109347426e-05, "loss": 0.6641, "num_tokens": 1101980155.0, "step": 472 }, { "epoch": 0.1634980988593156, "grad_norm": 0.8155912226414017, "learning_rate": 7.903889038485502e-05, "loss": 0.6725, "num_tokens": 1104319434.0, "step": 473 }, { "epoch": 0.16384376080193572, "grad_norm": 0.6876666371102291, "learning_rate": 7.902834281132375e-05, "loss": 0.6841, "num_tokens": 1106613946.0, "step": 474 }, { "epoch": 0.16418942274455584, "grad_norm": 0.5926398033709877, "learning_rate": 7.901773838824445e-05, "loss": 0.6801, "num_tokens": 1108927897.0, "step": 475 }, { "epoch": 0.16453508468717595, "grad_norm": 0.7476375239223323, "learning_rate": 7.900707713106386e-05, "loss": 0.6644, "num_tokens": 1111252456.0, "step": 476 }, { "epoch": 0.16488074662979607, "grad_norm": 0.7480269918567899, "learning_rate": 7.899635905531163e-05, "loss": 0.6717, "num_tokens": 1113546667.0, "step": 477 }, { "epoch": 0.1652264085724162, "grad_norm": 0.655120894018493, "learning_rate": 7.898558417660008e-05, "loss": 0.683, "num_tokens": 1115926258.0, "step": 478 }, { "epoch": 0.1655720705150363, "grad_norm": 1.060339861534569, "learning_rate": 7.89747525106243e-05, "loss": 0.6702, "num_tokens": 1118169535.0, "step": 479 }, { "epoch": 0.16591773245765642, "grad_norm": 0.8083687453887654, "learning_rate": 7.896386407316208e-05, "loss": 0.6543, "num_tokens": 1120401008.0, "step": 480 }, { "epoch": 0.16626339440027654, "grad_norm": 1.0787398612002346, "learning_rate": 7.895291888007394e-05, "loss": 0.6621, "num_tokens": 1122638007.0, "step": 481 }, { "epoch": 0.16660905634289666, "grad_norm": 0.8944554890516988, "learning_rate": 7.894191694730306e-05, "loss": 0.6805, "num_tokens": 1125046096.0, "step": 482 }, { "epoch": 0.16695471828551675, "grad_norm": 0.8487740075876676, "learning_rate": 7.893085829087524e-05, "loss": 0.6761, "num_tokens": 1127360376.0, "step": 483 }, { "epoch": 0.16730038022813687, "grad_norm": 0.7858720558284045, "learning_rate": 7.891974292689895e-05, "loss": 0.6531, "num_tokens": 1129542203.0, "step": 484 }, { "epoch": 0.167646042170757, "grad_norm": 0.8222752831081003, "learning_rate": 7.890857087156523e-05, "loss": 0.6746, "num_tokens": 1131920871.0, "step": 485 }, { "epoch": 0.1679917041133771, "grad_norm": 0.7046265083128342, "learning_rate": 7.88973421411477e-05, "loss": 0.6614, "num_tokens": 1134259630.0, "step": 486 }, { "epoch": 0.16833736605599722, "grad_norm": 0.7102915836053934, "learning_rate": 7.888605675200256e-05, "loss": 0.6596, "num_tokens": 1136468765.0, "step": 487 }, { "epoch": 0.16868302799861734, "grad_norm": 0.6433044756393974, "learning_rate": 7.887471472056853e-05, "loss": 0.6676, "num_tokens": 1138819148.0, "step": 488 }, { "epoch": 0.16902868994123746, "grad_norm": 0.7785247513497286, "learning_rate": 7.886331606336681e-05, "loss": 0.6829, "num_tokens": 1141267248.0, "step": 489 }, { "epoch": 0.16937435188385758, "grad_norm": 0.40877372901756215, "learning_rate": 7.885186079700114e-05, "loss": 0.6861, "num_tokens": 1143522301.0, "step": 490 }, { "epoch": 0.1697200138264777, "grad_norm": 1.1849688808923968, "learning_rate": 7.884034893815766e-05, "loss": 0.6856, "num_tokens": 1145901897.0, "step": 491 }, { "epoch": 0.1700656757690978, "grad_norm": 0.9648441292139598, "learning_rate": 7.8828780503605e-05, "loss": 0.6905, "num_tokens": 1148190579.0, "step": 492 }, { "epoch": 0.17041133771171793, "grad_norm": 1.0573858450199947, "learning_rate": 7.881715551019413e-05, "loss": 0.682, "num_tokens": 1150464745.0, "step": 493 }, { "epoch": 0.17075699965433805, "grad_norm": 0.9662953298133525, "learning_rate": 7.880547397485845e-05, "loss": 0.6518, "num_tokens": 1152752631.0, "step": 494 }, { "epoch": 0.17110266159695817, "grad_norm": 0.8972158687927207, "learning_rate": 7.879373591461376e-05, "loss": 0.6773, "num_tokens": 1155093513.0, "step": 495 }, { "epoch": 0.17144832353957828, "grad_norm": 0.8317367232125701, "learning_rate": 7.878194134655811e-05, "loss": 0.671, "num_tokens": 1157485784.0, "step": 496 }, { "epoch": 0.1717939854821984, "grad_norm": 0.8276925000128146, "learning_rate": 7.877009028787193e-05, "loss": 0.6676, "num_tokens": 1159927096.0, "step": 497 }, { "epoch": 0.17213964742481852, "grad_norm": 0.6585278048213279, "learning_rate": 7.875818275581789e-05, "loss": 0.6764, "num_tokens": 1162209982.0, "step": 498 }, { "epoch": 0.17248530936743864, "grad_norm": 0.7902236865020444, "learning_rate": 7.874621876774098e-05, "loss": 0.6585, "num_tokens": 1164485817.0, "step": 499 }, { "epoch": 0.17283097131005876, "grad_norm": 0.5377156888057961, "learning_rate": 7.873419834106834e-05, "loss": 0.6755, "num_tokens": 1166739745.0, "step": 500 }, { "epoch": 0.17317663325267887, "grad_norm": 0.8325757930919285, "learning_rate": 7.87221214933094e-05, "loss": 0.6733, "num_tokens": 1168965108.0, "step": 501 }, { "epoch": 0.173522295195299, "grad_norm": 0.5209795529066652, "learning_rate": 7.870998824205574e-05, "loss": 0.6763, "num_tokens": 1171329495.0, "step": 502 }, { "epoch": 0.1738679571379191, "grad_norm": 0.871350196893921, "learning_rate": 7.86977986049811e-05, "loss": 0.6905, "num_tokens": 1173693882.0, "step": 503 }, { "epoch": 0.17421361908053923, "grad_norm": 0.6806218449613248, "learning_rate": 7.868555259984136e-05, "loss": 0.6775, "num_tokens": 1175938352.0, "step": 504 }, { "epoch": 0.17455928102315935, "grad_norm": 0.8212712061291401, "learning_rate": 7.86732502444745e-05, "loss": 0.6826, "num_tokens": 1178243894.0, "step": 505 }, { "epoch": 0.17490494296577946, "grad_norm": 0.7765756131504727, "learning_rate": 7.866089155680059e-05, "loss": 0.6677, "num_tokens": 1180573839.0, "step": 506 }, { "epoch": 0.17525060490839958, "grad_norm": 0.7285054193287712, "learning_rate": 7.864847655482174e-05, "loss": 0.6907, "num_tokens": 1182903559.0, "step": 507 }, { "epoch": 0.1755962668510197, "grad_norm": 0.6267126802608064, "learning_rate": 7.863600525662213e-05, "loss": 0.6598, "num_tokens": 1185245321.0, "step": 508 }, { "epoch": 0.17594192879363982, "grad_norm": 0.7333527420740727, "learning_rate": 7.862347768036789e-05, "loss": 0.6639, "num_tokens": 1187510893.0, "step": 509 }, { "epoch": 0.17628759073625994, "grad_norm": 0.7171956079690914, "learning_rate": 7.861089384430716e-05, "loss": 0.6856, "num_tokens": 1189799084.0, "step": 510 }, { "epoch": 0.17663325267888005, "grad_norm": 0.5582524277149231, "learning_rate": 7.859825376677003e-05, "loss": 0.6778, "num_tokens": 1192100161.0, "step": 511 }, { "epoch": 0.17697891462150017, "grad_norm": 0.6331823404553163, "learning_rate": 7.858555746616849e-05, "loss": 0.6943, "num_tokens": 1194576087.0, "step": 512 }, { "epoch": 0.1773245765641203, "grad_norm": 0.6178396677336605, "learning_rate": 7.857280496099645e-05, "loss": 0.673, "num_tokens": 1196857673.0, "step": 513 }, { "epoch": 0.1776702385067404, "grad_norm": 0.511850960357818, "learning_rate": 7.855999626982969e-05, "loss": 0.6776, "num_tokens": 1199290389.0, "step": 514 }, { "epoch": 0.17801590044936053, "grad_norm": 0.6488623819045878, "learning_rate": 7.854713141132582e-05, "loss": 0.6791, "num_tokens": 1201669512.0, "step": 515 }, { "epoch": 0.17836156239198064, "grad_norm": 0.6244876293998658, "learning_rate": 7.853421040422425e-05, "loss": 0.6635, "num_tokens": 1203882546.0, "step": 516 }, { "epoch": 0.17870722433460076, "grad_norm": 0.6677352096577931, "learning_rate": 7.852123326734622e-05, "loss": 0.6816, "num_tokens": 1206223886.0, "step": 517 }, { "epoch": 0.17905288627722088, "grad_norm": 0.5031659879400073, "learning_rate": 7.85082000195947e-05, "loss": 0.678, "num_tokens": 1208610470.0, "step": 518 }, { "epoch": 0.179398548219841, "grad_norm": 0.5923629631419254, "learning_rate": 7.849511067995442e-05, "loss": 0.6775, "num_tokens": 1211009112.0, "step": 519 }, { "epoch": 0.17974421016246112, "grad_norm": 1.1653871353142766, "learning_rate": 7.848196526749177e-05, "loss": 0.6673, "num_tokens": 1213308458.0, "step": 520 }, { "epoch": 0.18008987210508123, "grad_norm": 0.5410225868283237, "learning_rate": 7.846876380135487e-05, "loss": 0.6839, "num_tokens": 1215688984.0, "step": 521 }, { "epoch": 0.18043553404770135, "grad_norm": 1.5214356912711706, "learning_rate": 7.845550630077344e-05, "loss": 0.6839, "num_tokens": 1217980640.0, "step": 522 }, { "epoch": 0.18078119599032147, "grad_norm": 1.2033703081470895, "learning_rate": 7.844219278505885e-05, "loss": 0.6866, "num_tokens": 1220428360.0, "step": 523 }, { "epoch": 0.1811268579329416, "grad_norm": 1.4132729530968586, "learning_rate": 7.842882327360408e-05, "loss": 0.6811, "num_tokens": 1222681431.0, "step": 524 }, { "epoch": 0.1814725198755617, "grad_norm": 1.4301108376708938, "learning_rate": 7.841539778588363e-05, "loss": 0.6789, "num_tokens": 1224952258.0, "step": 525 }, { "epoch": 0.18181818181818182, "grad_norm": 0.9368165413612812, "learning_rate": 7.840191634145359e-05, "loss": 0.68, "num_tokens": 1227342610.0, "step": 526 }, { "epoch": 0.18216384376080194, "grad_norm": 1.0912150879874103, "learning_rate": 7.838837895995152e-05, "loss": 0.6731, "num_tokens": 1229664962.0, "step": 527 }, { "epoch": 0.18250950570342206, "grad_norm": 0.7021981369200088, "learning_rate": 7.837478566109646e-05, "loss": 0.7008, "num_tokens": 1232051653.0, "step": 528 }, { "epoch": 0.18285516764604218, "grad_norm": 0.8469380222389131, "learning_rate": 7.836113646468895e-05, "loss": 0.6772, "num_tokens": 1234351403.0, "step": 529 }, { "epoch": 0.1832008295886623, "grad_norm": 0.6776079201803205, "learning_rate": 7.834743139061087e-05, "loss": 0.6881, "num_tokens": 1236773795.0, "step": 530 }, { "epoch": 0.1835464915312824, "grad_norm": 0.5809153316766027, "learning_rate": 7.833367045882555e-05, "loss": 0.6711, "num_tokens": 1239131431.0, "step": 531 }, { "epoch": 0.18389215347390253, "grad_norm": 0.7855283862086249, "learning_rate": 7.83198536893777e-05, "loss": 0.6749, "num_tokens": 1241481597.0, "step": 532 }, { "epoch": 0.18423781541652265, "grad_norm": 0.5047940805183389, "learning_rate": 7.83059811023933e-05, "loss": 0.6915, "num_tokens": 1243858177.0, "step": 533 }, { "epoch": 0.18458347735914277, "grad_norm": 0.9191573296170649, "learning_rate": 7.829205271807971e-05, "loss": 0.6766, "num_tokens": 1246344707.0, "step": 534 }, { "epoch": 0.18492913930176288, "grad_norm": 0.666795099542455, "learning_rate": 7.827806855672551e-05, "loss": 0.6774, "num_tokens": 1248752486.0, "step": 535 }, { "epoch": 0.185274801244383, "grad_norm": 1.0774963584795936, "learning_rate": 7.826402863870054e-05, "loss": 0.6654, "num_tokens": 1251205201.0, "step": 536 }, { "epoch": 0.18562046318700312, "grad_norm": 1.038314745443513, "learning_rate": 7.82499329844559e-05, "loss": 0.6682, "num_tokens": 1253605228.0, "step": 537 }, { "epoch": 0.18596612512962324, "grad_norm": 0.9180557168921101, "learning_rate": 7.823578161452381e-05, "loss": 0.6651, "num_tokens": 1255892235.0, "step": 538 }, { "epoch": 0.18631178707224336, "grad_norm": 0.9925632131953006, "learning_rate": 7.822157454951769e-05, "loss": 0.6647, "num_tokens": 1258184990.0, "step": 539 }, { "epoch": 0.18665744901486347, "grad_norm": 0.8383349696801654, "learning_rate": 7.820731181013207e-05, "loss": 0.6728, "num_tokens": 1260465450.0, "step": 540 }, { "epoch": 0.1870031109574836, "grad_norm": 0.9762719269307527, "learning_rate": 7.819299341714261e-05, "loss": 0.6869, "num_tokens": 1262826201.0, "step": 541 }, { "epoch": 0.1873487729001037, "grad_norm": 0.9367760544885605, "learning_rate": 7.817861939140599e-05, "loss": 0.6667, "num_tokens": 1265226768.0, "step": 542 }, { "epoch": 0.18769443484272383, "grad_norm": 0.9220242492008244, "learning_rate": 7.816418975385994e-05, "loss": 0.6694, "num_tokens": 1267598455.0, "step": 543 }, { "epoch": 0.18804009678534395, "grad_norm": 0.6819054463315197, "learning_rate": 7.814970452552323e-05, "loss": 0.6651, "num_tokens": 1269852164.0, "step": 544 }, { "epoch": 0.18838575872796406, "grad_norm": 0.9081684293803073, "learning_rate": 7.813516372749556e-05, "loss": 0.6744, "num_tokens": 1272102642.0, "step": 545 }, { "epoch": 0.18873142067058418, "grad_norm": 0.6787871397214766, "learning_rate": 7.812056738095761e-05, "loss": 0.6829, "num_tokens": 1274495986.0, "step": 546 }, { "epoch": 0.18907708261320427, "grad_norm": 0.9023949680304635, "learning_rate": 7.810591550717094e-05, "loss": 0.6697, "num_tokens": 1276835725.0, "step": 547 }, { "epoch": 0.1894227445558244, "grad_norm": 0.8285672328337838, "learning_rate": 7.809120812747804e-05, "loss": 0.6957, "num_tokens": 1279226496.0, "step": 548 }, { "epoch": 0.1897684064984445, "grad_norm": 1.0394614062239516, "learning_rate": 7.807644526330221e-05, "loss": 0.6888, "num_tokens": 1281679517.0, "step": 549 }, { "epoch": 0.19011406844106463, "grad_norm": 0.672644873647182, "learning_rate": 7.806162693614758e-05, "loss": 0.6687, "num_tokens": 1284089139.0, "step": 550 }, { "epoch": 0.19045973038368474, "grad_norm": 0.9719066237721203, "learning_rate": 7.804675316759908e-05, "loss": 0.6483, "num_tokens": 1286313128.0, "step": 551 }, { "epoch": 0.19080539232630486, "grad_norm": 0.8531831087346086, "learning_rate": 7.803182397932238e-05, "loss": 0.6823, "num_tokens": 1288682040.0, "step": 552 }, { "epoch": 0.19115105426892498, "grad_norm": 0.9558185039898183, "learning_rate": 7.801683939306392e-05, "loss": 0.6562, "num_tokens": 1291057661.0, "step": 553 }, { "epoch": 0.1914967162115451, "grad_norm": 0.77773027782198, "learning_rate": 7.800179943065079e-05, "loss": 0.6736, "num_tokens": 1293314858.0, "step": 554 }, { "epoch": 0.19184237815416522, "grad_norm": 0.999974470709146, "learning_rate": 7.798670411399073e-05, "loss": 0.6587, "num_tokens": 1295499037.0, "step": 555 }, { "epoch": 0.19218804009678533, "grad_norm": 0.6478375977079255, "learning_rate": 7.797155346507217e-05, "loss": 0.6802, "num_tokens": 1297909878.0, "step": 556 }, { "epoch": 0.19253370203940545, "grad_norm": 1.0916868446338521, "learning_rate": 7.795634750596409e-05, "loss": 0.6797, "num_tokens": 1300278130.0, "step": 557 }, { "epoch": 0.19287936398202557, "grad_norm": 0.7546860168829089, "learning_rate": 7.794108625881607e-05, "loss": 0.6575, "num_tokens": 1302533942.0, "step": 558 }, { "epoch": 0.1932250259246457, "grad_norm": 1.3434612262311425, "learning_rate": 7.79257697458582e-05, "loss": 0.682, "num_tokens": 1304791330.0, "step": 559 }, { "epoch": 0.1935706878672658, "grad_norm": 1.0425810918243734, "learning_rate": 7.791039798940107e-05, "loss": 0.6716, "num_tokens": 1307140180.0, "step": 560 }, { "epoch": 0.19391634980988592, "grad_norm": 1.0906088111340324, "learning_rate": 7.789497101183575e-05, "loss": 0.6853, "num_tokens": 1309518780.0, "step": 561 }, { "epoch": 0.19426201175250604, "grad_norm": 0.9435489855538719, "learning_rate": 7.787948883563377e-05, "loss": 0.6519, "num_tokens": 1311859896.0, "step": 562 }, { "epoch": 0.19460767369512616, "grad_norm": 0.9643748878302044, "learning_rate": 7.786395148334702e-05, "loss": 0.6663, "num_tokens": 1314085833.0, "step": 563 }, { "epoch": 0.19495333563774628, "grad_norm": 0.7297840357431394, "learning_rate": 7.784835897760779e-05, "loss": 0.6551, "num_tokens": 1316412163.0, "step": 564 }, { "epoch": 0.1952989975803664, "grad_norm": 1.2098275099270528, "learning_rate": 7.783271134112871e-05, "loss": 0.6637, "num_tokens": 1318759757.0, "step": 565 }, { "epoch": 0.1956446595229865, "grad_norm": 0.9537223424552935, "learning_rate": 7.78170085967027e-05, "loss": 0.6619, "num_tokens": 1321064377.0, "step": 566 }, { "epoch": 0.19599032146560663, "grad_norm": 0.9629544273357176, "learning_rate": 7.780125076720295e-05, "loss": 0.665, "num_tokens": 1323334965.0, "step": 567 }, { "epoch": 0.19633598340822675, "grad_norm": 0.8301967484227063, "learning_rate": 7.778543787558292e-05, "loss": 0.6684, "num_tokens": 1325669197.0, "step": 568 }, { "epoch": 0.19668164535084687, "grad_norm": 0.9722655568473246, "learning_rate": 7.776956994487624e-05, "loss": 0.6553, "num_tokens": 1328069958.0, "step": 569 }, { "epoch": 0.19702730729346699, "grad_norm": 0.9022977588977088, "learning_rate": 7.775364699819672e-05, "loss": 0.653, "num_tokens": 1330275271.0, "step": 570 }, { "epoch": 0.1973729692360871, "grad_norm": 1.1122557436678455, "learning_rate": 7.773766905873832e-05, "loss": 0.6797, "num_tokens": 1332686670.0, "step": 571 }, { "epoch": 0.19771863117870722, "grad_norm": 1.0125111788960826, "learning_rate": 7.77216361497751e-05, "loss": 0.6542, "num_tokens": 1335096177.0, "step": 572 }, { "epoch": 0.19806429312132734, "grad_norm": 0.9219632068442798, "learning_rate": 7.770554829466121e-05, "loss": 0.6706, "num_tokens": 1337337014.0, "step": 573 }, { "epoch": 0.19840995506394746, "grad_norm": 0.9012051143389893, "learning_rate": 7.768940551683079e-05, "loss": 0.6875, "num_tokens": 1339675556.0, "step": 574 }, { "epoch": 0.19875561700656758, "grad_norm": 0.8489487220322667, "learning_rate": 7.767320783979801e-05, "loss": 0.6506, "num_tokens": 1342040139.0, "step": 575 }, { "epoch": 0.1991012789491877, "grad_norm": 0.731314279133667, "learning_rate": 7.765695528715701e-05, "loss": 0.6629, "num_tokens": 1344384460.0, "step": 576 }, { "epoch": 0.1994469408918078, "grad_norm": 0.9249952296907413, "learning_rate": 7.764064788258185e-05, "loss": 0.6805, "num_tokens": 1346807158.0, "step": 577 }, { "epoch": 0.19979260283442793, "grad_norm": 0.8180799509386987, "learning_rate": 7.762428564982653e-05, "loss": 0.6752, "num_tokens": 1349140081.0, "step": 578 }, { "epoch": 1.00034566194262, "grad_norm": 2.1142211852665396, "learning_rate": 7.760786861272486e-05, "loss": 1.3232, "num_tokens": 1352945518.0, "step": 579 }, { "epoch": 1.0006913238852402, "grad_norm": 0.8740528567855538, "learning_rate": 7.759139679519052e-05, "loss": 0.663, "num_tokens": 1355363345.0, "step": 580 }, { "epoch": 1.0010369858278603, "grad_norm": 0.9346383668192436, "learning_rate": 7.757487022121696e-05, "loss": 0.6632, "num_tokens": 1357657558.0, "step": 581 }, { "epoch": 1.0013826477704804, "grad_norm": 0.5707352247994628, "learning_rate": 7.755828891487739e-05, "loss": 0.6759, "num_tokens": 1359990193.0, "step": 582 }, { "epoch": 1.0017283097131007, "grad_norm": 0.8777230050047098, "learning_rate": 7.754165290032477e-05, "loss": 0.6772, "num_tokens": 1362426704.0, "step": 583 }, { "epoch": 1.0020739716557208, "grad_norm": 0.7456478205951602, "learning_rate": 7.752496220179175e-05, "loss": 0.6631, "num_tokens": 1364665198.0, "step": 584 }, { "epoch": 1.0024196335983409, "grad_norm": 1.1036043922339784, "learning_rate": 7.750821684359059e-05, "loss": 0.6725, "num_tokens": 1367024438.0, "step": 585 }, { "epoch": 1.002765295540961, "grad_norm": 1.1358749685551512, "learning_rate": 7.74914168501132e-05, "loss": 0.6646, "num_tokens": 1369322895.0, "step": 586 }, { "epoch": 1.003110957483581, "grad_norm": 0.7777353302105828, "learning_rate": 7.74745622458311e-05, "loss": 0.6761, "num_tokens": 1371556203.0, "step": 587 }, { "epoch": 1.0034566194262011, "grad_norm": 0.9061793479058321, "learning_rate": 7.745765305529532e-05, "loss": 0.6487, "num_tokens": 1373818248.0, "step": 588 }, { "epoch": 1.0038022813688212, "grad_norm": 0.6363269674236014, "learning_rate": 7.744068930313641e-05, "loss": 0.663, "num_tokens": 1376220420.0, "step": 589 }, { "epoch": 1.0041479433114413, "grad_norm": 1.1400955700651099, "learning_rate": 7.742367101406439e-05, "loss": 0.6966, "num_tokens": 1378698210.0, "step": 590 }, { "epoch": 1.0044936052540616, "grad_norm": 0.6931490878384134, "learning_rate": 7.740659821286875e-05, "loss": 0.662, "num_tokens": 1380900787.0, "step": 591 }, { "epoch": 1.0048392671966817, "grad_norm": 0.7438861467017077, "learning_rate": 7.738947092441834e-05, "loss": 0.6537, "num_tokens": 1383209048.0, "step": 592 }, { "epoch": 1.0051849291393018, "grad_norm": 0.6822502682454515, "learning_rate": 7.737228917366145e-05, "loss": 0.6525, "num_tokens": 1385527452.0, "step": 593 }, { "epoch": 1.0055305910819219, "grad_norm": 0.5169468185232555, "learning_rate": 7.73550529856256e-05, "loss": 0.6754, "num_tokens": 1387948417.0, "step": 594 }, { "epoch": 1.005876253024542, "grad_norm": 0.7060860786596713, "learning_rate": 7.73377623854177e-05, "loss": 0.6822, "num_tokens": 1390344923.0, "step": 595 }, { "epoch": 1.006221914967162, "grad_norm": 0.6111485872093079, "learning_rate": 7.732041739822384e-05, "loss": 0.6585, "num_tokens": 1392652565.0, "step": 596 }, { "epoch": 1.0065675769097822, "grad_norm": 0.7018961454326624, "learning_rate": 7.730301804930942e-05, "loss": 0.6706, "num_tokens": 1395127019.0, "step": 597 }, { "epoch": 1.0069132388524022, "grad_norm": 0.5014693464006867, "learning_rate": 7.728556436401894e-05, "loss": 0.6698, "num_tokens": 1397426472.0, "step": 598 }, { "epoch": 1.0072589007950226, "grad_norm": 0.9465512205353946, "learning_rate": 7.72680563677761e-05, "loss": 0.6437, "num_tokens": 1399665676.0, "step": 599 }, { "epoch": 1.0076045627376427, "grad_norm": 0.596902876231457, "learning_rate": 7.725049408608367e-05, "loss": 0.6598, "num_tokens": 1402023834.0, "step": 600 }, { "epoch": 1.0079502246802627, "grad_norm": 0.9792241758241436, "learning_rate": 7.723287754452357e-05, "loss": 0.6617, "num_tokens": 1404341552.0, "step": 601 }, { "epoch": 1.0082958866228828, "grad_norm": 0.8572224392994607, "learning_rate": 7.721520676875667e-05, "loss": 0.6727, "num_tokens": 1406658741.0, "step": 602 }, { "epoch": 1.008641548565503, "grad_norm": 0.8019852222089114, "learning_rate": 7.719748178452291e-05, "loss": 0.6623, "num_tokens": 1409023426.0, "step": 603 }, { "epoch": 1.008987210508123, "grad_norm": 1.147902468568786, "learning_rate": 7.717970261764111e-05, "loss": 0.6705, "num_tokens": 1411295298.0, "step": 604 }, { "epoch": 1.009332872450743, "grad_norm": 0.8975050298448578, "learning_rate": 7.716186929400909e-05, "loss": 0.673, "num_tokens": 1413654997.0, "step": 605 }, { "epoch": 1.0096785343933632, "grad_norm": 0.8818019982825493, "learning_rate": 7.714398183960353e-05, "loss": 0.6545, "num_tokens": 1415990152.0, "step": 606 }, { "epoch": 1.0100241963359835, "grad_norm": 0.8515555098083399, "learning_rate": 7.712604028047998e-05, "loss": 0.6674, "num_tokens": 1418336050.0, "step": 607 }, { "epoch": 1.0103698582786036, "grad_norm": 0.6813012204492377, "learning_rate": 7.710804464277275e-05, "loss": 0.6805, "num_tokens": 1420683836.0, "step": 608 }, { "epoch": 1.0107155202212237, "grad_norm": 0.790969572407373, "learning_rate": 7.708999495269496e-05, "loss": 0.66, "num_tokens": 1422975794.0, "step": 609 }, { "epoch": 1.0110611821638438, "grad_norm": 0.6093040695520983, "learning_rate": 7.707189123653845e-05, "loss": 0.6799, "num_tokens": 1425378969.0, "step": 610 }, { "epoch": 1.0114068441064639, "grad_norm": 0.7038104402467534, "learning_rate": 7.705373352067378e-05, "loss": 0.6669, "num_tokens": 1427740311.0, "step": 611 }, { "epoch": 1.011752506049084, "grad_norm": 0.789700190060106, "learning_rate": 7.703552183155015e-05, "loss": 0.6677, "num_tokens": 1430071733.0, "step": 612 }, { "epoch": 1.012098167991704, "grad_norm": 0.5631211448705197, "learning_rate": 7.701725619569535e-05, "loss": 0.6644, "num_tokens": 1432398119.0, "step": 613 }, { "epoch": 1.0124438299343241, "grad_norm": 0.5728940307152167, "learning_rate": 7.699893663971584e-05, "loss": 0.6536, "num_tokens": 1434700894.0, "step": 614 }, { "epoch": 1.0127894918769444, "grad_norm": 0.6833111994010771, "learning_rate": 7.698056319029653e-05, "loss": 0.6753, "num_tokens": 1437009930.0, "step": 615 }, { "epoch": 1.0131351538195645, "grad_norm": 0.48898801558173766, "learning_rate": 7.696213587420087e-05, "loss": 0.6389, "num_tokens": 1439320755.0, "step": 616 }, { "epoch": 1.0134808157621846, "grad_norm": 0.7247165377476263, "learning_rate": 7.694365471827077e-05, "loss": 0.6367, "num_tokens": 1441611445.0, "step": 617 }, { "epoch": 1.0138264777048047, "grad_norm": 0.5533567676626411, "learning_rate": 7.692511974942656e-05, "loss": 0.6495, "num_tokens": 1443825170.0, "step": 618 }, { "epoch": 1.0141721396474248, "grad_norm": 0.5441193139815216, "learning_rate": 7.690653099466699e-05, "loss": 0.6673, "num_tokens": 1446190695.0, "step": 619 }, { "epoch": 1.014517801590045, "grad_norm": 0.5913180366540489, "learning_rate": 7.68878884810691e-05, "loss": 0.6536, "num_tokens": 1448411987.0, "step": 620 }, { "epoch": 1.014863463532665, "grad_norm": 0.5526213731310409, "learning_rate": 7.686919223578827e-05, "loss": 0.6622, "num_tokens": 1450746710.0, "step": 621 }, { "epoch": 1.015209125475285, "grad_norm": 0.5661299027415501, "learning_rate": 7.685044228605816e-05, "loss": 0.654, "num_tokens": 1453078431.0, "step": 622 }, { "epoch": 1.0155547874179054, "grad_norm": 0.9323005467571946, "learning_rate": 7.683163865919063e-05, "loss": 0.6656, "num_tokens": 1455445239.0, "step": 623 }, { "epoch": 1.0159004493605255, "grad_norm": 0.6680306010717404, "learning_rate": 7.681278138257574e-05, "loss": 0.6591, "num_tokens": 1457723394.0, "step": 624 }, { "epoch": 1.0162461113031456, "grad_norm": 1.1434659857406162, "learning_rate": 7.679387048368171e-05, "loss": 0.6588, "num_tokens": 1460214983.0, "step": 625 }, { "epoch": 1.0165917732457657, "grad_norm": 1.0012988697589547, "learning_rate": 7.677490599005484e-05, "loss": 0.671, "num_tokens": 1462481652.0, "step": 626 }, { "epoch": 1.0169374351883858, "grad_norm": 1.2261873867858468, "learning_rate": 7.675588792931951e-05, "loss": 0.6589, "num_tokens": 1464769947.0, "step": 627 }, { "epoch": 1.0172830971310058, "grad_norm": 1.0217974991617935, "learning_rate": 7.673681632917817e-05, "loss": 0.6725, "num_tokens": 1467094042.0, "step": 628 }, { "epoch": 1.017628759073626, "grad_norm": 1.11399376740334, "learning_rate": 7.671769121741117e-05, "loss": 0.6515, "num_tokens": 1469458137.0, "step": 629 }, { "epoch": 1.017974421016246, "grad_norm": 0.8987001634998655, "learning_rate": 7.669851262187688e-05, "loss": 0.6549, "num_tokens": 1471691478.0, "step": 630 }, { "epoch": 1.0183200829588663, "grad_norm": 1.2179402114743725, "learning_rate": 7.667928057051157e-05, "loss": 0.6559, "num_tokens": 1474115060.0, "step": 631 }, { "epoch": 1.0186657449014864, "grad_norm": 0.9968381361277421, "learning_rate": 7.665999509132932e-05, "loss": 0.6653, "num_tokens": 1476551097.0, "step": 632 }, { "epoch": 1.0190114068441065, "grad_norm": 1.134854009966637, "learning_rate": 7.664065621242211e-05, "loss": 0.667, "num_tokens": 1478958701.0, "step": 633 }, { "epoch": 1.0193570687867266, "grad_norm": 1.2239518100956406, "learning_rate": 7.662126396195964e-05, "loss": 0.6697, "num_tokens": 1481481101.0, "step": 634 }, { "epoch": 1.0197027307293467, "grad_norm": 0.6181330954809839, "learning_rate": 7.660181836818941e-05, "loss": 0.6564, "num_tokens": 1483911545.0, "step": 635 }, { "epoch": 1.0200483926719668, "grad_norm": 1.0680644611640846, "learning_rate": 7.658231945943657e-05, "loss": 0.6664, "num_tokens": 1486383500.0, "step": 636 }, { "epoch": 1.0203940546145869, "grad_norm": 0.7730549075716262, "learning_rate": 7.656276726410397e-05, "loss": 0.6735, "num_tokens": 1488687299.0, "step": 637 }, { "epoch": 1.020739716557207, "grad_norm": 1.0976599516446086, "learning_rate": 7.654316181067203e-05, "loss": 0.6645, "num_tokens": 1490826696.0, "step": 638 }, { "epoch": 1.0210853784998273, "grad_norm": 1.1945816374119282, "learning_rate": 7.652350312769883e-05, "loss": 0.6482, "num_tokens": 1493144226.0, "step": 639 }, { "epoch": 1.0214310404424474, "grad_norm": 0.6691192026146077, "learning_rate": 7.650379124381988e-05, "loss": 0.6634, "num_tokens": 1495464860.0, "step": 640 }, { "epoch": 1.0217767023850675, "grad_norm": 1.1196536032510578, "learning_rate": 7.648402618774828e-05, "loss": 0.6654, "num_tokens": 1497734437.0, "step": 641 }, { "epoch": 1.0221223643276875, "grad_norm": 0.9040819601814349, "learning_rate": 7.646420798827453e-05, "loss": 0.6806, "num_tokens": 1500083728.0, "step": 642 }, { "epoch": 1.0224680262703076, "grad_norm": 0.7950902913298484, "learning_rate": 7.644433667426657e-05, "loss": 0.6666, "num_tokens": 1502410219.0, "step": 643 }, { "epoch": 1.0228136882129277, "grad_norm": 0.7992043464604883, "learning_rate": 7.642441227466967e-05, "loss": 0.6559, "num_tokens": 1504763233.0, "step": 644 }, { "epoch": 1.0231593501555478, "grad_norm": 0.7218935596775387, "learning_rate": 7.640443481850646e-05, "loss": 0.6567, "num_tokens": 1507031533.0, "step": 645 }, { "epoch": 1.023505012098168, "grad_norm": 0.728114072202148, "learning_rate": 7.638440433487684e-05, "loss": 0.6598, "num_tokens": 1509305905.0, "step": 646 }, { "epoch": 1.023850674040788, "grad_norm": 0.5741261548647706, "learning_rate": 7.636432085295795e-05, "loss": 0.6555, "num_tokens": 1511627754.0, "step": 647 }, { "epoch": 1.0241963359834083, "grad_norm": 0.8008762284547368, "learning_rate": 7.634418440200414e-05, "loss": 0.6697, "num_tokens": 1513997463.0, "step": 648 }, { "epoch": 1.0245419979260284, "grad_norm": 0.8399321071744058, "learning_rate": 7.632399501134692e-05, "loss": 0.6615, "num_tokens": 1516263802.0, "step": 649 }, { "epoch": 1.0248876598686485, "grad_norm": 0.8072936105247144, "learning_rate": 7.630375271039489e-05, "loss": 0.6359, "num_tokens": 1518478478.0, "step": 650 }, { "epoch": 1.0252333218112686, "grad_norm": 0.6932729539087006, "learning_rate": 7.628345752863374e-05, "loss": 0.667, "num_tokens": 1520774169.0, "step": 651 }, { "epoch": 1.0255789837538887, "grad_norm": 0.5840938727847315, "learning_rate": 7.626310949562619e-05, "loss": 0.644, "num_tokens": 1523067587.0, "step": 652 }, { "epoch": 1.0259246456965088, "grad_norm": 0.543434169191495, "learning_rate": 7.624270864101193e-05, "loss": 0.6608, "num_tokens": 1525375809.0, "step": 653 }, { "epoch": 1.0262703076391289, "grad_norm": 0.7933480253455424, "learning_rate": 7.622225499450761e-05, "loss": 0.6627, "num_tokens": 1527727521.0, "step": 654 }, { "epoch": 1.026615969581749, "grad_norm": 0.4965360215465145, "learning_rate": 7.620174858590675e-05, "loss": 0.6632, "num_tokens": 1530146486.0, "step": 655 }, { "epoch": 1.0269616315243693, "grad_norm": 0.5876542604728499, "learning_rate": 7.618118944507978e-05, "loss": 0.6623, "num_tokens": 1532516610.0, "step": 656 }, { "epoch": 1.0273072934669893, "grad_norm": 0.5680896434244347, "learning_rate": 7.616057760197388e-05, "loss": 0.6615, "num_tokens": 1534910929.0, "step": 657 }, { "epoch": 1.0276529554096094, "grad_norm": 0.5644813651143472, "learning_rate": 7.613991308661302e-05, "loss": 0.6584, "num_tokens": 1537193010.0, "step": 658 }, { "epoch": 1.0279986173522295, "grad_norm": 0.557082880787213, "learning_rate": 7.611919592909792e-05, "loss": 0.6512, "num_tokens": 1539530821.0, "step": 659 }, { "epoch": 1.0283442792948496, "grad_norm": 0.5163226932167695, "learning_rate": 7.609842615960595e-05, "loss": 0.6527, "num_tokens": 1541842685.0, "step": 660 }, { "epoch": 1.0286899412374697, "grad_norm": 0.6494316991627198, "learning_rate": 7.607760380839111e-05, "loss": 0.6599, "num_tokens": 1544150207.0, "step": 661 }, { "epoch": 1.0290356031800898, "grad_norm": 0.7169128975167005, "learning_rate": 7.605672890578404e-05, "loss": 0.6505, "num_tokens": 1546480381.0, "step": 662 }, { "epoch": 1.0293812651227099, "grad_norm": 0.45838851670608066, "learning_rate": 7.60358014821919e-05, "loss": 0.6455, "num_tokens": 1548865039.0, "step": 663 }, { "epoch": 1.0297269270653302, "grad_norm": 0.5637559218656475, "learning_rate": 7.60148215680983e-05, "loss": 0.6351, "num_tokens": 1551214967.0, "step": 664 }, { "epoch": 1.0300725890079503, "grad_norm": 0.6933103396811869, "learning_rate": 7.59937891940634e-05, "loss": 0.6665, "num_tokens": 1553613898.0, "step": 665 }, { "epoch": 1.0304182509505704, "grad_norm": 0.5474278756097691, "learning_rate": 7.597270439072372e-05, "loss": 0.6688, "num_tokens": 1555811871.0, "step": 666 }, { "epoch": 1.0307639128931905, "grad_norm": 0.585996743924975, "learning_rate": 7.59515671887922e-05, "loss": 0.6524, "num_tokens": 1558217502.0, "step": 667 }, { "epoch": 1.0311095748358106, "grad_norm": 0.590909112214886, "learning_rate": 7.593037761905803e-05, "loss": 0.6528, "num_tokens": 1560591813.0, "step": 668 }, { "epoch": 1.0314552367784307, "grad_norm": 0.5120051549837813, "learning_rate": 7.590913571238673e-05, "loss": 0.6467, "num_tokens": 1562947777.0, "step": 669 }, { "epoch": 1.0318008987210507, "grad_norm": 0.5374538070518096, "learning_rate": 7.588784149972007e-05, "loss": 0.6529, "num_tokens": 1565226888.0, "step": 670 }, { "epoch": 1.0321465606636708, "grad_norm": 0.6514603483818726, "learning_rate": 7.586649501207596e-05, "loss": 0.6555, "num_tokens": 1567566902.0, "step": 671 }, { "epoch": 1.0324922226062911, "grad_norm": 0.5880500827656041, "learning_rate": 7.58450962805485e-05, "loss": 0.6552, "num_tokens": 1570002540.0, "step": 672 }, { "epoch": 1.0328378845489112, "grad_norm": 0.40116340005740103, "learning_rate": 7.58236453363079e-05, "loss": 0.644, "num_tokens": 1572317546.0, "step": 673 }, { "epoch": 1.0331835464915313, "grad_norm": 0.8968079763045643, "learning_rate": 7.580214221060037e-05, "loss": 0.6468, "num_tokens": 1574670887.0, "step": 674 }, { "epoch": 1.0335292084341514, "grad_norm": 0.5400714101142915, "learning_rate": 7.578058693474817e-05, "loss": 0.6583, "num_tokens": 1577041992.0, "step": 675 }, { "epoch": 1.0338748703767715, "grad_norm": 1.167715203666084, "learning_rate": 7.57589795401495e-05, "loss": 0.6759, "num_tokens": 1579366383.0, "step": 676 }, { "epoch": 1.0342205323193916, "grad_norm": 0.8548395155995829, "learning_rate": 7.57373200582785e-05, "loss": 0.6687, "num_tokens": 1581686721.0, "step": 677 }, { "epoch": 1.0345661942620117, "grad_norm": 0.737877349090235, "learning_rate": 7.571560852068518e-05, "loss": 0.6601, "num_tokens": 1584027524.0, "step": 678 }, { "epoch": 1.0349118562046318, "grad_norm": 0.7724750351761767, "learning_rate": 7.569384495899537e-05, "loss": 0.6643, "num_tokens": 1586214548.0, "step": 679 }, { "epoch": 1.035257518147252, "grad_norm": 0.6427434281966125, "learning_rate": 7.567202940491068e-05, "loss": 0.6529, "num_tokens": 1588602294.0, "step": 680 }, { "epoch": 1.0356031800898722, "grad_norm": 0.7060933592824846, "learning_rate": 7.565016189020846e-05, "loss": 0.6602, "num_tokens": 1590931577.0, "step": 681 }, { "epoch": 1.0359488420324923, "grad_norm": 0.5749061643754176, "learning_rate": 7.562824244674171e-05, "loss": 0.6586, "num_tokens": 1593221502.0, "step": 682 }, { "epoch": 1.0362945039751124, "grad_norm": 0.9674203250964503, "learning_rate": 7.560627110643914e-05, "loss": 0.652, "num_tokens": 1595529164.0, "step": 683 }, { "epoch": 1.0366401659177324, "grad_norm": 0.636148254062618, "learning_rate": 7.558424790130502e-05, "loss": 0.6649, "num_tokens": 1597928186.0, "step": 684 }, { "epoch": 1.0369858278603525, "grad_norm": 1.2858714510085154, "learning_rate": 7.556217286341914e-05, "loss": 0.6637, "num_tokens": 1600287202.0, "step": 685 }, { "epoch": 1.0373314898029726, "grad_norm": 1.1981294794756567, "learning_rate": 7.554004602493684e-05, "loss": 0.6578, "num_tokens": 1602544956.0, "step": 686 }, { "epoch": 1.0376771517455927, "grad_norm": 0.5485746693126576, "learning_rate": 7.551786741808888e-05, "loss": 0.667, "num_tokens": 1604947160.0, "step": 687 }, { "epoch": 1.038022813688213, "grad_norm": 0.9849890404623954, "learning_rate": 7.549563707518146e-05, "loss": 0.6575, "num_tokens": 1607334967.0, "step": 688 }, { "epoch": 1.0383684756308331, "grad_norm": 0.7416291684802955, "learning_rate": 7.54733550285961e-05, "loss": 0.6699, "num_tokens": 1609590170.0, "step": 689 }, { "epoch": 1.0387141375734532, "grad_norm": 0.9466627868323391, "learning_rate": 7.545102131078966e-05, "loss": 0.6364, "num_tokens": 1611878853.0, "step": 690 }, { "epoch": 1.0390597995160733, "grad_norm": 0.6625787569147549, "learning_rate": 7.542863595429427e-05, "loss": 0.6701, "num_tokens": 1614271436.0, "step": 691 }, { "epoch": 1.0394054614586934, "grad_norm": 0.8851057627269082, "learning_rate": 7.540619899171724e-05, "loss": 0.6434, "num_tokens": 1616614392.0, "step": 692 }, { "epoch": 1.0397511234013135, "grad_norm": 0.6865229050413474, "learning_rate": 7.538371045574113e-05, "loss": 0.6642, "num_tokens": 1618931190.0, "step": 693 }, { "epoch": 1.0400967853439336, "grad_norm": 0.9536804293374974, "learning_rate": 7.536117037912354e-05, "loss": 0.6541, "num_tokens": 1621360340.0, "step": 694 }, { "epoch": 1.0404424472865537, "grad_norm": 0.9243922530318902, "learning_rate": 7.53385787946972e-05, "loss": 0.6518, "num_tokens": 1623537986.0, "step": 695 }, { "epoch": 1.040788109229174, "grad_norm": 0.6489092556853862, "learning_rate": 7.531593573536985e-05, "loss": 0.6542, "num_tokens": 1625975581.0, "step": 696 }, { "epoch": 1.041133771171794, "grad_norm": 0.7070904565887195, "learning_rate": 7.529324123412417e-05, "loss": 0.6446, "num_tokens": 1628249323.0, "step": 697 }, { "epoch": 1.0414794331144142, "grad_norm": 0.5605860382938546, "learning_rate": 7.527049532401786e-05, "loss": 0.6494, "num_tokens": 1630621076.0, "step": 698 }, { "epoch": 1.0418250950570342, "grad_norm": 0.5440822681356616, "learning_rate": 7.524769803818344e-05, "loss": 0.6657, "num_tokens": 1632919877.0, "step": 699 }, { "epoch": 1.0421707569996543, "grad_norm": 0.6879646283882763, "learning_rate": 7.522484940982828e-05, "loss": 0.6683, "num_tokens": 1635377901.0, "step": 700 }, { "epoch": 1.0425164189422744, "grad_norm": 0.7656100461663494, "learning_rate": 7.520194947223452e-05, "loss": 0.6579, "num_tokens": 1637667005.0, "step": 701 }, { "epoch": 1.0428620808848945, "grad_norm": 0.5020779204302981, "learning_rate": 7.517899825875908e-05, "loss": 0.6673, "num_tokens": 1639995071.0, "step": 702 }, { "epoch": 1.0432077428275146, "grad_norm": 1.0248831881344556, "learning_rate": 7.515599580283355e-05, "loss": 0.6676, "num_tokens": 1642403239.0, "step": 703 }, { "epoch": 1.043553404770135, "grad_norm": 0.7934296382607025, "learning_rate": 7.513294213796416e-05, "loss": 0.6595, "num_tokens": 1644747762.0, "step": 704 }, { "epoch": 1.043899066712755, "grad_norm": 0.785876435755504, "learning_rate": 7.510983729773172e-05, "loss": 0.6544, "num_tokens": 1647148874.0, "step": 705 }, { "epoch": 1.044244728655375, "grad_norm": 0.6653142046134285, "learning_rate": 7.50866813157916e-05, "loss": 0.6476, "num_tokens": 1649429078.0, "step": 706 }, { "epoch": 1.0445903905979952, "grad_norm": 0.5805927347944138, "learning_rate": 7.506347422587367e-05, "loss": 0.6459, "num_tokens": 1651724969.0, "step": 707 }, { "epoch": 1.0449360525406153, "grad_norm": 0.4995620326946016, "learning_rate": 7.504021606178223e-05, "loss": 0.652, "num_tokens": 1654102988.0, "step": 708 }, { "epoch": 1.0452817144832354, "grad_norm": 1.0024879397761142, "learning_rate": 7.5016906857396e-05, "loss": 0.6537, "num_tokens": 1656515825.0, "step": 709 }, { "epoch": 1.0456273764258555, "grad_norm": 0.7784919709657042, "learning_rate": 7.499354664666799e-05, "loss": 0.6468, "num_tokens": 1658838824.0, "step": 710 }, { "epoch": 1.0459730383684755, "grad_norm": 0.9938225644209574, "learning_rate": 7.49701354636256e-05, "loss": 0.6632, "num_tokens": 1661324459.0, "step": 711 }, { "epoch": 1.0463187003110956, "grad_norm": 0.8426641245749431, "learning_rate": 7.494667334237038e-05, "loss": 0.6584, "num_tokens": 1663618935.0, "step": 712 }, { "epoch": 1.046664362253716, "grad_norm": 1.0918729539916363, "learning_rate": 7.492316031707816e-05, "loss": 0.6582, "num_tokens": 1666090420.0, "step": 713 }, { "epoch": 1.047010024196336, "grad_norm": 0.7066020239858003, "learning_rate": 7.489959642199887e-05, "loss": 0.6445, "num_tokens": 1668437369.0, "step": 714 }, { "epoch": 1.0473556861389561, "grad_norm": 1.0404264832931147, "learning_rate": 7.487598169145655e-05, "loss": 0.6375, "num_tokens": 1670695341.0, "step": 715 }, { "epoch": 1.0477013480815762, "grad_norm": 0.7302789837282395, "learning_rate": 7.485231615984931e-05, "loss": 0.6578, "num_tokens": 1672936203.0, "step": 716 }, { "epoch": 1.0480470100241963, "grad_norm": 0.99046289830908, "learning_rate": 7.482859986164923e-05, "loss": 0.6295, "num_tokens": 1675294177.0, "step": 717 }, { "epoch": 1.0483926719668164, "grad_norm": 0.8661096408910032, "learning_rate": 7.480483283140234e-05, "loss": 0.6394, "num_tokens": 1677573567.0, "step": 718 }, { "epoch": 1.0487383339094365, "grad_norm": 0.8909145637324993, "learning_rate": 7.478101510372859e-05, "loss": 0.6531, "num_tokens": 1679936390.0, "step": 719 }, { "epoch": 1.0490839958520566, "grad_norm": 0.8656962894943192, "learning_rate": 7.475714671332174e-05, "loss": 0.6437, "num_tokens": 1682313056.0, "step": 720 }, { "epoch": 1.049429657794677, "grad_norm": 0.7777445530331969, "learning_rate": 7.473322769494939e-05, "loss": 0.6629, "num_tokens": 1684756818.0, "step": 721 }, { "epoch": 1.049775319737297, "grad_norm": 0.7722760692911367, "learning_rate": 7.470925808345288e-05, "loss": 0.667, "num_tokens": 1687064609.0, "step": 722 }, { "epoch": 1.050120981679917, "grad_norm": 0.5156006867367259, "learning_rate": 7.468523791374722e-05, "loss": 0.656, "num_tokens": 1689398275.0, "step": 723 }, { "epoch": 1.0504666436225372, "grad_norm": 1.2427373802965145, "learning_rate": 7.466116722082109e-05, "loss": 0.6556, "num_tokens": 1691759259.0, "step": 724 }, { "epoch": 1.0508123055651573, "grad_norm": 0.9602253747856707, "learning_rate": 7.463704603973674e-05, "loss": 0.651, "num_tokens": 1694009012.0, "step": 725 }, { "epoch": 1.0511579675077773, "grad_norm": 1.2299356232976921, "learning_rate": 7.461287440562998e-05, "loss": 0.6354, "num_tokens": 1696272685.0, "step": 726 }, { "epoch": 1.0515036294503974, "grad_norm": 1.2578384724260376, "learning_rate": 7.458865235371014e-05, "loss": 0.6485, "num_tokens": 1698606423.0, "step": 727 }, { "epoch": 1.0518492913930175, "grad_norm": 0.8398174663359371, "learning_rate": 7.45643799192599e-05, "loss": 0.6457, "num_tokens": 1700848633.0, "step": 728 }, { "epoch": 1.0521949533356378, "grad_norm": 0.8690683195161508, "learning_rate": 7.454005713763542e-05, "loss": 0.6229, "num_tokens": 1703082424.0, "step": 729 }, { "epoch": 1.052540615278258, "grad_norm": 0.8994921927069949, "learning_rate": 7.451568404426616e-05, "loss": 0.6531, "num_tokens": 1705436343.0, "step": 730 }, { "epoch": 1.052886277220878, "grad_norm": 0.6372820534131493, "learning_rate": 7.449126067465489e-05, "loss": 0.6384, "num_tokens": 1707640584.0, "step": 731 }, { "epoch": 1.053231939163498, "grad_norm": 1.0529172440016912, "learning_rate": 7.446678706437757e-05, "loss": 0.6301, "num_tokens": 1709958348.0, "step": 732 }, { "epoch": 1.0535776011061182, "grad_norm": 0.8723977841586724, "learning_rate": 7.444226324908337e-05, "loss": 0.6614, "num_tokens": 1712337837.0, "step": 733 }, { "epoch": 1.0539232630487383, "grad_norm": 0.9742638243269647, "learning_rate": 7.441768926449462e-05, "loss": 0.6454, "num_tokens": 1714554478.0, "step": 734 }, { "epoch": 1.0542689249913584, "grad_norm": 0.8825078820815238, "learning_rate": 7.439306514640664e-05, "loss": 0.6373, "num_tokens": 1716824287.0, "step": 735 }, { "epoch": 1.0546145869339785, "grad_norm": 0.8084516685896288, "learning_rate": 7.436839093068789e-05, "loss": 0.6554, "num_tokens": 1719124722.0, "step": 736 }, { "epoch": 1.0549602488765988, "grad_norm": 0.6476128238630405, "learning_rate": 7.434366665327972e-05, "loss": 0.6243, "num_tokens": 1721387459.0, "step": 737 }, { "epoch": 1.0553059108192189, "grad_norm": 0.9501488634030735, "learning_rate": 7.431889235019642e-05, "loss": 0.6607, "num_tokens": 1723663967.0, "step": 738 }, { "epoch": 1.055651572761839, "grad_norm": 0.6065977588322126, "learning_rate": 7.429406805752517e-05, "loss": 0.6405, "num_tokens": 1726098124.0, "step": 739 }, { "epoch": 1.055997234704459, "grad_norm": 1.2323642179986805, "learning_rate": 7.426919381142596e-05, "loss": 0.6534, "num_tokens": 1728451629.0, "step": 740 }, { "epoch": 1.0563428966470791, "grad_norm": 1.2010325979432563, "learning_rate": 7.424426964813154e-05, "loss": 0.6679, "num_tokens": 1730884149.0, "step": 741 }, { "epoch": 1.0566885585896992, "grad_norm": 0.6975154125871768, "learning_rate": 7.421929560394736e-05, "loss": 0.6545, "num_tokens": 1733153296.0, "step": 742 }, { "epoch": 1.0570342205323193, "grad_norm": 0.78449847823142, "learning_rate": 7.419427171525152e-05, "loss": 0.6414, "num_tokens": 1735500140.0, "step": 743 }, { "epoch": 1.0573798824749394, "grad_norm": 0.6423226250173539, "learning_rate": 7.416919801849479e-05, "loss": 0.6577, "num_tokens": 1737848869.0, "step": 744 }, { "epoch": 1.0577255444175597, "grad_norm": 0.6105721895654297, "learning_rate": 7.414407455020042e-05, "loss": 0.6192, "num_tokens": 1740058638.0, "step": 745 }, { "epoch": 1.0580712063601798, "grad_norm": 0.5677178819217232, "learning_rate": 7.411890134696417e-05, "loss": 0.6526, "num_tokens": 1742408534.0, "step": 746 }, { "epoch": 1.0584168683028, "grad_norm": 0.5482402190242479, "learning_rate": 7.40936784454543e-05, "loss": 0.6536, "num_tokens": 1744745805.0, "step": 747 }, { "epoch": 1.05876253024542, "grad_norm": 0.547107451115583, "learning_rate": 7.406840588241138e-05, "loss": 0.6441, "num_tokens": 1746951988.0, "step": 748 }, { "epoch": 1.05910819218804, "grad_norm": 0.5905695130265809, "learning_rate": 7.404308369464839e-05, "loss": 0.6341, "num_tokens": 1749258212.0, "step": 749 }, { "epoch": 1.0594538541306602, "grad_norm": 0.5030246960359617, "learning_rate": 7.401771191905056e-05, "loss": 0.6479, "num_tokens": 1751554535.0, "step": 750 }, { "epoch": 1.0597995160732803, "grad_norm": 0.5832871767615319, "learning_rate": 7.399229059257537e-05, "loss": 0.652, "num_tokens": 1753966669.0, "step": 751 }, { "epoch": 1.0601451780159004, "grad_norm": 0.5121125935844938, "learning_rate": 7.396681975225244e-05, "loss": 0.6569, "num_tokens": 1756377001.0, "step": 752 }, { "epoch": 1.0604908399585207, "grad_norm": 0.4503017435065586, "learning_rate": 7.394129943518356e-05, "loss": 0.6311, "num_tokens": 1758737119.0, "step": 753 }, { "epoch": 1.0608365019011408, "grad_norm": 0.7949061846548016, "learning_rate": 7.391572967854258e-05, "loss": 0.648, "num_tokens": 1761026759.0, "step": 754 }, { "epoch": 1.0611821638437609, "grad_norm": 0.4560064351545243, "learning_rate": 7.389011051957532e-05, "loss": 0.6451, "num_tokens": 1763350042.0, "step": 755 }, { "epoch": 1.061527825786381, "grad_norm": 1.2347077769242014, "learning_rate": 7.386444199559961e-05, "loss": 0.6487, "num_tokens": 1765650817.0, "step": 756 }, { "epoch": 1.061873487729001, "grad_norm": 1.158583874700133, "learning_rate": 7.38387241440052e-05, "loss": 0.6504, "num_tokens": 1767881814.0, "step": 757 }, { "epoch": 1.0622191496716211, "grad_norm": 0.6448011263007218, "learning_rate": 7.381295700225364e-05, "loss": 0.641, "num_tokens": 1770175833.0, "step": 758 }, { "epoch": 1.0625648116142412, "grad_norm": 0.8917307508923848, "learning_rate": 7.37871406078783e-05, "loss": 0.667, "num_tokens": 1772663319.0, "step": 759 }, { "epoch": 1.0629104735568613, "grad_norm": 0.6104621491368218, "learning_rate": 7.37612749984843e-05, "loss": 0.6436, "num_tokens": 1775138394.0, "step": 760 }, { "epoch": 1.0632561354994814, "grad_norm": 0.8124160430251444, "learning_rate": 7.373536021174847e-05, "loss": 0.6536, "num_tokens": 1777396741.0, "step": 761 }, { "epoch": 1.0636017974421017, "grad_norm": 0.7039889909540522, "learning_rate": 7.370939628541924e-05, "loss": 0.6474, "num_tokens": 1779706764.0, "step": 762 }, { "epoch": 1.0639474593847218, "grad_norm": 0.689193292786864, "learning_rate": 7.368338325731661e-05, "loss": 0.6439, "num_tokens": 1781916130.0, "step": 763 }, { "epoch": 1.0642931213273419, "grad_norm": 0.6390137507725259, "learning_rate": 7.365732116533211e-05, "loss": 0.6419, "num_tokens": 1784188330.0, "step": 764 }, { "epoch": 1.064638783269962, "grad_norm": 0.6554849571391587, "learning_rate": 7.363121004742878e-05, "loss": 0.6519, "num_tokens": 1786644594.0, "step": 765 }, { "epoch": 1.064984445212582, "grad_norm": 0.6312688213053872, "learning_rate": 7.360504994164103e-05, "loss": 0.6507, "num_tokens": 1789030996.0, "step": 766 }, { "epoch": 1.0653301071552022, "grad_norm": 0.6189738003958034, "learning_rate": 7.357884088607464e-05, "loss": 0.6488, "num_tokens": 1791436337.0, "step": 767 }, { "epoch": 1.0656757690978222, "grad_norm": 0.6977938063772677, "learning_rate": 7.355258291890668e-05, "loss": 0.6499, "num_tokens": 1793687333.0, "step": 768 }, { "epoch": 1.0660214310404426, "grad_norm": 0.619630514606608, "learning_rate": 7.352627607838552e-05, "loss": 0.6462, "num_tokens": 1796062168.0, "step": 769 }, { "epoch": 1.0663670929830626, "grad_norm": 0.5828130974841995, "learning_rate": 7.349992040283063e-05, "loss": 0.645, "num_tokens": 1798457607.0, "step": 770 }, { "epoch": 1.0667127549256827, "grad_norm": 0.6199761505982225, "learning_rate": 7.347351593063268e-05, "loss": 0.6372, "num_tokens": 1800781598.0, "step": 771 }, { "epoch": 1.0670584168683028, "grad_norm": 0.5266254097971709, "learning_rate": 7.344706270025341e-05, "loss": 0.6395, "num_tokens": 1803166909.0, "step": 772 }, { "epoch": 1.067404078810923, "grad_norm": 0.6359171321541497, "learning_rate": 7.342056075022558e-05, "loss": 0.6643, "num_tokens": 1805621827.0, "step": 773 }, { "epoch": 1.067749740753543, "grad_norm": 0.5002793760377864, "learning_rate": 7.339401011915288e-05, "loss": 0.6553, "num_tokens": 1808133341.0, "step": 774 }, { "epoch": 1.068095402696163, "grad_norm": 0.45445502906255464, "learning_rate": 7.336741084571e-05, "loss": 0.6436, "num_tokens": 1810498181.0, "step": 775 }, { "epoch": 1.0684410646387832, "grad_norm": 0.5993541693473192, "learning_rate": 7.334076296864237e-05, "loss": 0.6566, "num_tokens": 1812815708.0, "step": 776 }, { "epoch": 1.0687867265814033, "grad_norm": 0.39715208242474315, "learning_rate": 7.331406652676631e-05, "loss": 0.6257, "num_tokens": 1815191832.0, "step": 777 }, { "epoch": 1.0691323885240236, "grad_norm": 0.5807411421752339, "learning_rate": 7.328732155896883e-05, "loss": 0.6512, "num_tokens": 1817555497.0, "step": 778 }, { "epoch": 1.0694780504666437, "grad_norm": 0.536870812872521, "learning_rate": 7.326052810420765e-05, "loss": 0.6386, "num_tokens": 1819843980.0, "step": 779 }, { "epoch": 1.0698237124092638, "grad_norm": 0.4966890078545703, "learning_rate": 7.323368620151112e-05, "loss": 0.6371, "num_tokens": 1822080005.0, "step": 780 }, { "epoch": 1.0701693743518839, "grad_norm": 0.5976501128568695, "learning_rate": 7.320679588997813e-05, "loss": 0.6588, "num_tokens": 1824416891.0, "step": 781 }, { "epoch": 1.070515036294504, "grad_norm": 0.46247030445853615, "learning_rate": 7.317985720877812e-05, "loss": 0.6402, "num_tokens": 1826709154.0, "step": 782 }, { "epoch": 1.070860698237124, "grad_norm": 0.7536096132054806, "learning_rate": 7.315287019715096e-05, "loss": 0.6437, "num_tokens": 1829001355.0, "step": 783 }, { "epoch": 1.0712063601797441, "grad_norm": 0.38994420778625705, "learning_rate": 7.312583489440692e-05, "loss": 0.6538, "num_tokens": 1831393805.0, "step": 784 }, { "epoch": 1.0715520221223644, "grad_norm": 0.7109690429009989, "learning_rate": 7.309875133992666e-05, "loss": 0.6455, "num_tokens": 1833796453.0, "step": 785 }, { "epoch": 1.0718976840649845, "grad_norm": 0.4852179051637593, "learning_rate": 7.307161957316106e-05, "loss": 0.6527, "num_tokens": 1836211006.0, "step": 786 }, { "epoch": 1.0722433460076046, "grad_norm": 0.7083451989057316, "learning_rate": 7.304443963363126e-05, "loss": 0.6522, "num_tokens": 1838424514.0, "step": 787 }, { "epoch": 1.0725890079502247, "grad_norm": 0.7573885040764021, "learning_rate": 7.301721156092858e-05, "loss": 0.6398, "num_tokens": 1840656105.0, "step": 788 }, { "epoch": 1.0729346698928448, "grad_norm": 0.6114814000393922, "learning_rate": 7.298993539471443e-05, "loss": 0.6598, "num_tokens": 1843070310.0, "step": 789 }, { "epoch": 1.073280331835465, "grad_norm": 0.6934632391747559, "learning_rate": 7.29626111747203e-05, "loss": 0.656, "num_tokens": 1845457321.0, "step": 790 }, { "epoch": 1.073625993778085, "grad_norm": 0.5843166128213365, "learning_rate": 7.293523894074763e-05, "loss": 0.641, "num_tokens": 1847767475.0, "step": 791 }, { "epoch": 1.073971655720705, "grad_norm": 0.667579078831484, "learning_rate": 7.290781873266787e-05, "loss": 0.6349, "num_tokens": 1850145899.0, "step": 792 }, { "epoch": 1.0743173176633252, "grad_norm": 0.6197358398924984, "learning_rate": 7.288035059042229e-05, "loss": 0.6564, "num_tokens": 1852622005.0, "step": 793 }, { "epoch": 1.0746629796059455, "grad_norm": 0.7001967420866541, "learning_rate": 7.285283455402199e-05, "loss": 0.6277, "num_tokens": 1854879141.0, "step": 794 }, { "epoch": 1.0750086415485656, "grad_norm": 0.5182855704796356, "learning_rate": 7.282527066354787e-05, "loss": 0.6497, "num_tokens": 1857195141.0, "step": 795 }, { "epoch": 1.0753543034911857, "grad_norm": 0.5523797722375886, "learning_rate": 7.279765895915051e-05, "loss": 0.63, "num_tokens": 1859489387.0, "step": 796 }, { "epoch": 1.0756999654338057, "grad_norm": 0.760152374250873, "learning_rate": 7.276999948105014e-05, "loss": 0.6309, "num_tokens": 1861760812.0, "step": 797 }, { "epoch": 1.0760456273764258, "grad_norm": 0.4682843426946511, "learning_rate": 7.274229226953658e-05, "loss": 0.631, "num_tokens": 1864121467.0, "step": 798 }, { "epoch": 1.076391289319046, "grad_norm": 0.8337644900637217, "learning_rate": 7.271453736496918e-05, "loss": 0.6447, "num_tokens": 1866587557.0, "step": 799 }, { "epoch": 1.076736951261666, "grad_norm": 0.7879071626071465, "learning_rate": 7.268673480777676e-05, "loss": 0.6505, "num_tokens": 1868815218.0, "step": 800 }, { "epoch": 1.077082613204286, "grad_norm": 0.8061431677035321, "learning_rate": 7.265888463845758e-05, "loss": 0.6389, "num_tokens": 1871103301.0, "step": 801 }, { "epoch": 1.0774282751469064, "grad_norm": 0.6987493343066828, "learning_rate": 7.26309868975792e-05, "loss": 0.6489, "num_tokens": 1873534935.0, "step": 802 }, { "epoch": 1.0777739370895265, "grad_norm": 0.6400362720975927, "learning_rate": 7.260304162577852e-05, "loss": 0.6493, "num_tokens": 1875885442.0, "step": 803 }, { "epoch": 1.0781195990321466, "grad_norm": 0.6266223091351598, "learning_rate": 7.257504886376164e-05, "loss": 0.6655, "num_tokens": 1878198463.0, "step": 804 }, { "epoch": 1.0784652609747667, "grad_norm": 0.6739942656077761, "learning_rate": 7.254700865230387e-05, "loss": 0.6768, "num_tokens": 1880600485.0, "step": 805 }, { "epoch": 1.0788109229173868, "grad_norm": 0.5902245697276366, "learning_rate": 7.251892103224961e-05, "loss": 0.6397, "num_tokens": 1882914183.0, "step": 806 }, { "epoch": 1.0791565848600069, "grad_norm": 0.7020689395643801, "learning_rate": 7.249078604451235e-05, "loss": 0.6395, "num_tokens": 1885233760.0, "step": 807 }, { "epoch": 1.079502246802627, "grad_norm": 0.6115206656323819, "learning_rate": 7.246260373007453e-05, "loss": 0.6506, "num_tokens": 1887564291.0, "step": 808 }, { "epoch": 1.079847908745247, "grad_norm": 0.6114811908663207, "learning_rate": 7.243437412998757e-05, "loss": 0.6366, "num_tokens": 1889754096.0, "step": 809 }, { "epoch": 1.0801935706878674, "grad_norm": 0.46225512013206405, "learning_rate": 7.240609728537177e-05, "loss": 0.6512, "num_tokens": 1892045520.0, "step": 810 }, { "epoch": 1.0805392326304875, "grad_norm": 0.6560348649529575, "learning_rate": 7.237777323741618e-05, "loss": 0.6557, "num_tokens": 1894354541.0, "step": 811 }, { "epoch": 1.0808848945731075, "grad_norm": 0.6453921405854075, "learning_rate": 7.23494020273787e-05, "loss": 0.6525, "num_tokens": 1896652688.0, "step": 812 }, { "epoch": 1.0812305565157276, "grad_norm": 0.5369860393744597, "learning_rate": 7.232098369658586e-05, "loss": 0.6526, "num_tokens": 1899087830.0, "step": 813 }, { "epoch": 1.0815762184583477, "grad_norm": 0.4107974206461175, "learning_rate": 7.229251828643286e-05, "loss": 0.6424, "num_tokens": 1901392709.0, "step": 814 }, { "epoch": 1.0819218804009678, "grad_norm": 0.4578877114563842, "learning_rate": 7.226400583838349e-05, "loss": 0.6304, "num_tokens": 1903834795.0, "step": 815 }, { "epoch": 1.082267542343588, "grad_norm": 0.5944889433338477, "learning_rate": 7.223544639397004e-05, "loss": 0.6458, "num_tokens": 1906206411.0, "step": 816 }, { "epoch": 1.082613204286208, "grad_norm": 0.6184931999931358, "learning_rate": 7.220683999479321e-05, "loss": 0.6361, "num_tokens": 1908577672.0, "step": 817 }, { "epoch": 1.0829588662288283, "grad_norm": 0.45795276862694984, "learning_rate": 7.217818668252218e-05, "loss": 0.6331, "num_tokens": 1910889471.0, "step": 818 }, { "epoch": 1.0833045281714484, "grad_norm": 0.5132892740499396, "learning_rate": 7.214948649889444e-05, "loss": 0.6377, "num_tokens": 1913234203.0, "step": 819 }, { "epoch": 1.0836501901140685, "grad_norm": 0.3448583302603749, "learning_rate": 7.212073948571568e-05, "loss": 0.628, "num_tokens": 1915596601.0, "step": 820 }, { "epoch": 1.0839958520566886, "grad_norm": 0.5513522868846035, "learning_rate": 7.209194568485995e-05, "loss": 0.6233, "num_tokens": 1917901182.0, "step": 821 }, { "epoch": 1.0843415139993087, "grad_norm": 0.5866413038599926, "learning_rate": 7.20631051382693e-05, "loss": 0.6458, "num_tokens": 1920226840.0, "step": 822 }, { "epoch": 1.0846871759419288, "grad_norm": 0.4045684168469693, "learning_rate": 7.203421788795396e-05, "loss": 0.6471, "num_tokens": 1922453696.0, "step": 823 }, { "epoch": 1.0850328378845489, "grad_norm": 0.7869103482008692, "learning_rate": 7.200528397599219e-05, "loss": 0.6494, "num_tokens": 1924759201.0, "step": 824 }, { "epoch": 1.085378499827169, "grad_norm": 0.5410056370259639, "learning_rate": 7.197630344453017e-05, "loss": 0.6602, "num_tokens": 1927135596.0, "step": 825 }, { "epoch": 1.085724161769789, "grad_norm": 0.7958783353763306, "learning_rate": 7.194727633578201e-05, "loss": 0.6549, "num_tokens": 1929610669.0, "step": 826 }, { "epoch": 1.0860698237124093, "grad_norm": 0.6402841646465279, "learning_rate": 7.19182026920297e-05, "loss": 0.6459, "num_tokens": 1931883643.0, "step": 827 }, { "epoch": 1.0864154856550294, "grad_norm": 0.818857544167672, "learning_rate": 7.188908255562297e-05, "loss": 0.6481, "num_tokens": 1934243142.0, "step": 828 }, { "epoch": 1.0867611475976495, "grad_norm": 0.5363775378787705, "learning_rate": 7.18599159689793e-05, "loss": 0.6493, "num_tokens": 1936514746.0, "step": 829 }, { "epoch": 1.0871068095402696, "grad_norm": 0.864852068872538, "learning_rate": 7.183070297458383e-05, "loss": 0.6391, "num_tokens": 1938735061.0, "step": 830 }, { "epoch": 1.0874524714828897, "grad_norm": 0.8102716523037938, "learning_rate": 7.180144361498927e-05, "loss": 0.6468, "num_tokens": 1941031300.0, "step": 831 }, { "epoch": 1.0877981334255098, "grad_norm": 0.5484483948735375, "learning_rate": 7.177213793281587e-05, "loss": 0.6548, "num_tokens": 1943463463.0, "step": 832 }, { "epoch": 1.0881437953681299, "grad_norm": 0.7593067290211563, "learning_rate": 7.174278597075143e-05, "loss": 0.6518, "num_tokens": 1945702324.0, "step": 833 }, { "epoch": 1.0884894573107502, "grad_norm": 0.6533774156522734, "learning_rate": 7.171338777155107e-05, "loss": 0.6374, "num_tokens": 1948050388.0, "step": 834 }, { "epoch": 1.0888351192533703, "grad_norm": 0.5686216185926073, "learning_rate": 7.16839433780373e-05, "loss": 0.6499, "num_tokens": 1950369193.0, "step": 835 }, { "epoch": 1.0891807811959904, "grad_norm": 0.706657891467921, "learning_rate": 7.165445283309989e-05, "loss": 0.6464, "num_tokens": 1952782653.0, "step": 836 }, { "epoch": 1.0895264431386105, "grad_norm": 0.572975938004015, "learning_rate": 7.162491617969592e-05, "loss": 0.6411, "num_tokens": 1955164495.0, "step": 837 }, { "epoch": 1.0898721050812306, "grad_norm": 0.4864844513645345, "learning_rate": 7.159533346084952e-05, "loss": 0.6362, "num_tokens": 1957510144.0, "step": 838 }, { "epoch": 1.0902177670238506, "grad_norm": 0.5932761701971104, "learning_rate": 7.156570471965199e-05, "loss": 0.6434, "num_tokens": 1959879213.0, "step": 839 }, { "epoch": 1.0905634289664707, "grad_norm": 0.4599648868842703, "learning_rate": 7.153602999926166e-05, "loss": 0.6264, "num_tokens": 1962237084.0, "step": 840 }, { "epoch": 1.0909090909090908, "grad_norm": 0.6961549336858718, "learning_rate": 7.150630934290383e-05, "loss": 0.6564, "num_tokens": 1964642093.0, "step": 841 }, { "epoch": 1.091254752851711, "grad_norm": 0.5532430573581417, "learning_rate": 7.147654279387071e-05, "loss": 0.6357, "num_tokens": 1967001460.0, "step": 842 }, { "epoch": 1.0916004147943312, "grad_norm": 0.5394878914126686, "learning_rate": 7.144673039552135e-05, "loss": 0.6411, "num_tokens": 1969354407.0, "step": 843 }, { "epoch": 1.0919460767369513, "grad_norm": 0.6909141430882717, "learning_rate": 7.14168721912816e-05, "loss": 0.623, "num_tokens": 1971649574.0, "step": 844 }, { "epoch": 1.0922917386795714, "grad_norm": 0.5137859737101764, "learning_rate": 7.138696822464401e-05, "loss": 0.6486, "num_tokens": 1973936364.0, "step": 845 }, { "epoch": 1.0926374006221915, "grad_norm": 0.832982418739012, "learning_rate": 7.135701853916784e-05, "loss": 0.6382, "num_tokens": 1976244124.0, "step": 846 }, { "epoch": 1.0929830625648116, "grad_norm": 0.7229837251766013, "learning_rate": 7.132702317847889e-05, "loss": 0.6198, "num_tokens": 1978510513.0, "step": 847 }, { "epoch": 1.0933287245074317, "grad_norm": 0.7518776950417697, "learning_rate": 7.129698218626951e-05, "loss": 0.6523, "num_tokens": 1980794292.0, "step": 848 }, { "epoch": 1.0936743864500518, "grad_norm": 0.7267343038511015, "learning_rate": 7.126689560629852e-05, "loss": 0.629, "num_tokens": 1983095973.0, "step": 849 }, { "epoch": 1.094020048392672, "grad_norm": 0.5623206201756308, "learning_rate": 7.123676348239117e-05, "loss": 0.6211, "num_tokens": 1985399396.0, "step": 850 }, { "epoch": 1.0943657103352922, "grad_norm": 0.6394717281881187, "learning_rate": 7.120658585843901e-05, "loss": 0.6193, "num_tokens": 1987621030.0, "step": 851 }, { "epoch": 1.0947113722779123, "grad_norm": 0.4674349248748675, "learning_rate": 7.117636277839989e-05, "loss": 0.6468, "num_tokens": 1989875933.0, "step": 852 }, { "epoch": 1.0950570342205324, "grad_norm": 0.7775480942695264, "learning_rate": 7.114609428629787e-05, "loss": 0.6482, "num_tokens": 1992156139.0, "step": 853 }, { "epoch": 1.0954026961631524, "grad_norm": 0.5758101275193842, "learning_rate": 7.111578042622317e-05, "loss": 0.6259, "num_tokens": 1994420988.0, "step": 854 }, { "epoch": 1.0957483581057725, "grad_norm": 0.8718137102717207, "learning_rate": 7.108542124233206e-05, "loss": 0.6515, "num_tokens": 1996790521.0, "step": 855 }, { "epoch": 1.0960940200483926, "grad_norm": 0.7422005655497319, "learning_rate": 7.105501677884686e-05, "loss": 0.6457, "num_tokens": 1999127227.0, "step": 856 }, { "epoch": 1.0964396819910127, "grad_norm": 0.7689179759714174, "learning_rate": 7.102456708005585e-05, "loss": 0.6349, "num_tokens": 2001456840.0, "step": 857 }, { "epoch": 1.0967853439336328, "grad_norm": 0.6696473629178211, "learning_rate": 7.099407219031317e-05, "loss": 0.643, "num_tokens": 2003834413.0, "step": 858 }, { "epoch": 1.0971310058762531, "grad_norm": 0.7196473711071472, "learning_rate": 7.096353215403882e-05, "loss": 0.6297, "num_tokens": 2006098585.0, "step": 859 }, { "epoch": 1.0974766678188732, "grad_norm": 0.5928074700274769, "learning_rate": 7.093294701571853e-05, "loss": 0.6409, "num_tokens": 2008489293.0, "step": 860 }, { "epoch": 1.0978223297614933, "grad_norm": 0.7056246259237353, "learning_rate": 7.090231681990379e-05, "loss": 0.6414, "num_tokens": 2010808787.0, "step": 861 }, { "epoch": 1.0981679917041134, "grad_norm": 0.5486511854819776, "learning_rate": 7.087164161121162e-05, "loss": 0.6478, "num_tokens": 2013092910.0, "step": 862 }, { "epoch": 1.0985136536467335, "grad_norm": 0.7275998401947965, "learning_rate": 7.084092143432472e-05, "loss": 0.6219, "num_tokens": 2015422559.0, "step": 863 }, { "epoch": 1.0988593155893536, "grad_norm": 0.5902939257926976, "learning_rate": 7.08101563339912e-05, "loss": 0.635, "num_tokens": 2017782573.0, "step": 864 }, { "epoch": 1.0992049775319737, "grad_norm": 0.6373096265415296, "learning_rate": 7.077934635502467e-05, "loss": 0.6401, "num_tokens": 2020116306.0, "step": 865 }, { "epoch": 1.0995506394745937, "grad_norm": 0.5207584379771001, "learning_rate": 7.074849154230407e-05, "loss": 0.6322, "num_tokens": 2022443781.0, "step": 866 }, { "epoch": 1.099896301417214, "grad_norm": 0.6445001637743131, "learning_rate": 7.071759194077368e-05, "loss": 0.6243, "num_tokens": 2024745124.0, "step": 867 }, { "epoch": 1.1002419633598342, "grad_norm": 0.6259526617735324, "learning_rate": 7.068664759544299e-05, "loss": 0.6365, "num_tokens": 2027035978.0, "step": 868 }, { "epoch": 1.1005876253024542, "grad_norm": 0.6851587462113808, "learning_rate": 7.065565855138669e-05, "loss": 0.6415, "num_tokens": 2029384468.0, "step": 869 }, { "epoch": 1.1009332872450743, "grad_norm": 0.7118355424982044, "learning_rate": 7.062462485374456e-05, "loss": 0.6365, "num_tokens": 2031813719.0, "step": 870 }, { "epoch": 1.1012789491876944, "grad_norm": 0.5718826597746939, "learning_rate": 7.059354654772145e-05, "loss": 0.6371, "num_tokens": 2034215691.0, "step": 871 }, { "epoch": 1.1016246111303145, "grad_norm": 0.669027918816834, "learning_rate": 7.056242367858716e-05, "loss": 0.617, "num_tokens": 2036688303.0, "step": 872 }, { "epoch": 1.1019702730729346, "grad_norm": 0.4390920202449945, "learning_rate": 7.05312562916764e-05, "loss": 0.631, "num_tokens": 2039089916.0, "step": 873 }, { "epoch": 1.1023159350155547, "grad_norm": 0.7705947538110972, "learning_rate": 7.050004443238879e-05, "loss": 0.6365, "num_tokens": 2041486542.0, "step": 874 }, { "epoch": 1.102661596958175, "grad_norm": 0.5244246251307207, "learning_rate": 7.046878814618862e-05, "loss": 0.6338, "num_tokens": 2043822878.0, "step": 875 }, { "epoch": 1.103007258900795, "grad_norm": 0.6855156107505581, "learning_rate": 7.0437487478605e-05, "loss": 0.6157, "num_tokens": 2046121260.0, "step": 876 }, { "epoch": 1.1033529208434152, "grad_norm": 0.5914213555415903, "learning_rate": 7.040614247523163e-05, "loss": 0.6408, "num_tokens": 2048421563.0, "step": 877 }, { "epoch": 1.1036985827860353, "grad_norm": 0.6966852099859985, "learning_rate": 7.037475318172679e-05, "loss": 0.6263, "num_tokens": 2050773142.0, "step": 878 }, { "epoch": 1.1040442447286554, "grad_norm": 0.6057019635316307, "learning_rate": 7.03433196438133e-05, "loss": 0.6264, "num_tokens": 2053065791.0, "step": 879 }, { "epoch": 1.1043899066712755, "grad_norm": 0.6243191805663824, "learning_rate": 7.031184190727843e-05, "loss": 0.6318, "num_tokens": 2055382885.0, "step": 880 }, { "epoch": 1.1047355686138955, "grad_norm": 0.6471641175860846, "learning_rate": 7.028032001797379e-05, "loss": 0.6401, "num_tokens": 2057694752.0, "step": 881 }, { "epoch": 1.1050812305565156, "grad_norm": 0.4717662214072129, "learning_rate": 7.024875402181535e-05, "loss": 0.6279, "num_tokens": 2059953810.0, "step": 882 }, { "epoch": 1.105426892499136, "grad_norm": 0.6993358214802385, "learning_rate": 7.02171439647833e-05, "loss": 0.6328, "num_tokens": 2062336734.0, "step": 883 }, { "epoch": 1.105772554441756, "grad_norm": 0.6930172186519157, "learning_rate": 7.018548989292204e-05, "loss": 0.6507, "num_tokens": 2064670753.0, "step": 884 }, { "epoch": 1.1061182163843761, "grad_norm": 0.48513162283116845, "learning_rate": 7.015379185234004e-05, "loss": 0.654, "num_tokens": 2066999169.0, "step": 885 }, { "epoch": 1.1064638783269962, "grad_norm": 0.79984066658087, "learning_rate": 7.012204988920986e-05, "loss": 0.6583, "num_tokens": 2069455232.0, "step": 886 }, { "epoch": 1.1068095402696163, "grad_norm": 0.5615265223757545, "learning_rate": 7.0090264049768e-05, "loss": 0.6457, "num_tokens": 2071803138.0, "step": 887 }, { "epoch": 1.1071552022122364, "grad_norm": 0.7388769745641661, "learning_rate": 7.00584343803149e-05, "loss": 0.6471, "num_tokens": 2074194623.0, "step": 888 }, { "epoch": 1.1075008641548565, "grad_norm": 0.6596766010589992, "learning_rate": 7.002656092721486e-05, "loss": 0.6602, "num_tokens": 2076680051.0, "step": 889 }, { "epoch": 1.1078465260974766, "grad_norm": 0.6825923823675335, "learning_rate": 6.99946437368959e-05, "loss": 0.6425, "num_tokens": 2079095231.0, "step": 890 }, { "epoch": 1.1081921880400967, "grad_norm": 0.5054575681810504, "learning_rate": 6.99626828558498e-05, "loss": 0.6208, "num_tokens": 2081422851.0, "step": 891 }, { "epoch": 1.108537849982717, "grad_norm": 0.7101454920931178, "learning_rate": 6.993067833063194e-05, "loss": 0.6257, "num_tokens": 2083631354.0, "step": 892 }, { "epoch": 1.108883511925337, "grad_norm": 0.6813611778539539, "learning_rate": 6.989863020786133e-05, "loss": 0.6441, "num_tokens": 2086103646.0, "step": 893 }, { "epoch": 1.1092291738679572, "grad_norm": 0.5808563119676818, "learning_rate": 6.986653853422046e-05, "loss": 0.635, "num_tokens": 2088435733.0, "step": 894 }, { "epoch": 1.1095748358105773, "grad_norm": 0.6763093451632597, "learning_rate": 6.983440335645522e-05, "loss": 0.6454, "num_tokens": 2090777490.0, "step": 895 }, { "epoch": 1.1099204977531973, "grad_norm": 0.4916894744134045, "learning_rate": 6.98022247213749e-05, "loss": 0.6426, "num_tokens": 2093112752.0, "step": 896 }, { "epoch": 1.1102661596958174, "grad_norm": 0.5976052169547356, "learning_rate": 6.977000267585211e-05, "loss": 0.6368, "num_tokens": 2095506528.0, "step": 897 }, { "epoch": 1.1106118216384375, "grad_norm": 0.48367407629573994, "learning_rate": 6.973773726682268e-05, "loss": 0.6215, "num_tokens": 2097886592.0, "step": 898 }, { "epoch": 1.1109574835810578, "grad_norm": 0.4999548515884936, "learning_rate": 6.970542854128557e-05, "loss": 0.6227, "num_tokens": 2100159013.0, "step": 899 }, { "epoch": 1.111303145523678, "grad_norm": 0.8232656973391344, "learning_rate": 6.967307654630291e-05, "loss": 0.6276, "num_tokens": 2102437349.0, "step": 900 }, { "epoch": 1.111648807466298, "grad_norm": 0.7566580413902171, "learning_rate": 6.964068132899979e-05, "loss": 0.6366, "num_tokens": 2104776738.0, "step": 901 }, { "epoch": 1.111994469408918, "grad_norm": 0.73199001613691, "learning_rate": 6.960824293656429e-05, "loss": 0.6221, "num_tokens": 2107100856.0, "step": 902 }, { "epoch": 1.1123401313515382, "grad_norm": 0.8697615015805004, "learning_rate": 6.957576141624736e-05, "loss": 0.6346, "num_tokens": 2109494220.0, "step": 903 }, { "epoch": 1.1126857932941583, "grad_norm": 0.5199833480250547, "learning_rate": 6.95432368153628e-05, "loss": 0.6236, "num_tokens": 2111818402.0, "step": 904 }, { "epoch": 1.1130314552367784, "grad_norm": 0.6901009839572165, "learning_rate": 6.951066918128716e-05, "loss": 0.6405, "num_tokens": 2114112431.0, "step": 905 }, { "epoch": 1.1133771171793985, "grad_norm": 0.5531430275448762, "learning_rate": 6.947805856145965e-05, "loss": 0.6358, "num_tokens": 2116496655.0, "step": 906 }, { "epoch": 1.1137227791220186, "grad_norm": 0.555268446488502, "learning_rate": 6.944540500338212e-05, "loss": 0.634, "num_tokens": 2118796208.0, "step": 907 }, { "epoch": 1.1140684410646389, "grad_norm": 0.5692476125999714, "learning_rate": 6.941270855461891e-05, "loss": 0.6246, "num_tokens": 2121183141.0, "step": 908 }, { "epoch": 1.114414103007259, "grad_norm": 0.6671198843025686, "learning_rate": 6.937996926279694e-05, "loss": 0.6383, "num_tokens": 2123584625.0, "step": 909 }, { "epoch": 1.114759764949879, "grad_norm": 0.5598019317327989, "learning_rate": 6.934718717560543e-05, "loss": 0.633, "num_tokens": 2126008691.0, "step": 910 }, { "epoch": 1.1151054268924991, "grad_norm": 0.7795964951726142, "learning_rate": 6.9314362340796e-05, "loss": 0.6258, "num_tokens": 2128369876.0, "step": 911 }, { "epoch": 1.1154510888351192, "grad_norm": 0.753000224616792, "learning_rate": 6.928149480618252e-05, "loss": 0.6441, "num_tokens": 2130554887.0, "step": 912 }, { "epoch": 1.1157967507777393, "grad_norm": 0.6037643159868399, "learning_rate": 6.924858461964108e-05, "loss": 0.6383, "num_tokens": 2132953171.0, "step": 913 }, { "epoch": 1.1161424127203594, "grad_norm": 0.5991831241506346, "learning_rate": 6.921563182910983e-05, "loss": 0.6339, "num_tokens": 2135269648.0, "step": 914 }, { "epoch": 1.1164880746629797, "grad_norm": 0.7047653439113356, "learning_rate": 6.918263648258906e-05, "loss": 0.632, "num_tokens": 2137601618.0, "step": 915 }, { "epoch": 1.1168337366055998, "grad_norm": 0.47454492484100547, "learning_rate": 6.914959862814103e-05, "loss": 0.651, "num_tokens": 2139940157.0, "step": 916 }, { "epoch": 1.11717939854822, "grad_norm": 0.6793734322739983, "learning_rate": 6.911651831388986e-05, "loss": 0.6407, "num_tokens": 2142158525.0, "step": 917 }, { "epoch": 1.11752506049084, "grad_norm": 0.6206771689296277, "learning_rate": 6.908339558802158e-05, "loss": 0.6197, "num_tokens": 2144501158.0, "step": 918 }, { "epoch": 1.11787072243346, "grad_norm": 0.4976655024049246, "learning_rate": 6.905023049878401e-05, "loss": 0.6465, "num_tokens": 2146789115.0, "step": 919 }, { "epoch": 1.1182163843760802, "grad_norm": 0.7230013861607477, "learning_rate": 6.901702309448659e-05, "loss": 0.6503, "num_tokens": 2149159219.0, "step": 920 }, { "epoch": 1.1185620463187003, "grad_norm": 0.5678791087220733, "learning_rate": 6.898377342350051e-05, "loss": 0.6286, "num_tokens": 2151494749.0, "step": 921 }, { "epoch": 1.1189077082613204, "grad_norm": 0.5911803287324875, "learning_rate": 6.895048153425845e-05, "loss": 0.6209, "num_tokens": 2153813345.0, "step": 922 }, { "epoch": 1.1192533702039404, "grad_norm": 0.46345346327761283, "learning_rate": 6.89171474752546e-05, "loss": 0.638, "num_tokens": 2156023059.0, "step": 923 }, { "epoch": 1.1195990321465608, "grad_norm": 0.6110386431097571, "learning_rate": 6.888377129504461e-05, "loss": 0.6367, "num_tokens": 2158287614.0, "step": 924 }, { "epoch": 1.1199446940891808, "grad_norm": 0.4368089387289657, "learning_rate": 6.885035304224543e-05, "loss": 0.635, "num_tokens": 2160666520.0, "step": 925 }, { "epoch": 1.120290356031801, "grad_norm": 0.5372899511278134, "learning_rate": 6.881689276553535e-05, "loss": 0.6246, "num_tokens": 2163000496.0, "step": 926 }, { "epoch": 1.120636017974421, "grad_norm": 0.6538615604219101, "learning_rate": 6.878339051365385e-05, "loss": 0.6427, "num_tokens": 2165308958.0, "step": 927 }, { "epoch": 1.1209816799170411, "grad_norm": 0.4243914914199368, "learning_rate": 6.874984633540154e-05, "loss": 0.6577, "num_tokens": 2167749217.0, "step": 928 }, { "epoch": 1.1213273418596612, "grad_norm": 0.3893441256768904, "learning_rate": 6.871626027964012e-05, "loss": 0.6298, "num_tokens": 2170036892.0, "step": 929 }, { "epoch": 1.1216730038022813, "grad_norm": 0.5810904356526593, "learning_rate": 6.868263239529226e-05, "loss": 0.6304, "num_tokens": 2172379267.0, "step": 930 }, { "epoch": 1.1220186657449014, "grad_norm": 0.5814926105052469, "learning_rate": 6.864896273134165e-05, "loss": 0.6206, "num_tokens": 2174814243.0, "step": 931 }, { "epoch": 1.1223643276875217, "grad_norm": 0.3669268932228981, "learning_rate": 6.861525133683269e-05, "loss": 0.6445, "num_tokens": 2177154550.0, "step": 932 }, { "epoch": 1.1227099896301418, "grad_norm": 0.3750619980799489, "learning_rate": 6.858149826087069e-05, "loss": 0.6238, "num_tokens": 2179468910.0, "step": 933 }, { "epoch": 1.1230556515727619, "grad_norm": 0.3960254726510145, "learning_rate": 6.854770355262162e-05, "loss": 0.623, "num_tokens": 2181699356.0, "step": 934 }, { "epoch": 1.123401313515382, "grad_norm": 0.7447647609741658, "learning_rate": 6.851386726131211e-05, "loss": 0.6463, "num_tokens": 2184168740.0, "step": 935 }, { "epoch": 1.123746975458002, "grad_norm": 0.5180778483368027, "learning_rate": 6.847998943622935e-05, "loss": 0.6332, "num_tokens": 2186365943.0, "step": 936 }, { "epoch": 1.1240926374006222, "grad_norm": 0.6425397415246044, "learning_rate": 6.844607012672104e-05, "loss": 0.6317, "num_tokens": 2188721176.0, "step": 937 }, { "epoch": 1.1244382993432422, "grad_norm": 0.7044000930496582, "learning_rate": 6.841210938219531e-05, "loss": 0.6422, "num_tokens": 2190988111.0, "step": 938 }, { "epoch": 1.1247839612858623, "grad_norm": 0.49193188806039756, "learning_rate": 6.837810725212062e-05, "loss": 0.6372, "num_tokens": 2193380486.0, "step": 939 }, { "epoch": 1.1251296232284824, "grad_norm": 0.6704740327256191, "learning_rate": 6.834406378602576e-05, "loss": 0.6433, "num_tokens": 2195689281.0, "step": 940 }, { "epoch": 1.1254752851711027, "grad_norm": 0.5593393104513635, "learning_rate": 6.830997903349968e-05, "loss": 0.6456, "num_tokens": 2197944315.0, "step": 941 }, { "epoch": 1.1258209471137228, "grad_norm": 0.5875245810645537, "learning_rate": 6.827585304419152e-05, "loss": 0.6329, "num_tokens": 2200277439.0, "step": 942 }, { "epoch": 1.126166609056343, "grad_norm": 0.4498250279337379, "learning_rate": 6.824168586781042e-05, "loss": 0.6328, "num_tokens": 2202577556.0, "step": 943 }, { "epoch": 1.126512270998963, "grad_norm": 0.5893339159235107, "learning_rate": 6.820747755412559e-05, "loss": 0.6393, "num_tokens": 2204940990.0, "step": 944 }, { "epoch": 1.126857932941583, "grad_norm": 0.49392041985893986, "learning_rate": 6.817322815296612e-05, "loss": 0.6416, "num_tokens": 2207320014.0, "step": 945 }, { "epoch": 1.1272035948842032, "grad_norm": 0.46509019182473127, "learning_rate": 6.813893771422095e-05, "loss": 0.6315, "num_tokens": 2209652915.0, "step": 946 }, { "epoch": 1.1275492568268233, "grad_norm": 0.6723719169966719, "learning_rate": 6.81046062878388e-05, "loss": 0.6179, "num_tokens": 2211960764.0, "step": 947 }, { "epoch": 1.1278949187694436, "grad_norm": 0.4621703070283164, "learning_rate": 6.807023392382812e-05, "loss": 0.6438, "num_tokens": 2214418935.0, "step": 948 }, { "epoch": 1.1282405807120637, "grad_norm": 0.7218407737679211, "learning_rate": 6.803582067225695e-05, "loss": 0.6363, "num_tokens": 2216700012.0, "step": 949 }, { "epoch": 1.1285862426546838, "grad_norm": 0.5884391351103374, "learning_rate": 6.800136658325292e-05, "loss": 0.6304, "num_tokens": 2219035399.0, "step": 950 }, { "epoch": 1.1289319045973039, "grad_norm": 0.689816889563364, "learning_rate": 6.796687170700312e-05, "loss": 0.6236, "num_tokens": 2221297737.0, "step": 951 }, { "epoch": 1.129277566539924, "grad_norm": 0.6085290817918191, "learning_rate": 6.793233609375408e-05, "loss": 0.6618, "num_tokens": 2223737424.0, "step": 952 }, { "epoch": 1.129623228482544, "grad_norm": 0.637760361595735, "learning_rate": 6.789775979381162e-05, "loss": 0.6475, "num_tokens": 2226124311.0, "step": 953 }, { "epoch": 1.1299688904251641, "grad_norm": 0.5487522464458742, "learning_rate": 6.786314285754091e-05, "loss": 0.6422, "num_tokens": 2228555461.0, "step": 954 }, { "epoch": 1.1303145523677842, "grad_norm": 0.7340495531293658, "learning_rate": 6.782848533536624e-05, "loss": 0.6085, "num_tokens": 2230983286.0, "step": 955 }, { "epoch": 1.1306602143104043, "grad_norm": 0.6664998247310985, "learning_rate": 6.779378727777103e-05, "loss": 0.6479, "num_tokens": 2233288654.0, "step": 956 }, { "epoch": 1.1310058762530246, "grad_norm": 0.67988864669748, "learning_rate": 6.775904873529778e-05, "loss": 0.6235, "num_tokens": 2235497133.0, "step": 957 }, { "epoch": 1.1313515381956447, "grad_norm": 0.6796758006477891, "learning_rate": 6.772426975854791e-05, "loss": 0.6357, "num_tokens": 2237797176.0, "step": 958 }, { "epoch": 1.1316972001382648, "grad_norm": 0.48626440584935404, "learning_rate": 6.76894503981818e-05, "loss": 0.6259, "num_tokens": 2240134648.0, "step": 959 }, { "epoch": 1.132042862080885, "grad_norm": 0.5740460824237259, "learning_rate": 6.765459070491859e-05, "loss": 0.6521, "num_tokens": 2242440732.0, "step": 960 }, { "epoch": 1.132388524023505, "grad_norm": 0.5988155984820678, "learning_rate": 6.761969072953624e-05, "loss": 0.6227, "num_tokens": 2244829003.0, "step": 961 }, { "epoch": 1.132734185966125, "grad_norm": 0.5067837814332167, "learning_rate": 6.758475052287126e-05, "loss": 0.6461, "num_tokens": 2247227638.0, "step": 962 }, { "epoch": 1.1330798479087452, "grad_norm": 0.7304351732023366, "learning_rate": 6.754977013581897e-05, "loss": 0.6498, "num_tokens": 2249616137.0, "step": 963 }, { "epoch": 1.1334255098513655, "grad_norm": 0.6477905996842682, "learning_rate": 6.751474961933303e-05, "loss": 0.6509, "num_tokens": 2251916922.0, "step": 964 }, { "epoch": 1.1337711717939856, "grad_norm": 0.6482806465104399, "learning_rate": 6.747968902442562e-05, "loss": 0.6373, "num_tokens": 2254204419.0, "step": 965 }, { "epoch": 1.1341168337366057, "grad_norm": 0.6006819045968405, "learning_rate": 6.744458840216731e-05, "loss": 0.6461, "num_tokens": 2256667941.0, "step": 966 }, { "epoch": 1.1344624956792257, "grad_norm": 0.5337562138038692, "learning_rate": 6.740944780368699e-05, "loss": 0.634, "num_tokens": 2258998563.0, "step": 967 }, { "epoch": 1.1348081576218458, "grad_norm": 0.4223695542827033, "learning_rate": 6.737426728017173e-05, "loss": 0.62, "num_tokens": 2261148023.0, "step": 968 }, { "epoch": 1.135153819564466, "grad_norm": 0.5750274402311729, "learning_rate": 6.733904688286678e-05, "loss": 0.6249, "num_tokens": 2263557925.0, "step": 969 }, { "epoch": 1.135499481507086, "grad_norm": 0.5316012645950183, "learning_rate": 6.73037866630755e-05, "loss": 0.6228, "num_tokens": 2265849340.0, "step": 970 }, { "epoch": 1.135845143449706, "grad_norm": 0.7189913183050987, "learning_rate": 6.726848667215923e-05, "loss": 0.6266, "num_tokens": 2268174598.0, "step": 971 }, { "epoch": 1.1361908053923262, "grad_norm": 0.5800634854128125, "learning_rate": 6.723314696153724e-05, "loss": 0.6109, "num_tokens": 2270509342.0, "step": 972 }, { "epoch": 1.1365364673349465, "grad_norm": 0.6258730712234569, "learning_rate": 6.719776758268666e-05, "loss": 0.6162, "num_tokens": 2272825083.0, "step": 973 }, { "epoch": 1.1368821292775666, "grad_norm": 0.5329175998226954, "learning_rate": 6.716234858714242e-05, "loss": 0.6289, "num_tokens": 2275091568.0, "step": 974 }, { "epoch": 1.1372277912201867, "grad_norm": 0.5231537995187835, "learning_rate": 6.71268900264971e-05, "loss": 0.6236, "num_tokens": 2277492532.0, "step": 975 }, { "epoch": 1.1375734531628068, "grad_norm": 0.6601276082266523, "learning_rate": 6.709139195240101e-05, "loss": 0.6296, "num_tokens": 2279786923.0, "step": 976 }, { "epoch": 1.1379191151054269, "grad_norm": 0.4937968105353521, "learning_rate": 6.70558544165619e-05, "loss": 0.6264, "num_tokens": 2282187464.0, "step": 977 }, { "epoch": 1.138264777048047, "grad_norm": 0.5146181314844022, "learning_rate": 6.702027747074512e-05, "loss": 0.6268, "num_tokens": 2284442191.0, "step": 978 }, { "epoch": 1.138610438990667, "grad_norm": 0.4663909067350701, "learning_rate": 6.698466116677332e-05, "loss": 0.636, "num_tokens": 2286759940.0, "step": 979 }, { "epoch": 1.1389561009332874, "grad_norm": 0.7581406371080717, "learning_rate": 6.694900555652656e-05, "loss": 0.6342, "num_tokens": 2289085381.0, "step": 980 }, { "epoch": 1.1393017628759075, "grad_norm": 0.41439161112940287, "learning_rate": 6.691331069194212e-05, "loss": 0.615, "num_tokens": 2291387458.0, "step": 981 }, { "epoch": 1.1396474248185275, "grad_norm": 0.5545072363697835, "learning_rate": 6.687757662501445e-05, "loss": 0.6356, "num_tokens": 2293716765.0, "step": 982 }, { "epoch": 1.1399930867611476, "grad_norm": 0.5554150497991966, "learning_rate": 6.684180340779512e-05, "loss": 0.6111, "num_tokens": 2296051852.0, "step": 983 }, { "epoch": 1.1403387487037677, "grad_norm": 0.5447425206600714, "learning_rate": 6.680599109239275e-05, "loss": 0.6297, "num_tokens": 2298393559.0, "step": 984 }, { "epoch": 1.1406844106463878, "grad_norm": 0.4510165923624746, "learning_rate": 6.677013973097283e-05, "loss": 0.6252, "num_tokens": 2300755039.0, "step": 985 }, { "epoch": 1.141030072589008, "grad_norm": 0.510964276685836, "learning_rate": 6.673424937575782e-05, "loss": 0.63, "num_tokens": 2303082949.0, "step": 986 }, { "epoch": 1.141375734531628, "grad_norm": 0.48323428192819035, "learning_rate": 6.669832007902694e-05, "loss": 0.6395, "num_tokens": 2305423791.0, "step": 987 }, { "epoch": 1.141721396474248, "grad_norm": 0.6048317819158896, "learning_rate": 6.666235189311613e-05, "loss": 0.6237, "num_tokens": 2307679909.0, "step": 988 }, { "epoch": 1.1420670584168684, "grad_norm": 0.5291077952484509, "learning_rate": 6.6626344870418e-05, "loss": 0.6072, "num_tokens": 2309957448.0, "step": 989 }, { "epoch": 1.1424127203594885, "grad_norm": 0.6168406657051422, "learning_rate": 6.65902990633817e-05, "loss": 0.6155, "num_tokens": 2312331918.0, "step": 990 }, { "epoch": 1.1427583823021086, "grad_norm": 0.6559005577027325, "learning_rate": 6.655421452451286e-05, "loss": 0.6235, "num_tokens": 2314637599.0, "step": 991 }, { "epoch": 1.1431040442447287, "grad_norm": 0.628192778755691, "learning_rate": 6.65180913063736e-05, "loss": 0.6007, "num_tokens": 2316874956.0, "step": 992 }, { "epoch": 1.1434497061873488, "grad_norm": 0.5658899861342798, "learning_rate": 6.64819294615823e-05, "loss": 0.6197, "num_tokens": 2319333788.0, "step": 993 }, { "epoch": 1.1437953681299688, "grad_norm": 0.5068984419164501, "learning_rate": 6.64457290428137e-05, "loss": 0.6281, "num_tokens": 2321717198.0, "step": 994 }, { "epoch": 1.144141030072589, "grad_norm": 0.3814550398423423, "learning_rate": 6.640949010279862e-05, "loss": 0.6207, "num_tokens": 2324035973.0, "step": 995 }, { "epoch": 1.144486692015209, "grad_norm": 0.5236360790300648, "learning_rate": 6.637321269432405e-05, "loss": 0.6321, "num_tokens": 2326389131.0, "step": 996 }, { "epoch": 1.1448323539578293, "grad_norm": 0.4836360462587859, "learning_rate": 6.633689687023302e-05, "loss": 0.5903, "num_tokens": 2328682547.0, "step": 997 }, { "epoch": 1.1451780159004494, "grad_norm": 0.5848324586628182, "learning_rate": 6.630054268342452e-05, "loss": 0.6409, "num_tokens": 2331110855.0, "step": 998 }, { "epoch": 1.1455236778430695, "grad_norm": 0.4099450482794086, "learning_rate": 6.626415018685338e-05, "loss": 0.6192, "num_tokens": 2333482982.0, "step": 999 }, { "epoch": 1.1458693397856896, "grad_norm": 0.7408602110844928, "learning_rate": 6.622771943353026e-05, "loss": 0.6179, "num_tokens": 2335807519.0, "step": 1000 }, { "epoch": 1.1462150017283097, "grad_norm": 0.6012612937482089, "learning_rate": 6.619125047652157e-05, "loss": 0.619, "num_tokens": 2338209285.0, "step": 1001 }, { "epoch": 1.1465606636709298, "grad_norm": 0.6493454542590236, "learning_rate": 6.615474336894931e-05, "loss": 0.6402, "num_tokens": 2340538805.0, "step": 1002 }, { "epoch": 1.1469063256135499, "grad_norm": 0.6267501046711663, "learning_rate": 6.611819816399114e-05, "loss": 0.6433, "num_tokens": 2342758184.0, "step": 1003 }, { "epoch": 1.14725198755617, "grad_norm": 0.5923752473356027, "learning_rate": 6.608161491488008e-05, "loss": 0.6275, "num_tokens": 2345117717.0, "step": 1004 }, { "epoch": 1.14759764949879, "grad_norm": 0.5284346039261694, "learning_rate": 6.604499367490472e-05, "loss": 0.625, "num_tokens": 2347389652.0, "step": 1005 }, { "epoch": 1.1479433114414104, "grad_norm": 0.44923193031135483, "learning_rate": 6.600833449740888e-05, "loss": 0.6199, "num_tokens": 2349733790.0, "step": 1006 }, { "epoch": 1.1482889733840305, "grad_norm": 0.6672607541090939, "learning_rate": 6.597163743579169e-05, "loss": 0.6184, "num_tokens": 2352150095.0, "step": 1007 }, { "epoch": 1.1486346353266506, "grad_norm": 0.5203279901567358, "learning_rate": 6.593490254350743e-05, "loss": 0.6282, "num_tokens": 2354520199.0, "step": 1008 }, { "epoch": 1.1489802972692706, "grad_norm": 0.5390672046219768, "learning_rate": 6.589812987406553e-05, "loss": 0.6255, "num_tokens": 2356858374.0, "step": 1009 }, { "epoch": 1.1493259592118907, "grad_norm": 0.5219983962268486, "learning_rate": 6.58613194810304e-05, "loss": 0.6376, "num_tokens": 2359145604.0, "step": 1010 }, { "epoch": 1.1496716211545108, "grad_norm": 0.5189856868184768, "learning_rate": 6.582447141802145e-05, "loss": 0.607, "num_tokens": 2361475559.0, "step": 1011 }, { "epoch": 1.150017283097131, "grad_norm": 0.46311222356315457, "learning_rate": 6.578758573871292e-05, "loss": 0.6239, "num_tokens": 2363737560.0, "step": 1012 }, { "epoch": 1.1503629450397512, "grad_norm": 0.7127067671023645, "learning_rate": 6.575066249683384e-05, "loss": 0.6283, "num_tokens": 2366088917.0, "step": 1013 }, { "epoch": 1.1507086069823713, "grad_norm": 0.504798450260592, "learning_rate": 6.5713701746168e-05, "loss": 0.6385, "num_tokens": 2368421839.0, "step": 1014 }, { "epoch": 1.1510542689249914, "grad_norm": 0.6885582654012001, "learning_rate": 6.567670354055379e-05, "loss": 0.6367, "num_tokens": 2370794871.0, "step": 1015 }, { "epoch": 1.1513999308676115, "grad_norm": 0.6123015826183623, "learning_rate": 6.563966793388416e-05, "loss": 0.6187, "num_tokens": 2373094409.0, "step": 1016 }, { "epoch": 1.1517455928102316, "grad_norm": 0.5780681901246453, "learning_rate": 6.560259498010656e-05, "loss": 0.6117, "num_tokens": 2375491140.0, "step": 1017 }, { "epoch": 1.1520912547528517, "grad_norm": 0.5259881800886433, "learning_rate": 6.556548473322283e-05, "loss": 0.6335, "num_tokens": 2377915034.0, "step": 1018 }, { "epoch": 1.1524369166954718, "grad_norm": 0.49751243175953724, "learning_rate": 6.552833724728911e-05, "loss": 0.628, "num_tokens": 2380242355.0, "step": 1019 }, { "epoch": 1.1527825786380919, "grad_norm": 0.5436034825045782, "learning_rate": 6.549115257641583e-05, "loss": 0.6223, "num_tokens": 2382624028.0, "step": 1020 }, { "epoch": 1.153128240580712, "grad_norm": 0.44158626232659887, "learning_rate": 6.545393077476755e-05, "loss": 0.6077, "num_tokens": 2384913662.0, "step": 1021 }, { "epoch": 1.1534739025233323, "grad_norm": 0.5844912134081436, "learning_rate": 6.541667189656292e-05, "loss": 0.6401, "num_tokens": 2387252071.0, "step": 1022 }, { "epoch": 1.1538195644659524, "grad_norm": 0.48878099388322455, "learning_rate": 6.537937599607459e-05, "loss": 0.6298, "num_tokens": 2389537580.0, "step": 1023 }, { "epoch": 1.1541652264085724, "grad_norm": 0.6726480816242691, "learning_rate": 6.53420431276292e-05, "loss": 0.6321, "num_tokens": 2391849525.0, "step": 1024 }, { "epoch": 1.1545108883511925, "grad_norm": 0.4667586163082233, "learning_rate": 6.530467334560713e-05, "loss": 0.609, "num_tokens": 2394244666.0, "step": 1025 }, { "epoch": 1.1548565502938126, "grad_norm": 0.5946137675797538, "learning_rate": 6.526726670444264e-05, "loss": 0.6165, "num_tokens": 2396611863.0, "step": 1026 }, { "epoch": 1.1552022122364327, "grad_norm": 0.5617144788546772, "learning_rate": 6.52298232586236e-05, "loss": 0.6235, "num_tokens": 2398986714.0, "step": 1027 }, { "epoch": 1.1555478741790528, "grad_norm": 0.5669655200572348, "learning_rate": 6.519234306269153e-05, "loss": 0.6486, "num_tokens": 2401283913.0, "step": 1028 }, { "epoch": 1.1558935361216731, "grad_norm": 0.6117264014054326, "learning_rate": 6.515482617124147e-05, "loss": 0.6274, "num_tokens": 2403583605.0, "step": 1029 }, { "epoch": 1.1562391980642932, "grad_norm": 0.5556547485056625, "learning_rate": 6.511727263892192e-05, "loss": 0.6212, "num_tokens": 2405801945.0, "step": 1030 }, { "epoch": 1.1565848600069133, "grad_norm": 0.5911936235918205, "learning_rate": 6.507968252043473e-05, "loss": 0.6079, "num_tokens": 2408109634.0, "step": 1031 }, { "epoch": 1.1569305219495334, "grad_norm": 0.4503003052254557, "learning_rate": 6.504205587053508e-05, "loss": 0.6379, "num_tokens": 2410543159.0, "step": 1032 }, { "epoch": 1.1572761838921535, "grad_norm": 0.5267984089254603, "learning_rate": 6.500439274403134e-05, "loss": 0.6103, "num_tokens": 2412863462.0, "step": 1033 }, { "epoch": 1.1576218458347736, "grad_norm": 0.46879755191818, "learning_rate": 6.496669319578502e-05, "loss": 0.6254, "num_tokens": 2415203804.0, "step": 1034 }, { "epoch": 1.1579675077773937, "grad_norm": 0.6381091314475303, "learning_rate": 6.492895728071065e-05, "loss": 0.637, "num_tokens": 2417610667.0, "step": 1035 }, { "epoch": 1.1583131697200137, "grad_norm": 0.6094039963064175, "learning_rate": 6.48911850537758e-05, "loss": 0.6113, "num_tokens": 2419930595.0, "step": 1036 }, { "epoch": 1.1586588316626338, "grad_norm": 0.42868201291590824, "learning_rate": 6.485337657000086e-05, "loss": 0.6174, "num_tokens": 2422255862.0, "step": 1037 }, { "epoch": 1.1590044936052541, "grad_norm": 0.3945134222441782, "learning_rate": 6.481553188445912e-05, "loss": 0.6148, "num_tokens": 2424548752.0, "step": 1038 }, { "epoch": 1.1593501555478742, "grad_norm": 0.6144174418937509, "learning_rate": 6.47776510522765e-05, "loss": 0.6245, "num_tokens": 2426950590.0, "step": 1039 }, { "epoch": 1.1596958174904943, "grad_norm": 0.5967950663607829, "learning_rate": 6.473973412863164e-05, "loss": 0.6132, "num_tokens": 2429150730.0, "step": 1040 }, { "epoch": 1.1600414794331144, "grad_norm": 0.6114240026615225, "learning_rate": 6.470178116875574e-05, "loss": 0.6396, "num_tokens": 2431460825.0, "step": 1041 }, { "epoch": 1.1603871413757345, "grad_norm": 0.5204705855257178, "learning_rate": 6.466379222793251e-05, "loss": 0.6627, "num_tokens": 2433790240.0, "step": 1042 }, { "epoch": 1.1607328033183546, "grad_norm": 0.8581440433088964, "learning_rate": 6.4625767361498e-05, "loss": 0.6333, "num_tokens": 2436131665.0, "step": 1043 }, { "epoch": 1.1610784652609747, "grad_norm": 0.4973254034934891, "learning_rate": 6.458770662484068e-05, "loss": 0.6384, "num_tokens": 2438500883.0, "step": 1044 }, { "epoch": 1.161424127203595, "grad_norm": 1.0637891925815905, "learning_rate": 6.454961007340122e-05, "loss": 0.6244, "num_tokens": 2440759518.0, "step": 1045 }, { "epoch": 1.161769789146215, "grad_norm": 0.7984849801833085, "learning_rate": 6.451147776267246e-05, "loss": 0.6045, "num_tokens": 2443150579.0, "step": 1046 }, { "epoch": 1.1621154510888352, "grad_norm": 0.9159144404348797, "learning_rate": 6.447330974819937e-05, "loss": 0.6435, "num_tokens": 2445569012.0, "step": 1047 }, { "epoch": 1.1624611130314553, "grad_norm": 1.1611006690908596, "learning_rate": 6.443510608557883e-05, "loss": 0.5997, "num_tokens": 2447770933.0, "step": 1048 }, { "epoch": 1.1628067749740754, "grad_norm": 0.523598039649553, "learning_rate": 6.439686683045977e-05, "loss": 0.6353, "num_tokens": 2450130773.0, "step": 1049 }, { "epoch": 1.1631524369166955, "grad_norm": 1.2567326385582667, "learning_rate": 6.435859203854287e-05, "loss": 0.6071, "num_tokens": 2452577107.0, "step": 1050 }, { "epoch": 1.1634980988593155, "grad_norm": 1.241276400190654, "learning_rate": 6.432028176558064e-05, "loss": 0.6297, "num_tokens": 2454916386.0, "step": 1051 }, { "epoch": 1.1638437608019356, "grad_norm": 0.7572514942284563, "learning_rate": 6.428193606737723e-05, "loss": 0.6288, "num_tokens": 2457210898.0, "step": 1052 }, { "epoch": 1.1641894227445557, "grad_norm": 1.107044968013011, "learning_rate": 6.42435549997884e-05, "loss": 0.6209, "num_tokens": 2459524849.0, "step": 1053 }, { "epoch": 1.164535084687176, "grad_norm": 1.073624324225753, "learning_rate": 6.420513861872144e-05, "loss": 0.6202, "num_tokens": 2461849408.0, "step": 1054 }, { "epoch": 1.1648807466297961, "grad_norm": 0.9268276420827404, "learning_rate": 6.416668698013507e-05, "loss": 0.6195, "num_tokens": 2464143619.0, "step": 1055 }, { "epoch": 1.1652264085724162, "grad_norm": 0.9676910918328933, "learning_rate": 6.412820014003938e-05, "loss": 0.6253, "num_tokens": 2466523210.0, "step": 1056 }, { "epoch": 1.1655720705150363, "grad_norm": 0.8123373044427525, "learning_rate": 6.408967815449572e-05, "loss": 0.6303, "num_tokens": 2468766487.0, "step": 1057 }, { "epoch": 1.1659177324576564, "grad_norm": 0.9045271199461954, "learning_rate": 6.405112107961664e-05, "loss": 0.5998, "num_tokens": 2470997960.0, "step": 1058 }, { "epoch": 1.1662633944002765, "grad_norm": 0.9580608118315684, "learning_rate": 6.401252897156583e-05, "loss": 0.6196, "num_tokens": 2473234959.0, "step": 1059 }, { "epoch": 1.1666090563428966, "grad_norm": 0.6787626987069121, "learning_rate": 6.397390188655797e-05, "loss": 0.6342, "num_tokens": 2475643048.0, "step": 1060 }, { "epoch": 1.1669547182855167, "grad_norm": 0.8399074698635285, "learning_rate": 6.393523988085868e-05, "loss": 0.6241, "num_tokens": 2477957328.0, "step": 1061 }, { "epoch": 1.167300380228137, "grad_norm": 0.7876972855558251, "learning_rate": 6.38965430107845e-05, "loss": 0.6017, "num_tokens": 2480139155.0, "step": 1062 }, { "epoch": 1.167646042170757, "grad_norm": 0.5783847796837571, "learning_rate": 6.38578113327027e-05, "loss": 0.6223, "num_tokens": 2482517823.0, "step": 1063 }, { "epoch": 1.1679917041133772, "grad_norm": 0.6759839194351326, "learning_rate": 6.381904490303132e-05, "loss": 0.606, "num_tokens": 2484856582.0, "step": 1064 }, { "epoch": 1.1683373660559973, "grad_norm": 0.6068465246249596, "learning_rate": 6.378024377823893e-05, "loss": 0.6027, "num_tokens": 2487065717.0, "step": 1065 }, { "epoch": 1.1686830279986173, "grad_norm": 0.6928264910232758, "learning_rate": 6.374140801484471e-05, "loss": 0.6069, "num_tokens": 2489416100.0, "step": 1066 }, { "epoch": 1.1690286899412374, "grad_norm": 0.599808957540365, "learning_rate": 6.370253766941829e-05, "loss": 0.6342, "num_tokens": 2491864200.0, "step": 1067 }, { "epoch": 1.1693743518838575, "grad_norm": 0.6966398788529901, "learning_rate": 6.36636327985796e-05, "loss": 0.605, "num_tokens": 2494119253.0, "step": 1068 }, { "epoch": 1.1697200138264776, "grad_norm": 0.6340293147435363, "learning_rate": 6.362469345899899e-05, "loss": 0.6418, "num_tokens": 2496498849.0, "step": 1069 }, { "epoch": 1.1700656757690977, "grad_norm": 0.7845818371431056, "learning_rate": 6.358571970739688e-05, "loss": 0.6382, "num_tokens": 2498787531.0, "step": 1070 }, { "epoch": 1.170411337711718, "grad_norm": 0.6280136625834155, "learning_rate": 6.354671160054393e-05, "loss": 0.6329, "num_tokens": 2501061697.0, "step": 1071 }, { "epoch": 1.170756999654338, "grad_norm": 0.528329470195529, "learning_rate": 6.350766919526076e-05, "loss": 0.6004, "num_tokens": 2503349583.0, "step": 1072 }, { "epoch": 1.1711026615969582, "grad_norm": 0.5635540560003458, "learning_rate": 6.3468592548418e-05, "loss": 0.6229, "num_tokens": 2505690465.0, "step": 1073 }, { "epoch": 1.1714483235395783, "grad_norm": 0.42868148598577804, "learning_rate": 6.342948171693615e-05, "loss": 0.6167, "num_tokens": 2508082736.0, "step": 1074 }, { "epoch": 1.1717939854821984, "grad_norm": 0.6741695282043885, "learning_rate": 6.339033675778548e-05, "loss": 0.6138, "num_tokens": 2510524048.0, "step": 1075 }, { "epoch": 1.1721396474248185, "grad_norm": 0.538082711701654, "learning_rate": 6.3351157727986e-05, "loss": 0.6154, "num_tokens": 2512806934.0, "step": 1076 }, { "epoch": 1.1724853093674386, "grad_norm": 0.3757701154562901, "learning_rate": 6.331194468460732e-05, "loss": 0.6008, "num_tokens": 2515082769.0, "step": 1077 }, { "epoch": 1.1728309713100589, "grad_norm": 0.48310444096695987, "learning_rate": 6.327269768476863e-05, "loss": 0.6067, "num_tokens": 2517336697.0, "step": 1078 }, { "epoch": 1.173176633252679, "grad_norm": 0.5872091587422075, "learning_rate": 6.323341678563855e-05, "loss": 0.6199, "num_tokens": 2519562060.0, "step": 1079 }, { "epoch": 1.173522295195299, "grad_norm": 0.516393769174304, "learning_rate": 6.319410204443512e-05, "loss": 0.6068, "num_tokens": 2521926447.0, "step": 1080 }, { "epoch": 1.1738679571379191, "grad_norm": 0.5208911786116173, "learning_rate": 6.315475351842563e-05, "loss": 0.6398, "num_tokens": 2524290834.0, "step": 1081 }, { "epoch": 1.1742136190805392, "grad_norm": 0.47934323416880825, "learning_rate": 6.311537126492659e-05, "loss": 0.6139, "num_tokens": 2526535304.0, "step": 1082 }, { "epoch": 1.1745592810231593, "grad_norm": 0.4464631066895134, "learning_rate": 6.307595534130368e-05, "loss": 0.6302, "num_tokens": 2528840846.0, "step": 1083 }, { "epoch": 1.1749049429657794, "grad_norm": 0.44714748063814214, "learning_rate": 6.303650580497158e-05, "loss": 0.6127, "num_tokens": 2531170791.0, "step": 1084 }, { "epoch": 1.1752506049083995, "grad_norm": 0.5989513388106248, "learning_rate": 6.299702271339393e-05, "loss": 0.6333, "num_tokens": 2533500511.0, "step": 1085 }, { "epoch": 1.1755962668510196, "grad_norm": 0.43365235289137755, "learning_rate": 6.29575061240833e-05, "loss": 0.5995, "num_tokens": 2535842273.0, "step": 1086 }, { "epoch": 1.17594192879364, "grad_norm": 0.5173045270615718, "learning_rate": 6.291795609460097e-05, "loss": 0.6075, "num_tokens": 2538107845.0, "step": 1087 }, { "epoch": 1.17628759073626, "grad_norm": 0.5395280912616487, "learning_rate": 6.287837268255705e-05, "loss": 0.6298, "num_tokens": 2540396036.0, "step": 1088 }, { "epoch": 1.17663325267888, "grad_norm": 0.6559481496907851, "learning_rate": 6.283875594561013e-05, "loss": 0.6132, "num_tokens": 2542697113.0, "step": 1089 }, { "epoch": 1.1769789146215002, "grad_norm": 0.48535320926206027, "learning_rate": 6.279910594146746e-05, "loss": 0.6348, "num_tokens": 2545173039.0, "step": 1090 }, { "epoch": 1.1773245765641203, "grad_norm": 0.679245336559069, "learning_rate": 6.275942272788469e-05, "loss": 0.6106, "num_tokens": 2547454625.0, "step": 1091 }, { "epoch": 1.1776702385067404, "grad_norm": 0.6108275365498225, "learning_rate": 6.271970636266588e-05, "loss": 0.614, "num_tokens": 2549887341.0, "step": 1092 }, { "epoch": 1.1780159004493604, "grad_norm": 0.4108169662251059, "learning_rate": 6.267995690366334e-05, "loss": 0.6208, "num_tokens": 2552266464.0, "step": 1093 }, { "epoch": 1.1783615623919808, "grad_norm": 0.7010143528190987, "learning_rate": 6.26401744087776e-05, "loss": 0.6012, "num_tokens": 2554479498.0, "step": 1094 }, { "epoch": 1.1787072243346008, "grad_norm": 0.4813767164582428, "learning_rate": 6.260035893595734e-05, "loss": 0.6253, "num_tokens": 2556820838.0, "step": 1095 }, { "epoch": 1.179052886277221, "grad_norm": 0.5620214362860045, "learning_rate": 6.256051054319924e-05, "loss": 0.6073, "num_tokens": 2559207422.0, "step": 1096 }, { "epoch": 1.179398548219841, "grad_norm": 0.6506138277564101, "learning_rate": 6.252062928854794e-05, "loss": 0.6192, "num_tokens": 2561606064.0, "step": 1097 }, { "epoch": 1.1797442101624611, "grad_norm": 0.3888366779250104, "learning_rate": 6.248071523009596e-05, "loss": 0.6248, "num_tokens": 2563905410.0, "step": 1098 }, { "epoch": 1.1800898721050812, "grad_norm": 0.8996762071189687, "learning_rate": 6.24407684259836e-05, "loss": 0.6208, "num_tokens": 2566285936.0, "step": 1099 }, { "epoch": 1.1804355340477013, "grad_norm": 0.5789649137721405, "learning_rate": 6.240078893439886e-05, "loss": 0.642, "num_tokens": 2568577592.0, "step": 1100 }, { "epoch": 1.1807811959903214, "grad_norm": 0.9378808154139731, "learning_rate": 6.236077681357731e-05, "loss": 0.6387, "num_tokens": 2571025312.0, "step": 1101 }, { "epoch": 1.1811268579329415, "grad_norm": 0.5188144845407305, "learning_rate": 6.232073212180217e-05, "loss": 0.6378, "num_tokens": 2573278383.0, "step": 1102 }, { "epoch": 1.1814725198755618, "grad_norm": 1.129598569577214, "learning_rate": 6.228065491740394e-05, "loss": 0.6332, "num_tokens": 2575549210.0, "step": 1103 }, { "epoch": 1.1818181818181819, "grad_norm": 0.9901018669772305, "learning_rate": 6.224054525876059e-05, "loss": 0.6335, "num_tokens": 2577939562.0, "step": 1104 }, { "epoch": 1.182163843760802, "grad_norm": 0.9756545906779439, "learning_rate": 6.220040320429736e-05, "loss": 0.6273, "num_tokens": 2580261914.0, "step": 1105 }, { "epoch": 1.182509505703422, "grad_norm": 1.1311448720134305, "learning_rate": 6.216022881248663e-05, "loss": 0.6454, "num_tokens": 2582648605.0, "step": 1106 }, { "epoch": 1.1828551676460421, "grad_norm": 0.6024389344222203, "learning_rate": 6.212002214184789e-05, "loss": 0.6277, "num_tokens": 2584948355.0, "step": 1107 }, { "epoch": 1.1832008295886622, "grad_norm": 1.05897859012828, "learning_rate": 6.207978325094772e-05, "loss": 0.6342, "num_tokens": 2587370747.0, "step": 1108 }, { "epoch": 1.1835464915312823, "grad_norm": 0.7615208980732178, "learning_rate": 6.203951219839953e-05, "loss": 0.6093, "num_tokens": 2589728383.0, "step": 1109 }, { "epoch": 1.1838921534739026, "grad_norm": 0.8614307141570999, "learning_rate": 6.199920904286365e-05, "loss": 0.6242, "num_tokens": 2592078549.0, "step": 1110 }, { "epoch": 1.1842378154165227, "grad_norm": 0.8596040846959309, "learning_rate": 6.195887384304714e-05, "loss": 0.6268, "num_tokens": 2594455129.0, "step": 1111 }, { "epoch": 1.1845834773591428, "grad_norm": 0.6812845432497595, "learning_rate": 6.191850665770375e-05, "loss": 0.629, "num_tokens": 2596941659.0, "step": 1112 }, { "epoch": 1.184929139301763, "grad_norm": 0.6225659113904932, "learning_rate": 6.187810754563385e-05, "loss": 0.6149, "num_tokens": 2599349438.0, "step": 1113 }, { "epoch": 1.185274801244383, "grad_norm": 0.5913140994331072, "learning_rate": 6.183767656568421e-05, "loss": 0.6196, "num_tokens": 2601802153.0, "step": 1114 }, { "epoch": 1.185620463187003, "grad_norm": 0.5346630677799714, "learning_rate": 6.179721377674817e-05, "loss": 0.6163, "num_tokens": 2604202180.0, "step": 1115 }, { "epoch": 1.1859661251296232, "grad_norm": 0.43402760999811735, "learning_rate": 6.175671923776529e-05, "loss": 0.6158, "num_tokens": 2606489187.0, "step": 1116 }, { "epoch": 1.1863117870722433, "grad_norm": 0.630188092739292, "learning_rate": 6.171619300772144e-05, "loss": 0.6131, "num_tokens": 2608781942.0, "step": 1117 }, { "epoch": 1.1866574490148634, "grad_norm": 0.3777676008270153, "learning_rate": 6.167563514564858e-05, "loss": 0.6173, "num_tokens": 2611062402.0, "step": 1118 }, { "epoch": 1.1870031109574837, "grad_norm": 0.7305396771794315, "learning_rate": 6.163504571062486e-05, "loss": 0.639, "num_tokens": 2613423153.0, "step": 1119 }, { "epoch": 1.1873487729001038, "grad_norm": 0.5051106839106165, "learning_rate": 6.15944247617743e-05, "loss": 0.6186, "num_tokens": 2615823720.0, "step": 1120 }, { "epoch": 1.1876944348427239, "grad_norm": 0.6964356076027165, "learning_rate": 6.155377235826693e-05, "loss": 0.6229, "num_tokens": 2618195407.0, "step": 1121 }, { "epoch": 1.188040096785344, "grad_norm": 0.5790787138190756, "learning_rate": 6.15130885593185e-05, "loss": 0.6063, "num_tokens": 2620449116.0, "step": 1122 }, { "epoch": 1.188385758727964, "grad_norm": 0.5126447255480284, "learning_rate": 6.147237342419056e-05, "loss": 0.626, "num_tokens": 2622699594.0, "step": 1123 }, { "epoch": 1.1887314206705841, "grad_norm": 0.7871564434471875, "learning_rate": 6.143162701219029e-05, "loss": 0.6264, "num_tokens": 2625092938.0, "step": 1124 }, { "epoch": 1.1890770826132042, "grad_norm": 0.5209536353255501, "learning_rate": 6.139084938267043e-05, "loss": 0.6245, "num_tokens": 2627432677.0, "step": 1125 }, { "epoch": 1.1894227445558243, "grad_norm": 1.062400785366239, "learning_rate": 6.135004059502917e-05, "loss": 0.6469, "num_tokens": 2629823448.0, "step": 1126 }, { "epoch": 1.1897684064984446, "grad_norm": 1.048597547035341, "learning_rate": 6.13092007087101e-05, "loss": 0.6512, "num_tokens": 2632276469.0, "step": 1127 }, { "epoch": 1.1901140684410647, "grad_norm": 0.6534021430570458, "learning_rate": 6.126832978320211e-05, "loss": 0.6141, "num_tokens": 2634686091.0, "step": 1128 }, { "epoch": 1.1904597303836848, "grad_norm": 0.7216571533557882, "learning_rate": 6.122742787803933e-05, "loss": 0.6037, "num_tokens": 2636910080.0, "step": 1129 }, { "epoch": 1.190805392326305, "grad_norm": 0.7289522656813919, "learning_rate": 6.118649505280098e-05, "loss": 0.636, "num_tokens": 2639278992.0, "step": 1130 }, { "epoch": 1.191151054268925, "grad_norm": 0.4122211881725999, "learning_rate": 6.114553136711132e-05, "loss": 0.6114, "num_tokens": 2641654613.0, "step": 1131 }, { "epoch": 1.191496716211545, "grad_norm": 0.8526245898721758, "learning_rate": 6.110453688063959e-05, "loss": 0.6248, "num_tokens": 2643911810.0, "step": 1132 }, { "epoch": 1.1918423781541652, "grad_norm": 0.6359581949957432, "learning_rate": 6.106351165309986e-05, "loss": 0.6182, "num_tokens": 2646095989.0, "step": 1133 }, { "epoch": 1.1921880400967853, "grad_norm": 0.8869903807214052, "learning_rate": 6.1022455744251006e-05, "loss": 0.6263, "num_tokens": 2648506830.0, "step": 1134 }, { "epoch": 1.1925337020394053, "grad_norm": 0.6551349551445427, "learning_rate": 6.0981369213896575e-05, "loss": 0.6428, "num_tokens": 2650875082.0, "step": 1135 }, { "epoch": 1.1928793639820257, "grad_norm": 0.9109586939150391, "learning_rate": 6.094025212188475e-05, "loss": 0.6082, "num_tokens": 2653130894.0, "step": 1136 }, { "epoch": 1.1932250259246457, "grad_norm": 0.6835566588970783, "learning_rate": 6.089910452810821e-05, "loss": 0.6474, "num_tokens": 2655388282.0, "step": 1137 }, { "epoch": 1.1935706878672658, "grad_norm": 0.9333519384329954, "learning_rate": 6.0857926492504065e-05, "loss": 0.6302, "num_tokens": 2657737132.0, "step": 1138 }, { "epoch": 1.193916349809886, "grad_norm": 0.788240280625609, "learning_rate": 6.081671807505373e-05, "loss": 0.6455, "num_tokens": 2660115732.0, "step": 1139 }, { "epoch": 1.194262011752506, "grad_norm": 0.8040181859841573, "learning_rate": 6.077547933578297e-05, "loss": 0.6077, "num_tokens": 2662456848.0, "step": 1140 }, { "epoch": 1.194607673695126, "grad_norm": 0.7549739870035803, "learning_rate": 6.0734210334761616e-05, "loss": 0.6221, "num_tokens": 2664682785.0, "step": 1141 }, { "epoch": 1.1949533356377462, "grad_norm": 0.6216844832284941, "learning_rate": 6.069291113210366e-05, "loss": 0.6025, "num_tokens": 2667009115.0, "step": 1142 }, { "epoch": 1.1952989975803665, "grad_norm": 0.5381473921686684, "learning_rate": 6.065158178796701e-05, "loss": 0.6262, "num_tokens": 2669356709.0, "step": 1143 }, { "epoch": 1.1956446595229866, "grad_norm": 0.7727860444247312, "learning_rate": 6.061022236255356e-05, "loss": 0.6156, "num_tokens": 2671661329.0, "step": 1144 }, { "epoch": 1.1959903214656067, "grad_norm": 0.5719313181027021, "learning_rate": 6.056883291610897e-05, "loss": 0.6211, "num_tokens": 2673931917.0, "step": 1145 }, { "epoch": 1.1963359834082268, "grad_norm": 0.9713052484474615, "learning_rate": 6.052741350892264e-05, "loss": 0.619, "num_tokens": 2676266149.0, "step": 1146 }, { "epoch": 1.1966816453508469, "grad_norm": 0.9268812950739193, "learning_rate": 6.048596420132759e-05, "loss": 0.6137, "num_tokens": 2678666910.0, "step": 1147 }, { "epoch": 1.197027307293467, "grad_norm": 0.6141942310831742, "learning_rate": 6.0444485053700465e-05, "loss": 0.6054, "num_tokens": 2680872223.0, "step": 1148 }, { "epoch": 1.197372969236087, "grad_norm": 0.7691361745603013, "learning_rate": 6.04029761264613e-05, "loss": 0.6422, "num_tokens": 2683283622.0, "step": 1149 }, { "epoch": 1.1977186311787071, "grad_norm": 0.5370561173015453, "learning_rate": 6.036143748007354e-05, "loss": 0.612, "num_tokens": 2685693129.0, "step": 1150 }, { "epoch": 1.1980642931213272, "grad_norm": 0.46517937709460283, "learning_rate": 6.03198691750439e-05, "loss": 0.6258, "num_tokens": 2687933966.0, "step": 1151 }, { "epoch": 1.1984099550639475, "grad_norm": 0.6352010947591583, "learning_rate": 6.027827127192235e-05, "loss": 0.641, "num_tokens": 2690272508.0, "step": 1152 }, { "epoch": 1.1987556170065676, "grad_norm": 0.3622663900883658, "learning_rate": 6.02366438313019e-05, "loss": 0.6042, "num_tokens": 2692637091.0, "step": 1153 }, { "epoch": 1.1991012789491877, "grad_norm": 0.8088722125784332, "learning_rate": 6.019498691381868e-05, "loss": 0.6106, "num_tokens": 2694981412.0, "step": 1154 }, { "epoch": 1.1994469408918078, "grad_norm": 0.6217728090568126, "learning_rate": 6.0153300580151614e-05, "loss": 0.6392, "num_tokens": 2697404110.0, "step": 1155 }, { "epoch": 1.199792602834428, "grad_norm": 0.948503068732415, "learning_rate": 6.011158489102264e-05, "loss": 0.6301, "num_tokens": 2699737033.0, "step": 1156 }, { "epoch": 2.00034566194262, "grad_norm": 1.6983983184643514, "learning_rate": 6.006983990719634e-05, "loss": 1.2661, "num_tokens": 2703542470.0, "step": 1157 }, { "epoch": 2.00069132388524, "grad_norm": 0.7442873843666701, "learning_rate": 6.0028065689480014e-05, "loss": 0.6203, "num_tokens": 2705960297.0, "step": 1158 }, { "epoch": 2.0010369858278603, "grad_norm": 0.7246510291053152, "learning_rate": 5.9986262298723524e-05, "loss": 0.6221, "num_tokens": 2708254510.0, "step": 1159 }, { "epoch": 2.0013826477704804, "grad_norm": 0.7731935930078484, "learning_rate": 5.994442979581924e-05, "loss": 0.6179, "num_tokens": 2710587145.0, "step": 1160 }, { "epoch": 2.0017283097131005, "grad_norm": 0.6661950010458624, "learning_rate": 5.990256824170196e-05, "loss": 0.634, "num_tokens": 2713023656.0, "step": 1161 }, { "epoch": 2.0020739716557205, "grad_norm": 0.7262493115905476, "learning_rate": 5.986067769734873e-05, "loss": 0.6133, "num_tokens": 2715262150.0, "step": 1162 }, { "epoch": 2.0024196335983406, "grad_norm": 0.6876518363658152, "learning_rate": 5.981875822377893e-05, "loss": 0.6325, "num_tokens": 2717621390.0, "step": 1163 }, { "epoch": 2.0027652955409607, "grad_norm": 0.7469791631147007, "learning_rate": 5.977680988205396e-05, "loss": 0.6241, "num_tokens": 2719919847.0, "step": 1164 }, { "epoch": 2.0031109574835813, "grad_norm": 0.7209876680831472, "learning_rate": 5.973483273327737e-05, "loss": 0.6276, "num_tokens": 2722153155.0, "step": 1165 }, { "epoch": 2.0034566194262013, "grad_norm": 0.6539876570046835, "learning_rate": 5.969282683859461e-05, "loss": 0.6026, "num_tokens": 2724415200.0, "step": 1166 }, { "epoch": 2.0038022813688214, "grad_norm": 0.6410589835786973, "learning_rate": 5.9650792259193044e-05, "loss": 0.6098, "num_tokens": 2726817372.0, "step": 1167 }, { "epoch": 2.0041479433114415, "grad_norm": 0.7593877771878029, "learning_rate": 5.960872905630177e-05, "loss": 0.6596, "num_tokens": 2729295162.0, "step": 1168 }, { "epoch": 2.0044936052540616, "grad_norm": 0.6666599626164708, "learning_rate": 5.9566637291191626e-05, "loss": 0.6072, "num_tokens": 2731497739.0, "step": 1169 }, { "epoch": 2.0048392671966817, "grad_norm": 0.6753379543754168, "learning_rate": 5.9524517025175034e-05, "loss": 0.6028, "num_tokens": 2733806000.0, "step": 1170 }, { "epoch": 2.005184929139302, "grad_norm": 0.6089275348410033, "learning_rate": 5.948236831960594e-05, "loss": 0.5982, "num_tokens": 2736124404.0, "step": 1171 }, { "epoch": 2.005530591081922, "grad_norm": 0.7992910351209958, "learning_rate": 5.9440191235879685e-05, "loss": 0.6141, "num_tokens": 2738545369.0, "step": 1172 }, { "epoch": 2.005876253024542, "grad_norm": 0.717465856861861, "learning_rate": 5.939798583543301e-05, "loss": 0.6297, "num_tokens": 2740941875.0, "step": 1173 }, { "epoch": 2.006221914967162, "grad_norm": 0.6993038527846662, "learning_rate": 5.935575217974383e-05, "loss": 0.6004, "num_tokens": 2743249517.0, "step": 1174 }, { "epoch": 2.006567576909782, "grad_norm": 0.6527277762520203, "learning_rate": 5.9313490330331296e-05, "loss": 0.6162, "num_tokens": 2745723971.0, "step": 1175 }, { "epoch": 2.0069132388524022, "grad_norm": 0.8193547453031245, "learning_rate": 5.9271200348755546e-05, "loss": 0.6039, "num_tokens": 2748023424.0, "step": 1176 }, { "epoch": 2.0072589007950223, "grad_norm": 0.6282131087809707, "learning_rate": 5.922888229661775e-05, "loss": 0.5974, "num_tokens": 2750262628.0, "step": 1177 }, { "epoch": 2.0076045627376424, "grad_norm": 0.6884009746594285, "learning_rate": 5.918653623555994e-05, "loss": 0.5996, "num_tokens": 2752620786.0, "step": 1178 }, { "epoch": 2.0079502246802625, "grad_norm": 0.5969485312674665, "learning_rate": 5.914416222726498e-05, "loss": 0.6153, "num_tokens": 2754938504.0, "step": 1179 }, { "epoch": 2.0082958866228826, "grad_norm": 0.8639766578152747, "learning_rate": 5.91017603334564e-05, "loss": 0.6221, "num_tokens": 2757255693.0, "step": 1180 }, { "epoch": 2.008641548565503, "grad_norm": 0.908447107151901, "learning_rate": 5.90593306158984e-05, "loss": 0.613, "num_tokens": 2759620378.0, "step": 1181 }, { "epoch": 2.0089872105081232, "grad_norm": 0.5440290004596824, "learning_rate": 5.901687313639563e-05, "loss": 0.6293, "num_tokens": 2761892250.0, "step": 1182 }, { "epoch": 2.0093328724507433, "grad_norm": 0.5866501794465553, "learning_rate": 5.8974387956793266e-05, "loss": 0.6264, "num_tokens": 2764251949.0, "step": 1183 }, { "epoch": 2.0096785343933634, "grad_norm": 0.6313320404507824, "learning_rate": 5.893187513897679e-05, "loss": 0.6075, "num_tokens": 2766587104.0, "step": 1184 }, { "epoch": 2.0100241963359835, "grad_norm": 0.46116049618639426, "learning_rate": 5.8889334744871936e-05, "loss": 0.6189, "num_tokens": 2768933002.0, "step": 1185 }, { "epoch": 2.0103698582786036, "grad_norm": 0.9071841441363677, "learning_rate": 5.884676683644463e-05, "loss": 0.6273, "num_tokens": 2771280788.0, "step": 1186 }, { "epoch": 2.0107155202212237, "grad_norm": 0.7035937211960908, "learning_rate": 5.880417147570086e-05, "loss": 0.6107, "num_tokens": 2773572746.0, "step": 1187 }, { "epoch": 2.0110611821638438, "grad_norm": 0.7991766387080663, "learning_rate": 5.876154872468661e-05, "loss": 0.6234, "num_tokens": 2775975921.0, "step": 1188 }, { "epoch": 2.011406844106464, "grad_norm": 0.807391846706665, "learning_rate": 5.8718898645487765e-05, "loss": 0.6161, "num_tokens": 2778337263.0, "step": 1189 }, { "epoch": 2.011752506049084, "grad_norm": 0.5562165017013784, "learning_rate": 5.867622130023e-05, "loss": 0.6199, "num_tokens": 2780668685.0, "step": 1190 }, { "epoch": 2.012098167991704, "grad_norm": 0.525918324832085, "learning_rate": 5.8633516751078715e-05, "loss": 0.6054, "num_tokens": 2782995071.0, "step": 1191 }, { "epoch": 2.012443829934324, "grad_norm": 0.742850834984646, "learning_rate": 5.8590785060238944e-05, "loss": 0.594, "num_tokens": 2785297846.0, "step": 1192 }, { "epoch": 2.0127894918769442, "grad_norm": 0.5818736602944322, "learning_rate": 5.8548026289955255e-05, "loss": 0.6226, "num_tokens": 2787606882.0, "step": 1193 }, { "epoch": 2.0131351538195643, "grad_norm": 0.8146841490819778, "learning_rate": 5.850524050251167e-05, "loss": 0.5705, "num_tokens": 2789917707.0, "step": 1194 }, { "epoch": 2.0134808157621844, "grad_norm": 0.7799587675986269, "learning_rate": 5.846242776023151e-05, "loss": 0.5862, "num_tokens": 2792208397.0, "step": 1195 }, { "epoch": 2.0138264777048045, "grad_norm": 0.49756545870411284, "learning_rate": 5.8419588125477454e-05, "loss": 0.5858, "num_tokens": 2794422122.0, "step": 1196 }, { "epoch": 2.014172139647425, "grad_norm": 0.588581034799447, "learning_rate": 5.837672166065128e-05, "loss": 0.6036, "num_tokens": 2796787647.0, "step": 1197 }, { "epoch": 2.014517801590045, "grad_norm": 0.5794257519385543, "learning_rate": 5.833382842819387e-05, "loss": 0.5926, "num_tokens": 2799008939.0, "step": 1198 }, { "epoch": 2.014863463532665, "grad_norm": 0.37542314287494183, "learning_rate": 5.8290908490585074e-05, "loss": 0.6004, "num_tokens": 2801343662.0, "step": 1199 }, { "epoch": 2.0152091254752853, "grad_norm": 0.9164441781041319, "learning_rate": 5.8247961910343695e-05, "loss": 0.5918, "num_tokens": 2803675383.0, "step": 1200 }, { "epoch": 2.0155547874179054, "grad_norm": 0.8543498168754647, "learning_rate": 5.820498875002731e-05, "loss": 0.6219, "num_tokens": 2806042191.0, "step": 1201 }, { "epoch": 2.0159004493605255, "grad_norm": 0.6893260214351075, "learning_rate": 5.8161989072232205e-05, "loss": 0.6061, "num_tokens": 2808320346.0, "step": 1202 }, { "epoch": 2.0162461113031456, "grad_norm": 0.7544457260809125, "learning_rate": 5.81189629395933e-05, "loss": 0.6172, "num_tokens": 2810811935.0, "step": 1203 }, { "epoch": 2.0165917732457657, "grad_norm": 0.5104916803984129, "learning_rate": 5.8075910414784084e-05, "loss": 0.6265, "num_tokens": 2813078604.0, "step": 1204 }, { "epoch": 2.0169374351883858, "grad_norm": 0.6381132131624556, "learning_rate": 5.8032831560516425e-05, "loss": 0.619, "num_tokens": 2815366899.0, "step": 1205 }, { "epoch": 2.017283097131006, "grad_norm": 0.5284200882949209, "learning_rate": 5.7989726439540605e-05, "loss": 0.6269, "num_tokens": 2817690994.0, "step": 1206 }, { "epoch": 2.017628759073626, "grad_norm": 0.34542786689797705, "learning_rate": 5.794659511464512e-05, "loss": 0.6117, "num_tokens": 2820055089.0, "step": 1207 }, { "epoch": 2.017974421016246, "grad_norm": 0.7560090082336607, "learning_rate": 5.7903437648656665e-05, "loss": 0.6042, "num_tokens": 2822288430.0, "step": 1208 }, { "epoch": 2.018320082958866, "grad_norm": 0.5898726828853388, "learning_rate": 5.786025410444002e-05, "loss": 0.6203, "num_tokens": 2824712012.0, "step": 1209 }, { "epoch": 2.018665744901486, "grad_norm": 0.8370990498105547, "learning_rate": 5.781704454489793e-05, "loss": 0.621, "num_tokens": 2827148049.0, "step": 1210 }, { "epoch": 2.0190114068441063, "grad_norm": 0.7862899234850369, "learning_rate": 5.7773809032971e-05, "loss": 0.6311, "num_tokens": 2829555653.0, "step": 1211 }, { "epoch": 2.0193570687867264, "grad_norm": 0.7267285714096522, "learning_rate": 5.7730547631637735e-05, "loss": 0.6334, "num_tokens": 2832078053.0, "step": 1212 }, { "epoch": 2.019702730729347, "grad_norm": 0.8117802091918718, "learning_rate": 5.7687260403914264e-05, "loss": 0.6026, "num_tokens": 2834508497.0, "step": 1213 }, { "epoch": 2.020048392671967, "grad_norm": 0.5476173083610995, "learning_rate": 5.764394741285438e-05, "loss": 0.6262, "num_tokens": 2836980452.0, "step": 1214 }, { "epoch": 2.020394054614587, "grad_norm": 0.7998449760195921, "learning_rate": 5.76006087215494e-05, "loss": 0.6229, "num_tokens": 2839284251.0, "step": 1215 }, { "epoch": 2.020739716557207, "grad_norm": 0.6449016538857358, "learning_rate": 5.7557244393128034e-05, "loss": 0.627, "num_tokens": 2841423648.0, "step": 1216 }, { "epoch": 2.0210853784998273, "grad_norm": 0.6873619631462554, "learning_rate": 5.751385449075641e-05, "loss": 0.6065, "num_tokens": 2843741178.0, "step": 1217 }, { "epoch": 2.0214310404424474, "grad_norm": 0.8482070608413074, "learning_rate": 5.7470439077637845e-05, "loss": 0.6135, "num_tokens": 2846061812.0, "step": 1218 }, { "epoch": 2.0217767023850675, "grad_norm": 0.5494497236481478, "learning_rate": 5.7426998217012835e-05, "loss": 0.6222, "num_tokens": 2848331389.0, "step": 1219 }, { "epoch": 2.0221223643276875, "grad_norm": 0.6863107082502556, "learning_rate": 5.738353197215897e-05, "loss": 0.6322, "num_tokens": 2850680680.0, "step": 1220 }, { "epoch": 2.0224680262703076, "grad_norm": 0.6738447393787746, "learning_rate": 5.734004040639076e-05, "loss": 0.6238, "num_tokens": 2853007171.0, "step": 1221 }, { "epoch": 2.0228136882129277, "grad_norm": 0.5542657186036472, "learning_rate": 5.7296523583059675e-05, "loss": 0.6069, "num_tokens": 2855360185.0, "step": 1222 }, { "epoch": 2.023159350155548, "grad_norm": 0.6568104984881573, "learning_rate": 5.7252981565553894e-05, "loss": 0.6078, "num_tokens": 2857628485.0, "step": 1223 }, { "epoch": 2.023505012098168, "grad_norm": 0.5947952196949869, "learning_rate": 5.7209414417298344e-05, "loss": 0.6074, "num_tokens": 2859902857.0, "step": 1224 }, { "epoch": 2.023850674040788, "grad_norm": 0.5879663371033316, "learning_rate": 5.716582220175456e-05, "loss": 0.598, "num_tokens": 2862224706.0, "step": 1225 }, { "epoch": 2.024196335983408, "grad_norm": 0.5175978532057498, "learning_rate": 5.712220498242057e-05, "loss": 0.6244, "num_tokens": 2864594415.0, "step": 1226 }, { "epoch": 2.024541997926028, "grad_norm": 0.6423216752918504, "learning_rate": 5.707856282283084e-05, "loss": 0.6179, "num_tokens": 2866860754.0, "step": 1227 }, { "epoch": 2.0248876598686483, "grad_norm": 0.47903563464562704, "learning_rate": 5.703489578655614e-05, "loss": 0.5939, "num_tokens": 2869075430.0, "step": 1228 }, { "epoch": 2.0252333218112684, "grad_norm": 0.6224231697723156, "learning_rate": 5.699120393720351e-05, "loss": 0.6153, "num_tokens": 2871371121.0, "step": 1229 }, { "epoch": 2.025578983753889, "grad_norm": 0.5854130733860494, "learning_rate": 5.6947487338416104e-05, "loss": 0.5909, "num_tokens": 2873664539.0, "step": 1230 }, { "epoch": 2.025924645696509, "grad_norm": 0.4681324858119423, "learning_rate": 5.690374605387318e-05, "loss": 0.6033, "num_tokens": 2875972761.0, "step": 1231 }, { "epoch": 2.026270307639129, "grad_norm": 0.5066873916492088, "learning_rate": 5.685998014728984e-05, "loss": 0.621, "num_tokens": 2878324473.0, "step": 1232 }, { "epoch": 2.026615969581749, "grad_norm": 0.5165774101959238, "learning_rate": 5.681618968241719e-05, "loss": 0.6011, "num_tokens": 2880743438.0, "step": 1233 }, { "epoch": 2.0269616315243693, "grad_norm": 0.5462166677946223, "learning_rate": 5.6772374723042016e-05, "loss": 0.6066, "num_tokens": 2883113562.0, "step": 1234 }, { "epoch": 2.0273072934669893, "grad_norm": 0.4991318134459581, "learning_rate": 5.672853533298683e-05, "loss": 0.6083, "num_tokens": 2885507881.0, "step": 1235 }, { "epoch": 2.0276529554096094, "grad_norm": 0.5519840728737752, "learning_rate": 5.668467157610968e-05, "loss": 0.602, "num_tokens": 2887789962.0, "step": 1236 }, { "epoch": 2.0279986173522295, "grad_norm": 0.4820652100026768, "learning_rate": 5.664078351630418e-05, "loss": 0.5947, "num_tokens": 2890127773.0, "step": 1237 }, { "epoch": 2.0283442792948496, "grad_norm": 0.5218030585256224, "learning_rate": 5.659687121749926e-05, "loss": 0.59, "num_tokens": 2892439637.0, "step": 1238 }, { "epoch": 2.0286899412374697, "grad_norm": 0.44313267257206534, "learning_rate": 5.655293474365925e-05, "loss": 0.6108, "num_tokens": 2894747159.0, "step": 1239 }, { "epoch": 2.02903560318009, "grad_norm": 0.5371641585740026, "learning_rate": 5.650897415878361e-05, "loss": 0.6041, "num_tokens": 2897077333.0, "step": 1240 }, { "epoch": 2.02938126512271, "grad_norm": 0.46624849879625346, "learning_rate": 5.6464989526906974e-05, "loss": 0.5776, "num_tokens": 2899461991.0, "step": 1241 }, { "epoch": 2.02972692706533, "grad_norm": 0.5401807002198875, "learning_rate": 5.642098091209899e-05, "loss": 0.5753, "num_tokens": 2901811919.0, "step": 1242 }, { "epoch": 2.03007258900795, "grad_norm": 0.5472715875192805, "learning_rate": 5.637694837846422e-05, "loss": 0.6168, "num_tokens": 2904210850.0, "step": 1243 }, { "epoch": 2.03041825095057, "grad_norm": 0.6275862815911735, "learning_rate": 5.633289199014211e-05, "loss": 0.6089, "num_tokens": 2906408823.0, "step": 1244 }, { "epoch": 2.0307639128931902, "grad_norm": 0.5182927643911286, "learning_rate": 5.6288811811306804e-05, "loss": 0.5971, "num_tokens": 2908814454.0, "step": 1245 }, { "epoch": 2.031109574835811, "grad_norm": 0.615480497109353, "learning_rate": 5.6244707906167136e-05, "loss": 0.5966, "num_tokens": 2911188765.0, "step": 1246 }, { "epoch": 2.031455236778431, "grad_norm": 0.5654307300109646, "learning_rate": 5.620058033896648e-05, "loss": 0.5871, "num_tokens": 2913544729.0, "step": 1247 }, { "epoch": 2.031800898721051, "grad_norm": 0.6759285074914977, "learning_rate": 5.615642917398271e-05, "loss": 0.5915, "num_tokens": 2915823840.0, "step": 1248 }, { "epoch": 2.032146560663671, "grad_norm": 0.6397580415247495, "learning_rate": 5.6112254475528006e-05, "loss": 0.6063, "num_tokens": 2918163854.0, "step": 1249 }, { "epoch": 2.032492222606291, "grad_norm": 0.5278698967861356, "learning_rate": 5.606805630794893e-05, "loss": 0.602, "num_tokens": 2920599492.0, "step": 1250 }, { "epoch": 2.0328378845489112, "grad_norm": 0.6757133057033519, "learning_rate": 5.6023834735626116e-05, "loss": 0.5675, "num_tokens": 2922914498.0, "step": 1251 }, { "epoch": 2.0331835464915313, "grad_norm": 0.4395604949882517, "learning_rate": 5.597958982297438e-05, "loss": 0.6082, "num_tokens": 2925267839.0, "step": 1252 }, { "epoch": 2.0335292084341514, "grad_norm": 0.6407649357139156, "learning_rate": 5.5935321634442474e-05, "loss": 0.6004, "num_tokens": 2927638944.0, "step": 1253 }, { "epoch": 2.0338748703767715, "grad_norm": 0.5781375209109065, "learning_rate": 5.5891030234513106e-05, "loss": 0.6385, "num_tokens": 2929963335.0, "step": 1254 }, { "epoch": 2.0342205323193916, "grad_norm": 0.5346773012866548, "learning_rate": 5.584671568770276e-05, "loss": 0.6219, "num_tokens": 2932283673.0, "step": 1255 }, { "epoch": 2.0345661942620117, "grad_norm": 0.5634585941402117, "learning_rate": 5.580237805856165e-05, "loss": 0.6091, "num_tokens": 2934624476.0, "step": 1256 }, { "epoch": 2.0349118562046318, "grad_norm": 0.5223123448500595, "learning_rate": 5.575801741167361e-05, "loss": 0.6148, "num_tokens": 2936811500.0, "step": 1257 }, { "epoch": 2.035257518147252, "grad_norm": 0.6100766238181008, "learning_rate": 5.5713633811656005e-05, "loss": 0.5992, "num_tokens": 2939199246.0, "step": 1258 }, { "epoch": 2.035603180089872, "grad_norm": 0.6300319578822967, "learning_rate": 5.566922732315962e-05, "loss": 0.6082, "num_tokens": 2941528529.0, "step": 1259 }, { "epoch": 2.035948842032492, "grad_norm": 0.6029478230716395, "learning_rate": 5.562479801086861e-05, "loss": 0.5989, "num_tokens": 2943818454.0, "step": 1260 }, { "epoch": 2.036294503975112, "grad_norm": 0.5963737260826928, "learning_rate": 5.5580345939500346e-05, "loss": 0.6141, "num_tokens": 2946126116.0, "step": 1261 }, { "epoch": 2.0366401659177327, "grad_norm": 0.5068278020650865, "learning_rate": 5.553587117380537e-05, "loss": 0.6114, "num_tokens": 2948525138.0, "step": 1262 }, { "epoch": 2.0369858278603528, "grad_norm": 0.5568417324171215, "learning_rate": 5.549137377856727e-05, "loss": 0.6304, "num_tokens": 2950884154.0, "step": 1263 }, { "epoch": 2.037331489802973, "grad_norm": 0.5213384527120767, "learning_rate": 5.5446853818602595e-05, "loss": 0.6153, "num_tokens": 2953141908.0, "step": 1264 }, { "epoch": 2.037677151745593, "grad_norm": 0.5321980257371528, "learning_rate": 5.540231135876077e-05, "loss": 0.6063, "num_tokens": 2955544112.0, "step": 1265 }, { "epoch": 2.038022813688213, "grad_norm": 0.5015596988009549, "learning_rate": 5.535774646392401e-05, "loss": 0.6156, "num_tokens": 2957931919.0, "step": 1266 }, { "epoch": 2.038368475630833, "grad_norm": 0.5832821256270071, "learning_rate": 5.531315919900717e-05, "loss": 0.6166, "num_tokens": 2960187122.0, "step": 1267 }, { "epoch": 2.038714137573453, "grad_norm": 0.45580269126113726, "learning_rate": 5.526854962895774e-05, "loss": 0.5977, "num_tokens": 2962475805.0, "step": 1268 }, { "epoch": 2.0390597995160733, "grad_norm": 0.6142537898377721, "learning_rate": 5.522391781875564e-05, "loss": 0.6137, "num_tokens": 2964868388.0, "step": 1269 }, { "epoch": 2.0394054614586934, "grad_norm": 0.5333676282289138, "learning_rate": 5.5179263833413234e-05, "loss": 0.5995, "num_tokens": 2967211344.0, "step": 1270 }, { "epoch": 2.0397511234013135, "grad_norm": 0.5725340938172345, "learning_rate": 5.513458773797519e-05, "loss": 0.6112, "num_tokens": 2969528142.0, "step": 1271 }, { "epoch": 2.0400967853439336, "grad_norm": 0.6170193208053968, "learning_rate": 5.5089889597518324e-05, "loss": 0.6145, "num_tokens": 2971957292.0, "step": 1272 }, { "epoch": 2.0404424472865537, "grad_norm": 0.5682381648597012, "learning_rate": 5.5045169477151645e-05, "loss": 0.608, "num_tokens": 2974134938.0, "step": 1273 }, { "epoch": 2.0407881092291738, "grad_norm": 0.4827937183125581, "learning_rate": 5.500042744201612e-05, "loss": 0.6031, "num_tokens": 2976572533.0, "step": 1274 }, { "epoch": 2.041133771171794, "grad_norm": 0.5383409905181379, "learning_rate": 5.495566355728465e-05, "loss": 0.5952, "num_tokens": 2978846275.0, "step": 1275 }, { "epoch": 2.041479433114414, "grad_norm": 0.4977390274277039, "learning_rate": 5.491087788816198e-05, "loss": 0.5927, "num_tokens": 2981218028.0, "step": 1276 }, { "epoch": 2.041825095057034, "grad_norm": 0.5743564152603742, "learning_rate": 5.4866070499884555e-05, "loss": 0.607, "num_tokens": 2983516829.0, "step": 1277 }, { "epoch": 2.0421707569996546, "grad_norm": 0.6016395301475899, "learning_rate": 5.482124145772051e-05, "loss": 0.6235, "num_tokens": 2985974853.0, "step": 1278 }, { "epoch": 2.0425164189422746, "grad_norm": 0.4587878650251555, "learning_rate": 5.477639082696947e-05, "loss": 0.6134, "num_tokens": 2988263957.0, "step": 1279 }, { "epoch": 2.0428620808848947, "grad_norm": 0.5170040264186122, "learning_rate": 5.473151867296254e-05, "loss": 0.6036, "num_tokens": 2990592023.0, "step": 1280 }, { "epoch": 2.043207742827515, "grad_norm": 0.4548539404380404, "learning_rate": 5.468662506106214e-05, "loss": 0.6334, "num_tokens": 2993000191.0, "step": 1281 }, { "epoch": 2.043553404770135, "grad_norm": 0.5033124591021085, "learning_rate": 5.464171005666198e-05, "loss": 0.614, "num_tokens": 2995344714.0, "step": 1282 }, { "epoch": 2.043899066712755, "grad_norm": 0.44667062989295664, "learning_rate": 5.459677372518692e-05, "loss": 0.6137, "num_tokens": 2997745826.0, "step": 1283 }, { "epoch": 2.044244728655375, "grad_norm": 0.43621104229171326, "learning_rate": 5.4551816132092876e-05, "loss": 0.5943, "num_tokens": 3000026030.0, "step": 1284 }, { "epoch": 2.044590390597995, "grad_norm": 0.42703493409671434, "learning_rate": 5.450683734286677e-05, "loss": 0.5903, "num_tokens": 3002321921.0, "step": 1285 }, { "epoch": 2.0449360525406153, "grad_norm": 0.5395855525690519, "learning_rate": 5.4461837423026355e-05, "loss": 0.5898, "num_tokens": 3004699940.0, "step": 1286 }, { "epoch": 2.0452817144832354, "grad_norm": 0.50678507152604, "learning_rate": 5.441681643812019e-05, "loss": 0.6183, "num_tokens": 3007112777.0, "step": 1287 }, { "epoch": 2.0456273764258555, "grad_norm": 0.6019317612983736, "learning_rate": 5.437177445372749e-05, "loss": 0.602, "num_tokens": 3009435776.0, "step": 1288 }, { "epoch": 2.0459730383684755, "grad_norm": 0.5863072150441261, "learning_rate": 5.432671153545811e-05, "loss": 0.6276, "num_tokens": 3011921411.0, "step": 1289 }, { "epoch": 2.0463187003110956, "grad_norm": 0.6254608431749489, "learning_rate": 5.428162774895234e-05, "loss": 0.6186, "num_tokens": 3014215887.0, "step": 1290 }, { "epoch": 2.0466643622537157, "grad_norm": 0.47843745798493226, "learning_rate": 5.423652315988093e-05, "loss": 0.6249, "num_tokens": 3016687372.0, "step": 1291 }, { "epoch": 2.047010024196336, "grad_norm": 0.4141659775595344, "learning_rate": 5.419139783394484e-05, "loss": 0.5966, "num_tokens": 3019034321.0, "step": 1292 }, { "epoch": 2.047355686138956, "grad_norm": 0.510810522892774, "learning_rate": 5.414625183687534e-05, "loss": 0.6017, "num_tokens": 3021292293.0, "step": 1293 }, { "epoch": 2.047701348081576, "grad_norm": 0.41829526657986776, "learning_rate": 5.4101085234433765e-05, "loss": 0.6111, "num_tokens": 3023533155.0, "step": 1294 }, { "epoch": 2.0480470100241965, "grad_norm": 0.48062734874510304, "learning_rate": 5.405589809241142e-05, "loss": 0.5914, "num_tokens": 3025891129.0, "step": 1295 }, { "epoch": 2.0483926719668166, "grad_norm": 0.3875636403621201, "learning_rate": 5.40106904766296e-05, "loss": 0.5977, "num_tokens": 3028170519.0, "step": 1296 }, { "epoch": 2.0487383339094367, "grad_norm": 0.40725036120715, "learning_rate": 5.39654624529394e-05, "loss": 0.614, "num_tokens": 3030533342.0, "step": 1297 }, { "epoch": 2.049083995852057, "grad_norm": 0.48169707685500246, "learning_rate": 5.392021408722161e-05, "loss": 0.6029, "num_tokens": 3032910008.0, "step": 1298 }, { "epoch": 2.049429657794677, "grad_norm": 0.4082236704550609, "learning_rate": 5.3874945445386706e-05, "loss": 0.62, "num_tokens": 3035353770.0, "step": 1299 }, { "epoch": 2.049775319737297, "grad_norm": 0.4478889348463514, "learning_rate": 5.3829656593374644e-05, "loss": 0.6221, "num_tokens": 3037661561.0, "step": 1300 }, { "epoch": 2.050120981679917, "grad_norm": 0.34116851464317116, "learning_rate": 5.3784347597154855e-05, "loss": 0.5979, "num_tokens": 3039995227.0, "step": 1301 }, { "epoch": 2.050466643622537, "grad_norm": 0.40013225507271305, "learning_rate": 5.373901852272611e-05, "loss": 0.6253, "num_tokens": 3042356211.0, "step": 1302 }, { "epoch": 2.0508123055651573, "grad_norm": 0.44099928828290136, "learning_rate": 5.369366943611641e-05, "loss": 0.6133, "num_tokens": 3044605964.0, "step": 1303 }, { "epoch": 2.0511579675077773, "grad_norm": 0.4080138900079332, "learning_rate": 5.364830040338293e-05, "loss": 0.6037, "num_tokens": 3046869637.0, "step": 1304 }, { "epoch": 2.0515036294503974, "grad_norm": 0.5169473108300595, "learning_rate": 5.3602911490611835e-05, "loss": 0.6161, "num_tokens": 3049203375.0, "step": 1305 }, { "epoch": 2.0518492913930175, "grad_norm": 0.4449557408995129, "learning_rate": 5.355750276391836e-05, "loss": 0.6061, "num_tokens": 3051445585.0, "step": 1306 }, { "epoch": 2.0521949533356376, "grad_norm": 0.3921152430483812, "learning_rate": 5.3512074289446514e-05, "loss": 0.5812, "num_tokens": 3053679376.0, "step": 1307 }, { "epoch": 2.0525406152782577, "grad_norm": 0.36286474115355816, "learning_rate": 5.34666261333691e-05, "loss": 0.6156, "num_tokens": 3056033295.0, "step": 1308 }, { "epoch": 2.052886277220878, "grad_norm": 0.41416395772979153, "learning_rate": 5.342115836188756e-05, "loss": 0.5873, "num_tokens": 3058237536.0, "step": 1309 }, { "epoch": 2.053231939163498, "grad_norm": 0.33231583844955537, "learning_rate": 5.3375671041231984e-05, "loss": 0.5975, "num_tokens": 3060555300.0, "step": 1310 }, { "epoch": 2.0535776011061184, "grad_norm": 0.3800244093214951, "learning_rate": 5.3330164237660844e-05, "loss": 0.6233, "num_tokens": 3062934789.0, "step": 1311 }, { "epoch": 2.0539232630487385, "grad_norm": 0.43947625057591977, "learning_rate": 5.328463801746108e-05, "loss": 0.6088, "num_tokens": 3065151430.0, "step": 1312 }, { "epoch": 2.0542689249913586, "grad_norm": 0.4474886647128191, "learning_rate": 5.323909244694782e-05, "loss": 0.6006, "num_tokens": 3067421239.0, "step": 1313 }, { "epoch": 2.0546145869339787, "grad_norm": 0.40717613745553277, "learning_rate": 5.319352759246447e-05, "loss": 0.6142, "num_tokens": 3069721674.0, "step": 1314 }, { "epoch": 2.054960248876599, "grad_norm": 0.3519313502011318, "learning_rate": 5.314794352038248e-05, "loss": 0.5775, "num_tokens": 3071984411.0, "step": 1315 }, { "epoch": 2.055305910819219, "grad_norm": 0.400389918759006, "learning_rate": 5.310234029710128e-05, "loss": 0.622, "num_tokens": 3074260919.0, "step": 1316 }, { "epoch": 2.055651572761839, "grad_norm": 0.38289231712531946, "learning_rate": 5.3056717989048236e-05, "loss": 0.5894, "num_tokens": 3076695076.0, "step": 1317 }, { "epoch": 2.055997234704459, "grad_norm": 0.39825730402005594, "learning_rate": 5.301107666267848e-05, "loss": 0.6225, "num_tokens": 3079048581.0, "step": 1318 }, { "epoch": 2.056342896647079, "grad_norm": 0.4892193674826655, "learning_rate": 5.2965416384474885e-05, "loss": 0.6363, "num_tokens": 3081481101.0, "step": 1319 }, { "epoch": 2.0566885585896992, "grad_norm": 0.2918629069714954, "learning_rate": 5.2919737220947876e-05, "loss": 0.6083, "num_tokens": 3083750248.0, "step": 1320 }, { "epoch": 2.0570342205323193, "grad_norm": 0.5704164770851035, "learning_rate": 5.28740392386354e-05, "loss": 0.5997, "num_tokens": 3086097092.0, "step": 1321 }, { "epoch": 2.0573798824749394, "grad_norm": 0.3637904345056428, "learning_rate": 5.2828322504102874e-05, "loss": 0.6133, "num_tokens": 3088445821.0, "step": 1322 }, { "epoch": 2.0577255444175595, "grad_norm": 0.4299012223391985, "learning_rate": 5.278258708394297e-05, "loss": 0.5662, "num_tokens": 3090655590.0, "step": 1323 }, { "epoch": 2.0580712063601796, "grad_norm": 0.36114495013871306, "learning_rate": 5.2736833044775595e-05, "loss": 0.6008, "num_tokens": 3093005486.0, "step": 1324 }, { "epoch": 2.0584168683027997, "grad_norm": 0.4646665458202692, "learning_rate": 5.269106045324778e-05, "loss": 0.6004, "num_tokens": 3095342757.0, "step": 1325 }, { "epoch": 2.0587625302454198, "grad_norm": 0.36559405257066435, "learning_rate": 5.264526937603358e-05, "loss": 0.5876, "num_tokens": 3097548940.0, "step": 1326 }, { "epoch": 2.0591081921880403, "grad_norm": 0.5504694490825284, "learning_rate": 5.259945987983397e-05, "loss": 0.586, "num_tokens": 3099855164.0, "step": 1327 }, { "epoch": 2.0594538541306604, "grad_norm": 0.4343717465517934, "learning_rate": 5.255363203137676e-05, "loss": 0.5903, "num_tokens": 3102151487.0, "step": 1328 }, { "epoch": 2.0597995160732805, "grad_norm": 0.504407525936016, "learning_rate": 5.25077858974165e-05, "loss": 0.6026, "num_tokens": 3104563621.0, "step": 1329 }, { "epoch": 2.0601451780159006, "grad_norm": 0.4659436131566551, "learning_rate": 5.246192154473436e-05, "loss": 0.6019, "num_tokens": 3106973953.0, "step": 1330 }, { "epoch": 2.0604908399585207, "grad_norm": 0.46577612027413723, "learning_rate": 5.241603904013806e-05, "loss": 0.5681, "num_tokens": 3109334071.0, "step": 1331 }, { "epoch": 2.0608365019011408, "grad_norm": 0.4534767046573194, "learning_rate": 5.237013845046175e-05, "loss": 0.6072, "num_tokens": 3111623711.0, "step": 1332 }, { "epoch": 2.061182163843761, "grad_norm": 0.46583401123679735, "learning_rate": 5.2324219842565955e-05, "loss": 0.5823, "num_tokens": 3113946994.0, "step": 1333 }, { "epoch": 2.061527825786381, "grad_norm": 0.4975240998433116, "learning_rate": 5.227828328333739e-05, "loss": 0.619, "num_tokens": 3116247769.0, "step": 1334 }, { "epoch": 2.061873487729001, "grad_norm": 0.36868129528835913, "learning_rate": 5.223232883968896e-05, "loss": 0.6155, "num_tokens": 3118478766.0, "step": 1335 }, { "epoch": 2.062219149671621, "grad_norm": 0.4457842705164704, "learning_rate": 5.218635657855961e-05, "loss": 0.5931, "num_tokens": 3120772785.0, "step": 1336 }, { "epoch": 2.062564811614241, "grad_norm": 0.43226589878053656, "learning_rate": 5.214036656691425e-05, "loss": 0.6273, "num_tokens": 3123260271.0, "step": 1337 }, { "epoch": 2.0629104735568613, "grad_norm": 0.3710556079437764, "learning_rate": 5.209435887174363e-05, "loss": 0.5939, "num_tokens": 3125735346.0, "step": 1338 }, { "epoch": 2.0632561354994814, "grad_norm": 0.5689559844563727, "learning_rate": 5.204833356006426e-05, "loss": 0.613, "num_tokens": 3127993693.0, "step": 1339 }, { "epoch": 2.0636017974421015, "grad_norm": 0.50140646818568, "learning_rate": 5.200229069891831e-05, "loss": 0.6027, "num_tokens": 3130303716.0, "step": 1340 }, { "epoch": 2.0639474593847216, "grad_norm": 0.5337983095997584, "learning_rate": 5.195623035537353e-05, "loss": 0.5967, "num_tokens": 3132513082.0, "step": 1341 }, { "epoch": 2.0642931213273417, "grad_norm": 0.5868167137154597, "learning_rate": 5.191015259652313e-05, "loss": 0.5918, "num_tokens": 3134785282.0, "step": 1342 }, { "epoch": 2.064638783269962, "grad_norm": 0.5340252361151071, "learning_rate": 5.186405748948566e-05, "loss": 0.6076, "num_tokens": 3137241546.0, "step": 1343 }, { "epoch": 2.0649844452125823, "grad_norm": 0.638440713322839, "learning_rate": 5.1817945101404976e-05, "loss": 0.6011, "num_tokens": 3139627948.0, "step": 1344 }, { "epoch": 2.0653301071552024, "grad_norm": 0.5318842486596531, "learning_rate": 5.177181549945009e-05, "loss": 0.603, "num_tokens": 3142033289.0, "step": 1345 }, { "epoch": 2.0656757690978225, "grad_norm": 0.7233091124619869, "learning_rate": 5.172566875081508e-05, "loss": 0.6052, "num_tokens": 3144284285.0, "step": 1346 }, { "epoch": 2.0660214310404426, "grad_norm": 0.6217843162035248, "learning_rate": 5.167950492271903e-05, "loss": 0.5999, "num_tokens": 3146659120.0, "step": 1347 }, { "epoch": 2.0663670929830626, "grad_norm": 0.5013759738254941, "learning_rate": 5.1633324082405855e-05, "loss": 0.5952, "num_tokens": 3149054559.0, "step": 1348 }, { "epoch": 2.0667127549256827, "grad_norm": 0.5499401018826537, "learning_rate": 5.1587126297144305e-05, "loss": 0.5903, "num_tokens": 3151378550.0, "step": 1349 }, { "epoch": 2.067058416868303, "grad_norm": 0.4752547351654341, "learning_rate": 5.1540911634227774e-05, "loss": 0.5833, "num_tokens": 3153763861.0, "step": 1350 }, { "epoch": 2.067404078810923, "grad_norm": 0.4971518106908343, "learning_rate": 5.149468016097426e-05, "loss": 0.6177, "num_tokens": 3156218779.0, "step": 1351 }, { "epoch": 2.067749740753543, "grad_norm": 0.5237165597670185, "learning_rate": 5.144843194472622e-05, "loss": 0.6009, "num_tokens": 3158730293.0, "step": 1352 }, { "epoch": 2.068095402696163, "grad_norm": 0.35430393340268224, "learning_rate": 5.140216705285054e-05, "loss": 0.5819, "num_tokens": 3161095133.0, "step": 1353 }, { "epoch": 2.068441064638783, "grad_norm": 0.4699734048766594, "learning_rate": 5.135588555273838e-05, "loss": 0.6057, "num_tokens": 3163412660.0, "step": 1354 }, { "epoch": 2.0687867265814033, "grad_norm": 0.403406170186645, "learning_rate": 5.130958751180508e-05, "loss": 0.5547, "num_tokens": 3165788784.0, "step": 1355 }, { "epoch": 2.0691323885240234, "grad_norm": 0.5128392197639187, "learning_rate": 5.126327299749008e-05, "loss": 0.5981, "num_tokens": 3168152449.0, "step": 1356 }, { "epoch": 2.0694780504666435, "grad_norm": 0.49078424504025125, "learning_rate": 5.1216942077256814e-05, "loss": 0.5823, "num_tokens": 3170440932.0, "step": 1357 }, { "epoch": 2.0698237124092635, "grad_norm": 0.4623384473764788, "learning_rate": 5.117059481859263e-05, "loss": 0.5764, "num_tokens": 3172676957.0, "step": 1358 }, { "epoch": 2.0701693743518836, "grad_norm": 0.4749201080992479, "learning_rate": 5.1124231289008664e-05, "loss": 0.6076, "num_tokens": 3175013843.0, "step": 1359 }, { "epoch": 2.070515036294504, "grad_norm": 0.43539691480416165, "learning_rate": 5.107785155603973e-05, "loss": 0.5762, "num_tokens": 3177306106.0, "step": 1360 }, { "epoch": 2.0708606982371243, "grad_norm": 0.4321376119569733, "learning_rate": 5.1031455687244286e-05, "loss": 0.6027, "num_tokens": 3179598307.0, "step": 1361 }, { "epoch": 2.0712063601797444, "grad_norm": 0.3908657109199825, "learning_rate": 5.0985043750204266e-05, "loss": 0.5823, "num_tokens": 3181990757.0, "step": 1362 }, { "epoch": 2.0715520221223644, "grad_norm": 0.3959646593624416, "learning_rate": 5.0938615812525037e-05, "loss": 0.5983, "num_tokens": 3184393405.0, "step": 1363 }, { "epoch": 2.0718976840649845, "grad_norm": 0.33415542714869745, "learning_rate": 5.089217194183523e-05, "loss": 0.5887, "num_tokens": 3186807958.0, "step": 1364 }, { "epoch": 2.0722433460076046, "grad_norm": 0.4126534313619065, "learning_rate": 5.08457122057867e-05, "loss": 0.6033, "num_tokens": 3189021466.0, "step": 1365 }, { "epoch": 2.0725890079502247, "grad_norm": 0.3377253409568065, "learning_rate": 5.079923667205445e-05, "loss": 0.5959, "num_tokens": 3191253057.0, "step": 1366 }, { "epoch": 2.072934669892845, "grad_norm": 0.5145287857230376, "learning_rate": 5.075274540833645e-05, "loss": 0.6107, "num_tokens": 3193667262.0, "step": 1367 }, { "epoch": 2.073280331835465, "grad_norm": 0.46953729460069205, "learning_rate": 5.07062384823536e-05, "loss": 0.6133, "num_tokens": 3196054273.0, "step": 1368 }, { "epoch": 2.073625993778085, "grad_norm": 0.45222944451378894, "learning_rate": 5.065971596184962e-05, "loss": 0.5877, "num_tokens": 3198364427.0, "step": 1369 }, { "epoch": 2.073971655720705, "grad_norm": 0.5030619973621816, "learning_rate": 5.0613177914590915e-05, "loss": 0.5906, "num_tokens": 3200742851.0, "step": 1370 }, { "epoch": 2.074317317663325, "grad_norm": 0.4688222159601526, "learning_rate": 5.056662440836654e-05, "loss": 0.6092, "num_tokens": 3203218957.0, "step": 1371 }, { "epoch": 2.0746629796059453, "grad_norm": 0.49551805449835007, "learning_rate": 5.052005551098808e-05, "loss": 0.5844, "num_tokens": 3205476093.0, "step": 1372 }, { "epoch": 2.0750086415485653, "grad_norm": 0.5537450404177507, "learning_rate": 5.0473471290289485e-05, "loss": 0.5916, "num_tokens": 3207792093.0, "step": 1373 }, { "epoch": 2.0753543034911854, "grad_norm": 0.4241531512083838, "learning_rate": 5.0426871814127077e-05, "loss": 0.577, "num_tokens": 3210086339.0, "step": 1374 }, { "epoch": 2.0756999654338055, "grad_norm": 0.5404564702691456, "learning_rate": 5.038025715037937e-05, "loss": 0.59, "num_tokens": 3212357764.0, "step": 1375 }, { "epoch": 2.076045627376426, "grad_norm": 0.43073252489205993, "learning_rate": 5.0333627366947015e-05, "loss": 0.5672, "num_tokens": 3214718419.0, "step": 1376 }, { "epoch": 2.076391289319046, "grad_norm": 0.48967298286161537, "learning_rate": 5.028698253175268e-05, "loss": 0.6083, "num_tokens": 3217184509.0, "step": 1377 }, { "epoch": 2.0767369512616662, "grad_norm": 0.43217246640406853, "learning_rate": 5.024032271274096e-05, "loss": 0.6098, "num_tokens": 3219412170.0, "step": 1378 }, { "epoch": 2.0770826132042863, "grad_norm": 0.4047987816392751, "learning_rate": 5.019364797787829e-05, "loss": 0.6011, "num_tokens": 3221700253.0, "step": 1379 }, { "epoch": 2.0774282751469064, "grad_norm": 0.3769070366388324, "learning_rate": 5.014695839515281e-05, "loss": 0.6011, "num_tokens": 3224131887.0, "step": 1380 }, { "epoch": 2.0777739370895265, "grad_norm": 0.45695592521803, "learning_rate": 5.0100254032574285e-05, "loss": 0.5986, "num_tokens": 3226482394.0, "step": 1381 }, { "epoch": 2.0781195990321466, "grad_norm": 0.4135402265139777, "learning_rate": 5.005353495817404e-05, "loss": 0.6158, "num_tokens": 3228795415.0, "step": 1382 }, { "epoch": 2.0784652609747667, "grad_norm": 0.4158986146867342, "learning_rate": 5.000680124000481e-05, "loss": 0.6316, "num_tokens": 3231197437.0, "step": 1383 }, { "epoch": 2.078810922917387, "grad_norm": 0.3989372903988358, "learning_rate": 4.996005294614064e-05, "loss": 0.5867, "num_tokens": 3233511135.0, "step": 1384 }, { "epoch": 2.079156584860007, "grad_norm": 0.446182580372595, "learning_rate": 4.991329014467685e-05, "loss": 0.5949, "num_tokens": 3235830712.0, "step": 1385 }, { "epoch": 2.079502246802627, "grad_norm": 0.39912662186878356, "learning_rate": 4.986651290372985e-05, "loss": 0.6008, "num_tokens": 3238161243.0, "step": 1386 }, { "epoch": 2.079847908745247, "grad_norm": 0.534903402940179, "learning_rate": 4.981972129143711e-05, "loss": 0.5866, "num_tokens": 3240351048.0, "step": 1387 }, { "epoch": 2.080193570687867, "grad_norm": 0.3752422616304984, "learning_rate": 4.9772915375957044e-05, "loss": 0.5874, "num_tokens": 3242642472.0, "step": 1388 }, { "epoch": 2.0805392326304872, "grad_norm": 0.5699021025370616, "learning_rate": 4.972609522546887e-05, "loss": 0.6095, "num_tokens": 3244951493.0, "step": 1389 }, { "epoch": 2.0808848945731073, "grad_norm": 0.558996067689069, "learning_rate": 4.967926090817253e-05, "loss": 0.6075, "num_tokens": 3247249640.0, "step": 1390 }, { "epoch": 2.0812305565157274, "grad_norm": 0.4722460415367258, "learning_rate": 4.963241249228867e-05, "loss": 0.6016, "num_tokens": 3249684782.0, "step": 1391 }, { "epoch": 2.081576218458348, "grad_norm": 0.5845853284207462, "learning_rate": 4.9585550046058404e-05, "loss": 0.5765, "num_tokens": 3251989661.0, "step": 1392 }, { "epoch": 2.081921880400968, "grad_norm": 0.42409143982270275, "learning_rate": 4.9538673637743324e-05, "loss": 0.5693, "num_tokens": 3254431747.0, "step": 1393 }, { "epoch": 2.082267542343588, "grad_norm": 0.6343660772321286, "learning_rate": 4.9491783335625326e-05, "loss": 0.597, "num_tokens": 3256803363.0, "step": 1394 }, { "epoch": 2.082613204286208, "grad_norm": 0.488192163273591, "learning_rate": 4.944487920800657e-05, "loss": 0.5916, "num_tokens": 3259174624.0, "step": 1395 }, { "epoch": 2.0829588662288283, "grad_norm": 0.5034102009509795, "learning_rate": 4.939796132320934e-05, "loss": 0.5693, "num_tokens": 3261486423.0, "step": 1396 }, { "epoch": 2.0833045281714484, "grad_norm": 0.43475066943592616, "learning_rate": 4.935102974957598e-05, "loss": 0.5814, "num_tokens": 3263831155.0, "step": 1397 }, { "epoch": 2.0836501901140685, "grad_norm": 0.5919902106353729, "learning_rate": 4.930408455546874e-05, "loss": 0.5553, "num_tokens": 3266193553.0, "step": 1398 }, { "epoch": 2.0839958520566886, "grad_norm": 0.4188249775572424, "learning_rate": 4.9257125809269753e-05, "loss": 0.5639, "num_tokens": 3268498134.0, "step": 1399 }, { "epoch": 2.0843415139993087, "grad_norm": 0.5166414747770294, "learning_rate": 4.9210153579380846e-05, "loss": 0.5944, "num_tokens": 3270823792.0, "step": 1400 }, { "epoch": 2.0846871759419288, "grad_norm": 0.45806391476180136, "learning_rate": 4.916316793422353e-05, "loss": 0.5729, "num_tokens": 3273050648.0, "step": 1401 }, { "epoch": 2.085032837884549, "grad_norm": 0.3724607613035754, "learning_rate": 4.9116168942238835e-05, "loss": 0.6057, "num_tokens": 3275356153.0, "step": 1402 }, { "epoch": 2.085378499827169, "grad_norm": 0.6505604917192587, "learning_rate": 4.9069156671887216e-05, "loss": 0.6041, "num_tokens": 3277732548.0, "step": 1403 }, { "epoch": 2.085724161769789, "grad_norm": 0.48129075010108735, "learning_rate": 4.902213119164851e-05, "loss": 0.6157, "num_tokens": 3280207621.0, "step": 1404 }, { "epoch": 2.086069823712409, "grad_norm": 0.8202288157223512, "learning_rate": 4.897509257002176e-05, "loss": 0.5978, "num_tokens": 3282480595.0, "step": 1405 }, { "epoch": 2.086415485655029, "grad_norm": 0.8040169194748262, "learning_rate": 4.8928040875525176e-05, "loss": 0.6092, "num_tokens": 3284840094.0, "step": 1406 }, { "epoch": 2.0867611475976493, "grad_norm": 0.5836619310287828, "learning_rate": 4.8880976176695995e-05, "loss": 0.5913, "num_tokens": 3287111698.0, "step": 1407 }, { "epoch": 2.08710680954027, "grad_norm": 0.7486404750312236, "learning_rate": 4.8833898542090395e-05, "loss": 0.5991, "num_tokens": 3289332013.0, "step": 1408 }, { "epoch": 2.08745247148289, "grad_norm": 0.6290656818287904, "learning_rate": 4.878680804028341e-05, "loss": 0.6063, "num_tokens": 3291628252.0, "step": 1409 }, { "epoch": 2.08779813342551, "grad_norm": 0.6409232716485557, "learning_rate": 4.873970473986882e-05, "loss": 0.6024, "num_tokens": 3294060415.0, "step": 1410 }, { "epoch": 2.08814379536813, "grad_norm": 0.6547673585890731, "learning_rate": 4.869258870945903e-05, "loss": 0.609, "num_tokens": 3296299276.0, "step": 1411 }, { "epoch": 2.08848945731075, "grad_norm": 0.5486147956355585, "learning_rate": 4.864546001768498e-05, "loss": 0.5896, "num_tokens": 3298647340.0, "step": 1412 }, { "epoch": 2.0888351192533703, "grad_norm": 0.5052878907147205, "learning_rate": 4.85983187331961e-05, "loss": 0.5976, "num_tokens": 3300966145.0, "step": 1413 }, { "epoch": 2.0891807811959904, "grad_norm": 0.5805886398260992, "learning_rate": 4.855116492466012e-05, "loss": 0.6025, "num_tokens": 3303379605.0, "step": 1414 }, { "epoch": 2.0895264431386105, "grad_norm": 0.5012579039196586, "learning_rate": 4.850399866076301e-05, "loss": 0.5893, "num_tokens": 3305761447.0, "step": 1415 }, { "epoch": 2.0898721050812306, "grad_norm": 0.4913502668881687, "learning_rate": 4.845682001020892e-05, "loss": 0.5746, "num_tokens": 3308107096.0, "step": 1416 }, { "epoch": 2.0902177670238506, "grad_norm": 0.48518963539608134, "learning_rate": 4.8409629041719995e-05, "loss": 0.5926, "num_tokens": 3310476165.0, "step": 1417 }, { "epoch": 2.0905634289664707, "grad_norm": 0.5074854506829566, "learning_rate": 4.8362425824036373e-05, "loss": 0.5672, "num_tokens": 3312834036.0, "step": 1418 }, { "epoch": 2.090909090909091, "grad_norm": 0.3805762297279639, "learning_rate": 4.831521042591601e-05, "loss": 0.6126, "num_tokens": 3315239045.0, "step": 1419 }, { "epoch": 2.091254752851711, "grad_norm": 0.44891464557705857, "learning_rate": 4.82679829161346e-05, "loss": 0.5839, "num_tokens": 3317598412.0, "step": 1420 }, { "epoch": 2.091600414794331, "grad_norm": 0.3691877616880071, "learning_rate": 4.822074336348547e-05, "loss": 0.5823, "num_tokens": 3319951359.0, "step": 1421 }, { "epoch": 2.091946076736951, "grad_norm": 0.5335786191913499, "learning_rate": 4.8173491836779516e-05, "loss": 0.5784, "num_tokens": 3322246526.0, "step": 1422 }, { "epoch": 2.092291738679571, "grad_norm": 0.45654871941771114, "learning_rate": 4.8126228404845066e-05, "loss": 0.5885, "num_tokens": 3324533316.0, "step": 1423 }, { "epoch": 2.0926374006221913, "grad_norm": 0.5336701829321912, "learning_rate": 4.807895313652778e-05, "loss": 0.5992, "num_tokens": 3326841076.0, "step": 1424 }, { "epoch": 2.092983062564812, "grad_norm": 0.48665791463487357, "learning_rate": 4.803166610069057e-05, "loss": 0.5765, "num_tokens": 3329107465.0, "step": 1425 }, { "epoch": 2.093328724507432, "grad_norm": 0.5608239329390864, "learning_rate": 4.798436736621348e-05, "loss": 0.6104, "num_tokens": 3331391244.0, "step": 1426 }, { "epoch": 2.093674386450052, "grad_norm": 0.5366131652356881, "learning_rate": 4.793705700199362e-05, "loss": 0.5859, "num_tokens": 3333692925.0, "step": 1427 }, { "epoch": 2.094020048392672, "grad_norm": 0.6052726358649559, "learning_rate": 4.788973507694499e-05, "loss": 0.5674, "num_tokens": 3335996348.0, "step": 1428 }, { "epoch": 2.094365710335292, "grad_norm": 0.5249649617991476, "learning_rate": 4.784240165999847e-05, "loss": 0.5682, "num_tokens": 3338217982.0, "step": 1429 }, { "epoch": 2.0947113722779123, "grad_norm": 0.6264789215811467, "learning_rate": 4.779505682010168e-05, "loss": 0.5833, "num_tokens": 3340472885.0, "step": 1430 }, { "epoch": 2.0950570342205324, "grad_norm": 0.47708969906447374, "learning_rate": 4.774770062621886e-05, "loss": 0.6063, "num_tokens": 3342753091.0, "step": 1431 }, { "epoch": 2.0954026961631524, "grad_norm": 0.6774414755916596, "learning_rate": 4.770033314733081e-05, "loss": 0.5766, "num_tokens": 3345017940.0, "step": 1432 }, { "epoch": 2.0957483581057725, "grad_norm": 0.5183321834799592, "learning_rate": 4.765295445243472e-05, "loss": 0.6124, "num_tokens": 3347387473.0, "step": 1433 }, { "epoch": 2.0960940200483926, "grad_norm": 0.630312611680502, "learning_rate": 4.76055646105442e-05, "loss": 0.6003, "num_tokens": 3349724179.0, "step": 1434 }, { "epoch": 2.0964396819910127, "grad_norm": 0.5415849422988853, "learning_rate": 4.755816369068902e-05, "loss": 0.5965, "num_tokens": 3352053792.0, "step": 1435 }, { "epoch": 2.096785343933633, "grad_norm": 0.5421067514682311, "learning_rate": 4.751075176191513e-05, "loss": 0.5963, "num_tokens": 3354431365.0, "step": 1436 }, { "epoch": 2.097131005876253, "grad_norm": 0.5416433545303763, "learning_rate": 4.746332889328448e-05, "loss": 0.5859, "num_tokens": 3356695537.0, "step": 1437 }, { "epoch": 2.097476667818873, "grad_norm": 0.5588170582375704, "learning_rate": 4.7415895153875015e-05, "loss": 0.5918, "num_tokens": 3359086245.0, "step": 1438 }, { "epoch": 2.097822329761493, "grad_norm": 0.5201846710638441, "learning_rate": 4.736845061278044e-05, "loss": 0.6001, "num_tokens": 3361405739.0, "step": 1439 }, { "epoch": 2.098167991704113, "grad_norm": 0.5859383484532398, "learning_rate": 4.7320995339110273e-05, "loss": 0.594, "num_tokens": 3363689862.0, "step": 1440 }, { "epoch": 2.0985136536467337, "grad_norm": 0.42113565050373614, "learning_rate": 4.7273529401989585e-05, "loss": 0.5798, "num_tokens": 3366019511.0, "step": 1441 }, { "epoch": 2.098859315589354, "grad_norm": 0.6237849130858489, "learning_rate": 4.722605287055904e-05, "loss": 0.584, "num_tokens": 3368379525.0, "step": 1442 }, { "epoch": 2.099204977531974, "grad_norm": 0.4586223582503875, "learning_rate": 4.7178565813974715e-05, "loss": 0.5975, "num_tokens": 3370713258.0, "step": 1443 }, { "epoch": 2.099550639474594, "grad_norm": 0.5422602907137821, "learning_rate": 4.7131068301408e-05, "loss": 0.5712, "num_tokens": 3373040733.0, "step": 1444 }, { "epoch": 2.099896301417214, "grad_norm": 0.46879892452079386, "learning_rate": 4.708356040204556e-05, "loss": 0.5768, "num_tokens": 3375342076.0, "step": 1445 }, { "epoch": 2.100241963359834, "grad_norm": 0.48930034636831554, "learning_rate": 4.703604218508912e-05, "loss": 0.5872, "num_tokens": 3377632930.0, "step": 1446 }, { "epoch": 2.1005876253024542, "grad_norm": 0.46962094420586087, "learning_rate": 4.698851371975552e-05, "loss": 0.5981, "num_tokens": 3379981420.0, "step": 1447 }, { "epoch": 2.1009332872450743, "grad_norm": 0.4846185853901907, "learning_rate": 4.6940975075276463e-05, "loss": 0.5945, "num_tokens": 3382410671.0, "step": 1448 }, { "epoch": 2.1012789491876944, "grad_norm": 0.41576146001612885, "learning_rate": 4.689342632089851e-05, "loss": 0.5858, "num_tokens": 3384812643.0, "step": 1449 }, { "epoch": 2.1016246111303145, "grad_norm": 0.3412626515858297, "learning_rate": 4.6845867525882914e-05, "loss": 0.5738, "num_tokens": 3387285255.0, "step": 1450 }, { "epoch": 2.1019702730729346, "grad_norm": 0.46776387820593257, "learning_rate": 4.6798298759505614e-05, "loss": 0.5681, "num_tokens": 3389686868.0, "step": 1451 }, { "epoch": 2.1023159350155547, "grad_norm": 0.3848216810159128, "learning_rate": 4.6750720091057005e-05, "loss": 0.5961, "num_tokens": 3392083494.0, "step": 1452 }, { "epoch": 2.102661596958175, "grad_norm": 0.5155178693498613, "learning_rate": 4.670313158984197e-05, "loss": 0.5741, "num_tokens": 3394419830.0, "step": 1453 }, { "epoch": 2.103007258900795, "grad_norm": 0.5443959647458134, "learning_rate": 4.6655533325179666e-05, "loss": 0.5752, "num_tokens": 3396718212.0, "step": 1454 }, { "epoch": 2.103352920843415, "grad_norm": 0.5041536646693336, "learning_rate": 4.660792536640348e-05, "loss": 0.5871, "num_tokens": 3399018515.0, "step": 1455 }, { "epoch": 2.103698582786035, "grad_norm": 0.5690919491663442, "learning_rate": 4.656030778286096e-05, "loss": 0.5843, "num_tokens": 3401370094.0, "step": 1456 }, { "epoch": 2.1040442447286556, "grad_norm": 0.518871558275388, "learning_rate": 4.651268064391362e-05, "loss": 0.5759, "num_tokens": 3403662743.0, "step": 1457 }, { "epoch": 2.1043899066712757, "grad_norm": 0.6898761905106405, "learning_rate": 4.6465044018936916e-05, "loss": 0.5806, "num_tokens": 3405979837.0, "step": 1458 }, { "epoch": 2.1047355686138958, "grad_norm": 0.4738716491350161, "learning_rate": 4.641739797732012e-05, "loss": 0.596, "num_tokens": 3408291704.0, "step": 1459 }, { "epoch": 2.105081230556516, "grad_norm": 0.5746644977651222, "learning_rate": 4.6369742588466226e-05, "loss": 0.5635, "num_tokens": 3410550762.0, "step": 1460 }, { "epoch": 2.105426892499136, "grad_norm": 0.5932643602677807, "learning_rate": 4.632207792179187e-05, "loss": 0.5917, "num_tokens": 3412933686.0, "step": 1461 }, { "epoch": 2.105772554441756, "grad_norm": 0.37588434761962064, "learning_rate": 4.627440404672712e-05, "loss": 0.6056, "num_tokens": 3415267705.0, "step": 1462 }, { "epoch": 2.106118216384376, "grad_norm": 0.4363904470182498, "learning_rate": 4.622672103271553e-05, "loss": 0.5937, "num_tokens": 3417596121.0, "step": 1463 }, { "epoch": 2.106463878326996, "grad_norm": 0.3447751095469651, "learning_rate": 4.617902894921395e-05, "loss": 0.6219, "num_tokens": 3420052184.0, "step": 1464 }, { "epoch": 2.1068095402696163, "grad_norm": 0.49389399695766245, "learning_rate": 4.613132786569246e-05, "loss": 0.5934, "num_tokens": 3422400090.0, "step": 1465 }, { "epoch": 2.1071552022122364, "grad_norm": 0.49247303494521105, "learning_rate": 4.608361785163418e-05, "loss": 0.6083, "num_tokens": 3424791575.0, "step": 1466 }, { "epoch": 2.1075008641548565, "grad_norm": 0.3629743537621346, "learning_rate": 4.603589897653532e-05, "loss": 0.6106, "num_tokens": 3427277003.0, "step": 1467 }, { "epoch": 2.1078465260974766, "grad_norm": 0.6773288249904736, "learning_rate": 4.5988171309904936e-05, "loss": 0.5976, "num_tokens": 3429692183.0, "step": 1468 }, { "epoch": 2.1081921880400967, "grad_norm": 0.49861825156068956, "learning_rate": 4.594043492126494e-05, "loss": 0.5674, "num_tokens": 3432019803.0, "step": 1469 }, { "epoch": 2.1085378499827168, "grad_norm": 0.6280652330983727, "learning_rate": 4.58926898801499e-05, "loss": 0.5832, "num_tokens": 3434228306.0, "step": 1470 }, { "epoch": 2.108883511925337, "grad_norm": 0.5762792896475177, "learning_rate": 4.5844936256107036e-05, "loss": 0.6049, "num_tokens": 3436700598.0, "step": 1471 }, { "epoch": 2.109229173867957, "grad_norm": 0.39382035499885637, "learning_rate": 4.579717411869603e-05, "loss": 0.5822, "num_tokens": 3439032685.0, "step": 1472 }, { "epoch": 2.1095748358105775, "grad_norm": 0.4473620336132255, "learning_rate": 4.5749403537489e-05, "loss": 0.6021, "num_tokens": 3441374442.0, "step": 1473 }, { "epoch": 2.1099204977531976, "grad_norm": 0.4585866444065292, "learning_rate": 4.570162458207034e-05, "loss": 0.5859, "num_tokens": 3443709704.0, "step": 1474 }, { "epoch": 2.1102661596958177, "grad_norm": 0.364684118166688, "learning_rate": 4.565383732203662e-05, "loss": 0.5883, "num_tokens": 3446103480.0, "step": 1475 }, { "epoch": 2.1106118216384377, "grad_norm": 0.5153032271117641, "learning_rate": 4.560604182699656e-05, "loss": 0.5614, "num_tokens": 3448483544.0, "step": 1476 }, { "epoch": 2.110957483581058, "grad_norm": 0.41422749477160964, "learning_rate": 4.555823816657085e-05, "loss": 0.5664, "num_tokens": 3450755965.0, "step": 1477 }, { "epoch": 2.111303145523678, "grad_norm": 0.5455588438015295, "learning_rate": 4.551042641039208e-05, "loss": 0.5919, "num_tokens": 3453034301.0, "step": 1478 }, { "epoch": 2.111648807466298, "grad_norm": 0.5401825233451718, "learning_rate": 4.5462606628104594e-05, "loss": 0.6011, "num_tokens": 3455373690.0, "step": 1479 }, { "epoch": 2.111994469408918, "grad_norm": 0.5583967317519248, "learning_rate": 4.5414778889364494e-05, "loss": 0.5847, "num_tokens": 3457697808.0, "step": 1480 }, { "epoch": 2.112340131351538, "grad_norm": 0.5334758538056495, "learning_rate": 4.536694326383941e-05, "loss": 0.5998, "num_tokens": 3460091172.0, "step": 1481 }, { "epoch": 2.1126857932941583, "grad_norm": 0.46818213520330704, "learning_rate": 4.531909982120852e-05, "loss": 0.5714, "num_tokens": 3462415354.0, "step": 1482 }, { "epoch": 2.1130314552367784, "grad_norm": 0.5792435318300084, "learning_rate": 4.527124863116234e-05, "loss": 0.5971, "num_tokens": 3464709383.0, "step": 1483 }, { "epoch": 2.1133771171793985, "grad_norm": 0.4709965304517349, "learning_rate": 4.522338976340266e-05, "loss": 0.5839, "num_tokens": 3467093607.0, "step": 1484 }, { "epoch": 2.1137227791220186, "grad_norm": 0.6020637019196002, "learning_rate": 4.5175523287642513e-05, "loss": 0.5815, "num_tokens": 3469393160.0, "step": 1485 }, { "epoch": 2.1140684410646386, "grad_norm": 0.5945573427376513, "learning_rate": 4.512764927360597e-05, "loss": 0.579, "num_tokens": 3471780093.0, "step": 1486 }, { "epoch": 2.1144141030072587, "grad_norm": 0.5332782847648369, "learning_rate": 4.50797677910281e-05, "loss": 0.596, "num_tokens": 3474181577.0, "step": 1487 }, { "epoch": 2.114759764949879, "grad_norm": 0.5511499382254517, "learning_rate": 4.503187890965486e-05, "loss": 0.5822, "num_tokens": 3476605643.0, "step": 1488 }, { "epoch": 2.115105426892499, "grad_norm": 0.6185706934473375, "learning_rate": 4.498398269924291e-05, "loss": 0.5918, "num_tokens": 3478966828.0, "step": 1489 }, { "epoch": 2.1154510888351195, "grad_norm": 0.44621909558067707, "learning_rate": 4.493607922955971e-05, "loss": 0.6, "num_tokens": 3481151839.0, "step": 1490 }, { "epoch": 2.1157967507777395, "grad_norm": 0.5135248871613949, "learning_rate": 4.4888168570383226e-05, "loss": 0.5915, "num_tokens": 3483550123.0, "step": 1491 }, { "epoch": 2.1161424127203596, "grad_norm": 0.43408892246476505, "learning_rate": 4.484025079150186e-05, "loss": 0.581, "num_tokens": 3485866600.0, "step": 1492 }, { "epoch": 2.1164880746629797, "grad_norm": 0.3633011465325275, "learning_rate": 4.4792325962714436e-05, "loss": 0.5935, "num_tokens": 3488198570.0, "step": 1493 }, { "epoch": 2.1168337366056, "grad_norm": 0.35940000396493293, "learning_rate": 4.474439415383006e-05, "loss": 0.5886, "num_tokens": 3490537109.0, "step": 1494 }, { "epoch": 2.11717939854822, "grad_norm": 0.3878789934424021, "learning_rate": 4.469645543466797e-05, "loss": 0.5964, "num_tokens": 3492755477.0, "step": 1495 }, { "epoch": 2.11752506049084, "grad_norm": 0.33345901092344943, "learning_rate": 4.464850987505747e-05, "loss": 0.5735, "num_tokens": 3495098110.0, "step": 1496 }, { "epoch": 2.11787072243346, "grad_norm": 0.34213964080475556, "learning_rate": 4.4600557544837847e-05, "loss": 0.5881, "num_tokens": 3497386067.0, "step": 1497 }, { "epoch": 2.11821638437608, "grad_norm": 0.357226310955312, "learning_rate": 4.4552598513858235e-05, "loss": 0.6108, "num_tokens": 3499756171.0, "step": 1498 }, { "epoch": 2.1185620463187003, "grad_norm": 0.27708489722957935, "learning_rate": 4.450463285197755e-05, "loss": 0.5772, "num_tokens": 3502091701.0, "step": 1499 }, { "epoch": 2.1189077082613204, "grad_norm": 0.41232959144279124, "learning_rate": 4.4456660629064354e-05, "loss": 0.5709, "num_tokens": 3504410297.0, "step": 1500 }, { "epoch": 2.1192533702039404, "grad_norm": 0.35667810338884975, "learning_rate": 4.440868191499675e-05, "loss": 0.5719, "num_tokens": 3506620011.0, "step": 1501 }, { "epoch": 2.1195990321465605, "grad_norm": 0.40654290165366097, "learning_rate": 4.43606967796623e-05, "loss": 0.5871, "num_tokens": 3508884566.0, "step": 1502 }, { "epoch": 2.1199446940891806, "grad_norm": 0.44387074653271946, "learning_rate": 4.431270529295797e-05, "loss": 0.5713, "num_tokens": 3511263472.0, "step": 1503 }, { "epoch": 2.1202903560318007, "grad_norm": 0.39723237218678403, "learning_rate": 4.4264707524789924e-05, "loss": 0.5752, "num_tokens": 3513597448.0, "step": 1504 }, { "epoch": 2.120636017974421, "grad_norm": 0.4459400326476763, "learning_rate": 4.421670354507347e-05, "loss": 0.6013, "num_tokens": 3515905910.0, "step": 1505 }, { "epoch": 2.1209816799170413, "grad_norm": 0.43501625717579756, "learning_rate": 4.4168693423733e-05, "loss": 0.5937, "num_tokens": 3518346169.0, "step": 1506 }, { "epoch": 2.1213273418596614, "grad_norm": 0.38917831788586815, "learning_rate": 4.412067723070184e-05, "loss": 0.5605, "num_tokens": 3520633844.0, "step": 1507 }, { "epoch": 2.1216730038022815, "grad_norm": 0.40692183654120423, "learning_rate": 4.4072655035922145e-05, "loss": 0.5819, "num_tokens": 3522976219.0, "step": 1508 }, { "epoch": 2.1220186657449016, "grad_norm": 0.376500991496363, "learning_rate": 4.4024626909344834e-05, "loss": 0.571, "num_tokens": 3525411195.0, "step": 1509 }, { "epoch": 2.1223643276875217, "grad_norm": 0.48864380071217056, "learning_rate": 4.3976592920929436e-05, "loss": 0.568, "num_tokens": 3527751502.0, "step": 1510 }, { "epoch": 2.122709989630142, "grad_norm": 0.3452412067287764, "learning_rate": 4.392855314064408e-05, "loss": 0.5531, "num_tokens": 3530065862.0, "step": 1511 }, { "epoch": 2.123055651572762, "grad_norm": 0.5304391072595286, "learning_rate": 4.388050763846524e-05, "loss": 0.5554, "num_tokens": 3532296308.0, "step": 1512 }, { "epoch": 2.123401313515382, "grad_norm": 0.5218528518368897, "learning_rate": 4.3832456484377814e-05, "loss": 0.6138, "num_tokens": 3534765692.0, "step": 1513 }, { "epoch": 2.123746975458002, "grad_norm": 0.444695742396165, "learning_rate": 4.378439974837488e-05, "loss": 0.5766, "num_tokens": 3536962895.0, "step": 1514 }, { "epoch": 2.124092637400622, "grad_norm": 0.5016819751222594, "learning_rate": 4.373633750045765e-05, "loss": 0.5882, "num_tokens": 3539318128.0, "step": 1515 }, { "epoch": 2.1244382993432422, "grad_norm": 0.4350757709078293, "learning_rate": 4.36882698106354e-05, "loss": 0.6014, "num_tokens": 3541585063.0, "step": 1516 }, { "epoch": 2.1247839612858623, "grad_norm": 0.3998049436175027, "learning_rate": 4.3640196748925294e-05, "loss": 0.5815, "num_tokens": 3543977438.0, "step": 1517 }, { "epoch": 2.1251296232284824, "grad_norm": 0.4314261366017844, "learning_rate": 4.359211838535232e-05, "loss": 0.5983, "num_tokens": 3546286233.0, "step": 1518 }, { "epoch": 2.1254752851711025, "grad_norm": 0.4476508542513684, "learning_rate": 4.354403478994924e-05, "loss": 0.5935, "num_tokens": 3548541267.0, "step": 1519 }, { "epoch": 2.1258209471137226, "grad_norm": 0.37433606778038175, "learning_rate": 4.3495946032756374e-05, "loss": 0.5864, "num_tokens": 3550874391.0, "step": 1520 }, { "epoch": 2.1261666090563427, "grad_norm": 0.46467280179911385, "learning_rate": 4.34478521838216e-05, "loss": 0.5745, "num_tokens": 3553174508.0, "step": 1521 }, { "epoch": 2.126512270998963, "grad_norm": 0.45372222407536217, "learning_rate": 4.33997533132002e-05, "loss": 0.5888, "num_tokens": 3555537942.0, "step": 1522 }, { "epoch": 2.1268579329415833, "grad_norm": 0.49096504342195496, "learning_rate": 4.3351649490954764e-05, "loss": 0.5842, "num_tokens": 3557916966.0, "step": 1523 }, { "epoch": 2.1272035948842034, "grad_norm": 0.39642832214995855, "learning_rate": 4.330354078715512e-05, "loss": 0.5732, "num_tokens": 3560249867.0, "step": 1524 }, { "epoch": 2.1275492568268235, "grad_norm": 0.44299922760462324, "learning_rate": 4.3255427271878155e-05, "loss": 0.5773, "num_tokens": 3562557716.0, "step": 1525 }, { "epoch": 2.1278949187694436, "grad_norm": 0.4306769491884419, "learning_rate": 4.320730901520783e-05, "loss": 0.5865, "num_tokens": 3565015887.0, "step": 1526 }, { "epoch": 2.1282405807120637, "grad_norm": 0.4663182269941624, "learning_rate": 4.315918608723497e-05, "loss": 0.5959, "num_tokens": 3567296964.0, "step": 1527 }, { "epoch": 2.1285862426546838, "grad_norm": 0.44856056495356916, "learning_rate": 4.311105855805722e-05, "loss": 0.5827, "num_tokens": 3569632351.0, "step": 1528 }, { "epoch": 2.128931904597304, "grad_norm": 0.4020148740100099, "learning_rate": 4.3062926497778924e-05, "loss": 0.5815, "num_tokens": 3571894689.0, "step": 1529 }, { "epoch": 2.129277566539924, "grad_norm": 0.3967396751993151, "learning_rate": 4.301478997651101e-05, "loss": 0.6162, "num_tokens": 3574334376.0, "step": 1530 }, { "epoch": 2.129623228482544, "grad_norm": 0.33280922514754935, "learning_rate": 4.2966649064370937e-05, "loss": 0.6012, "num_tokens": 3576721263.0, "step": 1531 }, { "epoch": 2.129968890425164, "grad_norm": 0.49112133215390136, "learning_rate": 4.2918503831482534e-05, "loss": 0.5908, "num_tokens": 3579152413.0, "step": 1532 }, { "epoch": 2.130314552367784, "grad_norm": 0.4029924244476603, "learning_rate": 4.2870354347975923e-05, "loss": 0.5715, "num_tokens": 3581580238.0, "step": 1533 }, { "epoch": 2.1306602143104043, "grad_norm": 0.5797849101549221, "learning_rate": 4.2822200683987445e-05, "loss": 0.6057, "num_tokens": 3583885606.0, "step": 1534 }, { "epoch": 2.1310058762530244, "grad_norm": 0.5545419293403632, "learning_rate": 4.27740429096595e-05, "loss": 0.5836, "num_tokens": 3586094085.0, "step": 1535 }, { "epoch": 2.1313515381956445, "grad_norm": 0.39544120446802755, "learning_rate": 4.2725881095140494e-05, "loss": 0.5931, "num_tokens": 3588394128.0, "step": 1536 }, { "epoch": 2.1316972001382646, "grad_norm": 0.5414136784004402, "learning_rate": 4.267771531058471e-05, "loss": 0.5696, "num_tokens": 3590731600.0, "step": 1537 }, { "epoch": 2.132042862080885, "grad_norm": 0.38650157575223854, "learning_rate": 4.2629545626152205e-05, "loss": 0.6047, "num_tokens": 3593037684.0, "step": 1538 }, { "epoch": 2.132388524023505, "grad_norm": 0.6449431876129881, "learning_rate": 4.2581372112008725e-05, "loss": 0.5775, "num_tokens": 3595425955.0, "step": 1539 }, { "epoch": 2.1327341859661253, "grad_norm": 0.6446301043232713, "learning_rate": 4.2533194838325616e-05, "loss": 0.5921, "num_tokens": 3597824590.0, "step": 1540 }, { "epoch": 2.1330798479087454, "grad_norm": 0.47956740063433745, "learning_rate": 4.248501387527966e-05, "loss": 0.613, "num_tokens": 3600213089.0, "step": 1541 }, { "epoch": 2.1334255098513655, "grad_norm": 0.5929778445476284, "learning_rate": 4.243682929305306e-05, "loss": 0.6075, "num_tokens": 3602513874.0, "step": 1542 }, { "epoch": 2.1337711717939856, "grad_norm": 0.4233463449994534, "learning_rate": 4.238864116183322e-05, "loss": 0.5964, "num_tokens": 3604801371.0, "step": 1543 }, { "epoch": 2.1341168337366057, "grad_norm": 0.4319308428438848, "learning_rate": 4.234044955181281e-05, "loss": 0.6005, "num_tokens": 3607264893.0, "step": 1544 }, { "epoch": 2.1344624956792257, "grad_norm": 0.4721185051044637, "learning_rate": 4.2292254533189475e-05, "loss": 0.586, "num_tokens": 3609595515.0, "step": 1545 }, { "epoch": 2.134808157621846, "grad_norm": 0.3626804909078634, "learning_rate": 4.2244056176165896e-05, "loss": 0.5545, "num_tokens": 3611744975.0, "step": 1546 }, { "epoch": 2.135153819564466, "grad_norm": 0.4367178541922521, "learning_rate": 4.219585455094955e-05, "loss": 0.5758, "num_tokens": 3614154877.0, "step": 1547 }, { "epoch": 2.135499481507086, "grad_norm": 0.4597158653426491, "learning_rate": 4.2147649727752755e-05, "loss": 0.5696, "num_tokens": 3616446292.0, "step": 1548 }, { "epoch": 2.135845143449706, "grad_norm": 0.48742549902524335, "learning_rate": 4.20994417767924e-05, "loss": 0.5887, "num_tokens": 3618771550.0, "step": 1549 }, { "epoch": 2.136190805392326, "grad_norm": 0.421195380403554, "learning_rate": 4.205123076829001e-05, "loss": 0.5643, "num_tokens": 3621106294.0, "step": 1550 }, { "epoch": 2.1365364673349463, "grad_norm": 0.4962367428200045, "learning_rate": 4.2003016772471505e-05, "loss": 0.5726, "num_tokens": 3623422035.0, "step": 1551 }, { "epoch": 2.1368821292775664, "grad_norm": 0.43707162666899874, "learning_rate": 4.1954799859567176e-05, "loss": 0.5717, "num_tokens": 3625688520.0, "step": 1552 }, { "epoch": 2.1372277912201865, "grad_norm": 0.42731496762786897, "learning_rate": 4.190658009981158e-05, "loss": 0.5736, "num_tokens": 3628089484.0, "step": 1553 }, { "epoch": 2.1375734531628066, "grad_norm": 0.48091479194805914, "learning_rate": 4.1858357563443384e-05, "loss": 0.585, "num_tokens": 3630383875.0, "step": 1554 }, { "epoch": 2.137919115105427, "grad_norm": 0.4403560738343109, "learning_rate": 4.1810132320705365e-05, "loss": 0.5719, "num_tokens": 3632784416.0, "step": 1555 }, { "epoch": 2.138264777048047, "grad_norm": 0.4760848017039719, "learning_rate": 4.176190444184416e-05, "loss": 0.5749, "num_tokens": 3635039143.0, "step": 1556 }, { "epoch": 2.1386104389906673, "grad_norm": 0.426180017500231, "learning_rate": 4.1713673997110285e-05, "loss": 0.5781, "num_tokens": 3637356892.0, "step": 1557 }, { "epoch": 2.1389561009332874, "grad_norm": 0.45476029314308675, "learning_rate": 4.166544105675801e-05, "loss": 0.5996, "num_tokens": 3639682333.0, "step": 1558 }, { "epoch": 2.1393017628759075, "grad_norm": 0.3592081374070471, "learning_rate": 4.161720569104523e-05, "loss": 0.5516, "num_tokens": 3641984410.0, "step": 1559 }, { "epoch": 2.1396474248185275, "grad_norm": 0.4009572816418318, "learning_rate": 4.1568967970233325e-05, "loss": 0.5788, "num_tokens": 3644313717.0, "step": 1560 }, { "epoch": 2.1399930867611476, "grad_norm": 0.35149549551799997, "learning_rate": 4.15207279645872e-05, "loss": 0.5586, "num_tokens": 3646648804.0, "step": 1561 }, { "epoch": 2.1403387487037677, "grad_norm": 0.5059356602348714, "learning_rate": 4.1472485744375006e-05, "loss": 0.5752, "num_tokens": 3648990511.0, "step": 1562 }, { "epoch": 2.140684410646388, "grad_norm": 0.41615788756120015, "learning_rate": 4.142424137986816e-05, "loss": 0.5638, "num_tokens": 3651351991.0, "step": 1563 }, { "epoch": 2.141030072589008, "grad_norm": 0.44677868070682053, "learning_rate": 4.137599494134118e-05, "loss": 0.5728, "num_tokens": 3653679901.0, "step": 1564 }, { "epoch": 2.141375734531628, "grad_norm": 0.5081129234433894, "learning_rate": 4.132774649907162e-05, "loss": 0.5838, "num_tokens": 3656020743.0, "step": 1565 }, { "epoch": 2.141721396474248, "grad_norm": 0.40991654980282255, "learning_rate": 4.127949612333996e-05, "loss": 0.5782, "num_tokens": 3658276861.0, "step": 1566 }, { "epoch": 2.142067058416868, "grad_norm": 0.4045643934993304, "learning_rate": 4.1231243884429476e-05, "loss": 0.5561, "num_tokens": 3660554400.0, "step": 1567 }, { "epoch": 2.1424127203594883, "grad_norm": 0.3315821456992701, "learning_rate": 4.1182989852626175e-05, "loss": 0.5721, "num_tokens": 3662928870.0, "step": 1568 }, { "epoch": 2.1427583823021084, "grad_norm": 0.349729997771642, "learning_rate": 4.113473409821866e-05, "loss": 0.5791, "num_tokens": 3665234551.0, "step": 1569 }, { "epoch": 2.143104044244729, "grad_norm": 0.3229124633389833, "learning_rate": 4.108647669149804e-05, "loss": 0.5539, "num_tokens": 3667471908.0, "step": 1570 }, { "epoch": 2.143449706187349, "grad_norm": 0.40862242253722186, "learning_rate": 4.1038217702757876e-05, "loss": 0.5742, "num_tokens": 3669930740.0, "step": 1571 }, { "epoch": 2.143795368129969, "grad_norm": 0.35957973163615636, "learning_rate": 4.0989957202293966e-05, "loss": 0.5708, "num_tokens": 3672314150.0, "step": 1572 }, { "epoch": 2.144141030072589, "grad_norm": 0.35633507648705237, "learning_rate": 4.094169526040436e-05, "loss": 0.5469, "num_tokens": 3674632925.0, "step": 1573 }, { "epoch": 2.1444866920152093, "grad_norm": 0.3303090242476595, "learning_rate": 4.089343194738919e-05, "loss": 0.5803, "num_tokens": 3676986083.0, "step": 1574 }, { "epoch": 2.1448323539578293, "grad_norm": 0.35448412932832873, "learning_rate": 4.084516733355059e-05, "loss": 0.5355, "num_tokens": 3679279499.0, "step": 1575 }, { "epoch": 2.1451780159004494, "grad_norm": 0.3909475116919189, "learning_rate": 4.079690148919259e-05, "loss": 0.5935, "num_tokens": 3681707807.0, "step": 1576 }, { "epoch": 2.1455236778430695, "grad_norm": 0.35393198281782284, "learning_rate": 4.0748634484620995e-05, "loss": 0.5517, "num_tokens": 3684079934.0, "step": 1577 }, { "epoch": 2.1458693397856896, "grad_norm": 0.42075058534895854, "learning_rate": 4.0700366390143295e-05, "loss": 0.5792, "num_tokens": 3686404471.0, "step": 1578 }, { "epoch": 2.1462150017283097, "grad_norm": 0.2995313447082663, "learning_rate": 4.065209727606863e-05, "loss": 0.5747, "num_tokens": 3688806237.0, "step": 1579 }, { "epoch": 2.14656066367093, "grad_norm": 0.5343983022659001, "learning_rate": 4.060382721270755e-05, "loss": 0.599, "num_tokens": 3691135757.0, "step": 1580 }, { "epoch": 2.14690632561355, "grad_norm": 0.4438786339745476, "learning_rate": 4.055555627037203e-05, "loss": 0.6006, "num_tokens": 3693355136.0, "step": 1581 }, { "epoch": 2.14725198755617, "grad_norm": 0.4941069824134752, "learning_rate": 4.050728451937531e-05, "loss": 0.5821, "num_tokens": 3695714669.0, "step": 1582 }, { "epoch": 2.14759764949879, "grad_norm": 0.4031208896621595, "learning_rate": 4.04590120300318e-05, "loss": 0.5731, "num_tokens": 3697986604.0, "step": 1583 }, { "epoch": 2.14794331144141, "grad_norm": 0.44305852363219267, "learning_rate": 4.041073887265702e-05, "loss": 0.5596, "num_tokens": 3700330742.0, "step": 1584 }, { "epoch": 2.1482889733840302, "grad_norm": 0.39280717857519387, "learning_rate": 4.036246511756743e-05, "loss": 0.581, "num_tokens": 3702747047.0, "step": 1585 }, { "epoch": 2.1486346353266503, "grad_norm": 0.36926333880870965, "learning_rate": 4.031419083508037e-05, "loss": 0.5768, "num_tokens": 3705117151.0, "step": 1586 }, { "epoch": 2.1489802972692704, "grad_norm": 0.3246583198619419, "learning_rate": 4.0265916095513936e-05, "loss": 0.5779, "num_tokens": 3707455326.0, "step": 1587 }, { "epoch": 2.149325959211891, "grad_norm": 0.368634262075389, "learning_rate": 4.021764096918693e-05, "loss": 0.5827, "num_tokens": 3709742556.0, "step": 1588 }, { "epoch": 2.149671621154511, "grad_norm": 0.30572720892886823, "learning_rate": 4.016936552641868e-05, "loss": 0.555, "num_tokens": 3712072511.0, "step": 1589 }, { "epoch": 2.150017283097131, "grad_norm": 0.49248626919039834, "learning_rate": 4.012108983752897e-05, "loss": 0.5662, "num_tokens": 3714334512.0, "step": 1590 }, { "epoch": 2.1503629450397512, "grad_norm": 0.4360364207942161, "learning_rate": 4.007281397283796e-05, "loss": 0.5945, "num_tokens": 3716685869.0, "step": 1591 }, { "epoch": 2.1507086069823713, "grad_norm": 0.522127775857114, "learning_rate": 4.00245380026661e-05, "loss": 0.5856, "num_tokens": 3719018791.0, "step": 1592 }, { "epoch": 2.1510542689249914, "grad_norm": 0.5594619646273812, "learning_rate": 3.9976261997333914e-05, "loss": 0.6005, "num_tokens": 3721391823.0, "step": 1593 }, { "epoch": 2.1513999308676115, "grad_norm": 0.38787599697682235, "learning_rate": 3.992798602716204e-05, "loss": 0.5752, "num_tokens": 3723691361.0, "step": 1594 }, { "epoch": 2.1517455928102316, "grad_norm": 0.4459806993929073, "learning_rate": 3.9879710162471054e-05, "loss": 0.5693, "num_tokens": 3726088092.0, "step": 1595 }, { "epoch": 2.1520912547528517, "grad_norm": 0.34125732773785955, "learning_rate": 3.983143447358134e-05, "loss": 0.5805, "num_tokens": 3728511986.0, "step": 1596 }, { "epoch": 2.1524369166954718, "grad_norm": 0.4854586263264401, "learning_rate": 3.978315903081308e-05, "loss": 0.5759, "num_tokens": 3730839307.0, "step": 1597 }, { "epoch": 2.152782578638092, "grad_norm": 0.386244758997106, "learning_rate": 3.9734883904486065e-05, "loss": 0.5767, "num_tokens": 3733220980.0, "step": 1598 }, { "epoch": 2.153128240580712, "grad_norm": 0.5040568212989219, "learning_rate": 3.968660916491966e-05, "loss": 0.546, "num_tokens": 3735510614.0, "step": 1599 }, { "epoch": 2.153473902523332, "grad_norm": 0.4522781544588036, "learning_rate": 3.9638334882432585e-05, "loss": 0.5969, "num_tokens": 3737849023.0, "step": 1600 }, { "epoch": 2.153819564465952, "grad_norm": 0.5147815585535901, "learning_rate": 3.959006112734299e-05, "loss": 0.574, "num_tokens": 3740134532.0, "step": 1601 }, { "epoch": 2.154165226408572, "grad_norm": 0.4672815081830139, "learning_rate": 3.9541787969968205e-05, "loss": 0.5934, "num_tokens": 3742446477.0, "step": 1602 }, { "epoch": 2.1545108883511928, "grad_norm": 0.42399242565611145, "learning_rate": 3.9493515480624714e-05, "loss": 0.5505, "num_tokens": 3744841618.0, "step": 1603 }, { "epoch": 2.154856550293813, "grad_norm": 0.40969670996276963, "learning_rate": 3.9445243729627985e-05, "loss": 0.5677, "num_tokens": 3747208815.0, "step": 1604 }, { "epoch": 2.155202212236433, "grad_norm": 0.4552923328944928, "learning_rate": 3.939697278729246e-05, "loss": 0.5727, "num_tokens": 3749583666.0, "step": 1605 }, { "epoch": 2.155547874179053, "grad_norm": 0.35678686798820974, "learning_rate": 3.9348702723931376e-05, "loss": 0.5982, "num_tokens": 3751880865.0, "step": 1606 }, { "epoch": 2.155893536121673, "grad_norm": 0.47752650155402254, "learning_rate": 3.930043360985671e-05, "loss": 0.5815, "num_tokens": 3754180557.0, "step": 1607 }, { "epoch": 2.156239198064293, "grad_norm": 0.3887701798950676, "learning_rate": 3.925216551537903e-05, "loss": 0.5726, "num_tokens": 3756398897.0, "step": 1608 }, { "epoch": 2.1565848600069133, "grad_norm": 0.39243930632609825, "learning_rate": 3.920389851080742e-05, "loss": 0.5639, "num_tokens": 3758706586.0, "step": 1609 }, { "epoch": 2.1569305219495334, "grad_norm": 0.42972319302645756, "learning_rate": 3.9155632666449415e-05, "loss": 0.5796, "num_tokens": 3761140111.0, "step": 1610 }, { "epoch": 2.1572761838921535, "grad_norm": 0.3470801351212925, "learning_rate": 3.910736805261082e-05, "loss": 0.5635, "num_tokens": 3763460414.0, "step": 1611 }, { "epoch": 2.1576218458347736, "grad_norm": 0.5074819719261473, "learning_rate": 3.905910473959564e-05, "loss": 0.5658, "num_tokens": 3765800756.0, "step": 1612 }, { "epoch": 2.1579675077773937, "grad_norm": 0.40571157567100113, "learning_rate": 3.9010842797706035e-05, "loss": 0.596, "num_tokens": 3768207619.0, "step": 1613 }, { "epoch": 2.1583131697200137, "grad_norm": 0.5015041501523927, "learning_rate": 3.896258229724214e-05, "loss": 0.5691, "num_tokens": 3770527547.0, "step": 1614 }, { "epoch": 2.158658831662634, "grad_norm": 0.5003281357026551, "learning_rate": 3.891432330850197e-05, "loss": 0.5558, "num_tokens": 3772852814.0, "step": 1615 }, { "epoch": 2.159004493605254, "grad_norm": 0.39819019361785885, "learning_rate": 3.886606590178135e-05, "loss": 0.5498, "num_tokens": 3775145704.0, "step": 1616 }, { "epoch": 2.159350155547874, "grad_norm": 0.4382841730052522, "learning_rate": 3.8817810147373826e-05, "loss": 0.5813, "num_tokens": 3777547542.0, "step": 1617 }, { "epoch": 2.159695817490494, "grad_norm": 0.35900507831841894, "learning_rate": 3.876955611557053e-05, "loss": 0.5639, "num_tokens": 3779747682.0, "step": 1618 }, { "epoch": 2.160041479433114, "grad_norm": 0.4589987629771109, "learning_rate": 3.8721303876660045e-05, "loss": 0.5932, "num_tokens": 3782057777.0, "step": 1619 }, { "epoch": 2.1603871413757347, "grad_norm": 0.3620142433657542, "learning_rate": 3.867305350092838e-05, "loss": 0.6081, "num_tokens": 3784387192.0, "step": 1620 }, { "epoch": 2.160732803318355, "grad_norm": 0.4420137108586777, "learning_rate": 3.8624805058658824e-05, "loss": 0.603, "num_tokens": 3786728617.0, "step": 1621 }, { "epoch": 2.161078465260975, "grad_norm": 0.36168094848982324, "learning_rate": 3.857655862013185e-05, "loss": 0.5828, "num_tokens": 3789097835.0, "step": 1622 }, { "epoch": 2.161424127203595, "grad_norm": 0.5168728609871764, "learning_rate": 3.852831425562501e-05, "loss": 0.5956, "num_tokens": 3791356470.0, "step": 1623 }, { "epoch": 2.161769789146215, "grad_norm": 0.37723357438553606, "learning_rate": 3.84800720354128e-05, "loss": 0.5684, "num_tokens": 3793747531.0, "step": 1624 }, { "epoch": 2.162115451088835, "grad_norm": 0.5464799620004885, "learning_rate": 3.843183202976667e-05, "loss": 0.615, "num_tokens": 3796165964.0, "step": 1625 }, { "epoch": 2.1624611130314553, "grad_norm": 0.4108637334345988, "learning_rate": 3.838359430895479e-05, "loss": 0.5737, "num_tokens": 3798367885.0, "step": 1626 }, { "epoch": 2.1628067749740754, "grad_norm": 0.46423095918540663, "learning_rate": 3.833535894324201e-05, "loss": 0.5843, "num_tokens": 3800727725.0, "step": 1627 }, { "epoch": 2.1631524369166955, "grad_norm": 0.4715351834469286, "learning_rate": 3.828712600288972e-05, "loss": 0.5855, "num_tokens": 3803174059.0, "step": 1628 }, { "epoch": 2.1634980988593155, "grad_norm": 0.4608463076050439, "learning_rate": 3.8238895558155854e-05, "loss": 0.598, "num_tokens": 3805513338.0, "step": 1629 }, { "epoch": 2.1638437608019356, "grad_norm": 0.5289605500181319, "learning_rate": 3.819066767929466e-05, "loss": 0.5896, "num_tokens": 3807807850.0, "step": 1630 }, { "epoch": 2.1641894227445557, "grad_norm": 0.6843097301058589, "learning_rate": 3.8142442436556625e-05, "loss": 0.5935, "num_tokens": 3810121801.0, "step": 1631 }, { "epoch": 2.164535084687176, "grad_norm": 0.6028055748980918, "learning_rate": 3.809421990018843e-05, "loss": 0.5851, "num_tokens": 3812446360.0, "step": 1632 }, { "epoch": 2.164880746629796, "grad_norm": 0.4922345520406235, "learning_rate": 3.8046000140432826e-05, "loss": 0.5867, "num_tokens": 3814740571.0, "step": 1633 }, { "epoch": 2.165226408572416, "grad_norm": 0.6687640322126353, "learning_rate": 3.799778322752851e-05, "loss": 0.5949, "num_tokens": 3817120162.0, "step": 1634 }, { "epoch": 2.1655720705150365, "grad_norm": 0.5579615685628413, "learning_rate": 3.794956923171e-05, "loss": 0.5881, "num_tokens": 3819363439.0, "step": 1635 }, { "epoch": 2.1659177324576566, "grad_norm": 0.5151441528085586, "learning_rate": 3.790135822320761e-05, "loss": 0.5668, "num_tokens": 3821594912.0, "step": 1636 }, { "epoch": 2.1662633944002767, "grad_norm": 0.6955999134965759, "learning_rate": 3.785315027224727e-05, "loss": 0.5885, "num_tokens": 3823831911.0, "step": 1637 }, { "epoch": 2.166609056342897, "grad_norm": 0.5911351599728437, "learning_rate": 3.7804945449050456e-05, "loss": 0.5931, "num_tokens": 3826240000.0, "step": 1638 }, { "epoch": 2.166954718285517, "grad_norm": 0.47681908925145744, "learning_rate": 3.775674382383412e-05, "loss": 0.5926, "num_tokens": 3828554280.0, "step": 1639 }, { "epoch": 2.167300380228137, "grad_norm": 0.5637941079453287, "learning_rate": 3.770854546681053e-05, "loss": 0.5633, "num_tokens": 3830736107.0, "step": 1640 }, { "epoch": 2.167646042170757, "grad_norm": 0.6560763691253044, "learning_rate": 3.7660350448187216e-05, "loss": 0.5758, "num_tokens": 3833114775.0, "step": 1641 }, { "epoch": 2.167991704113377, "grad_norm": 0.37057116614678604, "learning_rate": 3.7612158838166786e-05, "loss": 0.5662, "num_tokens": 3835453534.0, "step": 1642 }, { "epoch": 2.1683373660559973, "grad_norm": 0.5788086706023449, "learning_rate": 3.756397070694696e-05, "loss": 0.5542, "num_tokens": 3837662669.0, "step": 1643 }, { "epoch": 2.1686830279986173, "grad_norm": 0.6129402349533551, "learning_rate": 3.751578612472035e-05, "loss": 0.5677, "num_tokens": 3840013052.0, "step": 1644 }, { "epoch": 2.1690286899412374, "grad_norm": 0.371156729607469, "learning_rate": 3.7467605161674406e-05, "loss": 0.5875, "num_tokens": 3842461152.0, "step": 1645 }, { "epoch": 2.1693743518838575, "grad_norm": 0.5392213992727422, "learning_rate": 3.741942788799129e-05, "loss": 0.5697, "num_tokens": 3844716205.0, "step": 1646 }, { "epoch": 2.1697200138264776, "grad_norm": 0.48273504754983154, "learning_rate": 3.737125437384781e-05, "loss": 0.5981, "num_tokens": 3847095801.0, "step": 1647 }, { "epoch": 2.1700656757690977, "grad_norm": 0.40866544936507615, "learning_rate": 3.7323084689415306e-05, "loss": 0.6022, "num_tokens": 3849384483.0, "step": 1648 }, { "epoch": 2.170411337711718, "grad_norm": 0.47430210882285406, "learning_rate": 3.727491890485953e-05, "loss": 0.588, "num_tokens": 3851658649.0, "step": 1649 }, { "epoch": 2.170756999654338, "grad_norm": 0.5029182108552628, "learning_rate": 3.722675709034051e-05, "loss": 0.551, "num_tokens": 3853946535.0, "step": 1650 }, { "epoch": 2.171102661596958, "grad_norm": 0.47222407166530844, "learning_rate": 3.717859931601256e-05, "loss": 0.5765, "num_tokens": 3856287417.0, "step": 1651 }, { "epoch": 2.171448323539578, "grad_norm": 0.4248156581917221, "learning_rate": 3.7130445652024085e-05, "loss": 0.5584, "num_tokens": 3858679688.0, "step": 1652 }, { "epoch": 2.1717939854821986, "grad_norm": 0.4535261692200449, "learning_rate": 3.7082296168517494e-05, "loss": 0.5773, "num_tokens": 3861121000.0, "step": 1653 }, { "epoch": 2.1721396474248187, "grad_norm": 0.312373249306112, "learning_rate": 3.703415093562908e-05, "loss": 0.5669, "num_tokens": 3863403886.0, "step": 1654 }, { "epoch": 2.1724853093674388, "grad_norm": 0.4621334089991505, "learning_rate": 3.6986010023489e-05, "loss": 0.5301, "num_tokens": 3865679721.0, "step": 1655 }, { "epoch": 2.172830971310059, "grad_norm": 0.38147592404275305, "learning_rate": 3.693787350222109e-05, "loss": 0.5527, "num_tokens": 3867933649.0, "step": 1656 }, { "epoch": 2.173176633252679, "grad_norm": 0.5125259932427738, "learning_rate": 3.68897414419428e-05, "loss": 0.5699, "num_tokens": 3870159012.0, "step": 1657 }, { "epoch": 2.173522295195299, "grad_norm": 0.4348251802703402, "learning_rate": 3.684161391276505e-05, "loss": 0.555, "num_tokens": 3872523399.0, "step": 1658 }, { "epoch": 2.173867957137919, "grad_norm": 0.41104162167740904, "learning_rate": 3.6793490984792175e-05, "loss": 0.5866, "num_tokens": 3874887786.0, "step": 1659 }, { "epoch": 2.1742136190805392, "grad_norm": 0.4844079362517981, "learning_rate": 3.674537272812185e-05, "loss": 0.5539, "num_tokens": 3877132256.0, "step": 1660 }, { "epoch": 2.1745592810231593, "grad_norm": 0.4312784903033744, "learning_rate": 3.6697259212844905e-05, "loss": 0.5668, "num_tokens": 3879437798.0, "step": 1661 }, { "epoch": 2.1749049429657794, "grad_norm": 0.4481543658200724, "learning_rate": 3.6649150509045244e-05, "loss": 0.5516, "num_tokens": 3881767743.0, "step": 1662 }, { "epoch": 2.1752506049083995, "grad_norm": 0.4448007881232388, "learning_rate": 3.660104668679981e-05, "loss": 0.5893, "num_tokens": 3884097463.0, "step": 1663 }, { "epoch": 2.1755962668510196, "grad_norm": 0.29651920272278304, "learning_rate": 3.655294781617841e-05, "loss": 0.5417, "num_tokens": 3886439225.0, "step": 1664 }, { "epoch": 2.1759419287936397, "grad_norm": 0.3052619543124406, "learning_rate": 3.6504853967243634e-05, "loss": 0.5542, "num_tokens": 3888704797.0, "step": 1665 }, { "epoch": 2.1762875907362598, "grad_norm": 0.3547086546099954, "learning_rate": 3.645676521005077e-05, "loss": 0.5778, "num_tokens": 3890992988.0, "step": 1666 }, { "epoch": 2.17663325267888, "grad_norm": 0.2762549379423686, "learning_rate": 3.640868161464768e-05, "loss": 0.5723, "num_tokens": 3893294065.0, "step": 1667 }, { "epoch": 2.1769789146215004, "grad_norm": 0.4059613017175796, "learning_rate": 3.636060325107473e-05, "loss": 0.5764, "num_tokens": 3895769991.0, "step": 1668 }, { "epoch": 2.1773245765641205, "grad_norm": 0.3474366918655778, "learning_rate": 3.6312530189364614e-05, "loss": 0.5679, "num_tokens": 3898051577.0, "step": 1669 }, { "epoch": 2.1776702385067406, "grad_norm": 0.39132664501943865, "learning_rate": 3.626446249954236e-05, "loss": 0.5719, "num_tokens": 3900484293.0, "step": 1670 }, { "epoch": 2.1780159004493607, "grad_norm": 0.3113923774174792, "learning_rate": 3.6216400251625124e-05, "loss": 0.556, "num_tokens": 3902863416.0, "step": 1671 }, { "epoch": 2.1783615623919808, "grad_norm": 0.30332646255432144, "learning_rate": 3.61683435156222e-05, "loss": 0.5612, "num_tokens": 3905076450.0, "step": 1672 }, { "epoch": 2.178707224334601, "grad_norm": 0.4113288561124863, "learning_rate": 3.612029236153476e-05, "loss": 0.5696, "num_tokens": 3907417790.0, "step": 1673 }, { "epoch": 2.179052886277221, "grad_norm": 0.35240037904750227, "learning_rate": 3.607224685935594e-05, "loss": 0.5611, "num_tokens": 3909804374.0, "step": 1674 }, { "epoch": 2.179398548219841, "grad_norm": 0.48687355247249303, "learning_rate": 3.602420707907056e-05, "loss": 0.5792, "num_tokens": 3912203016.0, "step": 1675 }, { "epoch": 2.179744210162461, "grad_norm": 0.4098875075701958, "learning_rate": 3.597617309065519e-05, "loss": 0.5496, "num_tokens": 3914502362.0, "step": 1676 }, { "epoch": 2.180089872105081, "grad_norm": 0.4304335772544368, "learning_rate": 3.5928144964077864e-05, "loss": 0.5962, "num_tokens": 3916882888.0, "step": 1677 }, { "epoch": 2.1804355340477013, "grad_norm": 0.4785925068848899, "learning_rate": 3.5880122769298175e-05, "loss": 0.5948, "num_tokens": 3919174544.0, "step": 1678 }, { "epoch": 2.1807811959903214, "grad_norm": 0.46746268909684413, "learning_rate": 3.5832106576266995e-05, "loss": 0.6129, "num_tokens": 3921622264.0, "step": 1679 }, { "epoch": 2.1811268579329415, "grad_norm": 0.4163477885566145, "learning_rate": 3.578409645492655e-05, "loss": 0.5856, "num_tokens": 3923875335.0, "step": 1680 }, { "epoch": 2.1814725198755616, "grad_norm": 0.4532640252382832, "learning_rate": 3.5736092475210084e-05, "loss": 0.6094, "num_tokens": 3926146162.0, "step": 1681 }, { "epoch": 2.1818181818181817, "grad_norm": 0.46035438824564406, "learning_rate": 3.5688094707042035e-05, "loss": 0.6068, "num_tokens": 3928536514.0, "step": 1682 }, { "epoch": 2.1821638437608017, "grad_norm": 0.4353662263060845, "learning_rate": 3.564010322033769e-05, "loss": 0.6003, "num_tokens": 3930858866.0, "step": 1683 }, { "epoch": 2.182509505703422, "grad_norm": 0.4850276992486815, "learning_rate": 3.559211808500327e-05, "loss": 0.6219, "num_tokens": 3933245557.0, "step": 1684 }, { "epoch": 2.1828551676460424, "grad_norm": 0.5131446191738124, "learning_rate": 3.554413937093566e-05, "loss": 0.5848, "num_tokens": 3935545307.0, "step": 1685 }, { "epoch": 2.1832008295886625, "grad_norm": 0.4677052907972313, "learning_rate": 3.549616714802246e-05, "loss": 0.6118, "num_tokens": 3937967699.0, "step": 1686 }, { "epoch": 2.1835464915312826, "grad_norm": 0.44589653162288206, "learning_rate": 3.544820148614177e-05, "loss": 0.5742, "num_tokens": 3940325335.0, "step": 1687 }, { "epoch": 2.1838921534739026, "grad_norm": 0.49780096945699487, "learning_rate": 3.5400242455162175e-05, "loss": 0.5932, "num_tokens": 3942675501.0, "step": 1688 }, { "epoch": 2.1842378154165227, "grad_norm": 0.4936410748096351, "learning_rate": 3.5352290124942547e-05, "loss": 0.5968, "num_tokens": 3945052081.0, "step": 1689 }, { "epoch": 2.184583477359143, "grad_norm": 0.45272207924575136, "learning_rate": 3.530434456533204e-05, "loss": 0.5893, "num_tokens": 3947538611.0, "step": 1690 }, { "epoch": 2.184929139301763, "grad_norm": 0.4872721715786065, "learning_rate": 3.525640584616995e-05, "loss": 0.5719, "num_tokens": 3949946390.0, "step": 1691 }, { "epoch": 2.185274801244383, "grad_norm": 0.36201367992005706, "learning_rate": 3.520847403728558e-05, "loss": 0.5747, "num_tokens": 3952399105.0, "step": 1692 }, { "epoch": 2.185620463187003, "grad_norm": 0.463602572864178, "learning_rate": 3.516054920849815e-05, "loss": 0.5674, "num_tokens": 3954799132.0, "step": 1693 }, { "epoch": 2.185966125129623, "grad_norm": 0.36234238128339163, "learning_rate": 3.511263142961678e-05, "loss": 0.5583, "num_tokens": 3957086139.0, "step": 1694 }, { "epoch": 2.1863117870722433, "grad_norm": 0.5317641049169286, "learning_rate": 3.5064720770440296e-05, "loss": 0.5742, "num_tokens": 3959378894.0, "step": 1695 }, { "epoch": 2.1866574490148634, "grad_norm": 0.471573403778105, "learning_rate": 3.501681730075709e-05, "loss": 0.5505, "num_tokens": 3961659354.0, "step": 1696 }, { "epoch": 2.1870031109574835, "grad_norm": 0.41216022549670006, "learning_rate": 3.4968921090345165e-05, "loss": 0.6026, "num_tokens": 3964020105.0, "step": 1697 }, { "epoch": 2.1873487729001035, "grad_norm": 0.42323122533725727, "learning_rate": 3.49210322089719e-05, "loss": 0.5675, "num_tokens": 3966420672.0, "step": 1698 }, { "epoch": 2.1876944348427236, "grad_norm": 0.3138653948971231, "learning_rate": 3.487315072639404e-05, "loss": 0.5869, "num_tokens": 3968792359.0, "step": 1699 }, { "epoch": 2.188040096785344, "grad_norm": 0.40319030343127277, "learning_rate": 3.48252767123575e-05, "loss": 0.5596, "num_tokens": 3971046068.0, "step": 1700 }, { "epoch": 2.1883857587279643, "grad_norm": 0.3610504504125991, "learning_rate": 3.4777410236597353e-05, "loss": 0.5726, "num_tokens": 3973296546.0, "step": 1701 }, { "epoch": 2.1887314206705843, "grad_norm": 0.3554967056008757, "learning_rate": 3.472955136883768e-05, "loss": 0.5923, "num_tokens": 3975689890.0, "step": 1702 }, { "epoch": 2.1890770826132044, "grad_norm": 0.39904146644921934, "learning_rate": 3.4681700178791495e-05, "loss": 0.5721, "num_tokens": 3978029629.0, "step": 1703 }, { "epoch": 2.1894227445558245, "grad_norm": 0.3755173027738497, "learning_rate": 3.46338567361606e-05, "loss": 0.6228, "num_tokens": 3980420400.0, "step": 1704 }, { "epoch": 2.1897684064984446, "grad_norm": 0.497751920434996, "learning_rate": 3.4586021110635515e-05, "loss": 0.6244, "num_tokens": 3982873421.0, "step": 1705 }, { "epoch": 2.1901140684410647, "grad_norm": 0.5148781680410864, "learning_rate": 3.453819337189541e-05, "loss": 0.5741, "num_tokens": 3985283043.0, "step": 1706 }, { "epoch": 2.190459730383685, "grad_norm": 0.4066198393868349, "learning_rate": 3.449037358960794e-05, "loss": 0.5666, "num_tokens": 3987507032.0, "step": 1707 }, { "epoch": 2.190805392326305, "grad_norm": 0.3836308625845691, "learning_rate": 3.4442561833429166e-05, "loss": 0.5972, "num_tokens": 3989875944.0, "step": 1708 }, { "epoch": 2.191151054268925, "grad_norm": 0.35836301484995864, "learning_rate": 3.439475817300345e-05, "loss": 0.5462, "num_tokens": 3992251565.0, "step": 1709 }, { "epoch": 2.191496716211545, "grad_norm": 0.4104946221458571, "learning_rate": 3.4346962677963386e-05, "loss": 0.5931, "num_tokens": 3994508762.0, "step": 1710 }, { "epoch": 2.191842378154165, "grad_norm": 0.4124014388600809, "learning_rate": 3.429917541792969e-05, "loss": 0.5722, "num_tokens": 3996692941.0, "step": 1711 }, { "epoch": 2.1921880400967853, "grad_norm": 0.4503361710607658, "learning_rate": 3.425139646251102e-05, "loss": 0.5987, "num_tokens": 3999103782.0, "step": 1712 }, { "epoch": 2.1925337020394053, "grad_norm": 0.5352192987620799, "learning_rate": 3.4203625881303976e-05, "loss": 0.6011, "num_tokens": 4001472034.0, "step": 1713 }, { "epoch": 2.1928793639820254, "grad_norm": 0.4382444346671446, "learning_rate": 3.415586374389297e-05, "loss": 0.5808, "num_tokens": 4003727846.0, "step": 1714 }, { "epoch": 2.1932250259246455, "grad_norm": 0.46278263438291717, "learning_rate": 3.4108110119850114e-05, "loss": 0.6062, "num_tokens": 4005985234.0, "step": 1715 }, { "epoch": 2.1935706878672656, "grad_norm": 0.4514643181419856, "learning_rate": 3.406036507873508e-05, "loss": 0.602, "num_tokens": 4008334084.0, "step": 1716 }, { "epoch": 2.1939163498098857, "grad_norm": 0.5221191447664183, "learning_rate": 3.401262869009508e-05, "loss": 0.6126, "num_tokens": 4010712684.0, "step": 1717 }, { "epoch": 2.1942620117525062, "grad_norm": 0.42153524973538975, "learning_rate": 3.3964901023464697e-05, "loss": 0.5748, "num_tokens": 4013053800.0, "step": 1718 }, { "epoch": 2.1946076736951263, "grad_norm": 0.45106538070198826, "learning_rate": 3.3917182148365836e-05, "loss": 0.5883, "num_tokens": 4015279737.0, "step": 1719 }, { "epoch": 2.1949533356377464, "grad_norm": 0.40253657755700084, "learning_rate": 3.386947213430755e-05, "loss": 0.5622, "num_tokens": 4017606067.0, "step": 1720 }, { "epoch": 2.1952989975803665, "grad_norm": 0.5191393403861773, "learning_rate": 3.3821771050786045e-05, "loss": 0.5771, "num_tokens": 4019953661.0, "step": 1721 }, { "epoch": 2.1956446595229866, "grad_norm": 0.5145188361850536, "learning_rate": 3.3774078967284486e-05, "loss": 0.5803, "num_tokens": 4022258281.0, "step": 1722 }, { "epoch": 2.1959903214656067, "grad_norm": 0.4371924657081148, "learning_rate": 3.37263959532729e-05, "loss": 0.5759, "num_tokens": 4024528869.0, "step": 1723 }, { "epoch": 2.1963359834082268, "grad_norm": 0.5693070825878815, "learning_rate": 3.367872207820815e-05, "loss": 0.5923, "num_tokens": 4026863101.0, "step": 1724 }, { "epoch": 2.196681645350847, "grad_norm": 0.35133547147608174, "learning_rate": 3.363105741153377e-05, "loss": 0.5869, "num_tokens": 4029263862.0, "step": 1725 }, { "epoch": 2.197027307293467, "grad_norm": 0.6060415450474594, "learning_rate": 3.3583402022679897e-05, "loss": 0.5601, "num_tokens": 4031469175.0, "step": 1726 }, { "epoch": 2.197372969236087, "grad_norm": 0.46437751488956786, "learning_rate": 3.3535755981063106e-05, "loss": 0.6094, "num_tokens": 4033880574.0, "step": 1727 }, { "epoch": 2.197718631178707, "grad_norm": 0.5097295330864767, "learning_rate": 3.34881193560864e-05, "loss": 0.5665, "num_tokens": 4036290081.0, "step": 1728 }, { "epoch": 2.1980642931213272, "grad_norm": 0.5628972758761455, "learning_rate": 3.3440492217139055e-05, "loss": 0.5714, "num_tokens": 4038530918.0, "step": 1729 }, { "epoch": 2.1984099550639473, "grad_norm": 0.4055640721344835, "learning_rate": 3.3392874633596534e-05, "loss": 0.6007, "num_tokens": 4040869460.0, "step": 1730 }, { "epoch": 2.1987556170065674, "grad_norm": 0.353572067518476, "learning_rate": 3.334526667482035e-05, "loss": 0.535, "num_tokens": 4043234043.0, "step": 1731 }, { "epoch": 2.1991012789491875, "grad_norm": 0.44273212246122495, "learning_rate": 3.3297668410158036e-05, "loss": 0.5799, "num_tokens": 4045578364.0, "step": 1732 }, { "epoch": 2.199446940891808, "grad_norm": 0.3080731708087997, "learning_rate": 3.3250079908943e-05, "loss": 0.5986, "num_tokens": 4048001062.0, "step": 1733 }, { "epoch": 2.199792602834428, "grad_norm": 0.5738468906760364, "learning_rate": 3.3202501240494414e-05, "loss": 0.6045, "num_tokens": 4050333985.0, "step": 1734 } ], "logging_steps": 1, "max_steps": 2893, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.4440977488812704e+19, "train_batch_size": 8, "trial_name": null, "trial_params": null }