diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,13906 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.199792602834428, + "eval_steps": 500, + "global_step": 1734, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0003456619426201175, + "grad_norm": 34.52766429032349, + "learning_rate": 0.0, + "loss": 1.7402, + "num_tokens": 2348566.0, + "step": 1 + }, + { + "epoch": 0.000691323885240235, + "grad_norm": 34.916246724175195, + "learning_rate": 2.7586206896551726e-07, + "loss": 1.7643, + "num_tokens": 4766393.0, + "step": 2 + }, + { + "epoch": 0.0010369858278603526, + "grad_norm": 37.60414874561209, + "learning_rate": 5.517241379310345e-07, + "loss": 1.7635, + "num_tokens": 7060606.0, + "step": 3 + }, + { + "epoch": 0.00138264777048047, + "grad_norm": 28.067921013815816, + "learning_rate": 8.275862068965518e-07, + "loss": 1.7722, + "num_tokens": 9393241.0, + "step": 4 + }, + { + "epoch": 0.0017283097131005876, + "grad_norm": 30.411817473811606, + "learning_rate": 1.103448275862069e-06, + "loss": 1.7679, + "num_tokens": 11829752.0, + "step": 5 + }, + { + "epoch": 0.002073971655720705, + "grad_norm": 34.72349566651572, + "learning_rate": 1.3793103448275862e-06, + "loss": 1.6802, + "num_tokens": 14068246.0, + "step": 6 + }, + { + "epoch": 0.0024196335983408227, + "grad_norm": 34.17638397070201, + "learning_rate": 1.6551724137931037e-06, + "loss": 1.6733, + "num_tokens": 16427486.0, + "step": 7 + }, + { + "epoch": 0.00276529554096094, + "grad_norm": 240.23501794389531, + "learning_rate": 1.9310344827586207e-06, + "loss": 1.6469, + "num_tokens": 18725943.0, + "step": 8 + }, + { + "epoch": 0.0031109574835810577, + "grad_norm": 199.16629728896086, + "learning_rate": 2.206896551724138e-06, + "loss": 1.6185, + "num_tokens": 20959251.0, + "step": 9 + }, + { + "epoch": 0.0034566194262011752, + "grad_norm": 127.5413424915123, + "learning_rate": 2.4827586206896555e-06, + "loss": 1.4787, + "num_tokens": 23221296.0, + "step": 10 + }, + { + "epoch": 0.0038022813688212928, + "grad_norm": 20.172387932030606, + "learning_rate": 2.7586206896551725e-06, + "loss": 1.4814, + "num_tokens": 25623468.0, + "step": 11 + }, + { + "epoch": 0.00414794331144141, + "grad_norm": 24.996893860630973, + "learning_rate": 3.03448275862069e-06, + "loss": 1.4958, + "num_tokens": 28101258.0, + "step": 12 + }, + { + "epoch": 0.004493605254061528, + "grad_norm": 15.978924269182224, + "learning_rate": 3.3103448275862073e-06, + "loss": 1.3222, + "num_tokens": 30303835.0, + "step": 13 + }, + { + "epoch": 0.004839267196681645, + "grad_norm": 5.829201156403896, + "learning_rate": 3.5862068965517243e-06, + "loss": 1.2836, + "num_tokens": 32612096.0, + "step": 14 + }, + { + "epoch": 0.005184929139301763, + "grad_norm": 103.68722318826303, + "learning_rate": 3.862068965517241e-06, + "loss": 1.2761, + "num_tokens": 34930500.0, + "step": 15 + }, + { + "epoch": 0.00553059108192188, + "grad_norm": 124.0907542896587, + "learning_rate": 4.137931034482759e-06, + "loss": 1.2774, + "num_tokens": 37351465.0, + "step": 16 + }, + { + "epoch": 0.005876253024541998, + "grad_norm": 97.23488726579467, + "learning_rate": 4.413793103448276e-06, + "loss": 1.2742, + "num_tokens": 39747971.0, + "step": 17 + }, + { + "epoch": 0.006221914967162115, + "grad_norm": 26.087299275986652, + "learning_rate": 4.689655172413793e-06, + "loss": 1.1917, + "num_tokens": 42055613.0, + "step": 18 + }, + { + "epoch": 0.006567576909782233, + "grad_norm": 16.581625382992545, + "learning_rate": 4.965517241379311e-06, + "loss": 1.1738, + "num_tokens": 44530067.0, + "step": 19 + }, + { + "epoch": 0.0069132388524023505, + "grad_norm": 9.370695208101724, + "learning_rate": 5.241379310344829e-06, + "loss": 1.1508, + "num_tokens": 46829520.0, + "step": 20 + }, + { + "epoch": 0.007258900795022468, + "grad_norm": 2.7546835662098337, + "learning_rate": 5.517241379310345e-06, + "loss": 1.1012, + "num_tokens": 49068724.0, + "step": 21 + }, + { + "epoch": 0.0076045627376425855, + "grad_norm": 2.3825001318118812, + "learning_rate": 5.793103448275863e-06, + "loss": 1.0944, + "num_tokens": 51426882.0, + "step": 22 + }, + { + "epoch": 0.007950224680262703, + "grad_norm": 2.035556006407366, + "learning_rate": 6.06896551724138e-06, + "loss": 1.0721, + "num_tokens": 53744600.0, + "step": 23 + }, + { + "epoch": 0.00829588662288282, + "grad_norm": 1.463922532189421, + "learning_rate": 6.344827586206898e-06, + "loss": 1.0609, + "num_tokens": 56061789.0, + "step": 24 + }, + { + "epoch": 0.008641548565502939, + "grad_norm": 1.5095068946017134, + "learning_rate": 6.620689655172415e-06, + "loss": 1.0261, + "num_tokens": 58426474.0, + "step": 25 + }, + { + "epoch": 0.008987210508123056, + "grad_norm": 1.7812545085036195, + "learning_rate": 6.896551724137932e-06, + "loss": 1.0006, + "num_tokens": 60698346.0, + "step": 26 + }, + { + "epoch": 0.009332872450743173, + "grad_norm": 1.471823589913617, + "learning_rate": 7.172413793103449e-06, + "loss": 0.98, + "num_tokens": 63058045.0, + "step": 27 + }, + { + "epoch": 0.00967853439336329, + "grad_norm": 1.131991682045555, + "learning_rate": 7.4482758620689665e-06, + "loss": 0.9489, + "num_tokens": 65393200.0, + "step": 28 + }, + { + "epoch": 0.010024196335983409, + "grad_norm": 0.9832756068048666, + "learning_rate": 7.724137931034483e-06, + "loss": 0.9484, + "num_tokens": 67739098.0, + "step": 29 + }, + { + "epoch": 0.010369858278603527, + "grad_norm": 1.1765295545080312, + "learning_rate": 8.000000000000001e-06, + "loss": 0.9475, + "num_tokens": 70086884.0, + "step": 30 + }, + { + "epoch": 0.010715520221223643, + "grad_norm": 1.2194966845549524, + "learning_rate": 8.275862068965518e-06, + "loss": 0.9175, + "num_tokens": 72378842.0, + "step": 31 + }, + { + "epoch": 0.01106118216384376, + "grad_norm": 0.7236272581153216, + "learning_rate": 8.551724137931035e-06, + "loss": 0.9239, + "num_tokens": 74782017.0, + "step": 32 + }, + { + "epoch": 0.011406844106463879, + "grad_norm": 0.63227886221642, + "learning_rate": 8.827586206896552e-06, + "loss": 0.9039, + "num_tokens": 77143359.0, + "step": 33 + }, + { + "epoch": 0.011752506049083997, + "grad_norm": 0.4895603891264605, + "learning_rate": 9.10344827586207e-06, + "loss": 0.9001, + "num_tokens": 79474781.0, + "step": 34 + }, + { + "epoch": 0.012098167991704113, + "grad_norm": 0.6296333140665148, + "learning_rate": 9.379310344827586e-06, + "loss": 0.8816, + "num_tokens": 81801167.0, + "step": 35 + }, + { + "epoch": 0.01244382993432423, + "grad_norm": 0.5574668525090068, + "learning_rate": 9.655172413793105e-06, + "loss": 0.8644, + "num_tokens": 84103942.0, + "step": 36 + }, + { + "epoch": 0.012789491876944349, + "grad_norm": 0.6251940562087835, + "learning_rate": 9.931034482758622e-06, + "loss": 0.8805, + "num_tokens": 86412978.0, + "step": 37 + }, + { + "epoch": 0.013135153819564467, + "grad_norm": 0.4609208716458492, + "learning_rate": 1.0206896551724139e-05, + "loss": 0.8375, + "num_tokens": 88723803.0, + "step": 38 + }, + { + "epoch": 0.013480815762184583, + "grad_norm": 0.5437599793260961, + "learning_rate": 1.0482758620689658e-05, + "loss": 0.83, + "num_tokens": 91014493.0, + "step": 39 + }, + { + "epoch": 0.013826477704804701, + "grad_norm": 0.618952678909033, + "learning_rate": 1.0758620689655173e-05, + "loss": 0.8354, + "num_tokens": 93228218.0, + "step": 40 + }, + { + "epoch": 0.014172139647424819, + "grad_norm": 0.48878986093827087, + "learning_rate": 1.103448275862069e-05, + "loss": 0.8456, + "num_tokens": 95593743.0, + "step": 41 + }, + { + "epoch": 0.014517801590044937, + "grad_norm": 0.5522829608269526, + "learning_rate": 1.1310344827586209e-05, + "loss": 0.8281, + "num_tokens": 97815035.0, + "step": 42 + }, + { + "epoch": 0.014863463532665053, + "grad_norm": 0.49672837733164593, + "learning_rate": 1.1586206896551726e-05, + "loss": 0.8335, + "num_tokens": 100149758.0, + "step": 43 + }, + { + "epoch": 0.015209125475285171, + "grad_norm": 0.5362244996387824, + "learning_rate": 1.1862068965517241e-05, + "loss": 0.8195, + "num_tokens": 102481479.0, + "step": 44 + }, + { + "epoch": 0.015554787417905289, + "grad_norm": 0.4699636631130888, + "learning_rate": 1.213793103448276e-05, + "loss": 0.8245, + "num_tokens": 104848287.0, + "step": 45 + }, + { + "epoch": 0.015900449360525405, + "grad_norm": 0.352102114661968, + "learning_rate": 1.2413793103448277e-05, + "loss": 0.819, + "num_tokens": 107126442.0, + "step": 46 + }, + { + "epoch": 0.016246111303145523, + "grad_norm": 0.5888506749748358, + "learning_rate": 1.2689655172413795e-05, + "loss": 0.8087, + "num_tokens": 109618031.0, + "step": 47 + }, + { + "epoch": 0.01659177324576564, + "grad_norm": 0.49382888158547594, + "learning_rate": 1.296551724137931e-05, + "loss": 0.8189, + "num_tokens": 111884700.0, + "step": 48 + }, + { + "epoch": 0.01693743518838576, + "grad_norm": 0.593880802739918, + "learning_rate": 1.324137931034483e-05, + "loss": 0.8023, + "num_tokens": 114172995.0, + "step": 49 + }, + { + "epoch": 0.017283097131005877, + "grad_norm": 0.6031087194755608, + "learning_rate": 1.3517241379310346e-05, + "loss": 0.814, + "num_tokens": 116497090.0, + "step": 50 + }, + { + "epoch": 0.017628759073625995, + "grad_norm": 0.40468651934265243, + "learning_rate": 1.3793103448275863e-05, + "loss": 0.7937, + "num_tokens": 118861185.0, + "step": 51 + }, + { + "epoch": 0.017974421016246113, + "grad_norm": 0.6833857006865406, + "learning_rate": 1.406896551724138e-05, + "loss": 0.7932, + "num_tokens": 121094526.0, + "step": 52 + }, + { + "epoch": 0.018320082958866227, + "grad_norm": 0.4664454198315879, + "learning_rate": 1.4344827586206897e-05, + "loss": 0.7926, + "num_tokens": 123518108.0, + "step": 53 + }, + { + "epoch": 0.018665744901486345, + "grad_norm": 0.4700018167854046, + "learning_rate": 1.4620689655172416e-05, + "loss": 0.7997, + "num_tokens": 125954145.0, + "step": 54 + }, + { + "epoch": 0.019011406844106463, + "grad_norm": 0.4845178396389685, + "learning_rate": 1.4896551724137933e-05, + "loss": 0.7979, + "num_tokens": 128361749.0, + "step": 55 + }, + { + "epoch": 0.01935706878672658, + "grad_norm": 0.33585679598083323, + "learning_rate": 1.5172413793103448e-05, + "loss": 0.7995, + "num_tokens": 130884149.0, + "step": 56 + }, + { + "epoch": 0.0197027307293467, + "grad_norm": 0.5946698053058292, + "learning_rate": 1.5448275862068965e-05, + "loss": 0.7851, + "num_tokens": 133314593.0, + "step": 57 + }, + { + "epoch": 0.020048392671966817, + "grad_norm": 0.44935201498855654, + "learning_rate": 1.5724137931034484e-05, + "loss": 0.7891, + "num_tokens": 135786548.0, + "step": 58 + }, + { + "epoch": 0.020394054614586935, + "grad_norm": 0.6016698684545718, + "learning_rate": 1.6000000000000003e-05, + "loss": 0.7945, + "num_tokens": 138090347.0, + "step": 59 + }, + { + "epoch": 0.020739716557207053, + "grad_norm": 0.5149935440294358, + "learning_rate": 1.6275862068965518e-05, + "loss": 0.7856, + "num_tokens": 140229744.0, + "step": 60 + }, + { + "epoch": 0.021085378499827168, + "grad_norm": 0.42442139251406474, + "learning_rate": 1.6551724137931037e-05, + "loss": 0.7638, + "num_tokens": 142547274.0, + "step": 61 + }, + { + "epoch": 0.021431040442447286, + "grad_norm": 0.6165584439482034, + "learning_rate": 1.6827586206896552e-05, + "loss": 0.7839, + "num_tokens": 144867908.0, + "step": 62 + }, + { + "epoch": 0.021776702385067404, + "grad_norm": 0.615695106401857, + "learning_rate": 1.710344827586207e-05, + "loss": 0.7789, + "num_tokens": 147137485.0, + "step": 63 + }, + { + "epoch": 0.02212236432768752, + "grad_norm": 0.5447303795295104, + "learning_rate": 1.7379310344827586e-05, + "loss": 0.7905, + "num_tokens": 149486776.0, + "step": 64 + }, + { + "epoch": 0.02246802627030764, + "grad_norm": 0.7075502076257513, + "learning_rate": 1.7655172413793105e-05, + "loss": 0.7792, + "num_tokens": 151813267.0, + "step": 65 + }, + { + "epoch": 0.022813688212927757, + "grad_norm": 0.5470719758055983, + "learning_rate": 1.7931034482758623e-05, + "loss": 0.7651, + "num_tokens": 154166281.0, + "step": 66 + }, + { + "epoch": 0.023159350155547875, + "grad_norm": 0.4628479897033203, + "learning_rate": 1.820689655172414e-05, + "loss": 0.7731, + "num_tokens": 156434581.0, + "step": 67 + }, + { + "epoch": 0.023505012098167993, + "grad_norm": 0.7961906721005769, + "learning_rate": 1.8482758620689657e-05, + "loss": 0.7699, + "num_tokens": 158708953.0, + "step": 68 + }, + { + "epoch": 0.023850674040788108, + "grad_norm": 0.4658600860763261, + "learning_rate": 1.8758620689655173e-05, + "loss": 0.7683, + "num_tokens": 161030802.0, + "step": 69 + }, + { + "epoch": 0.024196335983408226, + "grad_norm": 0.9531225369625607, + "learning_rate": 1.903448275862069e-05, + "loss": 0.7758, + "num_tokens": 163400511.0, + "step": 70 + }, + { + "epoch": 0.024541997926028344, + "grad_norm": 0.7489302770832681, + "learning_rate": 1.931034482758621e-05, + "loss": 0.7671, + "num_tokens": 165666850.0, + "step": 71 + }, + { + "epoch": 0.02488765986864846, + "grad_norm": 0.9597008407747405, + "learning_rate": 1.9586206896551725e-05, + "loss": 0.7429, + "num_tokens": 167881526.0, + "step": 72 + }, + { + "epoch": 0.02523332181126858, + "grad_norm": 0.9313302737360822, + "learning_rate": 1.9862068965517244e-05, + "loss": 0.7706, + "num_tokens": 170177217.0, + "step": 73 + }, + { + "epoch": 0.025578983753888698, + "grad_norm": 0.5602528199834497, + "learning_rate": 2.013793103448276e-05, + "loss": 0.7507, + "num_tokens": 172470635.0, + "step": 74 + }, + { + "epoch": 0.025924645696508816, + "grad_norm": 0.8676148008868482, + "learning_rate": 2.0413793103448278e-05, + "loss": 0.7661, + "num_tokens": 174778857.0, + "step": 75 + }, + { + "epoch": 0.026270307639128933, + "grad_norm": 0.6668635259628354, + "learning_rate": 2.0689655172413797e-05, + "loss": 0.7661, + "num_tokens": 177130569.0, + "step": 76 + }, + { + "epoch": 0.026615969581749048, + "grad_norm": 0.8038788640178728, + "learning_rate": 2.0965517241379315e-05, + "loss": 0.7649, + "num_tokens": 179549534.0, + "step": 77 + }, + { + "epoch": 0.026961631524369166, + "grad_norm": 0.8880977133991036, + "learning_rate": 2.1241379310344827e-05, + "loss": 0.7634, + "num_tokens": 181919658.0, + "step": 78 + }, + { + "epoch": 0.027307293466989284, + "grad_norm": 0.7417404170940146, + "learning_rate": 2.1517241379310346e-05, + "loss": 0.762, + "num_tokens": 184313977.0, + "step": 79 + }, + { + "epoch": 0.027652955409609402, + "grad_norm": 0.7639055493485868, + "learning_rate": 2.1793103448275865e-05, + "loss": 0.7583, + "num_tokens": 186596058.0, + "step": 80 + }, + { + "epoch": 0.02799861735222952, + "grad_norm": 0.8099691607022619, + "learning_rate": 2.206896551724138e-05, + "loss": 0.7512, + "num_tokens": 188933869.0, + "step": 81 + }, + { + "epoch": 0.028344279294849638, + "grad_norm": 0.6860198325986622, + "learning_rate": 2.23448275862069e-05, + "loss": 0.7521, + "num_tokens": 191245733.0, + "step": 82 + }, + { + "epoch": 0.028689941237469756, + "grad_norm": 0.6846981754633444, + "learning_rate": 2.2620689655172417e-05, + "loss": 0.7551, + "num_tokens": 193553255.0, + "step": 83 + }, + { + "epoch": 0.029035603180089874, + "grad_norm": 0.6501174117629096, + "learning_rate": 2.2896551724137933e-05, + "loss": 0.7473, + "num_tokens": 195883429.0, + "step": 84 + }, + { + "epoch": 0.029381265122709988, + "grad_norm": 0.6914354541215395, + "learning_rate": 2.317241379310345e-05, + "loss": 0.7447, + "num_tokens": 198268087.0, + "step": 85 + }, + { + "epoch": 0.029726927065330106, + "grad_norm": 0.4755320816841892, + "learning_rate": 2.3448275862068967e-05, + "loss": 0.7341, + "num_tokens": 200618015.0, + "step": 86 + }, + { + "epoch": 0.030072589007950224, + "grad_norm": 0.9557281222429161, + "learning_rate": 2.3724137931034482e-05, + "loss": 0.7603, + "num_tokens": 203016946.0, + "step": 87 + }, + { + "epoch": 0.030418250950570342, + "grad_norm": 0.7349019646066728, + "learning_rate": 2.4e-05, + "loss": 0.7622, + "num_tokens": 205214919.0, + "step": 88 + }, + { + "epoch": 0.03076391289319046, + "grad_norm": 0.9116469040257529, + "learning_rate": 2.427586206896552e-05, + "loss": 0.7433, + "num_tokens": 207620550.0, + "step": 89 + }, + { + "epoch": 0.031109574835810578, + "grad_norm": 0.8510261221064249, + "learning_rate": 2.4551724137931038e-05, + "loss": 0.7444, + "num_tokens": 209994861.0, + "step": 90 + }, + { + "epoch": 0.03145523677843069, + "grad_norm": 0.6153876226538251, + "learning_rate": 2.4827586206896553e-05, + "loss": 0.7406, + "num_tokens": 212350825.0, + "step": 91 + }, + { + "epoch": 0.03180089872105081, + "grad_norm": 0.810675759688931, + "learning_rate": 2.5103448275862072e-05, + "loss": 0.7456, + "num_tokens": 214629936.0, + "step": 92 + }, + { + "epoch": 0.03214656066367093, + "grad_norm": 0.4884049293244507, + "learning_rate": 2.537931034482759e-05, + "loss": 0.7481, + "num_tokens": 216969950.0, + "step": 93 + }, + { + "epoch": 0.032492222606291046, + "grad_norm": 0.8446277273383646, + "learning_rate": 2.5655172413793103e-05, + "loss": 0.7458, + "num_tokens": 219405588.0, + "step": 94 + }, + { + "epoch": 0.032837884548911164, + "grad_norm": 0.5289015523887819, + "learning_rate": 2.593103448275862e-05, + "loss": 0.7386, + "num_tokens": 221720594.0, + "step": 95 + }, + { + "epoch": 0.03318354649153128, + "grad_norm": 0.6889357864803288, + "learning_rate": 2.620689655172414e-05, + "loss": 0.7356, + "num_tokens": 224073935.0, + "step": 96 + }, + { + "epoch": 0.0335292084341514, + "grad_norm": 0.6007223645180073, + "learning_rate": 2.648275862068966e-05, + "loss": 0.7472, + "num_tokens": 226445040.0, + "step": 97 + }, + { + "epoch": 0.03387487037677152, + "grad_norm": 0.7387047295293775, + "learning_rate": 2.6758620689655174e-05, + "loss": 0.7612, + "num_tokens": 228769431.0, + "step": 98 + }, + { + "epoch": 0.034220532319391636, + "grad_norm": 0.7764160324273132, + "learning_rate": 2.7034482758620693e-05, + "loss": 0.75, + "num_tokens": 231089769.0, + "step": 99 + }, + { + "epoch": 0.034566194262011754, + "grad_norm": 0.5933258036073403, + "learning_rate": 2.731034482758621e-05, + "loss": 0.7489, + "num_tokens": 233430572.0, + "step": 100 + }, + { + "epoch": 0.03491185620463187, + "grad_norm": 0.7297206869591315, + "learning_rate": 2.7586206896551727e-05, + "loss": 0.7472, + "num_tokens": 235617596.0, + "step": 101 + }, + { + "epoch": 0.03525751814725199, + "grad_norm": 0.7684655927441242, + "learning_rate": 2.7862068965517242e-05, + "loss": 0.7367, + "num_tokens": 238005342.0, + "step": 102 + }, + { + "epoch": 0.03560318008987211, + "grad_norm": 0.7579682420634565, + "learning_rate": 2.813793103448276e-05, + "loss": 0.7439, + "num_tokens": 240334625.0, + "step": 103 + }, + { + "epoch": 0.035948842032492226, + "grad_norm": 0.9043719301469854, + "learning_rate": 2.8413793103448276e-05, + "loss": 0.7428, + "num_tokens": 242624550.0, + "step": 104 + }, + { + "epoch": 0.03629450397511234, + "grad_norm": 0.471809452106959, + "learning_rate": 2.8689655172413795e-05, + "loss": 0.7391, + "num_tokens": 244932212.0, + "step": 105 + }, + { + "epoch": 0.036640165917732455, + "grad_norm": 0.888134097580562, + "learning_rate": 2.8965517241379313e-05, + "loss": 0.7456, + "num_tokens": 247331234.0, + "step": 106 + }, + { + "epoch": 0.03698582786035257, + "grad_norm": 0.4435173245087347, + "learning_rate": 2.9241379310344832e-05, + "loss": 0.7504, + "num_tokens": 249690250.0, + "step": 107 + }, + { + "epoch": 0.03733148980297269, + "grad_norm": 1.1172848964096243, + "learning_rate": 2.9517241379310347e-05, + "loss": 0.733, + "num_tokens": 251948004.0, + "step": 108 + }, + { + "epoch": 0.03767715174559281, + "grad_norm": 0.922205477104342, + "learning_rate": 2.9793103448275866e-05, + "loss": 0.7505, + "num_tokens": 254350208.0, + "step": 109 + }, + { + "epoch": 0.03802281368821293, + "grad_norm": 0.9360098315240186, + "learning_rate": 3.006896551724138e-05, + "loss": 0.7312, + "num_tokens": 256738015.0, + "step": 110 + }, + { + "epoch": 0.038368475630833045, + "grad_norm": 0.9493746052640399, + "learning_rate": 3.0344827586206897e-05, + "loss": 0.7453, + "num_tokens": 258993218.0, + "step": 111 + }, + { + "epoch": 0.03871413757345316, + "grad_norm": 0.6772582936432366, + "learning_rate": 3.0620689655172415e-05, + "loss": 0.7197, + "num_tokens": 261281901.0, + "step": 112 + }, + { + "epoch": 0.03905979951607328, + "grad_norm": 0.9116093959201665, + "learning_rate": 3.089655172413793e-05, + "loss": 0.7446, + "num_tokens": 263674484.0, + "step": 113 + }, + { + "epoch": 0.0394054614586934, + "grad_norm": 0.7495578225130799, + "learning_rate": 3.117241379310345e-05, + "loss": 0.7256, + "num_tokens": 266017440.0, + "step": 114 + }, + { + "epoch": 0.039751123401313516, + "grad_norm": 0.8068183580979901, + "learning_rate": 3.144827586206897e-05, + "loss": 0.7423, + "num_tokens": 268334238.0, + "step": 115 + }, + { + "epoch": 0.040096785343933634, + "grad_norm": 0.7124730040409233, + "learning_rate": 3.172413793103448e-05, + "loss": 0.7334, + "num_tokens": 270763388.0, + "step": 116 + }, + { + "epoch": 0.04044244728655375, + "grad_norm": 0.8089459947320503, + "learning_rate": 3.2000000000000005e-05, + "loss": 0.7299, + "num_tokens": 272941034.0, + "step": 117 + }, + { + "epoch": 0.04078810922917387, + "grad_norm": 0.7259513784231448, + "learning_rate": 3.227586206896552e-05, + "loss": 0.7329, + "num_tokens": 275378629.0, + "step": 118 + }, + { + "epoch": 0.04113377117179399, + "grad_norm": 0.7332446808409854, + "learning_rate": 3.2551724137931036e-05, + "loss": 0.7249, + "num_tokens": 277652371.0, + "step": 119 + }, + { + "epoch": 0.041479433114414106, + "grad_norm": 0.6421931090818569, + "learning_rate": 3.282758620689655e-05, + "loss": 0.7278, + "num_tokens": 280024124.0, + "step": 120 + }, + { + "epoch": 0.04182509505703422, + "grad_norm": 0.569024220548962, + "learning_rate": 3.310344827586207e-05, + "loss": 0.7489, + "num_tokens": 282322925.0, + "step": 121 + }, + { + "epoch": 0.042170756999654335, + "grad_norm": 0.5543917946977904, + "learning_rate": 3.337931034482759e-05, + "loss": 0.7497, + "num_tokens": 284780949.0, + "step": 122 + }, + { + "epoch": 0.04251641894227445, + "grad_norm": 0.6745056035927581, + "learning_rate": 3.3655172413793104e-05, + "loss": 0.7376, + "num_tokens": 287070053.0, + "step": 123 + }, + { + "epoch": 0.04286208088489457, + "grad_norm": 0.8057112613121699, + "learning_rate": 3.3931034482758626e-05, + "loss": 0.7461, + "num_tokens": 289398119.0, + "step": 124 + }, + { + "epoch": 0.04320774282751469, + "grad_norm": 0.7435738834081497, + "learning_rate": 3.420689655172414e-05, + "loss": 0.7451, + "num_tokens": 291806287.0, + "step": 125 + }, + { + "epoch": 0.04355340477013481, + "grad_norm": 0.607350592322234, + "learning_rate": 3.4482758620689657e-05, + "loss": 0.738, + "num_tokens": 294150810.0, + "step": 126 + }, + { + "epoch": 0.043899066712754925, + "grad_norm": 0.7157013623524531, + "learning_rate": 3.475862068965517e-05, + "loss": 0.7336, + "num_tokens": 296551922.0, + "step": 127 + }, + { + "epoch": 0.04424472865537504, + "grad_norm": 0.8341954990905753, + "learning_rate": 3.5034482758620694e-05, + "loss": 0.7249, + "num_tokens": 298832126.0, + "step": 128 + }, + { + "epoch": 0.04459039059799516, + "grad_norm": 0.817494247455751, + "learning_rate": 3.531034482758621e-05, + "loss": 0.7219, + "num_tokens": 301128017.0, + "step": 129 + }, + { + "epoch": 0.04493605254061528, + "grad_norm": 0.7585436099018116, + "learning_rate": 3.5586206896551725e-05, + "loss": 0.7321, + "num_tokens": 303506036.0, + "step": 130 + }, + { + "epoch": 0.0452817144832354, + "grad_norm": 0.9678755528109797, + "learning_rate": 3.586206896551725e-05, + "loss": 0.7276, + "num_tokens": 305918873.0, + "step": 131 + }, + { + "epoch": 0.045627376425855515, + "grad_norm": 0.718133726568121, + "learning_rate": 3.613793103448276e-05, + "loss": 0.7251, + "num_tokens": 308241872.0, + "step": 132 + }, + { + "epoch": 0.04597303836847563, + "grad_norm": 1.0973484475823698, + "learning_rate": 3.641379310344828e-05, + "loss": 0.7379, + "num_tokens": 310727507.0, + "step": 133 + }, + { + "epoch": 0.04631870031109575, + "grad_norm": 0.7416516400117827, + "learning_rate": 3.668965517241379e-05, + "loss": 0.7346, + "num_tokens": 313021983.0, + "step": 134 + }, + { + "epoch": 0.04666436225371587, + "grad_norm": 1.3284478552185184, + "learning_rate": 3.6965517241379315e-05, + "loss": 0.7346, + "num_tokens": 315493468.0, + "step": 135 + }, + { + "epoch": 0.04701002419633599, + "grad_norm": 1.0152349598840011, + "learning_rate": 3.724137931034483e-05, + "loss": 0.7208, + "num_tokens": 317840417.0, + "step": 136 + }, + { + "epoch": 0.0473556861389561, + "grad_norm": 1.064480784927161, + "learning_rate": 3.7517241379310345e-05, + "loss": 0.7145, + "num_tokens": 320098389.0, + "step": 137 + }, + { + "epoch": 0.047701348081576216, + "grad_norm": 0.9745304292393031, + "learning_rate": 3.779310344827587e-05, + "loss": 0.7346, + "num_tokens": 322339251.0, + "step": 138 + }, + { + "epoch": 0.048047010024196334, + "grad_norm": 1.0646363969058728, + "learning_rate": 3.806896551724138e-05, + "loss": 0.7034, + "num_tokens": 324697225.0, + "step": 139 + }, + { + "epoch": 0.04839267196681645, + "grad_norm": 0.8084137804059474, + "learning_rate": 3.83448275862069e-05, + "loss": 0.7181, + "num_tokens": 326976615.0, + "step": 140 + }, + { + "epoch": 0.04873833390943657, + "grad_norm": 1.0998528107081083, + "learning_rate": 3.862068965517242e-05, + "loss": 0.7248, + "num_tokens": 329339438.0, + "step": 141 + }, + { + "epoch": 0.04908399585205669, + "grad_norm": 1.0307723947265541, + "learning_rate": 3.8896551724137935e-05, + "loss": 0.7212, + "num_tokens": 331716104.0, + "step": 142 + }, + { + "epoch": 0.049429657794676805, + "grad_norm": 1.0104830173773098, + "learning_rate": 3.917241379310345e-05, + "loss": 0.7343, + "num_tokens": 334159866.0, + "step": 143 + }, + { + "epoch": 0.04977531973729692, + "grad_norm": 1.2284184845848667, + "learning_rate": 3.9448275862068966e-05, + "loss": 0.7398, + "num_tokens": 336467657.0, + "step": 144 + }, + { + "epoch": 0.05012098167991704, + "grad_norm": 0.8918997830916775, + "learning_rate": 3.972413793103449e-05, + "loss": 0.7313, + "num_tokens": 338801323.0, + "step": 145 + }, + { + "epoch": 0.05046664362253716, + "grad_norm": 1.082726102460824, + "learning_rate": 4e-05, + "loss": 0.7263, + "num_tokens": 341162307.0, + "step": 146 + }, + { + "epoch": 0.05081230556515728, + "grad_norm": 0.8971107129336452, + "learning_rate": 4.027586206896552e-05, + "loss": 0.7235, + "num_tokens": 343412060.0, + "step": 147 + }, + { + "epoch": 0.051157967507777395, + "grad_norm": 0.9412664497744425, + "learning_rate": 4.055172413793104e-05, + "loss": 0.7048, + "num_tokens": 345675733.0, + "step": 148 + }, + { + "epoch": 0.05150362945039751, + "grad_norm": 0.8661134234474723, + "learning_rate": 4.0827586206896556e-05, + "loss": 0.7193, + "num_tokens": 348009471.0, + "step": 149 + }, + { + "epoch": 0.05184929139301763, + "grad_norm": 0.7772855074917321, + "learning_rate": 4.110344827586207e-05, + "loss": 0.7195, + "num_tokens": 350251681.0, + "step": 150 + }, + { + "epoch": 0.05219495333563775, + "grad_norm": 0.7497371938056758, + "learning_rate": 4.137931034482759e-05, + "loss": 0.6977, + "num_tokens": 352485472.0, + "step": 151 + }, + { + "epoch": 0.05254061527825787, + "grad_norm": 0.7819238079711345, + "learning_rate": 4.165517241379311e-05, + "loss": 0.7238, + "num_tokens": 354839391.0, + "step": 152 + }, + { + "epoch": 0.05288627722087798, + "grad_norm": 0.8592104884503061, + "learning_rate": 4.193103448275863e-05, + "loss": 0.7108, + "num_tokens": 357043632.0, + "step": 153 + }, + { + "epoch": 0.053231939163498096, + "grad_norm": 0.7706232047174564, + "learning_rate": 4.2206896551724146e-05, + "loss": 0.7029, + "num_tokens": 359361396.0, + "step": 154 + }, + { + "epoch": 0.053577601106118214, + "grad_norm": 0.9870020438536392, + "learning_rate": 4.2482758620689655e-05, + "loss": 0.7304, + "num_tokens": 361740885.0, + "step": 155 + }, + { + "epoch": 0.05392326304873833, + "grad_norm": 1.0554218992326692, + "learning_rate": 4.275862068965517e-05, + "loss": 0.7145, + "num_tokens": 363957526.0, + "step": 156 + }, + { + "epoch": 0.05426892499135845, + "grad_norm": 0.582139071481871, + "learning_rate": 4.303448275862069e-05, + "loss": 0.7146, + "num_tokens": 366227335.0, + "step": 157 + }, + { + "epoch": 0.05461458693397857, + "grad_norm": 1.068448965677722, + "learning_rate": 4.331034482758621e-05, + "loss": 0.7232, + "num_tokens": 368527770.0, + "step": 158 + }, + { + "epoch": 0.054960248876598686, + "grad_norm": 0.5329171240099785, + "learning_rate": 4.358620689655173e-05, + "loss": 0.7035, + "num_tokens": 370790507.0, + "step": 159 + }, + { + "epoch": 0.055305910819218804, + "grad_norm": 1.2586026783810411, + "learning_rate": 4.3862068965517245e-05, + "loss": 0.7326, + "num_tokens": 373067015.0, + "step": 160 + }, + { + "epoch": 0.05565157276183892, + "grad_norm": 0.9837880569753787, + "learning_rate": 4.413793103448276e-05, + "loss": 0.7167, + "num_tokens": 375501172.0, + "step": 161 + }, + { + "epoch": 0.05599723470445904, + "grad_norm": 0.9384633099741303, + "learning_rate": 4.441379310344828e-05, + "loss": 0.7226, + "num_tokens": 377854677.0, + "step": 162 + }, + { + "epoch": 0.05634289664707916, + "grad_norm": 1.1519738556824626, + "learning_rate": 4.46896551724138e-05, + "loss": 0.7362, + "num_tokens": 380287197.0, + "step": 163 + }, + { + "epoch": 0.056688558589699276, + "grad_norm": 1.043350430252056, + "learning_rate": 4.496551724137931e-05, + "loss": 0.7272, + "num_tokens": 382556344.0, + "step": 164 + }, + { + "epoch": 0.057034220532319393, + "grad_norm": 1.1247382477517336, + "learning_rate": 4.5241379310344835e-05, + "loss": 0.7083, + "num_tokens": 384903188.0, + "step": 165 + }, + { + "epoch": 0.05737988247493951, + "grad_norm": 0.8855937671393632, + "learning_rate": 4.551724137931035e-05, + "loss": 0.7303, + "num_tokens": 387251917.0, + "step": 166 + }, + { + "epoch": 0.05772554441755963, + "grad_norm": 1.1411647243629852, + "learning_rate": 4.5793103448275865e-05, + "loss": 0.6884, + "num_tokens": 389461686.0, + "step": 167 + }, + { + "epoch": 0.05807120636017975, + "grad_norm": 0.8973573737935918, + "learning_rate": 4.606896551724139e-05, + "loss": 0.7232, + "num_tokens": 391811582.0, + "step": 168 + }, + { + "epoch": 0.05841686830279986, + "grad_norm": 1.1526902094508222, + "learning_rate": 4.63448275862069e-05, + "loss": 0.7206, + "num_tokens": 394148853.0, + "step": 169 + }, + { + "epoch": 0.058762530245419976, + "grad_norm": 1.0746984364432095, + "learning_rate": 4.6620689655172425e-05, + "loss": 0.7144, + "num_tokens": 396355036.0, + "step": 170 + }, + { + "epoch": 0.059108192188040094, + "grad_norm": 0.9518903963086779, + "learning_rate": 4.689655172413793e-05, + "loss": 0.703, + "num_tokens": 398661260.0, + "step": 171 + }, + { + "epoch": 0.05945385413066021, + "grad_norm": 0.9995903536249783, + "learning_rate": 4.717241379310345e-05, + "loss": 0.7163, + "num_tokens": 400957583.0, + "step": 172 + }, + { + "epoch": 0.05979951607328033, + "grad_norm": 0.8203958785857348, + "learning_rate": 4.7448275862068964e-05, + "loss": 0.7223, + "num_tokens": 403369717.0, + "step": 173 + }, + { + "epoch": 0.06014517801590045, + "grad_norm": 1.2147731990281987, + "learning_rate": 4.7724137931034486e-05, + "loss": 0.7222, + "num_tokens": 405780049.0, + "step": 174 + }, + { + "epoch": 0.060490839958520566, + "grad_norm": 0.9022809717301816, + "learning_rate": 4.8e-05, + "loss": 0.7013, + "num_tokens": 408140167.0, + "step": 175 + }, + { + "epoch": 0.060836501901140684, + "grad_norm": 1.050648714230654, + "learning_rate": 4.827586206896552e-05, + "loss": 0.7138, + "num_tokens": 410429807.0, + "step": 176 + }, + { + "epoch": 0.0611821638437608, + "grad_norm": 0.8145480570320937, + "learning_rate": 4.855172413793104e-05, + "loss": 0.7149, + "num_tokens": 412753090.0, + "step": 177 + }, + { + "epoch": 0.06152782578638092, + "grad_norm": 1.0054753624866357, + "learning_rate": 4.8827586206896554e-05, + "loss": 0.7104, + "num_tokens": 415053865.0, + "step": 178 + }, + { + "epoch": 0.06187348772900104, + "grad_norm": 0.8548827349446674, + "learning_rate": 4.9103448275862076e-05, + "loss": 0.7128, + "num_tokens": 417284862.0, + "step": 179 + }, + { + "epoch": 0.062219149671621156, + "grad_norm": 0.6002105552755639, + "learning_rate": 4.937931034482759e-05, + "loss": 0.7118, + "num_tokens": 419578881.0, + "step": 180 + }, + { + "epoch": 0.06256481161424127, + "grad_norm": 1.107683107673729, + "learning_rate": 4.9655172413793107e-05, + "loss": 0.7282, + "num_tokens": 422066367.0, + "step": 181 + }, + { + "epoch": 0.06291047355686138, + "grad_norm": 0.8850477934882971, + "learning_rate": 4.993103448275863e-05, + "loss": 0.7062, + "num_tokens": 424541442.0, + "step": 182 + }, + { + "epoch": 0.0632561354994815, + "grad_norm": 1.0418647998717137, + "learning_rate": 5.0206896551724144e-05, + "loss": 0.7164, + "num_tokens": 426799789.0, + "step": 183 + }, + { + "epoch": 0.06360179744210162, + "grad_norm": 0.9866322854571545, + "learning_rate": 5.048275862068966e-05, + "loss": 0.7095, + "num_tokens": 429109812.0, + "step": 184 + }, + { + "epoch": 0.06394745938472174, + "grad_norm": 0.8220500322939559, + "learning_rate": 5.075862068965518e-05, + "loss": 0.711, + "num_tokens": 431319178.0, + "step": 185 + }, + { + "epoch": 0.06429312132734186, + "grad_norm": 1.1643380349268293, + "learning_rate": 5.10344827586207e-05, + "loss": 0.7034, + "num_tokens": 433591378.0, + "step": 186 + }, + { + "epoch": 0.06463878326996197, + "grad_norm": 0.7953856479900898, + "learning_rate": 5.1310344827586205e-05, + "loss": 0.7172, + "num_tokens": 436047642.0, + "step": 187 + }, + { + "epoch": 0.06498444521258209, + "grad_norm": 1.3416422670812653, + "learning_rate": 5.158620689655173e-05, + "loss": 0.7138, + "num_tokens": 438434044.0, + "step": 188 + }, + { + "epoch": 0.06533010715520221, + "grad_norm": 0.9030922035012002, + "learning_rate": 5.186206896551724e-05, + "loss": 0.7141, + "num_tokens": 440839385.0, + "step": 189 + }, + { + "epoch": 0.06567576909782233, + "grad_norm": 1.4125631245405847, + "learning_rate": 5.213793103448276e-05, + "loss": 0.7134, + "num_tokens": 443090381.0, + "step": 190 + }, + { + "epoch": 0.06602143104044245, + "grad_norm": 1.078421315455212, + "learning_rate": 5.241379310344828e-05, + "loss": 0.71, + "num_tokens": 445465216.0, + "step": 191 + }, + { + "epoch": 0.06636709298306256, + "grad_norm": 1.1339002364196766, + "learning_rate": 5.2689655172413795e-05, + "loss": 0.71, + "num_tokens": 447860655.0, + "step": 192 + }, + { + "epoch": 0.06671275492568268, + "grad_norm": 0.8596178566442743, + "learning_rate": 5.296551724137932e-05, + "loss": 0.7057, + "num_tokens": 450184646.0, + "step": 193 + }, + { + "epoch": 0.0670584168683028, + "grad_norm": 1.3790678568270909, + "learning_rate": 5.324137931034483e-05, + "loss": 0.7035, + "num_tokens": 452569957.0, + "step": 194 + }, + { + "epoch": 0.06740407881092292, + "grad_norm": 1.0390787748036805, + "learning_rate": 5.351724137931035e-05, + "loss": 0.7319, + "num_tokens": 455024875.0, + "step": 195 + }, + { + "epoch": 0.06774974075354304, + "grad_norm": 1.309925274269295, + "learning_rate": 5.379310344827587e-05, + "loss": 0.7166, + "num_tokens": 457536389.0, + "step": 196 + }, + { + "epoch": 0.06809540269616315, + "grad_norm": 1.0203712559081182, + "learning_rate": 5.4068965517241385e-05, + "loss": 0.7138, + "num_tokens": 459901229.0, + "step": 197 + }, + { + "epoch": 0.06844106463878327, + "grad_norm": 1.0971769212525957, + "learning_rate": 5.43448275862069e-05, + "loss": 0.7201, + "num_tokens": 462218756.0, + "step": 198 + }, + { + "epoch": 0.06878672658140339, + "grad_norm": 0.876460559693676, + "learning_rate": 5.462068965517242e-05, + "loss": 0.6966, + "num_tokens": 464594880.0, + "step": 199 + }, + { + "epoch": 0.06913238852402351, + "grad_norm": 1.1316941943804402, + "learning_rate": 5.489655172413794e-05, + "loss": 0.7152, + "num_tokens": 466958545.0, + "step": 200 + }, + { + "epoch": 0.06947805046664363, + "grad_norm": 0.855185780085789, + "learning_rate": 5.517241379310345e-05, + "loss": 0.7089, + "num_tokens": 469247028.0, + "step": 201 + }, + { + "epoch": 0.06982371240926374, + "grad_norm": 1.1055725145143143, + "learning_rate": 5.5448275862068975e-05, + "loss": 0.6987, + "num_tokens": 471483053.0, + "step": 202 + }, + { + "epoch": 0.07016937435188386, + "grad_norm": 0.9170336897065865, + "learning_rate": 5.5724137931034484e-05, + "loss": 0.7236, + "num_tokens": 473819939.0, + "step": 203 + }, + { + "epoch": 0.07051503629450398, + "grad_norm": 0.9278362049523065, + "learning_rate": 5.6e-05, + "loss": 0.7055, + "num_tokens": 476112202.0, + "step": 204 + }, + { + "epoch": 0.0708606982371241, + "grad_norm": 0.9449160154937349, + "learning_rate": 5.627586206896552e-05, + "loss": 0.7072, + "num_tokens": 478404403.0, + "step": 205 + }, + { + "epoch": 0.07120636017974422, + "grad_norm": 0.6526098622181347, + "learning_rate": 5.6551724137931037e-05, + "loss": 0.7235, + "num_tokens": 480796853.0, + "step": 206 + }, + { + "epoch": 0.07155202212236433, + "grad_norm": 1.31179922805551, + "learning_rate": 5.682758620689655e-05, + "loss": 0.7034, + "num_tokens": 483199501.0, + "step": 207 + }, + { + "epoch": 0.07189768406498445, + "grad_norm": 0.8958524832278518, + "learning_rate": 5.7103448275862074e-05, + "loss": 0.7189, + "num_tokens": 485614054.0, + "step": 208 + }, + { + "epoch": 0.07224334600760456, + "grad_norm": 1.2912439498883648, + "learning_rate": 5.737931034482759e-05, + "loss": 0.7083, + "num_tokens": 487827562.0, + "step": 209 + }, + { + "epoch": 0.07258900795022467, + "grad_norm": 1.0421251696269234, + "learning_rate": 5.765517241379311e-05, + "loss": 0.7031, + "num_tokens": 490059153.0, + "step": 210 + }, + { + "epoch": 0.07293466989284479, + "grad_norm": 1.0197574543609216, + "learning_rate": 5.7931034482758627e-05, + "loss": 0.7186, + "num_tokens": 492473358.0, + "step": 211 + }, + { + "epoch": 0.07328033183546491, + "grad_norm": 0.8318654597079473, + "learning_rate": 5.820689655172414e-05, + "loss": 0.7182, + "num_tokens": 494860369.0, + "step": 212 + }, + { + "epoch": 0.07362599377808503, + "grad_norm": 1.030706449769229, + "learning_rate": 5.8482758620689664e-05, + "loss": 0.7, + "num_tokens": 497170523.0, + "step": 213 + }, + { + "epoch": 0.07397165572070515, + "grad_norm": 0.7476341956398788, + "learning_rate": 5.875862068965518e-05, + "loss": 0.7001, + "num_tokens": 499548947.0, + "step": 214 + }, + { + "epoch": 0.07431731766332526, + "grad_norm": 0.8966754075924577, + "learning_rate": 5.9034482758620695e-05, + "loss": 0.7171, + "num_tokens": 502025053.0, + "step": 215 + }, + { + "epoch": 0.07466297960594538, + "grad_norm": 0.7105787426449255, + "learning_rate": 5.931034482758622e-05, + "loss": 0.694, + "num_tokens": 504282189.0, + "step": 216 + }, + { + "epoch": 0.0750086415485655, + "grad_norm": 0.8647591899831232, + "learning_rate": 5.958620689655173e-05, + "loss": 0.7126, + "num_tokens": 506598189.0, + "step": 217 + }, + { + "epoch": 0.07535430349118562, + "grad_norm": 0.6779689764303697, + "learning_rate": 5.986206896551725e-05, + "loss": 0.6988, + "num_tokens": 508892435.0, + "step": 218 + }, + { + "epoch": 0.07569996543380574, + "grad_norm": 0.7049539628997222, + "learning_rate": 6.013793103448276e-05, + "loss": 0.6986, + "num_tokens": 511163860.0, + "step": 219 + }, + { + "epoch": 0.07604562737642585, + "grad_norm": 0.9144778322979157, + "learning_rate": 6.041379310344828e-05, + "loss": 0.6948, + "num_tokens": 513524515.0, + "step": 220 + }, + { + "epoch": 0.07639128931904597, + "grad_norm": 0.9507080106150033, + "learning_rate": 6.068965517241379e-05, + "loss": 0.704, + "num_tokens": 515990605.0, + "step": 221 + }, + { + "epoch": 0.07673695126166609, + "grad_norm": 0.9946450620958396, + "learning_rate": 6.0965517241379315e-05, + "loss": 0.7101, + "num_tokens": 518218266.0, + "step": 222 + }, + { + "epoch": 0.07708261320428621, + "grad_norm": 0.5988102158537464, + "learning_rate": 6.124137931034483e-05, + "loss": 0.7101, + "num_tokens": 520506349.0, + "step": 223 + }, + { + "epoch": 0.07742827514690633, + "grad_norm": 1.7159339172962982, + "learning_rate": 6.151724137931035e-05, + "loss": 0.7036, + "num_tokens": 522937983.0, + "step": 224 + }, + { + "epoch": 0.07777393708952644, + "grad_norm": 1.1605650590574434, + "learning_rate": 6.179310344827586e-05, + "loss": 0.7114, + "num_tokens": 525288490.0, + "step": 225 + }, + { + "epoch": 0.07811959903214656, + "grad_norm": 1.984813505030011, + "learning_rate": 6.206896551724138e-05, + "loss": 0.7215, + "num_tokens": 527601511.0, + "step": 226 + }, + { + "epoch": 0.07846526097476668, + "grad_norm": 1.9410175057220138, + "learning_rate": 6.23448275862069e-05, + "loss": 0.7365, + "num_tokens": 530003533.0, + "step": 227 + }, + { + "epoch": 0.0788109229173868, + "grad_norm": 1.2043718596372153, + "learning_rate": 6.262068965517241e-05, + "loss": 0.7026, + "num_tokens": 532317231.0, + "step": 228 + }, + { + "epoch": 0.07915658486000691, + "grad_norm": 1.548664678240791, + "learning_rate": 6.289655172413794e-05, + "loss": 0.6977, + "num_tokens": 534636808.0, + "step": 229 + }, + { + "epoch": 0.07950224680262703, + "grad_norm": 1.1504835175618584, + "learning_rate": 6.317241379310346e-05, + "loss": 0.7124, + "num_tokens": 536967339.0, + "step": 230 + }, + { + "epoch": 0.07984790874524715, + "grad_norm": 1.5900417245686578, + "learning_rate": 6.344827586206897e-05, + "loss": 0.6941, + "num_tokens": 539157144.0, + "step": 231 + }, + { + "epoch": 0.08019357068786727, + "grad_norm": 1.4941892722346075, + "learning_rate": 6.372413793103449e-05, + "loss": 0.7118, + "num_tokens": 541448568.0, + "step": 232 + }, + { + "epoch": 0.08053923263048739, + "grad_norm": 1.1506457919531141, + "learning_rate": 6.400000000000001e-05, + "loss": 0.7173, + "num_tokens": 543757589.0, + "step": 233 + }, + { + "epoch": 0.0808848945731075, + "grad_norm": 1.3591672059610354, + "learning_rate": 6.427586206896553e-05, + "loss": 0.709, + "num_tokens": 546055736.0, + "step": 234 + }, + { + "epoch": 0.08123055651572762, + "grad_norm": 1.099852732427473, + "learning_rate": 6.455172413793104e-05, + "loss": 0.7136, + "num_tokens": 548490878.0, + "step": 235 + }, + { + "epoch": 0.08157621845834774, + "grad_norm": 1.1866711082429422, + "learning_rate": 6.482758620689655e-05, + "loss": 0.6989, + "num_tokens": 550795757.0, + "step": 236 + }, + { + "epoch": 0.08192188040096786, + "grad_norm": 1.0683161397356664, + "learning_rate": 6.510344827586207e-05, + "loss": 0.6915, + "num_tokens": 553237843.0, + "step": 237 + }, + { + "epoch": 0.08226754234358798, + "grad_norm": 1.0014269061191867, + "learning_rate": 6.53793103448276e-05, + "loss": 0.7067, + "num_tokens": 555609459.0, + "step": 238 + }, + { + "epoch": 0.0826132042862081, + "grad_norm": 0.7434483096314644, + "learning_rate": 6.56551724137931e-05, + "loss": 0.7027, + "num_tokens": 557980720.0, + "step": 239 + }, + { + "epoch": 0.08295886622882821, + "grad_norm": 1.0347156189570355, + "learning_rate": 6.593103448275862e-05, + "loss": 0.6948, + "num_tokens": 560292519.0, + "step": 240 + }, + { + "epoch": 0.08330452817144833, + "grad_norm": 1.1930974097925942, + "learning_rate": 6.620689655172415e-05, + "loss": 0.6944, + "num_tokens": 562637251.0, + "step": 241 + }, + { + "epoch": 0.08365019011406843, + "grad_norm": 0.6666553380523713, + "learning_rate": 6.648275862068966e-05, + "loss": 0.6945, + "num_tokens": 564999649.0, + "step": 242 + }, + { + "epoch": 0.08399585205668855, + "grad_norm": 1.9243727581703054, + "learning_rate": 6.675862068965518e-05, + "loss": 0.6772, + "num_tokens": 567304230.0, + "step": 243 + }, + { + "epoch": 0.08434151399930867, + "grad_norm": 1.3840601785141495, + "learning_rate": 6.70344827586207e-05, + "loss": 0.7033, + "num_tokens": 569629888.0, + "step": 244 + }, + { + "epoch": 0.08468717594192879, + "grad_norm": 1.9433935671131355, + "learning_rate": 6.731034482758621e-05, + "loss": 0.7017, + "num_tokens": 571856744.0, + "step": 245 + }, + { + "epoch": 0.0850328378845489, + "grad_norm": 1.7727674338865027, + "learning_rate": 6.758620689655173e-05, + "loss": 0.7033, + "num_tokens": 574162249.0, + "step": 246 + }, + { + "epoch": 0.08537849982716902, + "grad_norm": 1.3722317873887826, + "learning_rate": 6.786206896551725e-05, + "loss": 0.7165, + "num_tokens": 576538644.0, + "step": 247 + }, + { + "epoch": 0.08572416176978914, + "grad_norm": 1.1680037272703208, + "learning_rate": 6.813793103448276e-05, + "loss": 0.7074, + "num_tokens": 579013717.0, + "step": 248 + }, + { + "epoch": 0.08606982371240926, + "grad_norm": 1.5957327301656017, + "learning_rate": 6.841379310344828e-05, + "loss": 0.6988, + "num_tokens": 581286691.0, + "step": 249 + }, + { + "epoch": 0.08641548565502938, + "grad_norm": 1.2046009460189067, + "learning_rate": 6.86896551724138e-05, + "loss": 0.7019, + "num_tokens": 583646190.0, + "step": 250 + }, + { + "epoch": 0.0867611475976495, + "grad_norm": 1.7449366521877085, + "learning_rate": 6.896551724137931e-05, + "loss": 0.7005, + "num_tokens": 585917794.0, + "step": 251 + }, + { + "epoch": 0.08710680954026961, + "grad_norm": 1.4306064719383027, + "learning_rate": 6.924137931034484e-05, + "loss": 0.6924, + "num_tokens": 588138109.0, + "step": 252 + }, + { + "epoch": 0.08745247148288973, + "grad_norm": 1.5456545611198949, + "learning_rate": 6.951724137931034e-05, + "loss": 0.6958, + "num_tokens": 590434348.0, + "step": 253 + }, + { + "epoch": 0.08779813342550985, + "grad_norm": 1.3933380155584882, + "learning_rate": 6.979310344827587e-05, + "loss": 0.7071, + "num_tokens": 592866511.0, + "step": 254 + }, + { + "epoch": 0.08814379536812997, + "grad_norm": 1.487962732840059, + "learning_rate": 7.006896551724139e-05, + "loss": 0.7021, + "num_tokens": 595105372.0, + "step": 255 + }, + { + "epoch": 0.08848945731075009, + "grad_norm": 1.2753789290830655, + "learning_rate": 7.03448275862069e-05, + "loss": 0.6908, + "num_tokens": 597453436.0, + "step": 256 + }, + { + "epoch": 0.0888351192533702, + "grad_norm": 1.298149849085115, + "learning_rate": 7.062068965517242e-05, + "loss": 0.7023, + "num_tokens": 599772241.0, + "step": 257 + }, + { + "epoch": 0.08918078119599032, + "grad_norm": 1.217125692018453, + "learning_rate": 7.089655172413794e-05, + "loss": 0.7012, + "num_tokens": 602185701.0, + "step": 258 + }, + { + "epoch": 0.08952644313861044, + "grad_norm": 1.1883730812217992, + "learning_rate": 7.117241379310345e-05, + "loss": 0.6959, + "num_tokens": 604567543.0, + "step": 259 + }, + { + "epoch": 0.08987210508123056, + "grad_norm": 1.0831663735834525, + "learning_rate": 7.144827586206897e-05, + "loss": 0.6934, + "num_tokens": 606913192.0, + "step": 260 + }, + { + "epoch": 0.09021776702385068, + "grad_norm": 0.9746730620076356, + "learning_rate": 7.17241379310345e-05, + "loss": 0.7008, + "num_tokens": 609282261.0, + "step": 261 + }, + { + "epoch": 0.0905634289664708, + "grad_norm": 0.7402597608567184, + "learning_rate": 7.2e-05, + "loss": 0.6883, + "num_tokens": 611640132.0, + "step": 262 + }, + { + "epoch": 0.09090909090909091, + "grad_norm": 1.0055165785713271, + "learning_rate": 7.227586206896552e-05, + "loss": 0.7139, + "num_tokens": 614045141.0, + "step": 263 + }, + { + "epoch": 0.09125475285171103, + "grad_norm": 0.5464502547515364, + "learning_rate": 7.255172413793105e-05, + "loss": 0.7089, + "num_tokens": 616404508.0, + "step": 264 + }, + { + "epoch": 0.09160041479433115, + "grad_norm": 1.135509676557934, + "learning_rate": 7.282758620689655e-05, + "loss": 0.6976, + "num_tokens": 618757455.0, + "step": 265 + }, + { + "epoch": 0.09194607673695127, + "grad_norm": 0.7458841157543672, + "learning_rate": 7.310344827586208e-05, + "loss": 0.6889, + "num_tokens": 621052622.0, + "step": 266 + }, + { + "epoch": 0.09229173867957138, + "grad_norm": 1.2455439399749555, + "learning_rate": 7.337931034482759e-05, + "loss": 0.7043, + "num_tokens": 623339412.0, + "step": 267 + }, + { + "epoch": 0.0926374006221915, + "grad_norm": 1.138316413962965, + "learning_rate": 7.365517241379311e-05, + "loss": 0.6958, + "num_tokens": 625647172.0, + "step": 268 + }, + { + "epoch": 0.09298306256481162, + "grad_norm": 0.8726454283956959, + "learning_rate": 7.393103448275863e-05, + "loss": 0.6806, + "num_tokens": 627913561.0, + "step": 269 + }, + { + "epoch": 0.09332872450743174, + "grad_norm": 1.0876201492366608, + "learning_rate": 7.420689655172414e-05, + "loss": 0.7119, + "num_tokens": 630197340.0, + "step": 270 + }, + { + "epoch": 0.09367438645005186, + "grad_norm": 0.7733630637075924, + "learning_rate": 7.448275862068966e-05, + "loss": 0.6931, + "num_tokens": 632499021.0, + "step": 271 + }, + { + "epoch": 0.09402004839267197, + "grad_norm": 1.111664877807677, + "learning_rate": 7.475862068965518e-05, + "loss": 0.6797, + "num_tokens": 634802444.0, + "step": 272 + }, + { + "epoch": 0.09436571033529209, + "grad_norm": 0.9122161783683577, + "learning_rate": 7.503448275862069e-05, + "loss": 0.6848, + "num_tokens": 637024078.0, + "step": 273 + }, + { + "epoch": 0.0947113722779122, + "grad_norm": 0.9000420275340484, + "learning_rate": 7.531034482758621e-05, + "loss": 0.7097, + "num_tokens": 639278981.0, + "step": 274 + }, + { + "epoch": 0.09505703422053231, + "grad_norm": 0.9927997257677583, + "learning_rate": 7.558620689655173e-05, + "loss": 0.7084, + "num_tokens": 641559187.0, + "step": 275 + }, + { + "epoch": 0.09540269616315243, + "grad_norm": 0.595089510074961, + "learning_rate": 7.586206896551724e-05, + "loss": 0.6981, + "num_tokens": 643824036.0, + "step": 276 + }, + { + "epoch": 0.09574835810577255, + "grad_norm": 1.454341628047847, + "learning_rate": 7.613793103448277e-05, + "loss": 0.7075, + "num_tokens": 646193569.0, + "step": 277 + }, + { + "epoch": 0.09609402004839267, + "grad_norm": 1.2806872879871047, + "learning_rate": 7.641379310344829e-05, + "loss": 0.7046, + "num_tokens": 648530275.0, + "step": 278 + }, + { + "epoch": 0.09643968199101279, + "grad_norm": 0.8952511336846225, + "learning_rate": 7.66896551724138e-05, + "loss": 0.6971, + "num_tokens": 650859888.0, + "step": 279 + }, + { + "epoch": 0.0967853439336329, + "grad_norm": 1.3874720133983718, + "learning_rate": 7.696551724137932e-05, + "loss": 0.6986, + "num_tokens": 653237461.0, + "step": 280 + }, + { + "epoch": 0.09713100587625302, + "grad_norm": 0.8975662650525933, + "learning_rate": 7.724137931034484e-05, + "loss": 0.697, + "num_tokens": 655501633.0, + "step": 281 + }, + { + "epoch": 0.09747666781887314, + "grad_norm": 1.2127287102857875, + "learning_rate": 7.751724137931035e-05, + "loss": 0.698, + "num_tokens": 657892341.0, + "step": 282 + }, + { + "epoch": 0.09782232976149326, + "grad_norm": 1.0684794860622633, + "learning_rate": 7.779310344827587e-05, + "loss": 0.7027, + "num_tokens": 660211835.0, + "step": 283 + }, + { + "epoch": 0.09816799170411337, + "grad_norm": 0.9724390611007337, + "learning_rate": 7.806896551724138e-05, + "loss": 0.7086, + "num_tokens": 662495958.0, + "step": 284 + }, + { + "epoch": 0.09851365364673349, + "grad_norm": 0.9890746582625942, + "learning_rate": 7.83448275862069e-05, + "loss": 0.682, + "num_tokens": 664825607.0, + "step": 285 + }, + { + "epoch": 0.09885931558935361, + "grad_norm": 0.9521052208344568, + "learning_rate": 7.862068965517242e-05, + "loss": 0.6967, + "num_tokens": 667185621.0, + "step": 286 + }, + { + "epoch": 0.09920497753197373, + "grad_norm": 0.5872551608920765, + "learning_rate": 7.889655172413793e-05, + "loss": 0.7138, + "num_tokens": 669519354.0, + "step": 287 + }, + { + "epoch": 0.09955063947459385, + "grad_norm": 1.0873675042837323, + "learning_rate": 7.917241379310345e-05, + "loss": 0.6948, + "num_tokens": 671846829.0, + "step": 288 + }, + { + "epoch": 0.09989630141721396, + "grad_norm": 0.8648169705503227, + "learning_rate": 7.944827586206898e-05, + "loss": 0.6924, + "num_tokens": 674148172.0, + "step": 289 + }, + { + "epoch": 0.10024196335983408, + "grad_norm": 0.9216484637326665, + "learning_rate": 7.972413793103448e-05, + "loss": 0.7007, + "num_tokens": 676439026.0, + "step": 290 + }, + { + "epoch": 0.1005876253024542, + "grad_norm": 0.8906713041792911, + "learning_rate": 8e-05, + "loss": 0.7089, + "num_tokens": 678787516.0, + "step": 291 + }, + { + "epoch": 0.10093328724507432, + "grad_norm": 0.8857322378121782, + "learning_rate": 7.999997086755005e-05, + "loss": 0.6986, + "num_tokens": 681216767.0, + "step": 292 + }, + { + "epoch": 0.10127894918769444, + "grad_norm": 0.9887428852571887, + "learning_rate": 7.99998834702426e-05, + "loss": 0.6984, + "num_tokens": 683618739.0, + "step": 293 + }, + { + "epoch": 0.10162461113031455, + "grad_norm": 0.7687099383073623, + "learning_rate": 7.999973780820498e-05, + "loss": 0.6858, + "num_tokens": 686091351.0, + "step": 294 + }, + { + "epoch": 0.10197027307293467, + "grad_norm": 0.8070721314266507, + "learning_rate": 7.999953388164936e-05, + "loss": 0.6963, + "num_tokens": 688492964.0, + "step": 295 + }, + { + "epoch": 0.10231593501555479, + "grad_norm": 0.8901939076819861, + "learning_rate": 7.999927169087277e-05, + "loss": 0.7001, + "num_tokens": 690889590.0, + "step": 296 + }, + { + "epoch": 0.10266159695817491, + "grad_norm": 0.8617005315161973, + "learning_rate": 7.999895123625716e-05, + "loss": 0.6967, + "num_tokens": 693225926.0, + "step": 297 + }, + { + "epoch": 0.10300725890079503, + "grad_norm": 0.596840269904407, + "learning_rate": 7.99985725182693e-05, + "loss": 0.6885, + "num_tokens": 695524308.0, + "step": 298 + }, + { + "epoch": 0.10335292084341514, + "grad_norm": 1.0173827901463235, + "learning_rate": 7.999813553746083e-05, + "loss": 0.7039, + "num_tokens": 697824611.0, + "step": 299 + }, + { + "epoch": 0.10369858278603526, + "grad_norm": 0.8874306379940305, + "learning_rate": 7.99976402944683e-05, + "loss": 0.6888, + "num_tokens": 700176190.0, + "step": 300 + }, + { + "epoch": 0.10404424472865538, + "grad_norm": 0.7489150598188141, + "learning_rate": 7.999708679001307e-05, + "loss": 0.6942, + "num_tokens": 702468839.0, + "step": 301 + }, + { + "epoch": 0.1043899066712755, + "grad_norm": 1.157959309996441, + "learning_rate": 7.999647502490141e-05, + "loss": 0.6909, + "num_tokens": 704785933.0, + "step": 302 + }, + { + "epoch": 0.10473556861389562, + "grad_norm": 0.7119099167867494, + "learning_rate": 7.999580500002444e-05, + "loss": 0.708, + "num_tokens": 707097800.0, + "step": 303 + }, + { + "epoch": 0.10508123055651573, + "grad_norm": 1.1233987008921111, + "learning_rate": 7.999507671635814e-05, + "loss": 0.6944, + "num_tokens": 709356858.0, + "step": 304 + }, + { + "epoch": 0.10542689249913585, + "grad_norm": 0.7726501218242386, + "learning_rate": 7.999429017496337e-05, + "loss": 0.7018, + "num_tokens": 711739782.0, + "step": 305 + }, + { + "epoch": 0.10577255444175596, + "grad_norm": 1.3481051637224188, + "learning_rate": 7.999344537698581e-05, + "loss": 0.7111, + "num_tokens": 714073801.0, + "step": 306 + }, + { + "epoch": 0.10611821638437607, + "grad_norm": 1.0783453329483956, + "learning_rate": 7.999254232365605e-05, + "loss": 0.7191, + "num_tokens": 716402217.0, + "step": 307 + }, + { + "epoch": 0.10646387832699619, + "grad_norm": 1.0700151008928058, + "learning_rate": 7.99915810162895e-05, + "loss": 0.7191, + "num_tokens": 718858280.0, + "step": 308 + }, + { + "epoch": 0.10680954026961631, + "grad_norm": 1.0968425201547842, + "learning_rate": 7.999056145628642e-05, + "loss": 0.7117, + "num_tokens": 721206186.0, + "step": 309 + }, + { + "epoch": 0.10715520221223643, + "grad_norm": 0.8334145125925164, + "learning_rate": 7.998948364513197e-05, + "loss": 0.7115, + "num_tokens": 723597671.0, + "step": 310 + }, + { + "epoch": 0.10750086415485655, + "grad_norm": 1.554717653070663, + "learning_rate": 7.99883475843961e-05, + "loss": 0.7149, + "num_tokens": 726083099.0, + "step": 311 + }, + { + "epoch": 0.10784652609747666, + "grad_norm": 1.3739057268202843, + "learning_rate": 7.998715327573365e-05, + "loss": 0.7023, + "num_tokens": 728498279.0, + "step": 312 + }, + { + "epoch": 0.10819218804009678, + "grad_norm": 0.8360408141229195, + "learning_rate": 7.99859007208843e-05, + "loss": 0.6845, + "num_tokens": 730825899.0, + "step": 313 + }, + { + "epoch": 0.1085378499827169, + "grad_norm": 1.0524586634295543, + "learning_rate": 7.998458992167255e-05, + "loss": 0.6874, + "num_tokens": 733034402.0, + "step": 314 + }, + { + "epoch": 0.10888351192533702, + "grad_norm": 0.696929349143732, + "learning_rate": 7.998322088000779e-05, + "loss": 0.7127, + "num_tokens": 735506694.0, + "step": 315 + }, + { + "epoch": 0.10922917386795714, + "grad_norm": 1.0044519332658897, + "learning_rate": 7.998179359788418e-05, + "loss": 0.6951, + "num_tokens": 737838781.0, + "step": 316 + }, + { + "epoch": 0.10957483581057725, + "grad_norm": 0.7708743421728953, + "learning_rate": 7.998030807738079e-05, + "loss": 0.7129, + "num_tokens": 740180538.0, + "step": 317 + }, + { + "epoch": 0.10992049775319737, + "grad_norm": 0.8077441390505878, + "learning_rate": 7.997876432066145e-05, + "loss": 0.7047, + "num_tokens": 742515800.0, + "step": 318 + }, + { + "epoch": 0.11026615969581749, + "grad_norm": 0.7676928527488951, + "learning_rate": 7.997716232997487e-05, + "loss": 0.7054, + "num_tokens": 744909576.0, + "step": 319 + }, + { + "epoch": 0.11061182163843761, + "grad_norm": 0.8499848951090085, + "learning_rate": 7.997550210765457e-05, + "loss": 0.682, + "num_tokens": 747289640.0, + "step": 320 + }, + { + "epoch": 0.11095748358105773, + "grad_norm": 0.8416097531462226, + "learning_rate": 7.997378365611888e-05, + "loss": 0.6867, + "num_tokens": 749562061.0, + "step": 321 + }, + { + "epoch": 0.11130314552367784, + "grad_norm": 0.7444937632171702, + "learning_rate": 7.997200697787098e-05, + "loss": 0.6921, + "num_tokens": 751840397.0, + "step": 322 + }, + { + "epoch": 0.11164880746629796, + "grad_norm": 0.8015095549688365, + "learning_rate": 7.997017207549884e-05, + "loss": 0.6978, + "num_tokens": 754179786.0, + "step": 323 + }, + { + "epoch": 0.11199446940891808, + "grad_norm": 0.6871825143542051, + "learning_rate": 7.996827895167523e-05, + "loss": 0.6874, + "num_tokens": 756503904.0, + "step": 324 + }, + { + "epoch": 0.1123401313515382, + "grad_norm": 0.958497866215431, + "learning_rate": 7.996632760915775e-05, + "loss": 0.6911, + "num_tokens": 758897268.0, + "step": 325 + }, + { + "epoch": 0.11268579329415832, + "grad_norm": 0.6873164249150165, + "learning_rate": 7.996431805078881e-05, + "loss": 0.6883, + "num_tokens": 761221450.0, + "step": 326 + }, + { + "epoch": 0.11303145523677843, + "grad_norm": 1.0817764647800767, + "learning_rate": 7.996225027949559e-05, + "loss": 0.695, + "num_tokens": 763515479.0, + "step": 327 + }, + { + "epoch": 0.11337711717939855, + "grad_norm": 0.8611981023591838, + "learning_rate": 7.996012429829011e-05, + "loss": 0.6959, + "num_tokens": 765899703.0, + "step": 328 + }, + { + "epoch": 0.11372277912201867, + "grad_norm": 0.9859302359610935, + "learning_rate": 7.995794011026911e-05, + "loss": 0.6899, + "num_tokens": 768199256.0, + "step": 329 + }, + { + "epoch": 0.11406844106463879, + "grad_norm": 0.6293751886043986, + "learning_rate": 7.995569771861421e-05, + "loss": 0.693, + "num_tokens": 770586189.0, + "step": 330 + }, + { + "epoch": 0.1144141030072589, + "grad_norm": 1.041080382816587, + "learning_rate": 7.995339712659173e-05, + "loss": 0.6927, + "num_tokens": 772987673.0, + "step": 331 + }, + { + "epoch": 0.11475976494987902, + "grad_norm": 1.0792671151467503, + "learning_rate": 7.995103833755279e-05, + "loss": 0.6871, + "num_tokens": 775411739.0, + "step": 332 + }, + { + "epoch": 0.11510542689249914, + "grad_norm": 0.6136864084317608, + "learning_rate": 7.99486213549333e-05, + "loss": 0.6952, + "num_tokens": 777772924.0, + "step": 333 + }, + { + "epoch": 0.11545108883511926, + "grad_norm": 1.515540323199239, + "learning_rate": 7.994614618225395e-05, + "loss": 0.6921, + "num_tokens": 779957935.0, + "step": 334 + }, + { + "epoch": 0.11579675077773938, + "grad_norm": 0.8138618474246415, + "learning_rate": 7.994361282312013e-05, + "loss": 0.7008, + "num_tokens": 782356219.0, + "step": 335 + }, + { + "epoch": 0.1161424127203595, + "grad_norm": 1.6282351949417546, + "learning_rate": 7.994102128122205e-05, + "loss": 0.6846, + "num_tokens": 784672696.0, + "step": 336 + }, + { + "epoch": 0.11648807466297961, + "grad_norm": 0.9385244196948598, + "learning_rate": 7.993837156033463e-05, + "loss": 0.6934, + "num_tokens": 787004666.0, + "step": 337 + }, + { + "epoch": 0.11683373660559972, + "grad_norm": 2.0223773205509956, + "learning_rate": 7.993566366431757e-05, + "loss": 0.6999, + "num_tokens": 789343205.0, + "step": 338 + }, + { + "epoch": 0.11717939854821983, + "grad_norm": 1.6593409472481893, + "learning_rate": 7.993289759711528e-05, + "loss": 0.6956, + "num_tokens": 791561573.0, + "step": 339 + }, + { + "epoch": 0.11752506049083995, + "grad_norm": 1.4144489016925788, + "learning_rate": 7.993007336275693e-05, + "loss": 0.6733, + "num_tokens": 793904206.0, + "step": 340 + }, + { + "epoch": 0.11787072243346007, + "grad_norm": 1.442725495786414, + "learning_rate": 7.992719096535636e-05, + "loss": 0.6999, + "num_tokens": 796192163.0, + "step": 341 + }, + { + "epoch": 0.11821638437608019, + "grad_norm": 1.1210163277532932, + "learning_rate": 7.992425040911223e-05, + "loss": 0.7081, + "num_tokens": 798562267.0, + "step": 342 + }, + { + "epoch": 0.1185620463187003, + "grad_norm": 0.9521213158543966, + "learning_rate": 7.992125169830782e-05, + "loss": 0.6864, + "num_tokens": 800897797.0, + "step": 343 + }, + { + "epoch": 0.11890770826132042, + "grad_norm": 1.3346389933713596, + "learning_rate": 7.99181948373112e-05, + "loss": 0.6735, + "num_tokens": 803216393.0, + "step": 344 + }, + { + "epoch": 0.11925337020394054, + "grad_norm": 0.9460110716948854, + "learning_rate": 7.991507983057512e-05, + "loss": 0.6994, + "num_tokens": 805426107.0, + "step": 345 + }, + { + "epoch": 0.11959903214656066, + "grad_norm": 1.636088119332799, + "learning_rate": 7.991190668263696e-05, + "loss": 0.6859, + "num_tokens": 807690662.0, + "step": 346 + }, + { + "epoch": 0.11994469408918078, + "grad_norm": 1.4743393864626981, + "learning_rate": 7.990867539811886e-05, + "loss": 0.6857, + "num_tokens": 810069568.0, + "step": 347 + }, + { + "epoch": 0.1202903560318009, + "grad_norm": 1.144471301810938, + "learning_rate": 7.990538598172766e-05, + "loss": 0.6786, + "num_tokens": 812403544.0, + "step": 348 + }, + { + "epoch": 0.12063601797442101, + "grad_norm": 0.9041477229403952, + "learning_rate": 7.990203843825481e-05, + "loss": 0.7015, + "num_tokens": 814712006.0, + "step": 349 + }, + { + "epoch": 0.12098167991704113, + "grad_norm": 1.4901426257473107, + "learning_rate": 7.98986327725765e-05, + "loss": 0.7064, + "num_tokens": 817152265.0, + "step": 350 + }, + { + "epoch": 0.12132734185966125, + "grad_norm": 1.1130971912265126, + "learning_rate": 7.989516898965351e-05, + "loss": 0.6852, + "num_tokens": 819439940.0, + "step": 351 + }, + { + "epoch": 0.12167300380228137, + "grad_norm": 1.3889429435407803, + "learning_rate": 7.989164709453134e-05, + "loss": 0.6804, + "num_tokens": 821782315.0, + "step": 352 + }, + { + "epoch": 0.12201866574490149, + "grad_norm": 1.0771950560624082, + "learning_rate": 7.988806709234012e-05, + "loss": 0.6767, + "num_tokens": 824217291.0, + "step": 353 + }, + { + "epoch": 0.1223643276875216, + "grad_norm": 1.333269216833232, + "learning_rate": 7.988442898829459e-05, + "loss": 0.6942, + "num_tokens": 826557598.0, + "step": 354 + }, + { + "epoch": 0.12270998963014172, + "grad_norm": 1.146515196854587, + "learning_rate": 7.988073278769417e-05, + "loss": 0.6761, + "num_tokens": 828871958.0, + "step": 355 + }, + { + "epoch": 0.12305565157276184, + "grad_norm": 1.2418382185081331, + "learning_rate": 7.987697849592287e-05, + "loss": 0.6724, + "num_tokens": 831102404.0, + "step": 356 + }, + { + "epoch": 0.12340131351538196, + "grad_norm": 1.1675477808642603, + "learning_rate": 7.987316611844933e-05, + "loss": 0.6951, + "num_tokens": 833571788.0, + "step": 357 + }, + { + "epoch": 0.12374697545800208, + "grad_norm": 1.1212649136823032, + "learning_rate": 7.986929566082681e-05, + "loss": 0.6843, + "num_tokens": 835768991.0, + "step": 358 + }, + { + "epoch": 0.1240926374006222, + "grad_norm": 1.1001762048921309, + "learning_rate": 7.986536712869315e-05, + "loss": 0.68, + "num_tokens": 838124224.0, + "step": 359 + }, + { + "epoch": 0.12443829934324231, + "grad_norm": 1.230695121465166, + "learning_rate": 7.986138052777079e-05, + "loss": 0.6884, + "num_tokens": 840391159.0, + "step": 360 + }, + { + "epoch": 0.12478396128586243, + "grad_norm": 1.0495160483194945, + "learning_rate": 7.985733586386678e-05, + "loss": 0.6862, + "num_tokens": 842783534.0, + "step": 361 + }, + { + "epoch": 0.12512962322848253, + "grad_norm": 1.2386372164603723, + "learning_rate": 7.985323314287272e-05, + "loss": 0.69, + "num_tokens": 845092329.0, + "step": 362 + }, + { + "epoch": 0.12547528517110265, + "grad_norm": 1.209782625725921, + "learning_rate": 7.984907237076479e-05, + "loss": 0.6924, + "num_tokens": 847347363.0, + "step": 363 + }, + { + "epoch": 0.12582094711372277, + "grad_norm": 1.0125950689075163, + "learning_rate": 7.984485355360371e-05, + "loss": 0.6806, + "num_tokens": 849680487.0, + "step": 364 + }, + { + "epoch": 0.1261666090563429, + "grad_norm": 0.79931824340975, + "learning_rate": 7.984057669753477e-05, + "loss": 0.6876, + "num_tokens": 851980604.0, + "step": 365 + }, + { + "epoch": 0.126512270998963, + "grad_norm": 1.489545534073644, + "learning_rate": 7.983624180878782e-05, + "loss": 0.6825, + "num_tokens": 854344038.0, + "step": 366 + }, + { + "epoch": 0.12685793294158312, + "grad_norm": 1.25142217259151, + "learning_rate": 7.983184889367718e-05, + "loss": 0.6875, + "num_tokens": 856723062.0, + "step": 367 + }, + { + "epoch": 0.12720359488420324, + "grad_norm": 1.1365016908699426, + "learning_rate": 7.982739795860177e-05, + "loss": 0.6782, + "num_tokens": 859055963.0, + "step": 368 + }, + { + "epoch": 0.12754925682682336, + "grad_norm": 1.0574870548419908, + "learning_rate": 7.982288901004496e-05, + "loss": 0.6665, + "num_tokens": 861363812.0, + "step": 369 + }, + { + "epoch": 0.12789491876944348, + "grad_norm": 1.0996762728867564, + "learning_rate": 7.981832205457467e-05, + "loss": 0.69, + "num_tokens": 863821983.0, + "step": 370 + }, + { + "epoch": 0.1282405807120636, + "grad_norm": 1.0464576250091726, + "learning_rate": 7.981369709884329e-05, + "loss": 0.6844, + "num_tokens": 866103060.0, + "step": 371 + }, + { + "epoch": 0.1285862426546837, + "grad_norm": 1.1173050819432968, + "learning_rate": 7.980901414958769e-05, + "loss": 0.6753, + "num_tokens": 868438447.0, + "step": 372 + }, + { + "epoch": 0.12893190459730383, + "grad_norm": 0.9322098736898108, + "learning_rate": 7.980427321362925e-05, + "loss": 0.6734, + "num_tokens": 870700785.0, + "step": 373 + }, + { + "epoch": 0.12927756653992395, + "grad_norm": 1.3037928514085664, + "learning_rate": 7.979947429787378e-05, + "loss": 0.7043, + "num_tokens": 873140472.0, + "step": 374 + }, + { + "epoch": 0.12962322848254407, + "grad_norm": 1.0995241694419917, + "learning_rate": 7.979461740931156e-05, + "loss": 0.6937, + "num_tokens": 875527359.0, + "step": 375 + }, + { + "epoch": 0.12996889042516419, + "grad_norm": 1.1447417908456636, + "learning_rate": 7.97897025550173e-05, + "loss": 0.6875, + "num_tokens": 877958509.0, + "step": 376 + }, + { + "epoch": 0.1303145523677843, + "grad_norm": 0.9761571723754631, + "learning_rate": 7.978472974215019e-05, + "loss": 0.6559, + "num_tokens": 880386334.0, + "step": 377 + }, + { + "epoch": 0.13066021431040442, + "grad_norm": 1.1843923554722389, + "learning_rate": 7.97796989779538e-05, + "loss": 0.692, + "num_tokens": 882691702.0, + "step": 378 + }, + { + "epoch": 0.13100587625302454, + "grad_norm": 0.9786774014037657, + "learning_rate": 7.977461026975612e-05, + "loss": 0.675, + "num_tokens": 884900181.0, + "step": 379 + }, + { + "epoch": 0.13135153819564466, + "grad_norm": 1.2325916494671205, + "learning_rate": 7.976946362496955e-05, + "loss": 0.6795, + "num_tokens": 887200224.0, + "step": 380 + }, + { + "epoch": 0.13169720013826478, + "grad_norm": 1.1587378094828298, + "learning_rate": 7.976425905109089e-05, + "loss": 0.6735, + "num_tokens": 889537696.0, + "step": 381 + }, + { + "epoch": 0.1320428620808849, + "grad_norm": 0.9681318246124302, + "learning_rate": 7.975899655570131e-05, + "loss": 0.7018, + "num_tokens": 891843780.0, + "step": 382 + }, + { + "epoch": 0.132388524023505, + "grad_norm": 0.9417073841478321, + "learning_rate": 7.975367614646637e-05, + "loss": 0.6736, + "num_tokens": 894232051.0, + "step": 383 + }, + { + "epoch": 0.13273418596612513, + "grad_norm": 1.1778735880407256, + "learning_rate": 7.974829783113594e-05, + "loss": 0.6925, + "num_tokens": 896630686.0, + "step": 384 + }, + { + "epoch": 0.13307984790874525, + "grad_norm": 1.0275387052157854, + "learning_rate": 7.974286161754433e-05, + "loss": 0.6989, + "num_tokens": 899019185.0, + "step": 385 + }, + { + "epoch": 0.13342550985136536, + "grad_norm": 1.1894463153596653, + "learning_rate": 7.973736751361008e-05, + "loss": 0.6976, + "num_tokens": 901319970.0, + "step": 386 + }, + { + "epoch": 0.13377117179398548, + "grad_norm": 1.062230542849221, + "learning_rate": 7.973181552733613e-05, + "loss": 0.6849, + "num_tokens": 903607467.0, + "step": 387 + }, + { + "epoch": 0.1341168337366056, + "grad_norm": 1.1234192776019418, + "learning_rate": 7.972620566680972e-05, + "loss": 0.6935, + "num_tokens": 906070989.0, + "step": 388 + }, + { + "epoch": 0.13446249567922572, + "grad_norm": 0.9277483816737434, + "learning_rate": 7.972053794020234e-05, + "loss": 0.6837, + "num_tokens": 908401611.0, + "step": 389 + }, + { + "epoch": 0.13480815762184584, + "grad_norm": 1.2627915267734544, + "learning_rate": 7.971481235576984e-05, + "loss": 0.6668, + "num_tokens": 910551071.0, + "step": 390 + }, + { + "epoch": 0.13515381956446595, + "grad_norm": 1.1495363608257083, + "learning_rate": 7.970902892185232e-05, + "loss": 0.673, + "num_tokens": 912960973.0, + "step": 391 + }, + { + "epoch": 0.13549948150708607, + "grad_norm": 1.0304687786805438, + "learning_rate": 7.970318764687412e-05, + "loss": 0.6735, + "num_tokens": 915252388.0, + "step": 392 + }, + { + "epoch": 0.1358451434497062, + "grad_norm": 1.096791445079324, + "learning_rate": 7.96972885393439e-05, + "loss": 0.6733, + "num_tokens": 917577646.0, + "step": 393 + }, + { + "epoch": 0.1361908053923263, + "grad_norm": 0.8600198478534011, + "learning_rate": 7.969133160785448e-05, + "loss": 0.6648, + "num_tokens": 919912390.0, + "step": 394 + }, + { + "epoch": 0.13653646733494643, + "grad_norm": 0.699489837547432, + "learning_rate": 7.968531686108295e-05, + "loss": 0.677, + "num_tokens": 922228131.0, + "step": 395 + }, + { + "epoch": 0.13688212927756654, + "grad_norm": 1.1046038398651836, + "learning_rate": 7.967924430779064e-05, + "loss": 0.6762, + "num_tokens": 924494616.0, + "step": 396 + }, + { + "epoch": 0.13722779122018666, + "grad_norm": 0.6919660405873397, + "learning_rate": 7.967311395682304e-05, + "loss": 0.6808, + "num_tokens": 926895580.0, + "step": 397 + }, + { + "epoch": 0.13757345316280678, + "grad_norm": 1.4317507548793176, + "learning_rate": 7.966692581710984e-05, + "loss": 0.675, + "num_tokens": 929189971.0, + "step": 398 + }, + { + "epoch": 0.1379191151054269, + "grad_norm": 1.2476788183063163, + "learning_rate": 7.966067989766492e-05, + "loss": 0.6732, + "num_tokens": 931590512.0, + "step": 399 + }, + { + "epoch": 0.13826477704804702, + "grad_norm": 0.8410229284623401, + "learning_rate": 7.965437620758632e-05, + "loss": 0.6818, + "num_tokens": 933845239.0, + "step": 400 + }, + { + "epoch": 0.13861043899066713, + "grad_norm": 0.9060763725603179, + "learning_rate": 7.964801475605622e-05, + "loss": 0.6876, + "num_tokens": 936162988.0, + "step": 401 + }, + { + "epoch": 0.13895610093328725, + "grad_norm": 0.9462088723512216, + "learning_rate": 7.964159555234097e-05, + "loss": 0.6855, + "num_tokens": 938488429.0, + "step": 402 + }, + { + "epoch": 0.13930176287590737, + "grad_norm": 0.6655687384076501, + "learning_rate": 7.963511860579099e-05, + "loss": 0.677, + "num_tokens": 940790506.0, + "step": 403 + }, + { + "epoch": 0.1396474248185275, + "grad_norm": 1.0083914414843063, + "learning_rate": 7.962858392584084e-05, + "loss": 0.6835, + "num_tokens": 943119813.0, + "step": 404 + }, + { + "epoch": 0.1399930867611476, + "grad_norm": 0.6008615479710875, + "learning_rate": 7.962199152200922e-05, + "loss": 0.6779, + "num_tokens": 945454900.0, + "step": 405 + }, + { + "epoch": 0.14033874870376772, + "grad_norm": 1.0299508450761872, + "learning_rate": 7.961534140389882e-05, + "loss": 0.6818, + "num_tokens": 947796607.0, + "step": 406 + }, + { + "epoch": 0.14068441064638784, + "grad_norm": 0.9121016239330308, + "learning_rate": 7.960863358119647e-05, + "loss": 0.6804, + "num_tokens": 950158087.0, + "step": 407 + }, + { + "epoch": 0.14103007258900796, + "grad_norm": 1.0707388282989765, + "learning_rate": 7.960186806367304e-05, + "loss": 0.6799, + "num_tokens": 952485997.0, + "step": 408 + }, + { + "epoch": 0.14137573453162808, + "grad_norm": 1.0978738875444578, + "learning_rate": 7.959504486118343e-05, + "loss": 0.6886, + "num_tokens": 954826839.0, + "step": 409 + }, + { + "epoch": 0.1417213964742482, + "grad_norm": 0.8018759058420323, + "learning_rate": 7.958816398366654e-05, + "loss": 0.6824, + "num_tokens": 957082957.0, + "step": 410 + }, + { + "epoch": 0.1420670584168683, + "grad_norm": 0.7716008658301883, + "learning_rate": 7.958122544114536e-05, + "loss": 0.6657, + "num_tokens": 959360496.0, + "step": 411 + }, + { + "epoch": 0.14241272035948843, + "grad_norm": 0.7593118913119389, + "learning_rate": 7.957422924372679e-05, + "loss": 0.6726, + "num_tokens": 961734966.0, + "step": 412 + }, + { + "epoch": 0.14275838230210855, + "grad_norm": 0.6802341786838565, + "learning_rate": 7.956717540160176e-05, + "loss": 0.6825, + "num_tokens": 964040647.0, + "step": 413 + }, + { + "epoch": 0.14310404424472867, + "grad_norm": 0.6992548603440866, + "learning_rate": 7.956006392504518e-05, + "loss": 0.6612, + "num_tokens": 966278004.0, + "step": 414 + }, + { + "epoch": 0.14344970618734879, + "grad_norm": 0.6525582814227087, + "learning_rate": 7.955289482441587e-05, + "loss": 0.6786, + "num_tokens": 968736836.0, + "step": 415 + }, + { + "epoch": 0.1437953681299689, + "grad_norm": 0.8040583434216484, + "learning_rate": 7.95456681101566e-05, + "loss": 0.6862, + "num_tokens": 971120246.0, + "step": 416 + }, + { + "epoch": 0.14414103007258902, + "grad_norm": 0.7531792973959063, + "learning_rate": 7.953838379279409e-05, + "loss": 0.6784, + "num_tokens": 973439021.0, + "step": 417 + }, + { + "epoch": 0.1444866920152091, + "grad_norm": 1.0098266689438589, + "learning_rate": 7.953104188293892e-05, + "loss": 0.681, + "num_tokens": 975792179.0, + "step": 418 + }, + { + "epoch": 0.14483235395782923, + "grad_norm": 0.5490089234056582, + "learning_rate": 7.952364239128564e-05, + "loss": 0.6614, + "num_tokens": 978085595.0, + "step": 419 + }, + { + "epoch": 0.14517801590044935, + "grad_norm": 1.1613951853625943, + "learning_rate": 7.951618532861257e-05, + "loss": 0.6878, + "num_tokens": 980513903.0, + "step": 420 + }, + { + "epoch": 0.14552367784306947, + "grad_norm": 0.8329947426192129, + "learning_rate": 7.950867070578195e-05, + "loss": 0.676, + "num_tokens": 982886030.0, + "step": 421 + }, + { + "epoch": 0.14586933978568958, + "grad_norm": 1.190100036888434, + "learning_rate": 7.950109853373988e-05, + "loss": 0.665, + "num_tokens": 985210567.0, + "step": 422 + }, + { + "epoch": 0.1462150017283097, + "grad_norm": 0.8935123344795749, + "learning_rate": 7.949346882351627e-05, + "loss": 0.6721, + "num_tokens": 987612333.0, + "step": 423 + }, + { + "epoch": 0.14656066367092982, + "grad_norm": 1.0971269133187929, + "learning_rate": 7.94857815862248e-05, + "loss": 0.6882, + "num_tokens": 989941853.0, + "step": 424 + }, + { + "epoch": 0.14690632561354994, + "grad_norm": 0.9269661639824344, + "learning_rate": 7.947803683306303e-05, + "loss": 0.6944, + "num_tokens": 992161232.0, + "step": 425 + }, + { + "epoch": 0.14725198755617006, + "grad_norm": 1.1080505270489063, + "learning_rate": 7.947023457531223e-05, + "loss": 0.6747, + "num_tokens": 994520765.0, + "step": 426 + }, + { + "epoch": 0.14759764949879017, + "grad_norm": 0.9310890523912794, + "learning_rate": 7.946237482433747e-05, + "loss": 0.6777, + "num_tokens": 996792700.0, + "step": 427 + }, + { + "epoch": 0.1479433114414103, + "grad_norm": 0.8982719874466853, + "learning_rate": 7.945445759158753e-05, + "loss": 0.6719, + "num_tokens": 999136838.0, + "step": 428 + }, + { + "epoch": 0.1482889733840304, + "grad_norm": 0.7354535611353327, + "learning_rate": 7.944648288859498e-05, + "loss": 0.6746, + "num_tokens": 1001553143.0, + "step": 429 + }, + { + "epoch": 0.14863463532665053, + "grad_norm": 1.1992702747213542, + "learning_rate": 7.943845072697605e-05, + "loss": 0.6714, + "num_tokens": 1003923247.0, + "step": 430 + }, + { + "epoch": 0.14898029726927065, + "grad_norm": 0.8315287546815002, + "learning_rate": 7.943036111843067e-05, + "loss": 0.6793, + "num_tokens": 1006261422.0, + "step": 431 + }, + { + "epoch": 0.14932595921189076, + "grad_norm": 1.218025167995165, + "learning_rate": 7.94222140747425e-05, + "loss": 0.682, + "num_tokens": 1008548652.0, + "step": 432 + }, + { + "epoch": 0.14967162115451088, + "grad_norm": 1.052629611601623, + "learning_rate": 7.941400960777879e-05, + "loss": 0.6555, + "num_tokens": 1010878607.0, + "step": 433 + }, + { + "epoch": 0.150017283097131, + "grad_norm": 0.9383825053952549, + "learning_rate": 7.940574772949048e-05, + "loss": 0.6736, + "num_tokens": 1013140608.0, + "step": 434 + }, + { + "epoch": 0.15036294503975112, + "grad_norm": 0.8027960200380267, + "learning_rate": 7.939742845191215e-05, + "loss": 0.6813, + "num_tokens": 1015491965.0, + "step": 435 + }, + { + "epoch": 0.15070860698237123, + "grad_norm": 1.0650836491529845, + "learning_rate": 7.938905178716194e-05, + "loss": 0.6847, + "num_tokens": 1017824887.0, + "step": 436 + }, + { + "epoch": 0.15105426892499135, + "grad_norm": 0.9543048286901269, + "learning_rate": 7.938061774744162e-05, + "loss": 0.6855, + "num_tokens": 1020197919.0, + "step": 437 + }, + { + "epoch": 0.15139993086761147, + "grad_norm": 0.817908007481354, + "learning_rate": 7.937212634503652e-05, + "loss": 0.6721, + "num_tokens": 1022497457.0, + "step": 438 + }, + { + "epoch": 0.1517455928102316, + "grad_norm": 0.7267433998540934, + "learning_rate": 7.936357759231555e-05, + "loss": 0.6658, + "num_tokens": 1024894188.0, + "step": 439 + }, + { + "epoch": 0.1520912547528517, + "grad_norm": 1.2551259308776896, + "learning_rate": 7.935497150173113e-05, + "loss": 0.6753, + "num_tokens": 1027318082.0, + "step": 440 + }, + { + "epoch": 0.15243691669547182, + "grad_norm": 1.0274370888599909, + "learning_rate": 7.93463080858192e-05, + "loss": 0.6741, + "num_tokens": 1029645403.0, + "step": 441 + }, + { + "epoch": 0.15278257863809194, + "grad_norm": 1.091659933559066, + "learning_rate": 7.933758735719923e-05, + "loss": 0.6665, + "num_tokens": 1032027076.0, + "step": 442 + }, + { + "epoch": 0.15312824058071206, + "grad_norm": 0.9876493246318193, + "learning_rate": 7.932880932857413e-05, + "loss": 0.6581, + "num_tokens": 1034316710.0, + "step": 443 + }, + { + "epoch": 0.15347390252333218, + "grad_norm": 0.8648602252980715, + "learning_rate": 7.931997401273034e-05, + "loss": 0.6916, + "num_tokens": 1036655119.0, + "step": 444 + }, + { + "epoch": 0.1538195644659523, + "grad_norm": 0.8849497074968393, + "learning_rate": 7.931108142253767e-05, + "loss": 0.6815, + "num_tokens": 1038940628.0, + "step": 445 + }, + { + "epoch": 0.15416522640857241, + "grad_norm": 0.7717310075577019, + "learning_rate": 7.930213157094943e-05, + "loss": 0.6853, + "num_tokens": 1041252573.0, + "step": 446 + }, + { + "epoch": 0.15451088835119253, + "grad_norm": 0.6163082225472927, + "learning_rate": 7.929312447100229e-05, + "loss": 0.6708, + "num_tokens": 1043647714.0, + "step": 447 + }, + { + "epoch": 0.15485655029381265, + "grad_norm": 0.9150930137138809, + "learning_rate": 7.928406013581631e-05, + "loss": 0.6646, + "num_tokens": 1046014911.0, + "step": 448 + }, + { + "epoch": 0.15520221223643277, + "grad_norm": 0.7211587607974769, + "learning_rate": 7.927493857859496e-05, + "loss": 0.6787, + "num_tokens": 1048389762.0, + "step": 449 + }, + { + "epoch": 0.1555478741790529, + "grad_norm": 1.2928863484162245, + "learning_rate": 7.926575981262503e-05, + "loss": 0.6902, + "num_tokens": 1050686961.0, + "step": 450 + }, + { + "epoch": 0.155893536121673, + "grad_norm": 1.1210231001273558, + "learning_rate": 7.925652385127665e-05, + "loss": 0.6712, + "num_tokens": 1052986653.0, + "step": 451 + }, + { + "epoch": 0.15623919806429312, + "grad_norm": 1.0286890019270256, + "learning_rate": 7.924723070800327e-05, + "loss": 0.6677, + "num_tokens": 1055204993.0, + "step": 452 + }, + { + "epoch": 0.15658486000691324, + "grad_norm": 0.9430942323978379, + "learning_rate": 7.923788039634162e-05, + "loss": 0.6558, + "num_tokens": 1057512682.0, + "step": 453 + }, + { + "epoch": 0.15693052194953336, + "grad_norm": 0.8444811867150017, + "learning_rate": 7.922847292991171e-05, + "loss": 0.6901, + "num_tokens": 1059946207.0, + "step": 454 + }, + { + "epoch": 0.15727618389215348, + "grad_norm": 0.5828469372887928, + "learning_rate": 7.92190083224168e-05, + "loss": 0.6747, + "num_tokens": 1062266510.0, + "step": 455 + }, + { + "epoch": 0.1576218458347736, + "grad_norm": 1.0082901340857904, + "learning_rate": 7.920948658764342e-05, + "loss": 0.6737, + "num_tokens": 1064606852.0, + "step": 456 + }, + { + "epoch": 0.1579675077773937, + "grad_norm": 0.8237080604879339, + "learning_rate": 7.919990773946123e-05, + "loss": 0.6872, + "num_tokens": 1067013715.0, + "step": 457 + }, + { + "epoch": 0.15831316972001383, + "grad_norm": 0.8386534497432427, + "learning_rate": 7.919027179182317e-05, + "loss": 0.6627, + "num_tokens": 1069333643.0, + "step": 458 + }, + { + "epoch": 0.15865883166263395, + "grad_norm": 1.0835608654140119, + "learning_rate": 7.918057875876532e-05, + "loss": 0.6623, + "num_tokens": 1071658910.0, + "step": 459 + }, + { + "epoch": 0.15900449360525407, + "grad_norm": 0.6694495830725079, + "learning_rate": 7.917082865440688e-05, + "loss": 0.6738, + "num_tokens": 1073951800.0, + "step": 460 + }, + { + "epoch": 0.15935015554787418, + "grad_norm": 1.10456055169353, + "learning_rate": 7.916102149295025e-05, + "loss": 0.6713, + "num_tokens": 1076353638.0, + "step": 461 + }, + { + "epoch": 0.1596958174904943, + "grad_norm": 0.8130739395781956, + "learning_rate": 7.915115728868089e-05, + "loss": 0.6694, + "num_tokens": 1078553778.0, + "step": 462 + }, + { + "epoch": 0.16004147943311442, + "grad_norm": 0.7826133063248987, + "learning_rate": 7.914123605596737e-05, + "loss": 0.6934, + "num_tokens": 1080863873.0, + "step": 463 + }, + { + "epoch": 0.16038714137573454, + "grad_norm": 1.2920416455139134, + "learning_rate": 7.913125780926131e-05, + "loss": 0.7044, + "num_tokens": 1083193288.0, + "step": 464 + }, + { + "epoch": 0.16073280331835466, + "grad_norm": 0.8252255936972961, + "learning_rate": 7.912122256309742e-05, + "loss": 0.6844, + "num_tokens": 1085534713.0, + "step": 465 + }, + { + "epoch": 0.16107846526097477, + "grad_norm": 1.5618718094935777, + "learning_rate": 7.911113033209337e-05, + "loss": 0.6779, + "num_tokens": 1087903931.0, + "step": 466 + }, + { + "epoch": 0.1614241272035949, + "grad_norm": 1.505487500370681, + "learning_rate": 7.910098113094992e-05, + "loss": 0.6662, + "num_tokens": 1090162566.0, + "step": 467 + }, + { + "epoch": 0.161769789146215, + "grad_norm": 0.6953409699459869, + "learning_rate": 7.909077497445076e-05, + "loss": 0.6573, + "num_tokens": 1092553627.0, + "step": 468 + }, + { + "epoch": 0.16211545108883513, + "grad_norm": 0.9654921052884959, + "learning_rate": 7.908051187746254e-05, + "loss": 0.6911, + "num_tokens": 1094972060.0, + "step": 469 + }, + { + "epoch": 0.16246111303145525, + "grad_norm": 0.6278143521364654, + "learning_rate": 7.90701918549349e-05, + "loss": 0.6567, + "num_tokens": 1097173981.0, + "step": 470 + }, + { + "epoch": 0.16280677497407536, + "grad_norm": 0.9249160263967247, + "learning_rate": 7.905981492190031e-05, + "loss": 0.6823, + "num_tokens": 1099533821.0, + "step": 471 + }, + { + "epoch": 0.16315243691669548, + "grad_norm": 0.6192907922843272, + "learning_rate": 7.904938109347426e-05, + "loss": 0.6641, + "num_tokens": 1101980155.0, + "step": 472 + }, + { + "epoch": 0.1634980988593156, + "grad_norm": 0.8155912226414017, + "learning_rate": 7.903889038485502e-05, + "loss": 0.6725, + "num_tokens": 1104319434.0, + "step": 473 + }, + { + "epoch": 0.16384376080193572, + "grad_norm": 0.6876666371102291, + "learning_rate": 7.902834281132375e-05, + "loss": 0.6841, + "num_tokens": 1106613946.0, + "step": 474 + }, + { + "epoch": 0.16418942274455584, + "grad_norm": 0.5926398033709877, + "learning_rate": 7.901773838824445e-05, + "loss": 0.6801, + "num_tokens": 1108927897.0, + "step": 475 + }, + { + "epoch": 0.16453508468717595, + "grad_norm": 0.7476375239223323, + "learning_rate": 7.900707713106386e-05, + "loss": 0.6644, + "num_tokens": 1111252456.0, + "step": 476 + }, + { + "epoch": 0.16488074662979607, + "grad_norm": 0.7480269918567899, + "learning_rate": 7.899635905531163e-05, + "loss": 0.6717, + "num_tokens": 1113546667.0, + "step": 477 + }, + { + "epoch": 0.1652264085724162, + "grad_norm": 0.655120894018493, + "learning_rate": 7.898558417660008e-05, + "loss": 0.683, + "num_tokens": 1115926258.0, + "step": 478 + }, + { + "epoch": 0.1655720705150363, + "grad_norm": 1.060339861534569, + "learning_rate": 7.89747525106243e-05, + "loss": 0.6702, + "num_tokens": 1118169535.0, + "step": 479 + }, + { + "epoch": 0.16591773245765642, + "grad_norm": 0.8083687453887654, + "learning_rate": 7.896386407316208e-05, + "loss": 0.6543, + "num_tokens": 1120401008.0, + "step": 480 + }, + { + "epoch": 0.16626339440027654, + "grad_norm": 1.0787398612002346, + "learning_rate": 7.895291888007394e-05, + "loss": 0.6621, + "num_tokens": 1122638007.0, + "step": 481 + }, + { + "epoch": 0.16660905634289666, + "grad_norm": 0.8944554890516988, + "learning_rate": 7.894191694730306e-05, + "loss": 0.6805, + "num_tokens": 1125046096.0, + "step": 482 + }, + { + "epoch": 0.16695471828551675, + "grad_norm": 0.8487740075876676, + "learning_rate": 7.893085829087524e-05, + "loss": 0.6761, + "num_tokens": 1127360376.0, + "step": 483 + }, + { + "epoch": 0.16730038022813687, + "grad_norm": 0.7858720558284045, + "learning_rate": 7.891974292689895e-05, + "loss": 0.6531, + "num_tokens": 1129542203.0, + "step": 484 + }, + { + "epoch": 0.167646042170757, + "grad_norm": 0.8222752831081003, + "learning_rate": 7.890857087156523e-05, + "loss": 0.6746, + "num_tokens": 1131920871.0, + "step": 485 + }, + { + "epoch": 0.1679917041133771, + "grad_norm": 0.7046265083128342, + "learning_rate": 7.88973421411477e-05, + "loss": 0.6614, + "num_tokens": 1134259630.0, + "step": 486 + }, + { + "epoch": 0.16833736605599722, + "grad_norm": 0.7102915836053934, + "learning_rate": 7.888605675200256e-05, + "loss": 0.6596, + "num_tokens": 1136468765.0, + "step": 487 + }, + { + "epoch": 0.16868302799861734, + "grad_norm": 0.6433044756393974, + "learning_rate": 7.887471472056853e-05, + "loss": 0.6676, + "num_tokens": 1138819148.0, + "step": 488 + }, + { + "epoch": 0.16902868994123746, + "grad_norm": 0.7785247513497286, + "learning_rate": 7.886331606336681e-05, + "loss": 0.6829, + "num_tokens": 1141267248.0, + "step": 489 + }, + { + "epoch": 0.16937435188385758, + "grad_norm": 0.40877372901756215, + "learning_rate": 7.885186079700114e-05, + "loss": 0.6861, + "num_tokens": 1143522301.0, + "step": 490 + }, + { + "epoch": 0.1697200138264777, + "grad_norm": 1.1849688808923968, + "learning_rate": 7.884034893815766e-05, + "loss": 0.6856, + "num_tokens": 1145901897.0, + "step": 491 + }, + { + "epoch": 0.1700656757690978, + "grad_norm": 0.9648441292139598, + "learning_rate": 7.8828780503605e-05, + "loss": 0.6905, + "num_tokens": 1148190579.0, + "step": 492 + }, + { + "epoch": 0.17041133771171793, + "grad_norm": 1.0573858450199947, + "learning_rate": 7.881715551019413e-05, + "loss": 0.682, + "num_tokens": 1150464745.0, + "step": 493 + }, + { + "epoch": 0.17075699965433805, + "grad_norm": 0.9662953298133525, + "learning_rate": 7.880547397485845e-05, + "loss": 0.6518, + "num_tokens": 1152752631.0, + "step": 494 + }, + { + "epoch": 0.17110266159695817, + "grad_norm": 0.8972158687927207, + "learning_rate": 7.879373591461376e-05, + "loss": 0.6773, + "num_tokens": 1155093513.0, + "step": 495 + }, + { + "epoch": 0.17144832353957828, + "grad_norm": 0.8317367232125701, + "learning_rate": 7.878194134655811e-05, + "loss": 0.671, + "num_tokens": 1157485784.0, + "step": 496 + }, + { + "epoch": 0.1717939854821984, + "grad_norm": 0.8276925000128146, + "learning_rate": 7.877009028787193e-05, + "loss": 0.6676, + "num_tokens": 1159927096.0, + "step": 497 + }, + { + "epoch": 0.17213964742481852, + "grad_norm": 0.6585278048213279, + "learning_rate": 7.875818275581789e-05, + "loss": 0.6764, + "num_tokens": 1162209982.0, + "step": 498 + }, + { + "epoch": 0.17248530936743864, + "grad_norm": 0.7902236865020444, + "learning_rate": 7.874621876774098e-05, + "loss": 0.6585, + "num_tokens": 1164485817.0, + "step": 499 + }, + { + "epoch": 0.17283097131005876, + "grad_norm": 0.5377156888057961, + "learning_rate": 7.873419834106834e-05, + "loss": 0.6755, + "num_tokens": 1166739745.0, + "step": 500 + }, + { + "epoch": 0.17317663325267887, + "grad_norm": 0.8325757930919285, + "learning_rate": 7.87221214933094e-05, + "loss": 0.6733, + "num_tokens": 1168965108.0, + "step": 501 + }, + { + "epoch": 0.173522295195299, + "grad_norm": 0.5209795529066652, + "learning_rate": 7.870998824205574e-05, + "loss": 0.6763, + "num_tokens": 1171329495.0, + "step": 502 + }, + { + "epoch": 0.1738679571379191, + "grad_norm": 0.871350196893921, + "learning_rate": 7.86977986049811e-05, + "loss": 0.6905, + "num_tokens": 1173693882.0, + "step": 503 + }, + { + "epoch": 0.17421361908053923, + "grad_norm": 0.6806218449613248, + "learning_rate": 7.868555259984136e-05, + "loss": 0.6775, + "num_tokens": 1175938352.0, + "step": 504 + }, + { + "epoch": 0.17455928102315935, + "grad_norm": 0.8212712061291401, + "learning_rate": 7.86732502444745e-05, + "loss": 0.6826, + "num_tokens": 1178243894.0, + "step": 505 + }, + { + "epoch": 0.17490494296577946, + "grad_norm": 0.7765756131504727, + "learning_rate": 7.866089155680059e-05, + "loss": 0.6677, + "num_tokens": 1180573839.0, + "step": 506 + }, + { + "epoch": 0.17525060490839958, + "grad_norm": 0.7285054193287712, + "learning_rate": 7.864847655482174e-05, + "loss": 0.6907, + "num_tokens": 1182903559.0, + "step": 507 + }, + { + "epoch": 0.1755962668510197, + "grad_norm": 0.6267126802608064, + "learning_rate": 7.863600525662213e-05, + "loss": 0.6598, + "num_tokens": 1185245321.0, + "step": 508 + }, + { + "epoch": 0.17594192879363982, + "grad_norm": 0.7333527420740727, + "learning_rate": 7.862347768036789e-05, + "loss": 0.6639, + "num_tokens": 1187510893.0, + "step": 509 + }, + { + "epoch": 0.17628759073625994, + "grad_norm": 0.7171956079690914, + "learning_rate": 7.861089384430716e-05, + "loss": 0.6856, + "num_tokens": 1189799084.0, + "step": 510 + }, + { + "epoch": 0.17663325267888005, + "grad_norm": 0.5582524277149231, + "learning_rate": 7.859825376677003e-05, + "loss": 0.6778, + "num_tokens": 1192100161.0, + "step": 511 + }, + { + "epoch": 0.17697891462150017, + "grad_norm": 0.6331823404553163, + "learning_rate": 7.858555746616849e-05, + "loss": 0.6943, + "num_tokens": 1194576087.0, + "step": 512 + }, + { + "epoch": 0.1773245765641203, + "grad_norm": 0.6178396677336605, + "learning_rate": 7.857280496099645e-05, + "loss": 0.673, + "num_tokens": 1196857673.0, + "step": 513 + }, + { + "epoch": 0.1776702385067404, + "grad_norm": 0.511850960357818, + "learning_rate": 7.855999626982969e-05, + "loss": 0.6776, + "num_tokens": 1199290389.0, + "step": 514 + }, + { + "epoch": 0.17801590044936053, + "grad_norm": 0.6488623819045878, + "learning_rate": 7.854713141132582e-05, + "loss": 0.6791, + "num_tokens": 1201669512.0, + "step": 515 + }, + { + "epoch": 0.17836156239198064, + "grad_norm": 0.6244876293998658, + "learning_rate": 7.853421040422425e-05, + "loss": 0.6635, + "num_tokens": 1203882546.0, + "step": 516 + }, + { + "epoch": 0.17870722433460076, + "grad_norm": 0.6677352096577931, + "learning_rate": 7.852123326734622e-05, + "loss": 0.6816, + "num_tokens": 1206223886.0, + "step": 517 + }, + { + "epoch": 0.17905288627722088, + "grad_norm": 0.5031659879400073, + "learning_rate": 7.85082000195947e-05, + "loss": 0.678, + "num_tokens": 1208610470.0, + "step": 518 + }, + { + "epoch": 0.179398548219841, + "grad_norm": 0.5923629631419254, + "learning_rate": 7.849511067995442e-05, + "loss": 0.6775, + "num_tokens": 1211009112.0, + "step": 519 + }, + { + "epoch": 0.17974421016246112, + "grad_norm": 1.1653871353142766, + "learning_rate": 7.848196526749177e-05, + "loss": 0.6673, + "num_tokens": 1213308458.0, + "step": 520 + }, + { + "epoch": 0.18008987210508123, + "grad_norm": 0.5410225868283237, + "learning_rate": 7.846876380135487e-05, + "loss": 0.6839, + "num_tokens": 1215688984.0, + "step": 521 + }, + { + "epoch": 0.18043553404770135, + "grad_norm": 1.5214356912711706, + "learning_rate": 7.845550630077344e-05, + "loss": 0.6839, + "num_tokens": 1217980640.0, + "step": 522 + }, + { + "epoch": 0.18078119599032147, + "grad_norm": 1.2033703081470895, + "learning_rate": 7.844219278505885e-05, + "loss": 0.6866, + "num_tokens": 1220428360.0, + "step": 523 + }, + { + "epoch": 0.1811268579329416, + "grad_norm": 1.4132729530968586, + "learning_rate": 7.842882327360408e-05, + "loss": 0.6811, + "num_tokens": 1222681431.0, + "step": 524 + }, + { + "epoch": 0.1814725198755617, + "grad_norm": 1.4301108376708938, + "learning_rate": 7.841539778588363e-05, + "loss": 0.6789, + "num_tokens": 1224952258.0, + "step": 525 + }, + { + "epoch": 0.18181818181818182, + "grad_norm": 0.9368165413612812, + "learning_rate": 7.840191634145359e-05, + "loss": 0.68, + "num_tokens": 1227342610.0, + "step": 526 + }, + { + "epoch": 0.18216384376080194, + "grad_norm": 1.0912150879874103, + "learning_rate": 7.838837895995152e-05, + "loss": 0.6731, + "num_tokens": 1229664962.0, + "step": 527 + }, + { + "epoch": 0.18250950570342206, + "grad_norm": 0.7021981369200088, + "learning_rate": 7.837478566109646e-05, + "loss": 0.7008, + "num_tokens": 1232051653.0, + "step": 528 + }, + { + "epoch": 0.18285516764604218, + "grad_norm": 0.8469380222389131, + "learning_rate": 7.836113646468895e-05, + "loss": 0.6772, + "num_tokens": 1234351403.0, + "step": 529 + }, + { + "epoch": 0.1832008295886623, + "grad_norm": 0.6776079201803205, + "learning_rate": 7.834743139061087e-05, + "loss": 0.6881, + "num_tokens": 1236773795.0, + "step": 530 + }, + { + "epoch": 0.1835464915312824, + "grad_norm": 0.5809153316766027, + "learning_rate": 7.833367045882555e-05, + "loss": 0.6711, + "num_tokens": 1239131431.0, + "step": 531 + }, + { + "epoch": 0.18389215347390253, + "grad_norm": 0.7855283862086249, + "learning_rate": 7.83198536893777e-05, + "loss": 0.6749, + "num_tokens": 1241481597.0, + "step": 532 + }, + { + "epoch": 0.18423781541652265, + "grad_norm": 0.5047940805183389, + "learning_rate": 7.83059811023933e-05, + "loss": 0.6915, + "num_tokens": 1243858177.0, + "step": 533 + }, + { + "epoch": 0.18458347735914277, + "grad_norm": 0.9191573296170649, + "learning_rate": 7.829205271807971e-05, + "loss": 0.6766, + "num_tokens": 1246344707.0, + "step": 534 + }, + { + "epoch": 0.18492913930176288, + "grad_norm": 0.666795099542455, + "learning_rate": 7.827806855672551e-05, + "loss": 0.6774, + "num_tokens": 1248752486.0, + "step": 535 + }, + { + "epoch": 0.185274801244383, + "grad_norm": 1.0774963584795936, + "learning_rate": 7.826402863870054e-05, + "loss": 0.6654, + "num_tokens": 1251205201.0, + "step": 536 + }, + { + "epoch": 0.18562046318700312, + "grad_norm": 1.038314745443513, + "learning_rate": 7.82499329844559e-05, + "loss": 0.6682, + "num_tokens": 1253605228.0, + "step": 537 + }, + { + "epoch": 0.18596612512962324, + "grad_norm": 0.9180557168921101, + "learning_rate": 7.823578161452381e-05, + "loss": 0.6651, + "num_tokens": 1255892235.0, + "step": 538 + }, + { + "epoch": 0.18631178707224336, + "grad_norm": 0.9925632131953006, + "learning_rate": 7.822157454951769e-05, + "loss": 0.6647, + "num_tokens": 1258184990.0, + "step": 539 + }, + { + "epoch": 0.18665744901486347, + "grad_norm": 0.8383349696801654, + "learning_rate": 7.820731181013207e-05, + "loss": 0.6728, + "num_tokens": 1260465450.0, + "step": 540 + }, + { + "epoch": 0.1870031109574836, + "grad_norm": 0.9762719269307527, + "learning_rate": 7.819299341714261e-05, + "loss": 0.6869, + "num_tokens": 1262826201.0, + "step": 541 + }, + { + "epoch": 0.1873487729001037, + "grad_norm": 0.9367760544885605, + "learning_rate": 7.817861939140599e-05, + "loss": 0.6667, + "num_tokens": 1265226768.0, + "step": 542 + }, + { + "epoch": 0.18769443484272383, + "grad_norm": 0.9220242492008244, + "learning_rate": 7.816418975385994e-05, + "loss": 0.6694, + "num_tokens": 1267598455.0, + "step": 543 + }, + { + "epoch": 0.18804009678534395, + "grad_norm": 0.6819054463315197, + "learning_rate": 7.814970452552323e-05, + "loss": 0.6651, + "num_tokens": 1269852164.0, + "step": 544 + }, + { + "epoch": 0.18838575872796406, + "grad_norm": 0.9081684293803073, + "learning_rate": 7.813516372749556e-05, + "loss": 0.6744, + "num_tokens": 1272102642.0, + "step": 545 + }, + { + "epoch": 0.18873142067058418, + "grad_norm": 0.6787871397214766, + "learning_rate": 7.812056738095761e-05, + "loss": 0.6829, + "num_tokens": 1274495986.0, + "step": 546 + }, + { + "epoch": 0.18907708261320427, + "grad_norm": 0.9023949680304635, + "learning_rate": 7.810591550717094e-05, + "loss": 0.6697, + "num_tokens": 1276835725.0, + "step": 547 + }, + { + "epoch": 0.1894227445558244, + "grad_norm": 0.8285672328337838, + "learning_rate": 7.809120812747804e-05, + "loss": 0.6957, + "num_tokens": 1279226496.0, + "step": 548 + }, + { + "epoch": 0.1897684064984445, + "grad_norm": 1.0394614062239516, + "learning_rate": 7.807644526330221e-05, + "loss": 0.6888, + "num_tokens": 1281679517.0, + "step": 549 + }, + { + "epoch": 0.19011406844106463, + "grad_norm": 0.672644873647182, + "learning_rate": 7.806162693614758e-05, + "loss": 0.6687, + "num_tokens": 1284089139.0, + "step": 550 + }, + { + "epoch": 0.19045973038368474, + "grad_norm": 0.9719066237721203, + "learning_rate": 7.804675316759908e-05, + "loss": 0.6483, + "num_tokens": 1286313128.0, + "step": 551 + }, + { + "epoch": 0.19080539232630486, + "grad_norm": 0.8531831087346086, + "learning_rate": 7.803182397932238e-05, + "loss": 0.6823, + "num_tokens": 1288682040.0, + "step": 552 + }, + { + "epoch": 0.19115105426892498, + "grad_norm": 0.9558185039898183, + "learning_rate": 7.801683939306392e-05, + "loss": 0.6562, + "num_tokens": 1291057661.0, + "step": 553 + }, + { + "epoch": 0.1914967162115451, + "grad_norm": 0.77773027782198, + "learning_rate": 7.800179943065079e-05, + "loss": 0.6736, + "num_tokens": 1293314858.0, + "step": 554 + }, + { + "epoch": 0.19184237815416522, + "grad_norm": 0.999974470709146, + "learning_rate": 7.798670411399073e-05, + "loss": 0.6587, + "num_tokens": 1295499037.0, + "step": 555 + }, + { + "epoch": 0.19218804009678533, + "grad_norm": 0.6478375977079255, + "learning_rate": 7.797155346507217e-05, + "loss": 0.6802, + "num_tokens": 1297909878.0, + "step": 556 + }, + { + "epoch": 0.19253370203940545, + "grad_norm": 1.0916868446338521, + "learning_rate": 7.795634750596409e-05, + "loss": 0.6797, + "num_tokens": 1300278130.0, + "step": 557 + }, + { + "epoch": 0.19287936398202557, + "grad_norm": 0.7546860168829089, + "learning_rate": 7.794108625881607e-05, + "loss": 0.6575, + "num_tokens": 1302533942.0, + "step": 558 + }, + { + "epoch": 0.1932250259246457, + "grad_norm": 1.3434612262311425, + "learning_rate": 7.79257697458582e-05, + "loss": 0.682, + "num_tokens": 1304791330.0, + "step": 559 + }, + { + "epoch": 0.1935706878672658, + "grad_norm": 1.0425810918243734, + "learning_rate": 7.791039798940107e-05, + "loss": 0.6716, + "num_tokens": 1307140180.0, + "step": 560 + }, + { + "epoch": 0.19391634980988592, + "grad_norm": 1.0906088111340324, + "learning_rate": 7.789497101183575e-05, + "loss": 0.6853, + "num_tokens": 1309518780.0, + "step": 561 + }, + { + "epoch": 0.19426201175250604, + "grad_norm": 0.9435489855538719, + "learning_rate": 7.787948883563377e-05, + "loss": 0.6519, + "num_tokens": 1311859896.0, + "step": 562 + }, + { + "epoch": 0.19460767369512616, + "grad_norm": 0.9643748878302044, + "learning_rate": 7.786395148334702e-05, + "loss": 0.6663, + "num_tokens": 1314085833.0, + "step": 563 + }, + { + "epoch": 0.19495333563774628, + "grad_norm": 0.7297840357431394, + "learning_rate": 7.784835897760779e-05, + "loss": 0.6551, + "num_tokens": 1316412163.0, + "step": 564 + }, + { + "epoch": 0.1952989975803664, + "grad_norm": 1.2098275099270528, + "learning_rate": 7.783271134112871e-05, + "loss": 0.6637, + "num_tokens": 1318759757.0, + "step": 565 + }, + { + "epoch": 0.1956446595229865, + "grad_norm": 0.9537223424552935, + "learning_rate": 7.78170085967027e-05, + "loss": 0.6619, + "num_tokens": 1321064377.0, + "step": 566 + }, + { + "epoch": 0.19599032146560663, + "grad_norm": 0.9629544273357176, + "learning_rate": 7.780125076720295e-05, + "loss": 0.665, + "num_tokens": 1323334965.0, + "step": 567 + }, + { + "epoch": 0.19633598340822675, + "grad_norm": 0.8301967484227063, + "learning_rate": 7.778543787558292e-05, + "loss": 0.6684, + "num_tokens": 1325669197.0, + "step": 568 + }, + { + "epoch": 0.19668164535084687, + "grad_norm": 0.9722655568473246, + "learning_rate": 7.776956994487624e-05, + "loss": 0.6553, + "num_tokens": 1328069958.0, + "step": 569 + }, + { + "epoch": 0.19702730729346699, + "grad_norm": 0.9022977588977088, + "learning_rate": 7.775364699819672e-05, + "loss": 0.653, + "num_tokens": 1330275271.0, + "step": 570 + }, + { + "epoch": 0.1973729692360871, + "grad_norm": 1.1122557436678455, + "learning_rate": 7.773766905873832e-05, + "loss": 0.6797, + "num_tokens": 1332686670.0, + "step": 571 + }, + { + "epoch": 0.19771863117870722, + "grad_norm": 1.0125111788960826, + "learning_rate": 7.77216361497751e-05, + "loss": 0.6542, + "num_tokens": 1335096177.0, + "step": 572 + }, + { + "epoch": 0.19806429312132734, + "grad_norm": 0.9219632068442798, + "learning_rate": 7.770554829466121e-05, + "loss": 0.6706, + "num_tokens": 1337337014.0, + "step": 573 + }, + { + "epoch": 0.19840995506394746, + "grad_norm": 0.9012051143389893, + "learning_rate": 7.768940551683079e-05, + "loss": 0.6875, + "num_tokens": 1339675556.0, + "step": 574 + }, + { + "epoch": 0.19875561700656758, + "grad_norm": 0.8489487220322667, + "learning_rate": 7.767320783979801e-05, + "loss": 0.6506, + "num_tokens": 1342040139.0, + "step": 575 + }, + { + "epoch": 0.1991012789491877, + "grad_norm": 0.731314279133667, + "learning_rate": 7.765695528715701e-05, + "loss": 0.6629, + "num_tokens": 1344384460.0, + "step": 576 + }, + { + "epoch": 0.1994469408918078, + "grad_norm": 0.9249952296907413, + "learning_rate": 7.764064788258185e-05, + "loss": 0.6805, + "num_tokens": 1346807158.0, + "step": 577 + }, + { + "epoch": 0.19979260283442793, + "grad_norm": 0.8180799509386987, + "learning_rate": 7.762428564982653e-05, + "loss": 0.6752, + "num_tokens": 1349140081.0, + "step": 578 + }, + { + "epoch": 1.00034566194262, + "grad_norm": 2.1142211852665396, + "learning_rate": 7.760786861272486e-05, + "loss": 1.3232, + "num_tokens": 1352945518.0, + "step": 579 + }, + { + "epoch": 1.0006913238852402, + "grad_norm": 0.8740528567855538, + "learning_rate": 7.759139679519052e-05, + "loss": 0.663, + "num_tokens": 1355363345.0, + "step": 580 + }, + { + "epoch": 1.0010369858278603, + "grad_norm": 0.9346383668192436, + "learning_rate": 7.757487022121696e-05, + "loss": 0.6632, + "num_tokens": 1357657558.0, + "step": 581 + }, + { + "epoch": 1.0013826477704804, + "grad_norm": 0.5707352247994628, + "learning_rate": 7.755828891487739e-05, + "loss": 0.6759, + "num_tokens": 1359990193.0, + "step": 582 + }, + { + "epoch": 1.0017283097131007, + "grad_norm": 0.8777230050047098, + "learning_rate": 7.754165290032477e-05, + "loss": 0.6772, + "num_tokens": 1362426704.0, + "step": 583 + }, + { + "epoch": 1.0020739716557208, + "grad_norm": 0.7456478205951602, + "learning_rate": 7.752496220179175e-05, + "loss": 0.6631, + "num_tokens": 1364665198.0, + "step": 584 + }, + { + "epoch": 1.0024196335983409, + "grad_norm": 1.1036043922339784, + "learning_rate": 7.750821684359059e-05, + "loss": 0.6725, + "num_tokens": 1367024438.0, + "step": 585 + }, + { + "epoch": 1.002765295540961, + "grad_norm": 1.1358749685551512, + "learning_rate": 7.74914168501132e-05, + "loss": 0.6646, + "num_tokens": 1369322895.0, + "step": 586 + }, + { + "epoch": 1.003110957483581, + "grad_norm": 0.7777353302105828, + "learning_rate": 7.74745622458311e-05, + "loss": 0.6761, + "num_tokens": 1371556203.0, + "step": 587 + }, + { + "epoch": 1.0034566194262011, + "grad_norm": 0.9061793479058321, + "learning_rate": 7.745765305529532e-05, + "loss": 0.6487, + "num_tokens": 1373818248.0, + "step": 588 + }, + { + "epoch": 1.0038022813688212, + "grad_norm": 0.6363269674236014, + "learning_rate": 7.744068930313641e-05, + "loss": 0.663, + "num_tokens": 1376220420.0, + "step": 589 + }, + { + "epoch": 1.0041479433114413, + "grad_norm": 1.1400955700651099, + "learning_rate": 7.742367101406439e-05, + "loss": 0.6966, + "num_tokens": 1378698210.0, + "step": 590 + }, + { + "epoch": 1.0044936052540616, + "grad_norm": 0.6931490878384134, + "learning_rate": 7.740659821286875e-05, + "loss": 0.662, + "num_tokens": 1380900787.0, + "step": 591 + }, + { + "epoch": 1.0048392671966817, + "grad_norm": 0.7438861467017077, + "learning_rate": 7.738947092441834e-05, + "loss": 0.6537, + "num_tokens": 1383209048.0, + "step": 592 + }, + { + "epoch": 1.0051849291393018, + "grad_norm": 0.6822502682454515, + "learning_rate": 7.737228917366145e-05, + "loss": 0.6525, + "num_tokens": 1385527452.0, + "step": 593 + }, + { + "epoch": 1.0055305910819219, + "grad_norm": 0.5169468185232555, + "learning_rate": 7.73550529856256e-05, + "loss": 0.6754, + "num_tokens": 1387948417.0, + "step": 594 + }, + { + "epoch": 1.005876253024542, + "grad_norm": 0.7060860786596713, + "learning_rate": 7.73377623854177e-05, + "loss": 0.6822, + "num_tokens": 1390344923.0, + "step": 595 + }, + { + "epoch": 1.006221914967162, + "grad_norm": 0.6111485872093079, + "learning_rate": 7.732041739822384e-05, + "loss": 0.6585, + "num_tokens": 1392652565.0, + "step": 596 + }, + { + "epoch": 1.0065675769097822, + "grad_norm": 0.7018961454326624, + "learning_rate": 7.730301804930942e-05, + "loss": 0.6706, + "num_tokens": 1395127019.0, + "step": 597 + }, + { + "epoch": 1.0069132388524022, + "grad_norm": 0.5014693464006867, + "learning_rate": 7.728556436401894e-05, + "loss": 0.6698, + "num_tokens": 1397426472.0, + "step": 598 + }, + { + "epoch": 1.0072589007950226, + "grad_norm": 0.9465512205353946, + "learning_rate": 7.72680563677761e-05, + "loss": 0.6437, + "num_tokens": 1399665676.0, + "step": 599 + }, + { + "epoch": 1.0076045627376427, + "grad_norm": 0.596902876231457, + "learning_rate": 7.725049408608367e-05, + "loss": 0.6598, + "num_tokens": 1402023834.0, + "step": 600 + }, + { + "epoch": 1.0079502246802627, + "grad_norm": 0.9792241758241436, + "learning_rate": 7.723287754452357e-05, + "loss": 0.6617, + "num_tokens": 1404341552.0, + "step": 601 + }, + { + "epoch": 1.0082958866228828, + "grad_norm": 0.8572224392994607, + "learning_rate": 7.721520676875667e-05, + "loss": 0.6727, + "num_tokens": 1406658741.0, + "step": 602 + }, + { + "epoch": 1.008641548565503, + "grad_norm": 0.8019852222089114, + "learning_rate": 7.719748178452291e-05, + "loss": 0.6623, + "num_tokens": 1409023426.0, + "step": 603 + }, + { + "epoch": 1.008987210508123, + "grad_norm": 1.147902468568786, + "learning_rate": 7.717970261764111e-05, + "loss": 0.6705, + "num_tokens": 1411295298.0, + "step": 604 + }, + { + "epoch": 1.009332872450743, + "grad_norm": 0.8975050298448578, + "learning_rate": 7.716186929400909e-05, + "loss": 0.673, + "num_tokens": 1413654997.0, + "step": 605 + }, + { + "epoch": 1.0096785343933632, + "grad_norm": 0.8818019982825493, + "learning_rate": 7.714398183960353e-05, + "loss": 0.6545, + "num_tokens": 1415990152.0, + "step": 606 + }, + { + "epoch": 1.0100241963359835, + "grad_norm": 0.8515555098083399, + "learning_rate": 7.712604028047998e-05, + "loss": 0.6674, + "num_tokens": 1418336050.0, + "step": 607 + }, + { + "epoch": 1.0103698582786036, + "grad_norm": 0.6813012204492377, + "learning_rate": 7.710804464277275e-05, + "loss": 0.6805, + "num_tokens": 1420683836.0, + "step": 608 + }, + { + "epoch": 1.0107155202212237, + "grad_norm": 0.790969572407373, + "learning_rate": 7.708999495269496e-05, + "loss": 0.66, + "num_tokens": 1422975794.0, + "step": 609 + }, + { + "epoch": 1.0110611821638438, + "grad_norm": 0.6093040695520983, + "learning_rate": 7.707189123653845e-05, + "loss": 0.6799, + "num_tokens": 1425378969.0, + "step": 610 + }, + { + "epoch": 1.0114068441064639, + "grad_norm": 0.7038104402467534, + "learning_rate": 7.705373352067378e-05, + "loss": 0.6669, + "num_tokens": 1427740311.0, + "step": 611 + }, + { + "epoch": 1.011752506049084, + "grad_norm": 0.789700190060106, + "learning_rate": 7.703552183155015e-05, + "loss": 0.6677, + "num_tokens": 1430071733.0, + "step": 612 + }, + { + "epoch": 1.012098167991704, + "grad_norm": 0.5631211448705197, + "learning_rate": 7.701725619569535e-05, + "loss": 0.6644, + "num_tokens": 1432398119.0, + "step": 613 + }, + { + "epoch": 1.0124438299343241, + "grad_norm": 0.5728940307152167, + "learning_rate": 7.699893663971584e-05, + "loss": 0.6536, + "num_tokens": 1434700894.0, + "step": 614 + }, + { + "epoch": 1.0127894918769444, + "grad_norm": 0.6833111994010771, + "learning_rate": 7.698056319029653e-05, + "loss": 0.6753, + "num_tokens": 1437009930.0, + "step": 615 + }, + { + "epoch": 1.0131351538195645, + "grad_norm": 0.48898801558173766, + "learning_rate": 7.696213587420087e-05, + "loss": 0.6389, + "num_tokens": 1439320755.0, + "step": 616 + }, + { + "epoch": 1.0134808157621846, + "grad_norm": 0.7247165377476263, + "learning_rate": 7.694365471827077e-05, + "loss": 0.6367, + "num_tokens": 1441611445.0, + "step": 617 + }, + { + "epoch": 1.0138264777048047, + "grad_norm": 0.5533567676626411, + "learning_rate": 7.692511974942656e-05, + "loss": 0.6495, + "num_tokens": 1443825170.0, + "step": 618 + }, + { + "epoch": 1.0141721396474248, + "grad_norm": 0.5441193139815216, + "learning_rate": 7.690653099466699e-05, + "loss": 0.6673, + "num_tokens": 1446190695.0, + "step": 619 + }, + { + "epoch": 1.014517801590045, + "grad_norm": 0.5913180366540489, + "learning_rate": 7.68878884810691e-05, + "loss": 0.6536, + "num_tokens": 1448411987.0, + "step": 620 + }, + { + "epoch": 1.014863463532665, + "grad_norm": 0.5526213731310409, + "learning_rate": 7.686919223578827e-05, + "loss": 0.6622, + "num_tokens": 1450746710.0, + "step": 621 + }, + { + "epoch": 1.015209125475285, + "grad_norm": 0.5661299027415501, + "learning_rate": 7.685044228605816e-05, + "loss": 0.654, + "num_tokens": 1453078431.0, + "step": 622 + }, + { + "epoch": 1.0155547874179054, + "grad_norm": 0.9323005467571946, + "learning_rate": 7.683163865919063e-05, + "loss": 0.6656, + "num_tokens": 1455445239.0, + "step": 623 + }, + { + "epoch": 1.0159004493605255, + "grad_norm": 0.6680306010717404, + "learning_rate": 7.681278138257574e-05, + "loss": 0.6591, + "num_tokens": 1457723394.0, + "step": 624 + }, + { + "epoch": 1.0162461113031456, + "grad_norm": 1.1434659857406162, + "learning_rate": 7.679387048368171e-05, + "loss": 0.6588, + "num_tokens": 1460214983.0, + "step": 625 + }, + { + "epoch": 1.0165917732457657, + "grad_norm": 1.0012988697589547, + "learning_rate": 7.677490599005484e-05, + "loss": 0.671, + "num_tokens": 1462481652.0, + "step": 626 + }, + { + "epoch": 1.0169374351883858, + "grad_norm": 1.2261873867858468, + "learning_rate": 7.675588792931951e-05, + "loss": 0.6589, + "num_tokens": 1464769947.0, + "step": 627 + }, + { + "epoch": 1.0172830971310058, + "grad_norm": 1.0217974991617935, + "learning_rate": 7.673681632917817e-05, + "loss": 0.6725, + "num_tokens": 1467094042.0, + "step": 628 + }, + { + "epoch": 1.017628759073626, + "grad_norm": 1.11399376740334, + "learning_rate": 7.671769121741117e-05, + "loss": 0.6515, + "num_tokens": 1469458137.0, + "step": 629 + }, + { + "epoch": 1.017974421016246, + "grad_norm": 0.8987001634998655, + "learning_rate": 7.669851262187688e-05, + "loss": 0.6549, + "num_tokens": 1471691478.0, + "step": 630 + }, + { + "epoch": 1.0183200829588663, + "grad_norm": 1.2179402114743725, + "learning_rate": 7.667928057051157e-05, + "loss": 0.6559, + "num_tokens": 1474115060.0, + "step": 631 + }, + { + "epoch": 1.0186657449014864, + "grad_norm": 0.9968381361277421, + "learning_rate": 7.665999509132932e-05, + "loss": 0.6653, + "num_tokens": 1476551097.0, + "step": 632 + }, + { + "epoch": 1.0190114068441065, + "grad_norm": 1.134854009966637, + "learning_rate": 7.664065621242211e-05, + "loss": 0.667, + "num_tokens": 1478958701.0, + "step": 633 + }, + { + "epoch": 1.0193570687867266, + "grad_norm": 1.2239518100956406, + "learning_rate": 7.662126396195964e-05, + "loss": 0.6697, + "num_tokens": 1481481101.0, + "step": 634 + }, + { + "epoch": 1.0197027307293467, + "grad_norm": 0.6181330954809839, + "learning_rate": 7.660181836818941e-05, + "loss": 0.6564, + "num_tokens": 1483911545.0, + "step": 635 + }, + { + "epoch": 1.0200483926719668, + "grad_norm": 1.0680644611640846, + "learning_rate": 7.658231945943657e-05, + "loss": 0.6664, + "num_tokens": 1486383500.0, + "step": 636 + }, + { + "epoch": 1.0203940546145869, + "grad_norm": 0.7730549075716262, + "learning_rate": 7.656276726410397e-05, + "loss": 0.6735, + "num_tokens": 1488687299.0, + "step": 637 + }, + { + "epoch": 1.020739716557207, + "grad_norm": 1.0976599516446086, + "learning_rate": 7.654316181067203e-05, + "loss": 0.6645, + "num_tokens": 1490826696.0, + "step": 638 + }, + { + "epoch": 1.0210853784998273, + "grad_norm": 1.1945816374119282, + "learning_rate": 7.652350312769883e-05, + "loss": 0.6482, + "num_tokens": 1493144226.0, + "step": 639 + }, + { + "epoch": 1.0214310404424474, + "grad_norm": 0.6691192026146077, + "learning_rate": 7.650379124381988e-05, + "loss": 0.6634, + "num_tokens": 1495464860.0, + "step": 640 + }, + { + "epoch": 1.0217767023850675, + "grad_norm": 1.1196536032510578, + "learning_rate": 7.648402618774828e-05, + "loss": 0.6654, + "num_tokens": 1497734437.0, + "step": 641 + }, + { + "epoch": 1.0221223643276875, + "grad_norm": 0.9040819601814349, + "learning_rate": 7.646420798827453e-05, + "loss": 0.6806, + "num_tokens": 1500083728.0, + "step": 642 + }, + { + "epoch": 1.0224680262703076, + "grad_norm": 0.7950902913298484, + "learning_rate": 7.644433667426657e-05, + "loss": 0.6666, + "num_tokens": 1502410219.0, + "step": 643 + }, + { + "epoch": 1.0228136882129277, + "grad_norm": 0.7992043464604883, + "learning_rate": 7.642441227466967e-05, + "loss": 0.6559, + "num_tokens": 1504763233.0, + "step": 644 + }, + { + "epoch": 1.0231593501555478, + "grad_norm": 0.7218935596775387, + "learning_rate": 7.640443481850646e-05, + "loss": 0.6567, + "num_tokens": 1507031533.0, + "step": 645 + }, + { + "epoch": 1.023505012098168, + "grad_norm": 0.728114072202148, + "learning_rate": 7.638440433487684e-05, + "loss": 0.6598, + "num_tokens": 1509305905.0, + "step": 646 + }, + { + "epoch": 1.023850674040788, + "grad_norm": 0.5741261548647706, + "learning_rate": 7.636432085295795e-05, + "loss": 0.6555, + "num_tokens": 1511627754.0, + "step": 647 + }, + { + "epoch": 1.0241963359834083, + "grad_norm": 0.8008762284547368, + "learning_rate": 7.634418440200414e-05, + "loss": 0.6697, + "num_tokens": 1513997463.0, + "step": 648 + }, + { + "epoch": 1.0245419979260284, + "grad_norm": 0.8399321071744058, + "learning_rate": 7.632399501134692e-05, + "loss": 0.6615, + "num_tokens": 1516263802.0, + "step": 649 + }, + { + "epoch": 1.0248876598686485, + "grad_norm": 0.8072936105247144, + "learning_rate": 7.630375271039489e-05, + "loss": 0.6359, + "num_tokens": 1518478478.0, + "step": 650 + }, + { + "epoch": 1.0252333218112686, + "grad_norm": 0.6932729539087006, + "learning_rate": 7.628345752863374e-05, + "loss": 0.667, + "num_tokens": 1520774169.0, + "step": 651 + }, + { + "epoch": 1.0255789837538887, + "grad_norm": 0.5840938727847315, + "learning_rate": 7.626310949562619e-05, + "loss": 0.644, + "num_tokens": 1523067587.0, + "step": 652 + }, + { + "epoch": 1.0259246456965088, + "grad_norm": 0.543434169191495, + "learning_rate": 7.624270864101193e-05, + "loss": 0.6608, + "num_tokens": 1525375809.0, + "step": 653 + }, + { + "epoch": 1.0262703076391289, + "grad_norm": 0.7933480253455424, + "learning_rate": 7.622225499450761e-05, + "loss": 0.6627, + "num_tokens": 1527727521.0, + "step": 654 + }, + { + "epoch": 1.026615969581749, + "grad_norm": 0.4965360215465145, + "learning_rate": 7.620174858590675e-05, + "loss": 0.6632, + "num_tokens": 1530146486.0, + "step": 655 + }, + { + "epoch": 1.0269616315243693, + "grad_norm": 0.5876542604728499, + "learning_rate": 7.618118944507978e-05, + "loss": 0.6623, + "num_tokens": 1532516610.0, + "step": 656 + }, + { + "epoch": 1.0273072934669893, + "grad_norm": 0.5680896434244347, + "learning_rate": 7.616057760197388e-05, + "loss": 0.6615, + "num_tokens": 1534910929.0, + "step": 657 + }, + { + "epoch": 1.0276529554096094, + "grad_norm": 0.5644813651143472, + "learning_rate": 7.613991308661302e-05, + "loss": 0.6584, + "num_tokens": 1537193010.0, + "step": 658 + }, + { + "epoch": 1.0279986173522295, + "grad_norm": 0.557082880787213, + "learning_rate": 7.611919592909792e-05, + "loss": 0.6512, + "num_tokens": 1539530821.0, + "step": 659 + }, + { + "epoch": 1.0283442792948496, + "grad_norm": 0.5163226932167695, + "learning_rate": 7.609842615960595e-05, + "loss": 0.6527, + "num_tokens": 1541842685.0, + "step": 660 + }, + { + "epoch": 1.0286899412374697, + "grad_norm": 0.6494316991627198, + "learning_rate": 7.607760380839111e-05, + "loss": 0.6599, + "num_tokens": 1544150207.0, + "step": 661 + }, + { + "epoch": 1.0290356031800898, + "grad_norm": 0.7169128975167005, + "learning_rate": 7.605672890578404e-05, + "loss": 0.6505, + "num_tokens": 1546480381.0, + "step": 662 + }, + { + "epoch": 1.0293812651227099, + "grad_norm": 0.45838851670608066, + "learning_rate": 7.60358014821919e-05, + "loss": 0.6455, + "num_tokens": 1548865039.0, + "step": 663 + }, + { + "epoch": 1.0297269270653302, + "grad_norm": 0.5637559218656475, + "learning_rate": 7.60148215680983e-05, + "loss": 0.6351, + "num_tokens": 1551214967.0, + "step": 664 + }, + { + "epoch": 1.0300725890079503, + "grad_norm": 0.6933103396811869, + "learning_rate": 7.59937891940634e-05, + "loss": 0.6665, + "num_tokens": 1553613898.0, + "step": 665 + }, + { + "epoch": 1.0304182509505704, + "grad_norm": 0.5474278756097691, + "learning_rate": 7.597270439072372e-05, + "loss": 0.6688, + "num_tokens": 1555811871.0, + "step": 666 + }, + { + "epoch": 1.0307639128931905, + "grad_norm": 0.585996743924975, + "learning_rate": 7.59515671887922e-05, + "loss": 0.6524, + "num_tokens": 1558217502.0, + "step": 667 + }, + { + "epoch": 1.0311095748358106, + "grad_norm": 0.590909112214886, + "learning_rate": 7.593037761905803e-05, + "loss": 0.6528, + "num_tokens": 1560591813.0, + "step": 668 + }, + { + "epoch": 1.0314552367784307, + "grad_norm": 0.5120051549837813, + "learning_rate": 7.590913571238673e-05, + "loss": 0.6467, + "num_tokens": 1562947777.0, + "step": 669 + }, + { + "epoch": 1.0318008987210507, + "grad_norm": 0.5374538070518096, + "learning_rate": 7.588784149972007e-05, + "loss": 0.6529, + "num_tokens": 1565226888.0, + "step": 670 + }, + { + "epoch": 1.0321465606636708, + "grad_norm": 0.6514603483818726, + "learning_rate": 7.586649501207596e-05, + "loss": 0.6555, + "num_tokens": 1567566902.0, + "step": 671 + }, + { + "epoch": 1.0324922226062911, + "grad_norm": 0.5880500827656041, + "learning_rate": 7.58450962805485e-05, + "loss": 0.6552, + "num_tokens": 1570002540.0, + "step": 672 + }, + { + "epoch": 1.0328378845489112, + "grad_norm": 0.40116340005740103, + "learning_rate": 7.58236453363079e-05, + "loss": 0.644, + "num_tokens": 1572317546.0, + "step": 673 + }, + { + "epoch": 1.0331835464915313, + "grad_norm": 0.8968079763045643, + "learning_rate": 7.580214221060037e-05, + "loss": 0.6468, + "num_tokens": 1574670887.0, + "step": 674 + }, + { + "epoch": 1.0335292084341514, + "grad_norm": 0.5400714101142915, + "learning_rate": 7.578058693474817e-05, + "loss": 0.6583, + "num_tokens": 1577041992.0, + "step": 675 + }, + { + "epoch": 1.0338748703767715, + "grad_norm": 1.167715203666084, + "learning_rate": 7.57589795401495e-05, + "loss": 0.6759, + "num_tokens": 1579366383.0, + "step": 676 + }, + { + "epoch": 1.0342205323193916, + "grad_norm": 0.8548395155995829, + "learning_rate": 7.57373200582785e-05, + "loss": 0.6687, + "num_tokens": 1581686721.0, + "step": 677 + }, + { + "epoch": 1.0345661942620117, + "grad_norm": 0.737877349090235, + "learning_rate": 7.571560852068518e-05, + "loss": 0.6601, + "num_tokens": 1584027524.0, + "step": 678 + }, + { + "epoch": 1.0349118562046318, + "grad_norm": 0.7724750351761767, + "learning_rate": 7.569384495899537e-05, + "loss": 0.6643, + "num_tokens": 1586214548.0, + "step": 679 + }, + { + "epoch": 1.035257518147252, + "grad_norm": 0.6427434281966125, + "learning_rate": 7.567202940491068e-05, + "loss": 0.6529, + "num_tokens": 1588602294.0, + "step": 680 + }, + { + "epoch": 1.0356031800898722, + "grad_norm": 0.7060933592824846, + "learning_rate": 7.565016189020846e-05, + "loss": 0.6602, + "num_tokens": 1590931577.0, + "step": 681 + }, + { + "epoch": 1.0359488420324923, + "grad_norm": 0.5749061643754176, + "learning_rate": 7.562824244674171e-05, + "loss": 0.6586, + "num_tokens": 1593221502.0, + "step": 682 + }, + { + "epoch": 1.0362945039751124, + "grad_norm": 0.9674203250964503, + "learning_rate": 7.560627110643914e-05, + "loss": 0.652, + "num_tokens": 1595529164.0, + "step": 683 + }, + { + "epoch": 1.0366401659177324, + "grad_norm": 0.636148254062618, + "learning_rate": 7.558424790130502e-05, + "loss": 0.6649, + "num_tokens": 1597928186.0, + "step": 684 + }, + { + "epoch": 1.0369858278603525, + "grad_norm": 1.2858714510085154, + "learning_rate": 7.556217286341914e-05, + "loss": 0.6637, + "num_tokens": 1600287202.0, + "step": 685 + }, + { + "epoch": 1.0373314898029726, + "grad_norm": 1.1981294794756567, + "learning_rate": 7.554004602493684e-05, + "loss": 0.6578, + "num_tokens": 1602544956.0, + "step": 686 + }, + { + "epoch": 1.0376771517455927, + "grad_norm": 0.5485746693126576, + "learning_rate": 7.551786741808888e-05, + "loss": 0.667, + "num_tokens": 1604947160.0, + "step": 687 + }, + { + "epoch": 1.038022813688213, + "grad_norm": 0.9849890404623954, + "learning_rate": 7.549563707518146e-05, + "loss": 0.6575, + "num_tokens": 1607334967.0, + "step": 688 + }, + { + "epoch": 1.0383684756308331, + "grad_norm": 0.7416291684802955, + "learning_rate": 7.54733550285961e-05, + "loss": 0.6699, + "num_tokens": 1609590170.0, + "step": 689 + }, + { + "epoch": 1.0387141375734532, + "grad_norm": 0.9466627868323391, + "learning_rate": 7.545102131078966e-05, + "loss": 0.6364, + "num_tokens": 1611878853.0, + "step": 690 + }, + { + "epoch": 1.0390597995160733, + "grad_norm": 0.6625787569147549, + "learning_rate": 7.542863595429427e-05, + "loss": 0.6701, + "num_tokens": 1614271436.0, + "step": 691 + }, + { + "epoch": 1.0394054614586934, + "grad_norm": 0.8851057627269082, + "learning_rate": 7.540619899171724e-05, + "loss": 0.6434, + "num_tokens": 1616614392.0, + "step": 692 + }, + { + "epoch": 1.0397511234013135, + "grad_norm": 0.6865229050413474, + "learning_rate": 7.538371045574113e-05, + "loss": 0.6642, + "num_tokens": 1618931190.0, + "step": 693 + }, + { + "epoch": 1.0400967853439336, + "grad_norm": 0.9536804293374974, + "learning_rate": 7.536117037912354e-05, + "loss": 0.6541, + "num_tokens": 1621360340.0, + "step": 694 + }, + { + "epoch": 1.0404424472865537, + "grad_norm": 0.9243922530318902, + "learning_rate": 7.53385787946972e-05, + "loss": 0.6518, + "num_tokens": 1623537986.0, + "step": 695 + }, + { + "epoch": 1.040788109229174, + "grad_norm": 0.6489092556853862, + "learning_rate": 7.531593573536985e-05, + "loss": 0.6542, + "num_tokens": 1625975581.0, + "step": 696 + }, + { + "epoch": 1.041133771171794, + "grad_norm": 0.7070904565887195, + "learning_rate": 7.529324123412417e-05, + "loss": 0.6446, + "num_tokens": 1628249323.0, + "step": 697 + }, + { + "epoch": 1.0414794331144142, + "grad_norm": 0.5605860382938546, + "learning_rate": 7.527049532401786e-05, + "loss": 0.6494, + "num_tokens": 1630621076.0, + "step": 698 + }, + { + "epoch": 1.0418250950570342, + "grad_norm": 0.5440822681356616, + "learning_rate": 7.524769803818344e-05, + "loss": 0.6657, + "num_tokens": 1632919877.0, + "step": 699 + }, + { + "epoch": 1.0421707569996543, + "grad_norm": 0.6879646283882763, + "learning_rate": 7.522484940982828e-05, + "loss": 0.6683, + "num_tokens": 1635377901.0, + "step": 700 + }, + { + "epoch": 1.0425164189422744, + "grad_norm": 0.7656100461663494, + "learning_rate": 7.520194947223452e-05, + "loss": 0.6579, + "num_tokens": 1637667005.0, + "step": 701 + }, + { + "epoch": 1.0428620808848945, + "grad_norm": 0.5020779204302981, + "learning_rate": 7.517899825875908e-05, + "loss": 0.6673, + "num_tokens": 1639995071.0, + "step": 702 + }, + { + "epoch": 1.0432077428275146, + "grad_norm": 1.0248831881344556, + "learning_rate": 7.515599580283355e-05, + "loss": 0.6676, + "num_tokens": 1642403239.0, + "step": 703 + }, + { + "epoch": 1.043553404770135, + "grad_norm": 0.7934296382607025, + "learning_rate": 7.513294213796416e-05, + "loss": 0.6595, + "num_tokens": 1644747762.0, + "step": 704 + }, + { + "epoch": 1.043899066712755, + "grad_norm": 0.785876435755504, + "learning_rate": 7.510983729773172e-05, + "loss": 0.6544, + "num_tokens": 1647148874.0, + "step": 705 + }, + { + "epoch": 1.044244728655375, + "grad_norm": 0.6653142046134285, + "learning_rate": 7.50866813157916e-05, + "loss": 0.6476, + "num_tokens": 1649429078.0, + "step": 706 + }, + { + "epoch": 1.0445903905979952, + "grad_norm": 0.5805927347944138, + "learning_rate": 7.506347422587367e-05, + "loss": 0.6459, + "num_tokens": 1651724969.0, + "step": 707 + }, + { + "epoch": 1.0449360525406153, + "grad_norm": 0.4995620326946016, + "learning_rate": 7.504021606178223e-05, + "loss": 0.652, + "num_tokens": 1654102988.0, + "step": 708 + }, + { + "epoch": 1.0452817144832354, + "grad_norm": 1.0024879397761142, + "learning_rate": 7.5016906857396e-05, + "loss": 0.6537, + "num_tokens": 1656515825.0, + "step": 709 + }, + { + "epoch": 1.0456273764258555, + "grad_norm": 0.7784919709657042, + "learning_rate": 7.499354664666799e-05, + "loss": 0.6468, + "num_tokens": 1658838824.0, + "step": 710 + }, + { + "epoch": 1.0459730383684755, + "grad_norm": 0.9938225644209574, + "learning_rate": 7.49701354636256e-05, + "loss": 0.6632, + "num_tokens": 1661324459.0, + "step": 711 + }, + { + "epoch": 1.0463187003110956, + "grad_norm": 0.8426641245749431, + "learning_rate": 7.494667334237038e-05, + "loss": 0.6584, + "num_tokens": 1663618935.0, + "step": 712 + }, + { + "epoch": 1.046664362253716, + "grad_norm": 1.0918729539916363, + "learning_rate": 7.492316031707816e-05, + "loss": 0.6582, + "num_tokens": 1666090420.0, + "step": 713 + }, + { + "epoch": 1.047010024196336, + "grad_norm": 0.7066020239858003, + "learning_rate": 7.489959642199887e-05, + "loss": 0.6445, + "num_tokens": 1668437369.0, + "step": 714 + }, + { + "epoch": 1.0473556861389561, + "grad_norm": 1.0404264832931147, + "learning_rate": 7.487598169145655e-05, + "loss": 0.6375, + "num_tokens": 1670695341.0, + "step": 715 + }, + { + "epoch": 1.0477013480815762, + "grad_norm": 0.7302789837282395, + "learning_rate": 7.485231615984931e-05, + "loss": 0.6578, + "num_tokens": 1672936203.0, + "step": 716 + }, + { + "epoch": 1.0480470100241963, + "grad_norm": 0.99046289830908, + "learning_rate": 7.482859986164923e-05, + "loss": 0.6295, + "num_tokens": 1675294177.0, + "step": 717 + }, + { + "epoch": 1.0483926719668164, + "grad_norm": 0.8661096408910032, + "learning_rate": 7.480483283140234e-05, + "loss": 0.6394, + "num_tokens": 1677573567.0, + "step": 718 + }, + { + "epoch": 1.0487383339094365, + "grad_norm": 0.8909145637324993, + "learning_rate": 7.478101510372859e-05, + "loss": 0.6531, + "num_tokens": 1679936390.0, + "step": 719 + }, + { + "epoch": 1.0490839958520566, + "grad_norm": 0.8656962894943192, + "learning_rate": 7.475714671332174e-05, + "loss": 0.6437, + "num_tokens": 1682313056.0, + "step": 720 + }, + { + "epoch": 1.049429657794677, + "grad_norm": 0.7777445530331969, + "learning_rate": 7.473322769494939e-05, + "loss": 0.6629, + "num_tokens": 1684756818.0, + "step": 721 + }, + { + "epoch": 1.049775319737297, + "grad_norm": 0.7722760692911367, + "learning_rate": 7.470925808345288e-05, + "loss": 0.667, + "num_tokens": 1687064609.0, + "step": 722 + }, + { + "epoch": 1.050120981679917, + "grad_norm": 0.5156006867367259, + "learning_rate": 7.468523791374722e-05, + "loss": 0.656, + "num_tokens": 1689398275.0, + "step": 723 + }, + { + "epoch": 1.0504666436225372, + "grad_norm": 1.2427373802965145, + "learning_rate": 7.466116722082109e-05, + "loss": 0.6556, + "num_tokens": 1691759259.0, + "step": 724 + }, + { + "epoch": 1.0508123055651573, + "grad_norm": 0.9602253747856707, + "learning_rate": 7.463704603973674e-05, + "loss": 0.651, + "num_tokens": 1694009012.0, + "step": 725 + }, + { + "epoch": 1.0511579675077773, + "grad_norm": 1.2299356232976921, + "learning_rate": 7.461287440562998e-05, + "loss": 0.6354, + "num_tokens": 1696272685.0, + "step": 726 + }, + { + "epoch": 1.0515036294503974, + "grad_norm": 1.2578384724260376, + "learning_rate": 7.458865235371014e-05, + "loss": 0.6485, + "num_tokens": 1698606423.0, + "step": 727 + }, + { + "epoch": 1.0518492913930175, + "grad_norm": 0.8398174663359371, + "learning_rate": 7.45643799192599e-05, + "loss": 0.6457, + "num_tokens": 1700848633.0, + "step": 728 + }, + { + "epoch": 1.0521949533356378, + "grad_norm": 0.8690683195161508, + "learning_rate": 7.454005713763542e-05, + "loss": 0.6229, + "num_tokens": 1703082424.0, + "step": 729 + }, + { + "epoch": 1.052540615278258, + "grad_norm": 0.8994921927069949, + "learning_rate": 7.451568404426616e-05, + "loss": 0.6531, + "num_tokens": 1705436343.0, + "step": 730 + }, + { + "epoch": 1.052886277220878, + "grad_norm": 0.6372820534131493, + "learning_rate": 7.449126067465489e-05, + "loss": 0.6384, + "num_tokens": 1707640584.0, + "step": 731 + }, + { + "epoch": 1.053231939163498, + "grad_norm": 1.0529172440016912, + "learning_rate": 7.446678706437757e-05, + "loss": 0.6301, + "num_tokens": 1709958348.0, + "step": 732 + }, + { + "epoch": 1.0535776011061182, + "grad_norm": 0.8723977841586724, + "learning_rate": 7.444226324908337e-05, + "loss": 0.6614, + "num_tokens": 1712337837.0, + "step": 733 + }, + { + "epoch": 1.0539232630487383, + "grad_norm": 0.9742638243269647, + "learning_rate": 7.441768926449462e-05, + "loss": 0.6454, + "num_tokens": 1714554478.0, + "step": 734 + }, + { + "epoch": 1.0542689249913584, + "grad_norm": 0.8825078820815238, + "learning_rate": 7.439306514640664e-05, + "loss": 0.6373, + "num_tokens": 1716824287.0, + "step": 735 + }, + { + "epoch": 1.0546145869339785, + "grad_norm": 0.8084516685896288, + "learning_rate": 7.436839093068789e-05, + "loss": 0.6554, + "num_tokens": 1719124722.0, + "step": 736 + }, + { + "epoch": 1.0549602488765988, + "grad_norm": 0.6476128238630405, + "learning_rate": 7.434366665327972e-05, + "loss": 0.6243, + "num_tokens": 1721387459.0, + "step": 737 + }, + { + "epoch": 1.0553059108192189, + "grad_norm": 0.9501488634030735, + "learning_rate": 7.431889235019642e-05, + "loss": 0.6607, + "num_tokens": 1723663967.0, + "step": 738 + }, + { + "epoch": 1.055651572761839, + "grad_norm": 0.6065977588322126, + "learning_rate": 7.429406805752517e-05, + "loss": 0.6405, + "num_tokens": 1726098124.0, + "step": 739 + }, + { + "epoch": 1.055997234704459, + "grad_norm": 1.2323642179986805, + "learning_rate": 7.426919381142596e-05, + "loss": 0.6534, + "num_tokens": 1728451629.0, + "step": 740 + }, + { + "epoch": 1.0563428966470791, + "grad_norm": 1.2010325979432563, + "learning_rate": 7.424426964813154e-05, + "loss": 0.6679, + "num_tokens": 1730884149.0, + "step": 741 + }, + { + "epoch": 1.0566885585896992, + "grad_norm": 0.6975154125871768, + "learning_rate": 7.421929560394736e-05, + "loss": 0.6545, + "num_tokens": 1733153296.0, + "step": 742 + }, + { + "epoch": 1.0570342205323193, + "grad_norm": 0.78449847823142, + "learning_rate": 7.419427171525152e-05, + "loss": 0.6414, + "num_tokens": 1735500140.0, + "step": 743 + }, + { + "epoch": 1.0573798824749394, + "grad_norm": 0.6423226250173539, + "learning_rate": 7.416919801849479e-05, + "loss": 0.6577, + "num_tokens": 1737848869.0, + "step": 744 + }, + { + "epoch": 1.0577255444175597, + "grad_norm": 0.6105721895654297, + "learning_rate": 7.414407455020042e-05, + "loss": 0.6192, + "num_tokens": 1740058638.0, + "step": 745 + }, + { + "epoch": 1.0580712063601798, + "grad_norm": 0.5677178819217232, + "learning_rate": 7.411890134696417e-05, + "loss": 0.6526, + "num_tokens": 1742408534.0, + "step": 746 + }, + { + "epoch": 1.0584168683028, + "grad_norm": 0.5482402190242479, + "learning_rate": 7.40936784454543e-05, + "loss": 0.6536, + "num_tokens": 1744745805.0, + "step": 747 + }, + { + "epoch": 1.05876253024542, + "grad_norm": 0.547107451115583, + "learning_rate": 7.406840588241138e-05, + "loss": 0.6441, + "num_tokens": 1746951988.0, + "step": 748 + }, + { + "epoch": 1.05910819218804, + "grad_norm": 0.5905695130265809, + "learning_rate": 7.404308369464839e-05, + "loss": 0.6341, + "num_tokens": 1749258212.0, + "step": 749 + }, + { + "epoch": 1.0594538541306602, + "grad_norm": 0.5030246960359617, + "learning_rate": 7.401771191905056e-05, + "loss": 0.6479, + "num_tokens": 1751554535.0, + "step": 750 + }, + { + "epoch": 1.0597995160732803, + "grad_norm": 0.5832871767615319, + "learning_rate": 7.399229059257537e-05, + "loss": 0.652, + "num_tokens": 1753966669.0, + "step": 751 + }, + { + "epoch": 1.0601451780159004, + "grad_norm": 0.5121125935844938, + "learning_rate": 7.396681975225244e-05, + "loss": 0.6569, + "num_tokens": 1756377001.0, + "step": 752 + }, + { + "epoch": 1.0604908399585207, + "grad_norm": 0.4503017435065586, + "learning_rate": 7.394129943518356e-05, + "loss": 0.6311, + "num_tokens": 1758737119.0, + "step": 753 + }, + { + "epoch": 1.0608365019011408, + "grad_norm": 0.7949061846548016, + "learning_rate": 7.391572967854258e-05, + "loss": 0.648, + "num_tokens": 1761026759.0, + "step": 754 + }, + { + "epoch": 1.0611821638437609, + "grad_norm": 0.4560064351545243, + "learning_rate": 7.389011051957532e-05, + "loss": 0.6451, + "num_tokens": 1763350042.0, + "step": 755 + }, + { + "epoch": 1.061527825786381, + "grad_norm": 1.2347077769242014, + "learning_rate": 7.386444199559961e-05, + "loss": 0.6487, + "num_tokens": 1765650817.0, + "step": 756 + }, + { + "epoch": 1.061873487729001, + "grad_norm": 1.158583874700133, + "learning_rate": 7.38387241440052e-05, + "loss": 0.6504, + "num_tokens": 1767881814.0, + "step": 757 + }, + { + "epoch": 1.0622191496716211, + "grad_norm": 0.6448011263007218, + "learning_rate": 7.381295700225364e-05, + "loss": 0.641, + "num_tokens": 1770175833.0, + "step": 758 + }, + { + "epoch": 1.0625648116142412, + "grad_norm": 0.8917307508923848, + "learning_rate": 7.37871406078783e-05, + "loss": 0.667, + "num_tokens": 1772663319.0, + "step": 759 + }, + { + "epoch": 1.0629104735568613, + "grad_norm": 0.6104621491368218, + "learning_rate": 7.37612749984843e-05, + "loss": 0.6436, + "num_tokens": 1775138394.0, + "step": 760 + }, + { + "epoch": 1.0632561354994814, + "grad_norm": 0.8124160430251444, + "learning_rate": 7.373536021174847e-05, + "loss": 0.6536, + "num_tokens": 1777396741.0, + "step": 761 + }, + { + "epoch": 1.0636017974421017, + "grad_norm": 0.7039889909540522, + "learning_rate": 7.370939628541924e-05, + "loss": 0.6474, + "num_tokens": 1779706764.0, + "step": 762 + }, + { + "epoch": 1.0639474593847218, + "grad_norm": 0.689193292786864, + "learning_rate": 7.368338325731661e-05, + "loss": 0.6439, + "num_tokens": 1781916130.0, + "step": 763 + }, + { + "epoch": 1.0642931213273419, + "grad_norm": 0.6390137507725259, + "learning_rate": 7.365732116533211e-05, + "loss": 0.6419, + "num_tokens": 1784188330.0, + "step": 764 + }, + { + "epoch": 1.064638783269962, + "grad_norm": 0.6554849571391587, + "learning_rate": 7.363121004742878e-05, + "loss": 0.6519, + "num_tokens": 1786644594.0, + "step": 765 + }, + { + "epoch": 1.064984445212582, + "grad_norm": 0.6312688213053872, + "learning_rate": 7.360504994164103e-05, + "loss": 0.6507, + "num_tokens": 1789030996.0, + "step": 766 + }, + { + "epoch": 1.0653301071552022, + "grad_norm": 0.6189738003958034, + "learning_rate": 7.357884088607464e-05, + "loss": 0.6488, + "num_tokens": 1791436337.0, + "step": 767 + }, + { + "epoch": 1.0656757690978222, + "grad_norm": 0.6977938063772677, + "learning_rate": 7.355258291890668e-05, + "loss": 0.6499, + "num_tokens": 1793687333.0, + "step": 768 + }, + { + "epoch": 1.0660214310404426, + "grad_norm": 0.619630514606608, + "learning_rate": 7.352627607838552e-05, + "loss": 0.6462, + "num_tokens": 1796062168.0, + "step": 769 + }, + { + "epoch": 1.0663670929830626, + "grad_norm": 0.5828130974841995, + "learning_rate": 7.349992040283063e-05, + "loss": 0.645, + "num_tokens": 1798457607.0, + "step": 770 + }, + { + "epoch": 1.0667127549256827, + "grad_norm": 0.6199761505982225, + "learning_rate": 7.347351593063268e-05, + "loss": 0.6372, + "num_tokens": 1800781598.0, + "step": 771 + }, + { + "epoch": 1.0670584168683028, + "grad_norm": 0.5266254097971709, + "learning_rate": 7.344706270025341e-05, + "loss": 0.6395, + "num_tokens": 1803166909.0, + "step": 772 + }, + { + "epoch": 1.067404078810923, + "grad_norm": 0.6359171321541497, + "learning_rate": 7.342056075022558e-05, + "loss": 0.6643, + "num_tokens": 1805621827.0, + "step": 773 + }, + { + "epoch": 1.067749740753543, + "grad_norm": 0.5002793760377864, + "learning_rate": 7.339401011915288e-05, + "loss": 0.6553, + "num_tokens": 1808133341.0, + "step": 774 + }, + { + "epoch": 1.068095402696163, + "grad_norm": 0.45445502906255464, + "learning_rate": 7.336741084571e-05, + "loss": 0.6436, + "num_tokens": 1810498181.0, + "step": 775 + }, + { + "epoch": 1.0684410646387832, + "grad_norm": 0.5993541693473192, + "learning_rate": 7.334076296864237e-05, + "loss": 0.6566, + "num_tokens": 1812815708.0, + "step": 776 + }, + { + "epoch": 1.0687867265814033, + "grad_norm": 0.39715208242474315, + "learning_rate": 7.331406652676631e-05, + "loss": 0.6257, + "num_tokens": 1815191832.0, + "step": 777 + }, + { + "epoch": 1.0691323885240236, + "grad_norm": 0.5807411421752339, + "learning_rate": 7.328732155896883e-05, + "loss": 0.6512, + "num_tokens": 1817555497.0, + "step": 778 + }, + { + "epoch": 1.0694780504666437, + "grad_norm": 0.536870812872521, + "learning_rate": 7.326052810420765e-05, + "loss": 0.6386, + "num_tokens": 1819843980.0, + "step": 779 + }, + { + "epoch": 1.0698237124092638, + "grad_norm": 0.4966890078545703, + "learning_rate": 7.323368620151112e-05, + "loss": 0.6371, + "num_tokens": 1822080005.0, + "step": 780 + }, + { + "epoch": 1.0701693743518839, + "grad_norm": 0.5976501128568695, + "learning_rate": 7.320679588997813e-05, + "loss": 0.6588, + "num_tokens": 1824416891.0, + "step": 781 + }, + { + "epoch": 1.070515036294504, + "grad_norm": 0.46247030445853615, + "learning_rate": 7.317985720877812e-05, + "loss": 0.6402, + "num_tokens": 1826709154.0, + "step": 782 + }, + { + "epoch": 1.070860698237124, + "grad_norm": 0.7536096132054806, + "learning_rate": 7.315287019715096e-05, + "loss": 0.6437, + "num_tokens": 1829001355.0, + "step": 783 + }, + { + "epoch": 1.0712063601797441, + "grad_norm": 0.38994420778625705, + "learning_rate": 7.312583489440692e-05, + "loss": 0.6538, + "num_tokens": 1831393805.0, + "step": 784 + }, + { + "epoch": 1.0715520221223644, + "grad_norm": 0.7109690429009989, + "learning_rate": 7.309875133992666e-05, + "loss": 0.6455, + "num_tokens": 1833796453.0, + "step": 785 + }, + { + "epoch": 1.0718976840649845, + "grad_norm": 0.4852179051637593, + "learning_rate": 7.307161957316106e-05, + "loss": 0.6527, + "num_tokens": 1836211006.0, + "step": 786 + }, + { + "epoch": 1.0722433460076046, + "grad_norm": 0.7083451989057316, + "learning_rate": 7.304443963363126e-05, + "loss": 0.6522, + "num_tokens": 1838424514.0, + "step": 787 + }, + { + "epoch": 1.0725890079502247, + "grad_norm": 0.7573885040764021, + "learning_rate": 7.301721156092858e-05, + "loss": 0.6398, + "num_tokens": 1840656105.0, + "step": 788 + }, + { + "epoch": 1.0729346698928448, + "grad_norm": 0.6114814000393922, + "learning_rate": 7.298993539471443e-05, + "loss": 0.6598, + "num_tokens": 1843070310.0, + "step": 789 + }, + { + "epoch": 1.073280331835465, + "grad_norm": 0.6934632391747559, + "learning_rate": 7.29626111747203e-05, + "loss": 0.656, + "num_tokens": 1845457321.0, + "step": 790 + }, + { + "epoch": 1.073625993778085, + "grad_norm": 0.5843166128213365, + "learning_rate": 7.293523894074763e-05, + "loss": 0.641, + "num_tokens": 1847767475.0, + "step": 791 + }, + { + "epoch": 1.073971655720705, + "grad_norm": 0.667579078831484, + "learning_rate": 7.290781873266787e-05, + "loss": 0.6349, + "num_tokens": 1850145899.0, + "step": 792 + }, + { + "epoch": 1.0743173176633252, + "grad_norm": 0.6197358398924984, + "learning_rate": 7.288035059042229e-05, + "loss": 0.6564, + "num_tokens": 1852622005.0, + "step": 793 + }, + { + "epoch": 1.0746629796059455, + "grad_norm": 0.7001967420866541, + "learning_rate": 7.285283455402199e-05, + "loss": 0.6277, + "num_tokens": 1854879141.0, + "step": 794 + }, + { + "epoch": 1.0750086415485656, + "grad_norm": 0.5182855704796356, + "learning_rate": 7.282527066354787e-05, + "loss": 0.6497, + "num_tokens": 1857195141.0, + "step": 795 + }, + { + "epoch": 1.0753543034911857, + "grad_norm": 0.5523797722375886, + "learning_rate": 7.279765895915051e-05, + "loss": 0.63, + "num_tokens": 1859489387.0, + "step": 796 + }, + { + "epoch": 1.0756999654338057, + "grad_norm": 0.760152374250873, + "learning_rate": 7.276999948105014e-05, + "loss": 0.6309, + "num_tokens": 1861760812.0, + "step": 797 + }, + { + "epoch": 1.0760456273764258, + "grad_norm": 0.4682843426946511, + "learning_rate": 7.274229226953658e-05, + "loss": 0.631, + "num_tokens": 1864121467.0, + "step": 798 + }, + { + "epoch": 1.076391289319046, + "grad_norm": 0.8337644900637217, + "learning_rate": 7.271453736496918e-05, + "loss": 0.6447, + "num_tokens": 1866587557.0, + "step": 799 + }, + { + "epoch": 1.076736951261666, + "grad_norm": 0.7879071626071465, + "learning_rate": 7.268673480777676e-05, + "loss": 0.6505, + "num_tokens": 1868815218.0, + "step": 800 + }, + { + "epoch": 1.077082613204286, + "grad_norm": 0.8061431677035321, + "learning_rate": 7.265888463845758e-05, + "loss": 0.6389, + "num_tokens": 1871103301.0, + "step": 801 + }, + { + "epoch": 1.0774282751469064, + "grad_norm": 0.6987493343066828, + "learning_rate": 7.26309868975792e-05, + "loss": 0.6489, + "num_tokens": 1873534935.0, + "step": 802 + }, + { + "epoch": 1.0777739370895265, + "grad_norm": 0.6400362720975927, + "learning_rate": 7.260304162577852e-05, + "loss": 0.6493, + "num_tokens": 1875885442.0, + "step": 803 + }, + { + "epoch": 1.0781195990321466, + "grad_norm": 0.6266223091351598, + "learning_rate": 7.257504886376164e-05, + "loss": 0.6655, + "num_tokens": 1878198463.0, + "step": 804 + }, + { + "epoch": 1.0784652609747667, + "grad_norm": 0.6739942656077761, + "learning_rate": 7.254700865230387e-05, + "loss": 0.6768, + "num_tokens": 1880600485.0, + "step": 805 + }, + { + "epoch": 1.0788109229173868, + "grad_norm": 0.5902245697276366, + "learning_rate": 7.251892103224961e-05, + "loss": 0.6397, + "num_tokens": 1882914183.0, + "step": 806 + }, + { + "epoch": 1.0791565848600069, + "grad_norm": 0.7020689395643801, + "learning_rate": 7.249078604451235e-05, + "loss": 0.6395, + "num_tokens": 1885233760.0, + "step": 807 + }, + { + "epoch": 1.079502246802627, + "grad_norm": 0.6115206656323819, + "learning_rate": 7.246260373007453e-05, + "loss": 0.6506, + "num_tokens": 1887564291.0, + "step": 808 + }, + { + "epoch": 1.079847908745247, + "grad_norm": 0.6114811908663207, + "learning_rate": 7.243437412998757e-05, + "loss": 0.6366, + "num_tokens": 1889754096.0, + "step": 809 + }, + { + "epoch": 1.0801935706878674, + "grad_norm": 0.46225512013206405, + "learning_rate": 7.240609728537177e-05, + "loss": 0.6512, + "num_tokens": 1892045520.0, + "step": 810 + }, + { + "epoch": 1.0805392326304875, + "grad_norm": 0.6560348649529575, + "learning_rate": 7.237777323741618e-05, + "loss": 0.6557, + "num_tokens": 1894354541.0, + "step": 811 + }, + { + "epoch": 1.0808848945731075, + "grad_norm": 0.6453921405854075, + "learning_rate": 7.23494020273787e-05, + "loss": 0.6525, + "num_tokens": 1896652688.0, + "step": 812 + }, + { + "epoch": 1.0812305565157276, + "grad_norm": 0.5369860393744597, + "learning_rate": 7.232098369658586e-05, + "loss": 0.6526, + "num_tokens": 1899087830.0, + "step": 813 + }, + { + "epoch": 1.0815762184583477, + "grad_norm": 0.4107974206461175, + "learning_rate": 7.229251828643286e-05, + "loss": 0.6424, + "num_tokens": 1901392709.0, + "step": 814 + }, + { + "epoch": 1.0819218804009678, + "grad_norm": 0.4578877114563842, + "learning_rate": 7.226400583838349e-05, + "loss": 0.6304, + "num_tokens": 1903834795.0, + "step": 815 + }, + { + "epoch": 1.082267542343588, + "grad_norm": 0.5944889433338477, + "learning_rate": 7.223544639397004e-05, + "loss": 0.6458, + "num_tokens": 1906206411.0, + "step": 816 + }, + { + "epoch": 1.082613204286208, + "grad_norm": 0.6184931999931358, + "learning_rate": 7.220683999479321e-05, + "loss": 0.6361, + "num_tokens": 1908577672.0, + "step": 817 + }, + { + "epoch": 1.0829588662288283, + "grad_norm": 0.45795276862694984, + "learning_rate": 7.217818668252218e-05, + "loss": 0.6331, + "num_tokens": 1910889471.0, + "step": 818 + }, + { + "epoch": 1.0833045281714484, + "grad_norm": 0.5132892740499396, + "learning_rate": 7.214948649889444e-05, + "loss": 0.6377, + "num_tokens": 1913234203.0, + "step": 819 + }, + { + "epoch": 1.0836501901140685, + "grad_norm": 0.3448583302603749, + "learning_rate": 7.212073948571568e-05, + "loss": 0.628, + "num_tokens": 1915596601.0, + "step": 820 + }, + { + "epoch": 1.0839958520566886, + "grad_norm": 0.5513522868846035, + "learning_rate": 7.209194568485995e-05, + "loss": 0.6233, + "num_tokens": 1917901182.0, + "step": 821 + }, + { + "epoch": 1.0843415139993087, + "grad_norm": 0.5866413038599926, + "learning_rate": 7.20631051382693e-05, + "loss": 0.6458, + "num_tokens": 1920226840.0, + "step": 822 + }, + { + "epoch": 1.0846871759419288, + "grad_norm": 0.4045684168469693, + "learning_rate": 7.203421788795396e-05, + "loss": 0.6471, + "num_tokens": 1922453696.0, + "step": 823 + }, + { + "epoch": 1.0850328378845489, + "grad_norm": 0.7869103482008692, + "learning_rate": 7.200528397599219e-05, + "loss": 0.6494, + "num_tokens": 1924759201.0, + "step": 824 + }, + { + "epoch": 1.085378499827169, + "grad_norm": 0.5410056370259639, + "learning_rate": 7.197630344453017e-05, + "loss": 0.6602, + "num_tokens": 1927135596.0, + "step": 825 + }, + { + "epoch": 1.085724161769789, + "grad_norm": 0.7958783353763306, + "learning_rate": 7.194727633578201e-05, + "loss": 0.6549, + "num_tokens": 1929610669.0, + "step": 826 + }, + { + "epoch": 1.0860698237124093, + "grad_norm": 0.6402841646465279, + "learning_rate": 7.19182026920297e-05, + "loss": 0.6459, + "num_tokens": 1931883643.0, + "step": 827 + }, + { + "epoch": 1.0864154856550294, + "grad_norm": 0.818857544167672, + "learning_rate": 7.188908255562297e-05, + "loss": 0.6481, + "num_tokens": 1934243142.0, + "step": 828 + }, + { + "epoch": 1.0867611475976495, + "grad_norm": 0.5363775378787705, + "learning_rate": 7.18599159689793e-05, + "loss": 0.6493, + "num_tokens": 1936514746.0, + "step": 829 + }, + { + "epoch": 1.0871068095402696, + "grad_norm": 0.864852068872538, + "learning_rate": 7.183070297458383e-05, + "loss": 0.6391, + "num_tokens": 1938735061.0, + "step": 830 + }, + { + "epoch": 1.0874524714828897, + "grad_norm": 0.8102716523037938, + "learning_rate": 7.180144361498927e-05, + "loss": 0.6468, + "num_tokens": 1941031300.0, + "step": 831 + }, + { + "epoch": 1.0877981334255098, + "grad_norm": 0.5484483948735375, + "learning_rate": 7.177213793281587e-05, + "loss": 0.6548, + "num_tokens": 1943463463.0, + "step": 832 + }, + { + "epoch": 1.0881437953681299, + "grad_norm": 0.7593067290211563, + "learning_rate": 7.174278597075143e-05, + "loss": 0.6518, + "num_tokens": 1945702324.0, + "step": 833 + }, + { + "epoch": 1.0884894573107502, + "grad_norm": 0.6533774156522734, + "learning_rate": 7.171338777155107e-05, + "loss": 0.6374, + "num_tokens": 1948050388.0, + "step": 834 + }, + { + "epoch": 1.0888351192533703, + "grad_norm": 0.5686216185926073, + "learning_rate": 7.16839433780373e-05, + "loss": 0.6499, + "num_tokens": 1950369193.0, + "step": 835 + }, + { + "epoch": 1.0891807811959904, + "grad_norm": 0.706657891467921, + "learning_rate": 7.165445283309989e-05, + "loss": 0.6464, + "num_tokens": 1952782653.0, + "step": 836 + }, + { + "epoch": 1.0895264431386105, + "grad_norm": 0.572975938004015, + "learning_rate": 7.162491617969592e-05, + "loss": 0.6411, + "num_tokens": 1955164495.0, + "step": 837 + }, + { + "epoch": 1.0898721050812306, + "grad_norm": 0.4864844513645345, + "learning_rate": 7.159533346084952e-05, + "loss": 0.6362, + "num_tokens": 1957510144.0, + "step": 838 + }, + { + "epoch": 1.0902177670238506, + "grad_norm": 0.5932761701971104, + "learning_rate": 7.156570471965199e-05, + "loss": 0.6434, + "num_tokens": 1959879213.0, + "step": 839 + }, + { + "epoch": 1.0905634289664707, + "grad_norm": 0.4599648868842703, + "learning_rate": 7.153602999926166e-05, + "loss": 0.6264, + "num_tokens": 1962237084.0, + "step": 840 + }, + { + "epoch": 1.0909090909090908, + "grad_norm": 0.6961549336858718, + "learning_rate": 7.150630934290383e-05, + "loss": 0.6564, + "num_tokens": 1964642093.0, + "step": 841 + }, + { + "epoch": 1.091254752851711, + "grad_norm": 0.5532430573581417, + "learning_rate": 7.147654279387071e-05, + "loss": 0.6357, + "num_tokens": 1967001460.0, + "step": 842 + }, + { + "epoch": 1.0916004147943312, + "grad_norm": 0.5394878914126686, + "learning_rate": 7.144673039552135e-05, + "loss": 0.6411, + "num_tokens": 1969354407.0, + "step": 843 + }, + { + "epoch": 1.0919460767369513, + "grad_norm": 0.6909141430882717, + "learning_rate": 7.14168721912816e-05, + "loss": 0.623, + "num_tokens": 1971649574.0, + "step": 844 + }, + { + "epoch": 1.0922917386795714, + "grad_norm": 0.5137859737101764, + "learning_rate": 7.138696822464401e-05, + "loss": 0.6486, + "num_tokens": 1973936364.0, + "step": 845 + }, + { + "epoch": 1.0926374006221915, + "grad_norm": 0.832982418739012, + "learning_rate": 7.135701853916784e-05, + "loss": 0.6382, + "num_tokens": 1976244124.0, + "step": 846 + }, + { + "epoch": 1.0929830625648116, + "grad_norm": 0.7229837251766013, + "learning_rate": 7.132702317847889e-05, + "loss": 0.6198, + "num_tokens": 1978510513.0, + "step": 847 + }, + { + "epoch": 1.0933287245074317, + "grad_norm": 0.7518776950417697, + "learning_rate": 7.129698218626951e-05, + "loss": 0.6523, + "num_tokens": 1980794292.0, + "step": 848 + }, + { + "epoch": 1.0936743864500518, + "grad_norm": 0.7267343038511015, + "learning_rate": 7.126689560629852e-05, + "loss": 0.629, + "num_tokens": 1983095973.0, + "step": 849 + }, + { + "epoch": 1.094020048392672, + "grad_norm": 0.5623206201756308, + "learning_rate": 7.123676348239117e-05, + "loss": 0.6211, + "num_tokens": 1985399396.0, + "step": 850 + }, + { + "epoch": 1.0943657103352922, + "grad_norm": 0.6394717281881187, + "learning_rate": 7.120658585843901e-05, + "loss": 0.6193, + "num_tokens": 1987621030.0, + "step": 851 + }, + { + "epoch": 1.0947113722779123, + "grad_norm": 0.4674349248748675, + "learning_rate": 7.117636277839989e-05, + "loss": 0.6468, + "num_tokens": 1989875933.0, + "step": 852 + }, + { + "epoch": 1.0950570342205324, + "grad_norm": 0.7775480942695264, + "learning_rate": 7.114609428629787e-05, + "loss": 0.6482, + "num_tokens": 1992156139.0, + "step": 853 + }, + { + "epoch": 1.0954026961631524, + "grad_norm": 0.5758101275193842, + "learning_rate": 7.111578042622317e-05, + "loss": 0.6259, + "num_tokens": 1994420988.0, + "step": 854 + }, + { + "epoch": 1.0957483581057725, + "grad_norm": 0.8718137102717207, + "learning_rate": 7.108542124233206e-05, + "loss": 0.6515, + "num_tokens": 1996790521.0, + "step": 855 + }, + { + "epoch": 1.0960940200483926, + "grad_norm": 0.7422005655497319, + "learning_rate": 7.105501677884686e-05, + "loss": 0.6457, + "num_tokens": 1999127227.0, + "step": 856 + }, + { + "epoch": 1.0964396819910127, + "grad_norm": 0.7689179759714174, + "learning_rate": 7.102456708005585e-05, + "loss": 0.6349, + "num_tokens": 2001456840.0, + "step": 857 + }, + { + "epoch": 1.0967853439336328, + "grad_norm": 0.6696473629178211, + "learning_rate": 7.099407219031317e-05, + "loss": 0.643, + "num_tokens": 2003834413.0, + "step": 858 + }, + { + "epoch": 1.0971310058762531, + "grad_norm": 0.7196473711071472, + "learning_rate": 7.096353215403882e-05, + "loss": 0.6297, + "num_tokens": 2006098585.0, + "step": 859 + }, + { + "epoch": 1.0974766678188732, + "grad_norm": 0.5928074700274769, + "learning_rate": 7.093294701571853e-05, + "loss": 0.6409, + "num_tokens": 2008489293.0, + "step": 860 + }, + { + "epoch": 1.0978223297614933, + "grad_norm": 0.7056246259237353, + "learning_rate": 7.090231681990379e-05, + "loss": 0.6414, + "num_tokens": 2010808787.0, + "step": 861 + }, + { + "epoch": 1.0981679917041134, + "grad_norm": 0.5486511854819776, + "learning_rate": 7.087164161121162e-05, + "loss": 0.6478, + "num_tokens": 2013092910.0, + "step": 862 + }, + { + "epoch": 1.0985136536467335, + "grad_norm": 0.7275998401947965, + "learning_rate": 7.084092143432472e-05, + "loss": 0.6219, + "num_tokens": 2015422559.0, + "step": 863 + }, + { + "epoch": 1.0988593155893536, + "grad_norm": 0.5902939257926976, + "learning_rate": 7.08101563339912e-05, + "loss": 0.635, + "num_tokens": 2017782573.0, + "step": 864 + }, + { + "epoch": 1.0992049775319737, + "grad_norm": 0.6373096265415296, + "learning_rate": 7.077934635502467e-05, + "loss": 0.6401, + "num_tokens": 2020116306.0, + "step": 865 + }, + { + "epoch": 1.0995506394745937, + "grad_norm": 0.5207584379771001, + "learning_rate": 7.074849154230407e-05, + "loss": 0.6322, + "num_tokens": 2022443781.0, + "step": 866 + }, + { + "epoch": 1.099896301417214, + "grad_norm": 0.6445001637743131, + "learning_rate": 7.071759194077368e-05, + "loss": 0.6243, + "num_tokens": 2024745124.0, + "step": 867 + }, + { + "epoch": 1.1002419633598342, + "grad_norm": 0.6259526617735324, + "learning_rate": 7.068664759544299e-05, + "loss": 0.6365, + "num_tokens": 2027035978.0, + "step": 868 + }, + { + "epoch": 1.1005876253024542, + "grad_norm": 0.6851587462113808, + "learning_rate": 7.065565855138669e-05, + "loss": 0.6415, + "num_tokens": 2029384468.0, + "step": 869 + }, + { + "epoch": 1.1009332872450743, + "grad_norm": 0.7118355424982044, + "learning_rate": 7.062462485374456e-05, + "loss": 0.6365, + "num_tokens": 2031813719.0, + "step": 870 + }, + { + "epoch": 1.1012789491876944, + "grad_norm": 0.5718826597746939, + "learning_rate": 7.059354654772145e-05, + "loss": 0.6371, + "num_tokens": 2034215691.0, + "step": 871 + }, + { + "epoch": 1.1016246111303145, + "grad_norm": 0.669027918816834, + "learning_rate": 7.056242367858716e-05, + "loss": 0.617, + "num_tokens": 2036688303.0, + "step": 872 + }, + { + "epoch": 1.1019702730729346, + "grad_norm": 0.4390920202449945, + "learning_rate": 7.05312562916764e-05, + "loss": 0.631, + "num_tokens": 2039089916.0, + "step": 873 + }, + { + "epoch": 1.1023159350155547, + "grad_norm": 0.7705947538110972, + "learning_rate": 7.050004443238879e-05, + "loss": 0.6365, + "num_tokens": 2041486542.0, + "step": 874 + }, + { + "epoch": 1.102661596958175, + "grad_norm": 0.5244246251307207, + "learning_rate": 7.046878814618862e-05, + "loss": 0.6338, + "num_tokens": 2043822878.0, + "step": 875 + }, + { + "epoch": 1.103007258900795, + "grad_norm": 0.6855156107505581, + "learning_rate": 7.0437487478605e-05, + "loss": 0.6157, + "num_tokens": 2046121260.0, + "step": 876 + }, + { + "epoch": 1.1033529208434152, + "grad_norm": 0.5914213555415903, + "learning_rate": 7.040614247523163e-05, + "loss": 0.6408, + "num_tokens": 2048421563.0, + "step": 877 + }, + { + "epoch": 1.1036985827860353, + "grad_norm": 0.6966852099859985, + "learning_rate": 7.037475318172679e-05, + "loss": 0.6263, + "num_tokens": 2050773142.0, + "step": 878 + }, + { + "epoch": 1.1040442447286554, + "grad_norm": 0.6057019635316307, + "learning_rate": 7.03433196438133e-05, + "loss": 0.6264, + "num_tokens": 2053065791.0, + "step": 879 + }, + { + "epoch": 1.1043899066712755, + "grad_norm": 0.6243191805663824, + "learning_rate": 7.031184190727843e-05, + "loss": 0.6318, + "num_tokens": 2055382885.0, + "step": 880 + }, + { + "epoch": 1.1047355686138955, + "grad_norm": 0.6471641175860846, + "learning_rate": 7.028032001797379e-05, + "loss": 0.6401, + "num_tokens": 2057694752.0, + "step": 881 + }, + { + "epoch": 1.1050812305565156, + "grad_norm": 0.4717662214072129, + "learning_rate": 7.024875402181535e-05, + "loss": 0.6279, + "num_tokens": 2059953810.0, + "step": 882 + }, + { + "epoch": 1.105426892499136, + "grad_norm": 0.6993358214802385, + "learning_rate": 7.02171439647833e-05, + "loss": 0.6328, + "num_tokens": 2062336734.0, + "step": 883 + }, + { + "epoch": 1.105772554441756, + "grad_norm": 0.6930172186519157, + "learning_rate": 7.018548989292204e-05, + "loss": 0.6507, + "num_tokens": 2064670753.0, + "step": 884 + }, + { + "epoch": 1.1061182163843761, + "grad_norm": 0.48513162283116845, + "learning_rate": 7.015379185234004e-05, + "loss": 0.654, + "num_tokens": 2066999169.0, + "step": 885 + }, + { + "epoch": 1.1064638783269962, + "grad_norm": 0.79984066658087, + "learning_rate": 7.012204988920986e-05, + "loss": 0.6583, + "num_tokens": 2069455232.0, + "step": 886 + }, + { + "epoch": 1.1068095402696163, + "grad_norm": 0.5615265223757545, + "learning_rate": 7.0090264049768e-05, + "loss": 0.6457, + "num_tokens": 2071803138.0, + "step": 887 + }, + { + "epoch": 1.1071552022122364, + "grad_norm": 0.7388769745641661, + "learning_rate": 7.00584343803149e-05, + "loss": 0.6471, + "num_tokens": 2074194623.0, + "step": 888 + }, + { + "epoch": 1.1075008641548565, + "grad_norm": 0.6596766010589992, + "learning_rate": 7.002656092721486e-05, + "loss": 0.6602, + "num_tokens": 2076680051.0, + "step": 889 + }, + { + "epoch": 1.1078465260974766, + "grad_norm": 0.6825923823675335, + "learning_rate": 6.99946437368959e-05, + "loss": 0.6425, + "num_tokens": 2079095231.0, + "step": 890 + }, + { + "epoch": 1.1081921880400967, + "grad_norm": 0.5054575681810504, + "learning_rate": 6.99626828558498e-05, + "loss": 0.6208, + "num_tokens": 2081422851.0, + "step": 891 + }, + { + "epoch": 1.108537849982717, + "grad_norm": 0.7101454920931178, + "learning_rate": 6.993067833063194e-05, + "loss": 0.6257, + "num_tokens": 2083631354.0, + "step": 892 + }, + { + "epoch": 1.108883511925337, + "grad_norm": 0.6813611778539539, + "learning_rate": 6.989863020786133e-05, + "loss": 0.6441, + "num_tokens": 2086103646.0, + "step": 893 + }, + { + "epoch": 1.1092291738679572, + "grad_norm": 0.5808563119676818, + "learning_rate": 6.986653853422046e-05, + "loss": 0.635, + "num_tokens": 2088435733.0, + "step": 894 + }, + { + "epoch": 1.1095748358105773, + "grad_norm": 0.6763093451632597, + "learning_rate": 6.983440335645522e-05, + "loss": 0.6454, + "num_tokens": 2090777490.0, + "step": 895 + }, + { + "epoch": 1.1099204977531973, + "grad_norm": 0.4916894744134045, + "learning_rate": 6.98022247213749e-05, + "loss": 0.6426, + "num_tokens": 2093112752.0, + "step": 896 + }, + { + "epoch": 1.1102661596958174, + "grad_norm": 0.5976052169547356, + "learning_rate": 6.977000267585211e-05, + "loss": 0.6368, + "num_tokens": 2095506528.0, + "step": 897 + }, + { + "epoch": 1.1106118216384375, + "grad_norm": 0.48367407629573994, + "learning_rate": 6.973773726682268e-05, + "loss": 0.6215, + "num_tokens": 2097886592.0, + "step": 898 + }, + { + "epoch": 1.1109574835810578, + "grad_norm": 0.4999548515884936, + "learning_rate": 6.970542854128557e-05, + "loss": 0.6227, + "num_tokens": 2100159013.0, + "step": 899 + }, + { + "epoch": 1.111303145523678, + "grad_norm": 0.8232656973391344, + "learning_rate": 6.967307654630291e-05, + "loss": 0.6276, + "num_tokens": 2102437349.0, + "step": 900 + }, + { + "epoch": 1.111648807466298, + "grad_norm": 0.7566580413902171, + "learning_rate": 6.964068132899979e-05, + "loss": 0.6366, + "num_tokens": 2104776738.0, + "step": 901 + }, + { + "epoch": 1.111994469408918, + "grad_norm": 0.73199001613691, + "learning_rate": 6.960824293656429e-05, + "loss": 0.6221, + "num_tokens": 2107100856.0, + "step": 902 + }, + { + "epoch": 1.1123401313515382, + "grad_norm": 0.8697615015805004, + "learning_rate": 6.957576141624736e-05, + "loss": 0.6346, + "num_tokens": 2109494220.0, + "step": 903 + }, + { + "epoch": 1.1126857932941583, + "grad_norm": 0.5199833480250547, + "learning_rate": 6.95432368153628e-05, + "loss": 0.6236, + "num_tokens": 2111818402.0, + "step": 904 + }, + { + "epoch": 1.1130314552367784, + "grad_norm": 0.6901009839572165, + "learning_rate": 6.951066918128716e-05, + "loss": 0.6405, + "num_tokens": 2114112431.0, + "step": 905 + }, + { + "epoch": 1.1133771171793985, + "grad_norm": 0.5531430275448762, + "learning_rate": 6.947805856145965e-05, + "loss": 0.6358, + "num_tokens": 2116496655.0, + "step": 906 + }, + { + "epoch": 1.1137227791220186, + "grad_norm": 0.555268446488502, + "learning_rate": 6.944540500338212e-05, + "loss": 0.634, + "num_tokens": 2118796208.0, + "step": 907 + }, + { + "epoch": 1.1140684410646389, + "grad_norm": 0.5692476125999714, + "learning_rate": 6.941270855461891e-05, + "loss": 0.6246, + "num_tokens": 2121183141.0, + "step": 908 + }, + { + "epoch": 1.114414103007259, + "grad_norm": 0.6671198843025686, + "learning_rate": 6.937996926279694e-05, + "loss": 0.6383, + "num_tokens": 2123584625.0, + "step": 909 + }, + { + "epoch": 1.114759764949879, + "grad_norm": 0.5598019317327989, + "learning_rate": 6.934718717560543e-05, + "loss": 0.633, + "num_tokens": 2126008691.0, + "step": 910 + }, + { + "epoch": 1.1151054268924991, + "grad_norm": 0.7795964951726142, + "learning_rate": 6.9314362340796e-05, + "loss": 0.6258, + "num_tokens": 2128369876.0, + "step": 911 + }, + { + "epoch": 1.1154510888351192, + "grad_norm": 0.753000224616792, + "learning_rate": 6.928149480618252e-05, + "loss": 0.6441, + "num_tokens": 2130554887.0, + "step": 912 + }, + { + "epoch": 1.1157967507777393, + "grad_norm": 0.6037643159868399, + "learning_rate": 6.924858461964108e-05, + "loss": 0.6383, + "num_tokens": 2132953171.0, + "step": 913 + }, + { + "epoch": 1.1161424127203594, + "grad_norm": 0.5991831241506346, + "learning_rate": 6.921563182910983e-05, + "loss": 0.6339, + "num_tokens": 2135269648.0, + "step": 914 + }, + { + "epoch": 1.1164880746629797, + "grad_norm": 0.7047653439113356, + "learning_rate": 6.918263648258906e-05, + "loss": 0.632, + "num_tokens": 2137601618.0, + "step": 915 + }, + { + "epoch": 1.1168337366055998, + "grad_norm": 0.47454492484100547, + "learning_rate": 6.914959862814103e-05, + "loss": 0.651, + "num_tokens": 2139940157.0, + "step": 916 + }, + { + "epoch": 1.11717939854822, + "grad_norm": 0.6793734322739983, + "learning_rate": 6.911651831388986e-05, + "loss": 0.6407, + "num_tokens": 2142158525.0, + "step": 917 + }, + { + "epoch": 1.11752506049084, + "grad_norm": 0.6206771689296277, + "learning_rate": 6.908339558802158e-05, + "loss": 0.6197, + "num_tokens": 2144501158.0, + "step": 918 + }, + { + "epoch": 1.11787072243346, + "grad_norm": 0.4976655024049246, + "learning_rate": 6.905023049878401e-05, + "loss": 0.6465, + "num_tokens": 2146789115.0, + "step": 919 + }, + { + "epoch": 1.1182163843760802, + "grad_norm": 0.7230013861607477, + "learning_rate": 6.901702309448659e-05, + "loss": 0.6503, + "num_tokens": 2149159219.0, + "step": 920 + }, + { + "epoch": 1.1185620463187003, + "grad_norm": 0.5678791087220733, + "learning_rate": 6.898377342350051e-05, + "loss": 0.6286, + "num_tokens": 2151494749.0, + "step": 921 + }, + { + "epoch": 1.1189077082613204, + "grad_norm": 0.5911803287324875, + "learning_rate": 6.895048153425845e-05, + "loss": 0.6209, + "num_tokens": 2153813345.0, + "step": 922 + }, + { + "epoch": 1.1192533702039404, + "grad_norm": 0.46345346327761283, + "learning_rate": 6.89171474752546e-05, + "loss": 0.638, + "num_tokens": 2156023059.0, + "step": 923 + }, + { + "epoch": 1.1195990321465608, + "grad_norm": 0.6110386431097571, + "learning_rate": 6.888377129504461e-05, + "loss": 0.6367, + "num_tokens": 2158287614.0, + "step": 924 + }, + { + "epoch": 1.1199446940891808, + "grad_norm": 0.4368089387289657, + "learning_rate": 6.885035304224543e-05, + "loss": 0.635, + "num_tokens": 2160666520.0, + "step": 925 + }, + { + "epoch": 1.120290356031801, + "grad_norm": 0.5372899511278134, + "learning_rate": 6.881689276553535e-05, + "loss": 0.6246, + "num_tokens": 2163000496.0, + "step": 926 + }, + { + "epoch": 1.120636017974421, + "grad_norm": 0.6538615604219101, + "learning_rate": 6.878339051365385e-05, + "loss": 0.6427, + "num_tokens": 2165308958.0, + "step": 927 + }, + { + "epoch": 1.1209816799170411, + "grad_norm": 0.4243914914199368, + "learning_rate": 6.874984633540154e-05, + "loss": 0.6577, + "num_tokens": 2167749217.0, + "step": 928 + }, + { + "epoch": 1.1213273418596612, + "grad_norm": 0.3893441256768904, + "learning_rate": 6.871626027964012e-05, + "loss": 0.6298, + "num_tokens": 2170036892.0, + "step": 929 + }, + { + "epoch": 1.1216730038022813, + "grad_norm": 0.5810904356526593, + "learning_rate": 6.868263239529226e-05, + "loss": 0.6304, + "num_tokens": 2172379267.0, + "step": 930 + }, + { + "epoch": 1.1220186657449014, + "grad_norm": 0.5814926105052469, + "learning_rate": 6.864896273134165e-05, + "loss": 0.6206, + "num_tokens": 2174814243.0, + "step": 931 + }, + { + "epoch": 1.1223643276875217, + "grad_norm": 0.3669268932228981, + "learning_rate": 6.861525133683269e-05, + "loss": 0.6445, + "num_tokens": 2177154550.0, + "step": 932 + }, + { + "epoch": 1.1227099896301418, + "grad_norm": 0.3750619980799489, + "learning_rate": 6.858149826087069e-05, + "loss": 0.6238, + "num_tokens": 2179468910.0, + "step": 933 + }, + { + "epoch": 1.1230556515727619, + "grad_norm": 0.3960254726510145, + "learning_rate": 6.854770355262162e-05, + "loss": 0.623, + "num_tokens": 2181699356.0, + "step": 934 + }, + { + "epoch": 1.123401313515382, + "grad_norm": 0.7447647609741658, + "learning_rate": 6.851386726131211e-05, + "loss": 0.6463, + "num_tokens": 2184168740.0, + "step": 935 + }, + { + "epoch": 1.123746975458002, + "grad_norm": 0.5180778483368027, + "learning_rate": 6.847998943622935e-05, + "loss": 0.6332, + "num_tokens": 2186365943.0, + "step": 936 + }, + { + "epoch": 1.1240926374006222, + "grad_norm": 0.6425397415246044, + "learning_rate": 6.844607012672104e-05, + "loss": 0.6317, + "num_tokens": 2188721176.0, + "step": 937 + }, + { + "epoch": 1.1244382993432422, + "grad_norm": 0.7044000930496582, + "learning_rate": 6.841210938219531e-05, + "loss": 0.6422, + "num_tokens": 2190988111.0, + "step": 938 + }, + { + "epoch": 1.1247839612858623, + "grad_norm": 0.49193188806039756, + "learning_rate": 6.837810725212062e-05, + "loss": 0.6372, + "num_tokens": 2193380486.0, + "step": 939 + }, + { + "epoch": 1.1251296232284824, + "grad_norm": 0.6704740327256191, + "learning_rate": 6.834406378602576e-05, + "loss": 0.6433, + "num_tokens": 2195689281.0, + "step": 940 + }, + { + "epoch": 1.1254752851711027, + "grad_norm": 0.5593393104513635, + "learning_rate": 6.830997903349968e-05, + "loss": 0.6456, + "num_tokens": 2197944315.0, + "step": 941 + }, + { + "epoch": 1.1258209471137228, + "grad_norm": 0.5875245810645537, + "learning_rate": 6.827585304419152e-05, + "loss": 0.6329, + "num_tokens": 2200277439.0, + "step": 942 + }, + { + "epoch": 1.126166609056343, + "grad_norm": 0.4498250279337379, + "learning_rate": 6.824168586781042e-05, + "loss": 0.6328, + "num_tokens": 2202577556.0, + "step": 943 + }, + { + "epoch": 1.126512270998963, + "grad_norm": 0.5893339159235107, + "learning_rate": 6.820747755412559e-05, + "loss": 0.6393, + "num_tokens": 2204940990.0, + "step": 944 + }, + { + "epoch": 1.126857932941583, + "grad_norm": 0.49392041985893986, + "learning_rate": 6.817322815296612e-05, + "loss": 0.6416, + "num_tokens": 2207320014.0, + "step": 945 + }, + { + "epoch": 1.1272035948842032, + "grad_norm": 0.46509019182473127, + "learning_rate": 6.813893771422095e-05, + "loss": 0.6315, + "num_tokens": 2209652915.0, + "step": 946 + }, + { + "epoch": 1.1275492568268233, + "grad_norm": 0.6723719169966719, + "learning_rate": 6.81046062878388e-05, + "loss": 0.6179, + "num_tokens": 2211960764.0, + "step": 947 + }, + { + "epoch": 1.1278949187694436, + "grad_norm": 0.4621703070283164, + "learning_rate": 6.807023392382812e-05, + "loss": 0.6438, + "num_tokens": 2214418935.0, + "step": 948 + }, + { + "epoch": 1.1282405807120637, + "grad_norm": 0.7218407737679211, + "learning_rate": 6.803582067225695e-05, + "loss": 0.6363, + "num_tokens": 2216700012.0, + "step": 949 + }, + { + "epoch": 1.1285862426546838, + "grad_norm": 0.5884391351103374, + "learning_rate": 6.800136658325292e-05, + "loss": 0.6304, + "num_tokens": 2219035399.0, + "step": 950 + }, + { + "epoch": 1.1289319045973039, + "grad_norm": 0.689816889563364, + "learning_rate": 6.796687170700312e-05, + "loss": 0.6236, + "num_tokens": 2221297737.0, + "step": 951 + }, + { + "epoch": 1.129277566539924, + "grad_norm": 0.6085290817918191, + "learning_rate": 6.793233609375408e-05, + "loss": 0.6618, + "num_tokens": 2223737424.0, + "step": 952 + }, + { + "epoch": 1.129623228482544, + "grad_norm": 0.637760361595735, + "learning_rate": 6.789775979381162e-05, + "loss": 0.6475, + "num_tokens": 2226124311.0, + "step": 953 + }, + { + "epoch": 1.1299688904251641, + "grad_norm": 0.5487522464458742, + "learning_rate": 6.786314285754091e-05, + "loss": 0.6422, + "num_tokens": 2228555461.0, + "step": 954 + }, + { + "epoch": 1.1303145523677842, + "grad_norm": 0.7340495531293658, + "learning_rate": 6.782848533536624e-05, + "loss": 0.6085, + "num_tokens": 2230983286.0, + "step": 955 + }, + { + "epoch": 1.1306602143104043, + "grad_norm": 0.6664998247310985, + "learning_rate": 6.779378727777103e-05, + "loss": 0.6479, + "num_tokens": 2233288654.0, + "step": 956 + }, + { + "epoch": 1.1310058762530246, + "grad_norm": 0.67988864669748, + "learning_rate": 6.775904873529778e-05, + "loss": 0.6235, + "num_tokens": 2235497133.0, + "step": 957 + }, + { + "epoch": 1.1313515381956447, + "grad_norm": 0.6796758006477891, + "learning_rate": 6.772426975854791e-05, + "loss": 0.6357, + "num_tokens": 2237797176.0, + "step": 958 + }, + { + "epoch": 1.1316972001382648, + "grad_norm": 0.48626440584935404, + "learning_rate": 6.76894503981818e-05, + "loss": 0.6259, + "num_tokens": 2240134648.0, + "step": 959 + }, + { + "epoch": 1.132042862080885, + "grad_norm": 0.5740460824237259, + "learning_rate": 6.765459070491859e-05, + "loss": 0.6521, + "num_tokens": 2242440732.0, + "step": 960 + }, + { + "epoch": 1.132388524023505, + "grad_norm": 0.5988155984820678, + "learning_rate": 6.761969072953624e-05, + "loss": 0.6227, + "num_tokens": 2244829003.0, + "step": 961 + }, + { + "epoch": 1.132734185966125, + "grad_norm": 0.5067837814332167, + "learning_rate": 6.758475052287126e-05, + "loss": 0.6461, + "num_tokens": 2247227638.0, + "step": 962 + }, + { + "epoch": 1.1330798479087452, + "grad_norm": 0.7304351732023366, + "learning_rate": 6.754977013581897e-05, + "loss": 0.6498, + "num_tokens": 2249616137.0, + "step": 963 + }, + { + "epoch": 1.1334255098513655, + "grad_norm": 0.6477905996842682, + "learning_rate": 6.751474961933303e-05, + "loss": 0.6509, + "num_tokens": 2251916922.0, + "step": 964 + }, + { + "epoch": 1.1337711717939856, + "grad_norm": 0.6482806465104399, + "learning_rate": 6.747968902442562e-05, + "loss": 0.6373, + "num_tokens": 2254204419.0, + "step": 965 + }, + { + "epoch": 1.1341168337366057, + "grad_norm": 0.6006819045968405, + "learning_rate": 6.744458840216731e-05, + "loss": 0.6461, + "num_tokens": 2256667941.0, + "step": 966 + }, + { + "epoch": 1.1344624956792257, + "grad_norm": 0.5337562138038692, + "learning_rate": 6.740944780368699e-05, + "loss": 0.634, + "num_tokens": 2258998563.0, + "step": 967 + }, + { + "epoch": 1.1348081576218458, + "grad_norm": 0.4223695542827033, + "learning_rate": 6.737426728017173e-05, + "loss": 0.62, + "num_tokens": 2261148023.0, + "step": 968 + }, + { + "epoch": 1.135153819564466, + "grad_norm": 0.5750274402311729, + "learning_rate": 6.733904688286678e-05, + "loss": 0.6249, + "num_tokens": 2263557925.0, + "step": 969 + }, + { + "epoch": 1.135499481507086, + "grad_norm": 0.5316012645950183, + "learning_rate": 6.73037866630755e-05, + "loss": 0.6228, + "num_tokens": 2265849340.0, + "step": 970 + }, + { + "epoch": 1.135845143449706, + "grad_norm": 0.7189913183050987, + "learning_rate": 6.726848667215923e-05, + "loss": 0.6266, + "num_tokens": 2268174598.0, + "step": 971 + }, + { + "epoch": 1.1361908053923262, + "grad_norm": 0.5800634854128125, + "learning_rate": 6.723314696153724e-05, + "loss": 0.6109, + "num_tokens": 2270509342.0, + "step": 972 + }, + { + "epoch": 1.1365364673349465, + "grad_norm": 0.6258730712234569, + "learning_rate": 6.719776758268666e-05, + "loss": 0.6162, + "num_tokens": 2272825083.0, + "step": 973 + }, + { + "epoch": 1.1368821292775666, + "grad_norm": 0.5329175998226954, + "learning_rate": 6.716234858714242e-05, + "loss": 0.6289, + "num_tokens": 2275091568.0, + "step": 974 + }, + { + "epoch": 1.1372277912201867, + "grad_norm": 0.5231537995187835, + "learning_rate": 6.71268900264971e-05, + "loss": 0.6236, + "num_tokens": 2277492532.0, + "step": 975 + }, + { + "epoch": 1.1375734531628068, + "grad_norm": 0.6601276082266523, + "learning_rate": 6.709139195240101e-05, + "loss": 0.6296, + "num_tokens": 2279786923.0, + "step": 976 + }, + { + "epoch": 1.1379191151054269, + "grad_norm": 0.4937968105353521, + "learning_rate": 6.70558544165619e-05, + "loss": 0.6264, + "num_tokens": 2282187464.0, + "step": 977 + }, + { + "epoch": 1.138264777048047, + "grad_norm": 0.5146181314844022, + "learning_rate": 6.702027747074512e-05, + "loss": 0.6268, + "num_tokens": 2284442191.0, + "step": 978 + }, + { + "epoch": 1.138610438990667, + "grad_norm": 0.4663909067350701, + "learning_rate": 6.698466116677332e-05, + "loss": 0.636, + "num_tokens": 2286759940.0, + "step": 979 + }, + { + "epoch": 1.1389561009332874, + "grad_norm": 0.7581406371080717, + "learning_rate": 6.694900555652656e-05, + "loss": 0.6342, + "num_tokens": 2289085381.0, + "step": 980 + }, + { + "epoch": 1.1393017628759075, + "grad_norm": 0.41439161112940287, + "learning_rate": 6.691331069194212e-05, + "loss": 0.615, + "num_tokens": 2291387458.0, + "step": 981 + }, + { + "epoch": 1.1396474248185275, + "grad_norm": 0.5545072363697835, + "learning_rate": 6.687757662501445e-05, + "loss": 0.6356, + "num_tokens": 2293716765.0, + "step": 982 + }, + { + "epoch": 1.1399930867611476, + "grad_norm": 0.5554150497991966, + "learning_rate": 6.684180340779512e-05, + "loss": 0.6111, + "num_tokens": 2296051852.0, + "step": 983 + }, + { + "epoch": 1.1403387487037677, + "grad_norm": 0.5447425206600714, + "learning_rate": 6.680599109239275e-05, + "loss": 0.6297, + "num_tokens": 2298393559.0, + "step": 984 + }, + { + "epoch": 1.1406844106463878, + "grad_norm": 0.4510165923624746, + "learning_rate": 6.677013973097283e-05, + "loss": 0.6252, + "num_tokens": 2300755039.0, + "step": 985 + }, + { + "epoch": 1.141030072589008, + "grad_norm": 0.510964276685836, + "learning_rate": 6.673424937575782e-05, + "loss": 0.63, + "num_tokens": 2303082949.0, + "step": 986 + }, + { + "epoch": 1.141375734531628, + "grad_norm": 0.48323428192819035, + "learning_rate": 6.669832007902694e-05, + "loss": 0.6395, + "num_tokens": 2305423791.0, + "step": 987 + }, + { + "epoch": 1.141721396474248, + "grad_norm": 0.6048317819158896, + "learning_rate": 6.666235189311613e-05, + "loss": 0.6237, + "num_tokens": 2307679909.0, + "step": 988 + }, + { + "epoch": 1.1420670584168684, + "grad_norm": 0.5291077952484509, + "learning_rate": 6.6626344870418e-05, + "loss": 0.6072, + "num_tokens": 2309957448.0, + "step": 989 + }, + { + "epoch": 1.1424127203594885, + "grad_norm": 0.6168406657051422, + "learning_rate": 6.65902990633817e-05, + "loss": 0.6155, + "num_tokens": 2312331918.0, + "step": 990 + }, + { + "epoch": 1.1427583823021086, + "grad_norm": 0.6559005577027325, + "learning_rate": 6.655421452451286e-05, + "loss": 0.6235, + "num_tokens": 2314637599.0, + "step": 991 + }, + { + "epoch": 1.1431040442447287, + "grad_norm": 0.628192778755691, + "learning_rate": 6.65180913063736e-05, + "loss": 0.6007, + "num_tokens": 2316874956.0, + "step": 992 + }, + { + "epoch": 1.1434497061873488, + "grad_norm": 0.5658899861342798, + "learning_rate": 6.64819294615823e-05, + "loss": 0.6197, + "num_tokens": 2319333788.0, + "step": 993 + }, + { + "epoch": 1.1437953681299688, + "grad_norm": 0.5068984419164501, + "learning_rate": 6.64457290428137e-05, + "loss": 0.6281, + "num_tokens": 2321717198.0, + "step": 994 + }, + { + "epoch": 1.144141030072589, + "grad_norm": 0.3814550398423423, + "learning_rate": 6.640949010279862e-05, + "loss": 0.6207, + "num_tokens": 2324035973.0, + "step": 995 + }, + { + "epoch": 1.144486692015209, + "grad_norm": 0.5236360790300648, + "learning_rate": 6.637321269432405e-05, + "loss": 0.6321, + "num_tokens": 2326389131.0, + "step": 996 + }, + { + "epoch": 1.1448323539578293, + "grad_norm": 0.4836360462587859, + "learning_rate": 6.633689687023302e-05, + "loss": 0.5903, + "num_tokens": 2328682547.0, + "step": 997 + }, + { + "epoch": 1.1451780159004494, + "grad_norm": 0.5848324586628182, + "learning_rate": 6.630054268342452e-05, + "loss": 0.6409, + "num_tokens": 2331110855.0, + "step": 998 + }, + { + "epoch": 1.1455236778430695, + "grad_norm": 0.4099450482794086, + "learning_rate": 6.626415018685338e-05, + "loss": 0.6192, + "num_tokens": 2333482982.0, + "step": 999 + }, + { + "epoch": 1.1458693397856896, + "grad_norm": 0.7408602110844928, + "learning_rate": 6.622771943353026e-05, + "loss": 0.6179, + "num_tokens": 2335807519.0, + "step": 1000 + }, + { + "epoch": 1.1462150017283097, + "grad_norm": 0.6012612937482089, + "learning_rate": 6.619125047652157e-05, + "loss": 0.619, + "num_tokens": 2338209285.0, + "step": 1001 + }, + { + "epoch": 1.1465606636709298, + "grad_norm": 0.6493454542590236, + "learning_rate": 6.615474336894931e-05, + "loss": 0.6402, + "num_tokens": 2340538805.0, + "step": 1002 + }, + { + "epoch": 1.1469063256135499, + "grad_norm": 0.6267501046711663, + "learning_rate": 6.611819816399114e-05, + "loss": 0.6433, + "num_tokens": 2342758184.0, + "step": 1003 + }, + { + "epoch": 1.14725198755617, + "grad_norm": 0.5923752473356027, + "learning_rate": 6.608161491488008e-05, + "loss": 0.6275, + "num_tokens": 2345117717.0, + "step": 1004 + }, + { + "epoch": 1.14759764949879, + "grad_norm": 0.5284346039261694, + "learning_rate": 6.604499367490472e-05, + "loss": 0.625, + "num_tokens": 2347389652.0, + "step": 1005 + }, + { + "epoch": 1.1479433114414104, + "grad_norm": 0.44923193031135483, + "learning_rate": 6.600833449740888e-05, + "loss": 0.6199, + "num_tokens": 2349733790.0, + "step": 1006 + }, + { + "epoch": 1.1482889733840305, + "grad_norm": 0.6672607541090939, + "learning_rate": 6.597163743579169e-05, + "loss": 0.6184, + "num_tokens": 2352150095.0, + "step": 1007 + }, + { + "epoch": 1.1486346353266506, + "grad_norm": 0.5203279901567358, + "learning_rate": 6.593490254350743e-05, + "loss": 0.6282, + "num_tokens": 2354520199.0, + "step": 1008 + }, + { + "epoch": 1.1489802972692706, + "grad_norm": 0.5390672046219768, + "learning_rate": 6.589812987406553e-05, + "loss": 0.6255, + "num_tokens": 2356858374.0, + "step": 1009 + }, + { + "epoch": 1.1493259592118907, + "grad_norm": 0.5219983962268486, + "learning_rate": 6.58613194810304e-05, + "loss": 0.6376, + "num_tokens": 2359145604.0, + "step": 1010 + }, + { + "epoch": 1.1496716211545108, + "grad_norm": 0.5189856868184768, + "learning_rate": 6.582447141802145e-05, + "loss": 0.607, + "num_tokens": 2361475559.0, + "step": 1011 + }, + { + "epoch": 1.150017283097131, + "grad_norm": 0.46311222356315457, + "learning_rate": 6.578758573871292e-05, + "loss": 0.6239, + "num_tokens": 2363737560.0, + "step": 1012 + }, + { + "epoch": 1.1503629450397512, + "grad_norm": 0.7127067671023645, + "learning_rate": 6.575066249683384e-05, + "loss": 0.6283, + "num_tokens": 2366088917.0, + "step": 1013 + }, + { + "epoch": 1.1507086069823713, + "grad_norm": 0.504798450260592, + "learning_rate": 6.5713701746168e-05, + "loss": 0.6385, + "num_tokens": 2368421839.0, + "step": 1014 + }, + { + "epoch": 1.1510542689249914, + "grad_norm": 0.6885582654012001, + "learning_rate": 6.567670354055379e-05, + "loss": 0.6367, + "num_tokens": 2370794871.0, + "step": 1015 + }, + { + "epoch": 1.1513999308676115, + "grad_norm": 0.6123015826183623, + "learning_rate": 6.563966793388416e-05, + "loss": 0.6187, + "num_tokens": 2373094409.0, + "step": 1016 + }, + { + "epoch": 1.1517455928102316, + "grad_norm": 0.5780681901246453, + "learning_rate": 6.560259498010656e-05, + "loss": 0.6117, + "num_tokens": 2375491140.0, + "step": 1017 + }, + { + "epoch": 1.1520912547528517, + "grad_norm": 0.5259881800886433, + "learning_rate": 6.556548473322283e-05, + "loss": 0.6335, + "num_tokens": 2377915034.0, + "step": 1018 + }, + { + "epoch": 1.1524369166954718, + "grad_norm": 0.49751243175953724, + "learning_rate": 6.552833724728911e-05, + "loss": 0.628, + "num_tokens": 2380242355.0, + "step": 1019 + }, + { + "epoch": 1.1527825786380919, + "grad_norm": 0.5436034825045782, + "learning_rate": 6.549115257641583e-05, + "loss": 0.6223, + "num_tokens": 2382624028.0, + "step": 1020 + }, + { + "epoch": 1.153128240580712, + "grad_norm": 0.44158626232659887, + "learning_rate": 6.545393077476755e-05, + "loss": 0.6077, + "num_tokens": 2384913662.0, + "step": 1021 + }, + { + "epoch": 1.1534739025233323, + "grad_norm": 0.5844912134081436, + "learning_rate": 6.541667189656292e-05, + "loss": 0.6401, + "num_tokens": 2387252071.0, + "step": 1022 + }, + { + "epoch": 1.1538195644659524, + "grad_norm": 0.48878099388322455, + "learning_rate": 6.537937599607459e-05, + "loss": 0.6298, + "num_tokens": 2389537580.0, + "step": 1023 + }, + { + "epoch": 1.1541652264085724, + "grad_norm": 0.6726480816242691, + "learning_rate": 6.53420431276292e-05, + "loss": 0.6321, + "num_tokens": 2391849525.0, + "step": 1024 + }, + { + "epoch": 1.1545108883511925, + "grad_norm": 0.4667586163082233, + "learning_rate": 6.530467334560713e-05, + "loss": 0.609, + "num_tokens": 2394244666.0, + "step": 1025 + }, + { + "epoch": 1.1548565502938126, + "grad_norm": 0.5946137675797538, + "learning_rate": 6.526726670444264e-05, + "loss": 0.6165, + "num_tokens": 2396611863.0, + "step": 1026 + }, + { + "epoch": 1.1552022122364327, + "grad_norm": 0.5617144788546772, + "learning_rate": 6.52298232586236e-05, + "loss": 0.6235, + "num_tokens": 2398986714.0, + "step": 1027 + }, + { + "epoch": 1.1555478741790528, + "grad_norm": 0.5669655200572348, + "learning_rate": 6.519234306269153e-05, + "loss": 0.6486, + "num_tokens": 2401283913.0, + "step": 1028 + }, + { + "epoch": 1.1558935361216731, + "grad_norm": 0.6117264014054326, + "learning_rate": 6.515482617124147e-05, + "loss": 0.6274, + "num_tokens": 2403583605.0, + "step": 1029 + }, + { + "epoch": 1.1562391980642932, + "grad_norm": 0.5556547485056625, + "learning_rate": 6.511727263892192e-05, + "loss": 0.6212, + "num_tokens": 2405801945.0, + "step": 1030 + }, + { + "epoch": 1.1565848600069133, + "grad_norm": 0.5911936235918205, + "learning_rate": 6.507968252043473e-05, + "loss": 0.6079, + "num_tokens": 2408109634.0, + "step": 1031 + }, + { + "epoch": 1.1569305219495334, + "grad_norm": 0.4503003052254557, + "learning_rate": 6.504205587053508e-05, + "loss": 0.6379, + "num_tokens": 2410543159.0, + "step": 1032 + }, + { + "epoch": 1.1572761838921535, + "grad_norm": 0.5267984089254603, + "learning_rate": 6.500439274403134e-05, + "loss": 0.6103, + "num_tokens": 2412863462.0, + "step": 1033 + }, + { + "epoch": 1.1576218458347736, + "grad_norm": 0.46879755191818, + "learning_rate": 6.496669319578502e-05, + "loss": 0.6254, + "num_tokens": 2415203804.0, + "step": 1034 + }, + { + "epoch": 1.1579675077773937, + "grad_norm": 0.6381091314475303, + "learning_rate": 6.492895728071065e-05, + "loss": 0.637, + "num_tokens": 2417610667.0, + "step": 1035 + }, + { + "epoch": 1.1583131697200137, + "grad_norm": 0.6094039963064175, + "learning_rate": 6.48911850537758e-05, + "loss": 0.6113, + "num_tokens": 2419930595.0, + "step": 1036 + }, + { + "epoch": 1.1586588316626338, + "grad_norm": 0.42868201291590824, + "learning_rate": 6.485337657000086e-05, + "loss": 0.6174, + "num_tokens": 2422255862.0, + "step": 1037 + }, + { + "epoch": 1.1590044936052541, + "grad_norm": 0.3945134222441782, + "learning_rate": 6.481553188445912e-05, + "loss": 0.6148, + "num_tokens": 2424548752.0, + "step": 1038 + }, + { + "epoch": 1.1593501555478742, + "grad_norm": 0.6144174418937509, + "learning_rate": 6.47776510522765e-05, + "loss": 0.6245, + "num_tokens": 2426950590.0, + "step": 1039 + }, + { + "epoch": 1.1596958174904943, + "grad_norm": 0.5967950663607829, + "learning_rate": 6.473973412863164e-05, + "loss": 0.6132, + "num_tokens": 2429150730.0, + "step": 1040 + }, + { + "epoch": 1.1600414794331144, + "grad_norm": 0.6114240026615225, + "learning_rate": 6.470178116875574e-05, + "loss": 0.6396, + "num_tokens": 2431460825.0, + "step": 1041 + }, + { + "epoch": 1.1603871413757345, + "grad_norm": 0.5204705855257178, + "learning_rate": 6.466379222793251e-05, + "loss": 0.6627, + "num_tokens": 2433790240.0, + "step": 1042 + }, + { + "epoch": 1.1607328033183546, + "grad_norm": 0.8581440433088964, + "learning_rate": 6.4625767361498e-05, + "loss": 0.6333, + "num_tokens": 2436131665.0, + "step": 1043 + }, + { + "epoch": 1.1610784652609747, + "grad_norm": 0.4973254034934891, + "learning_rate": 6.458770662484068e-05, + "loss": 0.6384, + "num_tokens": 2438500883.0, + "step": 1044 + }, + { + "epoch": 1.161424127203595, + "grad_norm": 1.0637891925815905, + "learning_rate": 6.454961007340122e-05, + "loss": 0.6244, + "num_tokens": 2440759518.0, + "step": 1045 + }, + { + "epoch": 1.161769789146215, + "grad_norm": 0.7984849801833085, + "learning_rate": 6.451147776267246e-05, + "loss": 0.6045, + "num_tokens": 2443150579.0, + "step": 1046 + }, + { + "epoch": 1.1621154510888352, + "grad_norm": 0.9159144404348797, + "learning_rate": 6.447330974819937e-05, + "loss": 0.6435, + "num_tokens": 2445569012.0, + "step": 1047 + }, + { + "epoch": 1.1624611130314553, + "grad_norm": 1.1611006690908596, + "learning_rate": 6.443510608557883e-05, + "loss": 0.5997, + "num_tokens": 2447770933.0, + "step": 1048 + }, + { + "epoch": 1.1628067749740754, + "grad_norm": 0.523598039649553, + "learning_rate": 6.439686683045977e-05, + "loss": 0.6353, + "num_tokens": 2450130773.0, + "step": 1049 + }, + { + "epoch": 1.1631524369166955, + "grad_norm": 1.2567326385582667, + "learning_rate": 6.435859203854287e-05, + "loss": 0.6071, + "num_tokens": 2452577107.0, + "step": 1050 + }, + { + "epoch": 1.1634980988593155, + "grad_norm": 1.241276400190654, + "learning_rate": 6.432028176558064e-05, + "loss": 0.6297, + "num_tokens": 2454916386.0, + "step": 1051 + }, + { + "epoch": 1.1638437608019356, + "grad_norm": 0.7572514942284563, + "learning_rate": 6.428193606737723e-05, + "loss": 0.6288, + "num_tokens": 2457210898.0, + "step": 1052 + }, + { + "epoch": 1.1641894227445557, + "grad_norm": 1.107044968013011, + "learning_rate": 6.42435549997884e-05, + "loss": 0.6209, + "num_tokens": 2459524849.0, + "step": 1053 + }, + { + "epoch": 1.164535084687176, + "grad_norm": 1.073624324225753, + "learning_rate": 6.420513861872144e-05, + "loss": 0.6202, + "num_tokens": 2461849408.0, + "step": 1054 + }, + { + "epoch": 1.1648807466297961, + "grad_norm": 0.9268276420827404, + "learning_rate": 6.416668698013507e-05, + "loss": 0.6195, + "num_tokens": 2464143619.0, + "step": 1055 + }, + { + "epoch": 1.1652264085724162, + "grad_norm": 0.9676910918328933, + "learning_rate": 6.412820014003938e-05, + "loss": 0.6253, + "num_tokens": 2466523210.0, + "step": 1056 + }, + { + "epoch": 1.1655720705150363, + "grad_norm": 0.8123373044427525, + "learning_rate": 6.408967815449572e-05, + "loss": 0.6303, + "num_tokens": 2468766487.0, + "step": 1057 + }, + { + "epoch": 1.1659177324576564, + "grad_norm": 0.9045271199461954, + "learning_rate": 6.405112107961664e-05, + "loss": 0.5998, + "num_tokens": 2470997960.0, + "step": 1058 + }, + { + "epoch": 1.1662633944002765, + "grad_norm": 0.9580608118315684, + "learning_rate": 6.401252897156583e-05, + "loss": 0.6196, + "num_tokens": 2473234959.0, + "step": 1059 + }, + { + "epoch": 1.1666090563428966, + "grad_norm": 0.6787626987069121, + "learning_rate": 6.397390188655797e-05, + "loss": 0.6342, + "num_tokens": 2475643048.0, + "step": 1060 + }, + { + "epoch": 1.1669547182855167, + "grad_norm": 0.8399074698635285, + "learning_rate": 6.393523988085868e-05, + "loss": 0.6241, + "num_tokens": 2477957328.0, + "step": 1061 + }, + { + "epoch": 1.167300380228137, + "grad_norm": 0.7876972855558251, + "learning_rate": 6.38965430107845e-05, + "loss": 0.6017, + "num_tokens": 2480139155.0, + "step": 1062 + }, + { + "epoch": 1.167646042170757, + "grad_norm": 0.5783847796837571, + "learning_rate": 6.38578113327027e-05, + "loss": 0.6223, + "num_tokens": 2482517823.0, + "step": 1063 + }, + { + "epoch": 1.1679917041133772, + "grad_norm": 0.6759839194351326, + "learning_rate": 6.381904490303132e-05, + "loss": 0.606, + "num_tokens": 2484856582.0, + "step": 1064 + }, + { + "epoch": 1.1683373660559973, + "grad_norm": 0.6068465246249596, + "learning_rate": 6.378024377823893e-05, + "loss": 0.6027, + "num_tokens": 2487065717.0, + "step": 1065 + }, + { + "epoch": 1.1686830279986173, + "grad_norm": 0.6928264910232758, + "learning_rate": 6.374140801484471e-05, + "loss": 0.6069, + "num_tokens": 2489416100.0, + "step": 1066 + }, + { + "epoch": 1.1690286899412374, + "grad_norm": 0.599808957540365, + "learning_rate": 6.370253766941829e-05, + "loss": 0.6342, + "num_tokens": 2491864200.0, + "step": 1067 + }, + { + "epoch": 1.1693743518838575, + "grad_norm": 0.6966398788529901, + "learning_rate": 6.36636327985796e-05, + "loss": 0.605, + "num_tokens": 2494119253.0, + "step": 1068 + }, + { + "epoch": 1.1697200138264776, + "grad_norm": 0.6340293147435363, + "learning_rate": 6.362469345899899e-05, + "loss": 0.6418, + "num_tokens": 2496498849.0, + "step": 1069 + }, + { + "epoch": 1.1700656757690977, + "grad_norm": 0.7845818371431056, + "learning_rate": 6.358571970739688e-05, + "loss": 0.6382, + "num_tokens": 2498787531.0, + "step": 1070 + }, + { + "epoch": 1.170411337711718, + "grad_norm": 0.6280136625834155, + "learning_rate": 6.354671160054393e-05, + "loss": 0.6329, + "num_tokens": 2501061697.0, + "step": 1071 + }, + { + "epoch": 1.170756999654338, + "grad_norm": 0.528329470195529, + "learning_rate": 6.350766919526076e-05, + "loss": 0.6004, + "num_tokens": 2503349583.0, + "step": 1072 + }, + { + "epoch": 1.1711026615969582, + "grad_norm": 0.5635540560003458, + "learning_rate": 6.3468592548418e-05, + "loss": 0.6229, + "num_tokens": 2505690465.0, + "step": 1073 + }, + { + "epoch": 1.1714483235395783, + "grad_norm": 0.42868148598577804, + "learning_rate": 6.342948171693615e-05, + "loss": 0.6167, + "num_tokens": 2508082736.0, + "step": 1074 + }, + { + "epoch": 1.1717939854821984, + "grad_norm": 0.6741695282043885, + "learning_rate": 6.339033675778548e-05, + "loss": 0.6138, + "num_tokens": 2510524048.0, + "step": 1075 + }, + { + "epoch": 1.1721396474248185, + "grad_norm": 0.538082711701654, + "learning_rate": 6.3351157727986e-05, + "loss": 0.6154, + "num_tokens": 2512806934.0, + "step": 1076 + }, + { + "epoch": 1.1724853093674386, + "grad_norm": 0.3757701154562901, + "learning_rate": 6.331194468460732e-05, + "loss": 0.6008, + "num_tokens": 2515082769.0, + "step": 1077 + }, + { + "epoch": 1.1728309713100589, + "grad_norm": 0.48310444096695987, + "learning_rate": 6.327269768476863e-05, + "loss": 0.6067, + "num_tokens": 2517336697.0, + "step": 1078 + }, + { + "epoch": 1.173176633252679, + "grad_norm": 0.5872091587422075, + "learning_rate": 6.323341678563855e-05, + "loss": 0.6199, + "num_tokens": 2519562060.0, + "step": 1079 + }, + { + "epoch": 1.173522295195299, + "grad_norm": 0.516393769174304, + "learning_rate": 6.319410204443512e-05, + "loss": 0.6068, + "num_tokens": 2521926447.0, + "step": 1080 + }, + { + "epoch": 1.1738679571379191, + "grad_norm": 0.5208911786116173, + "learning_rate": 6.315475351842563e-05, + "loss": 0.6398, + "num_tokens": 2524290834.0, + "step": 1081 + }, + { + "epoch": 1.1742136190805392, + "grad_norm": 0.47934323416880825, + "learning_rate": 6.311537126492659e-05, + "loss": 0.6139, + "num_tokens": 2526535304.0, + "step": 1082 + }, + { + "epoch": 1.1745592810231593, + "grad_norm": 0.4464631066895134, + "learning_rate": 6.307595534130368e-05, + "loss": 0.6302, + "num_tokens": 2528840846.0, + "step": 1083 + }, + { + "epoch": 1.1749049429657794, + "grad_norm": 0.44714748063814214, + "learning_rate": 6.303650580497158e-05, + "loss": 0.6127, + "num_tokens": 2531170791.0, + "step": 1084 + }, + { + "epoch": 1.1752506049083995, + "grad_norm": 0.5989513388106248, + "learning_rate": 6.299702271339393e-05, + "loss": 0.6333, + "num_tokens": 2533500511.0, + "step": 1085 + }, + { + "epoch": 1.1755962668510196, + "grad_norm": 0.43365235289137755, + "learning_rate": 6.29575061240833e-05, + "loss": 0.5995, + "num_tokens": 2535842273.0, + "step": 1086 + }, + { + "epoch": 1.17594192879364, + "grad_norm": 0.5173045270615718, + "learning_rate": 6.291795609460097e-05, + "loss": 0.6075, + "num_tokens": 2538107845.0, + "step": 1087 + }, + { + "epoch": 1.17628759073626, + "grad_norm": 0.5395280912616487, + "learning_rate": 6.287837268255705e-05, + "loss": 0.6298, + "num_tokens": 2540396036.0, + "step": 1088 + }, + { + "epoch": 1.17663325267888, + "grad_norm": 0.6559481496907851, + "learning_rate": 6.283875594561013e-05, + "loss": 0.6132, + "num_tokens": 2542697113.0, + "step": 1089 + }, + { + "epoch": 1.1769789146215002, + "grad_norm": 0.48535320926206027, + "learning_rate": 6.279910594146746e-05, + "loss": 0.6348, + "num_tokens": 2545173039.0, + "step": 1090 + }, + { + "epoch": 1.1773245765641203, + "grad_norm": 0.679245336559069, + "learning_rate": 6.275942272788469e-05, + "loss": 0.6106, + "num_tokens": 2547454625.0, + "step": 1091 + }, + { + "epoch": 1.1776702385067404, + "grad_norm": 0.6108275365498225, + "learning_rate": 6.271970636266588e-05, + "loss": 0.614, + "num_tokens": 2549887341.0, + "step": 1092 + }, + { + "epoch": 1.1780159004493604, + "grad_norm": 0.4108169662251059, + "learning_rate": 6.267995690366334e-05, + "loss": 0.6208, + "num_tokens": 2552266464.0, + "step": 1093 + }, + { + "epoch": 1.1783615623919808, + "grad_norm": 0.7010143528190987, + "learning_rate": 6.26401744087776e-05, + "loss": 0.6012, + "num_tokens": 2554479498.0, + "step": 1094 + }, + { + "epoch": 1.1787072243346008, + "grad_norm": 0.4813767164582428, + "learning_rate": 6.260035893595734e-05, + "loss": 0.6253, + "num_tokens": 2556820838.0, + "step": 1095 + }, + { + "epoch": 1.179052886277221, + "grad_norm": 0.5620214362860045, + "learning_rate": 6.256051054319924e-05, + "loss": 0.6073, + "num_tokens": 2559207422.0, + "step": 1096 + }, + { + "epoch": 1.179398548219841, + "grad_norm": 0.6506138277564101, + "learning_rate": 6.252062928854794e-05, + "loss": 0.6192, + "num_tokens": 2561606064.0, + "step": 1097 + }, + { + "epoch": 1.1797442101624611, + "grad_norm": 0.3888366779250104, + "learning_rate": 6.248071523009596e-05, + "loss": 0.6248, + "num_tokens": 2563905410.0, + "step": 1098 + }, + { + "epoch": 1.1800898721050812, + "grad_norm": 0.8996762071189687, + "learning_rate": 6.24407684259836e-05, + "loss": 0.6208, + "num_tokens": 2566285936.0, + "step": 1099 + }, + { + "epoch": 1.1804355340477013, + "grad_norm": 0.5789649137721405, + "learning_rate": 6.240078893439886e-05, + "loss": 0.642, + "num_tokens": 2568577592.0, + "step": 1100 + }, + { + "epoch": 1.1807811959903214, + "grad_norm": 0.9378808154139731, + "learning_rate": 6.236077681357731e-05, + "loss": 0.6387, + "num_tokens": 2571025312.0, + "step": 1101 + }, + { + "epoch": 1.1811268579329415, + "grad_norm": 0.5188144845407305, + "learning_rate": 6.232073212180217e-05, + "loss": 0.6378, + "num_tokens": 2573278383.0, + "step": 1102 + }, + { + "epoch": 1.1814725198755618, + "grad_norm": 1.129598569577214, + "learning_rate": 6.228065491740394e-05, + "loss": 0.6332, + "num_tokens": 2575549210.0, + "step": 1103 + }, + { + "epoch": 1.1818181818181819, + "grad_norm": 0.9901018669772305, + "learning_rate": 6.224054525876059e-05, + "loss": 0.6335, + "num_tokens": 2577939562.0, + "step": 1104 + }, + { + "epoch": 1.182163843760802, + "grad_norm": 0.9756545906779439, + "learning_rate": 6.220040320429736e-05, + "loss": 0.6273, + "num_tokens": 2580261914.0, + "step": 1105 + }, + { + "epoch": 1.182509505703422, + "grad_norm": 1.1311448720134305, + "learning_rate": 6.216022881248663e-05, + "loss": 0.6454, + "num_tokens": 2582648605.0, + "step": 1106 + }, + { + "epoch": 1.1828551676460421, + "grad_norm": 0.6024389344222203, + "learning_rate": 6.212002214184789e-05, + "loss": 0.6277, + "num_tokens": 2584948355.0, + "step": 1107 + }, + { + "epoch": 1.1832008295886622, + "grad_norm": 1.05897859012828, + "learning_rate": 6.207978325094772e-05, + "loss": 0.6342, + "num_tokens": 2587370747.0, + "step": 1108 + }, + { + "epoch": 1.1835464915312823, + "grad_norm": 0.7615208980732178, + "learning_rate": 6.203951219839953e-05, + "loss": 0.6093, + "num_tokens": 2589728383.0, + "step": 1109 + }, + { + "epoch": 1.1838921534739026, + "grad_norm": 0.8614307141570999, + "learning_rate": 6.199920904286365e-05, + "loss": 0.6242, + "num_tokens": 2592078549.0, + "step": 1110 + }, + { + "epoch": 1.1842378154165227, + "grad_norm": 0.8596040846959309, + "learning_rate": 6.195887384304714e-05, + "loss": 0.6268, + "num_tokens": 2594455129.0, + "step": 1111 + }, + { + "epoch": 1.1845834773591428, + "grad_norm": 0.6812845432497595, + "learning_rate": 6.191850665770375e-05, + "loss": 0.629, + "num_tokens": 2596941659.0, + "step": 1112 + }, + { + "epoch": 1.184929139301763, + "grad_norm": 0.6225659113904932, + "learning_rate": 6.187810754563385e-05, + "loss": 0.6149, + "num_tokens": 2599349438.0, + "step": 1113 + }, + { + "epoch": 1.185274801244383, + "grad_norm": 0.5913140994331072, + "learning_rate": 6.183767656568421e-05, + "loss": 0.6196, + "num_tokens": 2601802153.0, + "step": 1114 + }, + { + "epoch": 1.185620463187003, + "grad_norm": 0.5346630677799714, + "learning_rate": 6.179721377674817e-05, + "loss": 0.6163, + "num_tokens": 2604202180.0, + "step": 1115 + }, + { + "epoch": 1.1859661251296232, + "grad_norm": 0.43402760999811735, + "learning_rate": 6.175671923776529e-05, + "loss": 0.6158, + "num_tokens": 2606489187.0, + "step": 1116 + }, + { + "epoch": 1.1863117870722433, + "grad_norm": 0.630188092739292, + "learning_rate": 6.171619300772144e-05, + "loss": 0.6131, + "num_tokens": 2608781942.0, + "step": 1117 + }, + { + "epoch": 1.1866574490148634, + "grad_norm": 0.3777676008270153, + "learning_rate": 6.167563514564858e-05, + "loss": 0.6173, + "num_tokens": 2611062402.0, + "step": 1118 + }, + { + "epoch": 1.1870031109574837, + "grad_norm": 0.7305396771794315, + "learning_rate": 6.163504571062486e-05, + "loss": 0.639, + "num_tokens": 2613423153.0, + "step": 1119 + }, + { + "epoch": 1.1873487729001038, + "grad_norm": 0.5051106839106165, + "learning_rate": 6.15944247617743e-05, + "loss": 0.6186, + "num_tokens": 2615823720.0, + "step": 1120 + }, + { + "epoch": 1.1876944348427239, + "grad_norm": 0.6964356076027165, + "learning_rate": 6.155377235826693e-05, + "loss": 0.6229, + "num_tokens": 2618195407.0, + "step": 1121 + }, + { + "epoch": 1.188040096785344, + "grad_norm": 0.5790787138190756, + "learning_rate": 6.15130885593185e-05, + "loss": 0.6063, + "num_tokens": 2620449116.0, + "step": 1122 + }, + { + "epoch": 1.188385758727964, + "grad_norm": 0.5126447255480284, + "learning_rate": 6.147237342419056e-05, + "loss": 0.626, + "num_tokens": 2622699594.0, + "step": 1123 + }, + { + "epoch": 1.1887314206705841, + "grad_norm": 0.7871564434471875, + "learning_rate": 6.143162701219029e-05, + "loss": 0.6264, + "num_tokens": 2625092938.0, + "step": 1124 + }, + { + "epoch": 1.1890770826132042, + "grad_norm": 0.5209536353255501, + "learning_rate": 6.139084938267043e-05, + "loss": 0.6245, + "num_tokens": 2627432677.0, + "step": 1125 + }, + { + "epoch": 1.1894227445558243, + "grad_norm": 1.062400785366239, + "learning_rate": 6.135004059502917e-05, + "loss": 0.6469, + "num_tokens": 2629823448.0, + "step": 1126 + }, + { + "epoch": 1.1897684064984446, + "grad_norm": 1.048597547035341, + "learning_rate": 6.13092007087101e-05, + "loss": 0.6512, + "num_tokens": 2632276469.0, + "step": 1127 + }, + { + "epoch": 1.1901140684410647, + "grad_norm": 0.6534021430570458, + "learning_rate": 6.126832978320211e-05, + "loss": 0.6141, + "num_tokens": 2634686091.0, + "step": 1128 + }, + { + "epoch": 1.1904597303836848, + "grad_norm": 0.7216571533557882, + "learning_rate": 6.122742787803933e-05, + "loss": 0.6037, + "num_tokens": 2636910080.0, + "step": 1129 + }, + { + "epoch": 1.190805392326305, + "grad_norm": 0.7289522656813919, + "learning_rate": 6.118649505280098e-05, + "loss": 0.636, + "num_tokens": 2639278992.0, + "step": 1130 + }, + { + "epoch": 1.191151054268925, + "grad_norm": 0.4122211881725999, + "learning_rate": 6.114553136711132e-05, + "loss": 0.6114, + "num_tokens": 2641654613.0, + "step": 1131 + }, + { + "epoch": 1.191496716211545, + "grad_norm": 0.8526245898721758, + "learning_rate": 6.110453688063959e-05, + "loss": 0.6248, + "num_tokens": 2643911810.0, + "step": 1132 + }, + { + "epoch": 1.1918423781541652, + "grad_norm": 0.6359581949957432, + "learning_rate": 6.106351165309986e-05, + "loss": 0.6182, + "num_tokens": 2646095989.0, + "step": 1133 + }, + { + "epoch": 1.1921880400967853, + "grad_norm": 0.8869903807214052, + "learning_rate": 6.1022455744251006e-05, + "loss": 0.6263, + "num_tokens": 2648506830.0, + "step": 1134 + }, + { + "epoch": 1.1925337020394053, + "grad_norm": 0.6551349551445427, + "learning_rate": 6.0981369213896575e-05, + "loss": 0.6428, + "num_tokens": 2650875082.0, + "step": 1135 + }, + { + "epoch": 1.1928793639820257, + "grad_norm": 0.9109586939150391, + "learning_rate": 6.094025212188475e-05, + "loss": 0.6082, + "num_tokens": 2653130894.0, + "step": 1136 + }, + { + "epoch": 1.1932250259246457, + "grad_norm": 0.6835566588970783, + "learning_rate": 6.089910452810821e-05, + "loss": 0.6474, + "num_tokens": 2655388282.0, + "step": 1137 + }, + { + "epoch": 1.1935706878672658, + "grad_norm": 0.9333519384329954, + "learning_rate": 6.0857926492504065e-05, + "loss": 0.6302, + "num_tokens": 2657737132.0, + "step": 1138 + }, + { + "epoch": 1.193916349809886, + "grad_norm": 0.788240280625609, + "learning_rate": 6.081671807505373e-05, + "loss": 0.6455, + "num_tokens": 2660115732.0, + "step": 1139 + }, + { + "epoch": 1.194262011752506, + "grad_norm": 0.8040181859841573, + "learning_rate": 6.077547933578297e-05, + "loss": 0.6077, + "num_tokens": 2662456848.0, + "step": 1140 + }, + { + "epoch": 1.194607673695126, + "grad_norm": 0.7549739870035803, + "learning_rate": 6.0734210334761616e-05, + "loss": 0.6221, + "num_tokens": 2664682785.0, + "step": 1141 + }, + { + "epoch": 1.1949533356377462, + "grad_norm": 0.6216844832284941, + "learning_rate": 6.069291113210366e-05, + "loss": 0.6025, + "num_tokens": 2667009115.0, + "step": 1142 + }, + { + "epoch": 1.1952989975803665, + "grad_norm": 0.5381473921686684, + "learning_rate": 6.065158178796701e-05, + "loss": 0.6262, + "num_tokens": 2669356709.0, + "step": 1143 + }, + { + "epoch": 1.1956446595229866, + "grad_norm": 0.7727860444247312, + "learning_rate": 6.061022236255356e-05, + "loss": 0.6156, + "num_tokens": 2671661329.0, + "step": 1144 + }, + { + "epoch": 1.1959903214656067, + "grad_norm": 0.5719313181027021, + "learning_rate": 6.056883291610897e-05, + "loss": 0.6211, + "num_tokens": 2673931917.0, + "step": 1145 + }, + { + "epoch": 1.1963359834082268, + "grad_norm": 0.9713052484474615, + "learning_rate": 6.052741350892264e-05, + "loss": 0.619, + "num_tokens": 2676266149.0, + "step": 1146 + }, + { + "epoch": 1.1966816453508469, + "grad_norm": 0.9268812950739193, + "learning_rate": 6.048596420132759e-05, + "loss": 0.6137, + "num_tokens": 2678666910.0, + "step": 1147 + }, + { + "epoch": 1.197027307293467, + "grad_norm": 0.6141942310831742, + "learning_rate": 6.0444485053700465e-05, + "loss": 0.6054, + "num_tokens": 2680872223.0, + "step": 1148 + }, + { + "epoch": 1.197372969236087, + "grad_norm": 0.7691361745603013, + "learning_rate": 6.04029761264613e-05, + "loss": 0.6422, + "num_tokens": 2683283622.0, + "step": 1149 + }, + { + "epoch": 1.1977186311787071, + "grad_norm": 0.5370561173015453, + "learning_rate": 6.036143748007354e-05, + "loss": 0.612, + "num_tokens": 2685693129.0, + "step": 1150 + }, + { + "epoch": 1.1980642931213272, + "grad_norm": 0.46517937709460283, + "learning_rate": 6.03198691750439e-05, + "loss": 0.6258, + "num_tokens": 2687933966.0, + "step": 1151 + }, + { + "epoch": 1.1984099550639475, + "grad_norm": 0.6352010947591583, + "learning_rate": 6.027827127192235e-05, + "loss": 0.641, + "num_tokens": 2690272508.0, + "step": 1152 + }, + { + "epoch": 1.1987556170065676, + "grad_norm": 0.3622663900883658, + "learning_rate": 6.02366438313019e-05, + "loss": 0.6042, + "num_tokens": 2692637091.0, + "step": 1153 + }, + { + "epoch": 1.1991012789491877, + "grad_norm": 0.8088722125784332, + "learning_rate": 6.019498691381868e-05, + "loss": 0.6106, + "num_tokens": 2694981412.0, + "step": 1154 + }, + { + "epoch": 1.1994469408918078, + "grad_norm": 0.6217728090568126, + "learning_rate": 6.0153300580151614e-05, + "loss": 0.6392, + "num_tokens": 2697404110.0, + "step": 1155 + }, + { + "epoch": 1.199792602834428, + "grad_norm": 0.948503068732415, + "learning_rate": 6.011158489102264e-05, + "loss": 0.6301, + "num_tokens": 2699737033.0, + "step": 1156 + }, + { + "epoch": 2.00034566194262, + "grad_norm": 1.6983983184643514, + "learning_rate": 6.006983990719634e-05, + "loss": 1.2661, + "num_tokens": 2703542470.0, + "step": 1157 + }, + { + "epoch": 2.00069132388524, + "grad_norm": 0.7442873843666701, + "learning_rate": 6.0028065689480014e-05, + "loss": 0.6203, + "num_tokens": 2705960297.0, + "step": 1158 + }, + { + "epoch": 2.0010369858278603, + "grad_norm": 0.7246510291053152, + "learning_rate": 5.9986262298723524e-05, + "loss": 0.6221, + "num_tokens": 2708254510.0, + "step": 1159 + }, + { + "epoch": 2.0013826477704804, + "grad_norm": 0.7731935930078484, + "learning_rate": 5.994442979581924e-05, + "loss": 0.6179, + "num_tokens": 2710587145.0, + "step": 1160 + }, + { + "epoch": 2.0017283097131005, + "grad_norm": 0.6661950010458624, + "learning_rate": 5.990256824170196e-05, + "loss": 0.634, + "num_tokens": 2713023656.0, + "step": 1161 + }, + { + "epoch": 2.0020739716557205, + "grad_norm": 0.7262493115905476, + "learning_rate": 5.986067769734873e-05, + "loss": 0.6133, + "num_tokens": 2715262150.0, + "step": 1162 + }, + { + "epoch": 2.0024196335983406, + "grad_norm": 0.6876518363658152, + "learning_rate": 5.981875822377893e-05, + "loss": 0.6325, + "num_tokens": 2717621390.0, + "step": 1163 + }, + { + "epoch": 2.0027652955409607, + "grad_norm": 0.7469791631147007, + "learning_rate": 5.977680988205396e-05, + "loss": 0.6241, + "num_tokens": 2719919847.0, + "step": 1164 + }, + { + "epoch": 2.0031109574835813, + "grad_norm": 0.7209876680831472, + "learning_rate": 5.973483273327737e-05, + "loss": 0.6276, + "num_tokens": 2722153155.0, + "step": 1165 + }, + { + "epoch": 2.0034566194262013, + "grad_norm": 0.6539876570046835, + "learning_rate": 5.969282683859461e-05, + "loss": 0.6026, + "num_tokens": 2724415200.0, + "step": 1166 + }, + { + "epoch": 2.0038022813688214, + "grad_norm": 0.6410589835786973, + "learning_rate": 5.9650792259193044e-05, + "loss": 0.6098, + "num_tokens": 2726817372.0, + "step": 1167 + }, + { + "epoch": 2.0041479433114415, + "grad_norm": 0.7593877771878029, + "learning_rate": 5.960872905630177e-05, + "loss": 0.6596, + "num_tokens": 2729295162.0, + "step": 1168 + }, + { + "epoch": 2.0044936052540616, + "grad_norm": 0.6666599626164708, + "learning_rate": 5.9566637291191626e-05, + "loss": 0.6072, + "num_tokens": 2731497739.0, + "step": 1169 + }, + { + "epoch": 2.0048392671966817, + "grad_norm": 0.6753379543754168, + "learning_rate": 5.9524517025175034e-05, + "loss": 0.6028, + "num_tokens": 2733806000.0, + "step": 1170 + }, + { + "epoch": 2.005184929139302, + "grad_norm": 0.6089275348410033, + "learning_rate": 5.948236831960594e-05, + "loss": 0.5982, + "num_tokens": 2736124404.0, + "step": 1171 + }, + { + "epoch": 2.005530591081922, + "grad_norm": 0.7992910351209958, + "learning_rate": 5.9440191235879685e-05, + "loss": 0.6141, + "num_tokens": 2738545369.0, + "step": 1172 + }, + { + "epoch": 2.005876253024542, + "grad_norm": 0.717465856861861, + "learning_rate": 5.939798583543301e-05, + "loss": 0.6297, + "num_tokens": 2740941875.0, + "step": 1173 + }, + { + "epoch": 2.006221914967162, + "grad_norm": 0.6993038527846662, + "learning_rate": 5.935575217974383e-05, + "loss": 0.6004, + "num_tokens": 2743249517.0, + "step": 1174 + }, + { + "epoch": 2.006567576909782, + "grad_norm": 0.6527277762520203, + "learning_rate": 5.9313490330331296e-05, + "loss": 0.6162, + "num_tokens": 2745723971.0, + "step": 1175 + }, + { + "epoch": 2.0069132388524022, + "grad_norm": 0.8193547453031245, + "learning_rate": 5.9271200348755546e-05, + "loss": 0.6039, + "num_tokens": 2748023424.0, + "step": 1176 + }, + { + "epoch": 2.0072589007950223, + "grad_norm": 0.6282131087809707, + "learning_rate": 5.922888229661775e-05, + "loss": 0.5974, + "num_tokens": 2750262628.0, + "step": 1177 + }, + { + "epoch": 2.0076045627376424, + "grad_norm": 0.6884009746594285, + "learning_rate": 5.918653623555994e-05, + "loss": 0.5996, + "num_tokens": 2752620786.0, + "step": 1178 + }, + { + "epoch": 2.0079502246802625, + "grad_norm": 0.5969485312674665, + "learning_rate": 5.914416222726498e-05, + "loss": 0.6153, + "num_tokens": 2754938504.0, + "step": 1179 + }, + { + "epoch": 2.0082958866228826, + "grad_norm": 0.8639766578152747, + "learning_rate": 5.91017603334564e-05, + "loss": 0.6221, + "num_tokens": 2757255693.0, + "step": 1180 + }, + { + "epoch": 2.008641548565503, + "grad_norm": 0.908447107151901, + "learning_rate": 5.90593306158984e-05, + "loss": 0.613, + "num_tokens": 2759620378.0, + "step": 1181 + }, + { + "epoch": 2.0089872105081232, + "grad_norm": 0.5440290004596824, + "learning_rate": 5.901687313639563e-05, + "loss": 0.6293, + "num_tokens": 2761892250.0, + "step": 1182 + }, + { + "epoch": 2.0093328724507433, + "grad_norm": 0.5866501794465553, + "learning_rate": 5.8974387956793266e-05, + "loss": 0.6264, + "num_tokens": 2764251949.0, + "step": 1183 + }, + { + "epoch": 2.0096785343933634, + "grad_norm": 0.6313320404507824, + "learning_rate": 5.893187513897679e-05, + "loss": 0.6075, + "num_tokens": 2766587104.0, + "step": 1184 + }, + { + "epoch": 2.0100241963359835, + "grad_norm": 0.46116049618639426, + "learning_rate": 5.8889334744871936e-05, + "loss": 0.6189, + "num_tokens": 2768933002.0, + "step": 1185 + }, + { + "epoch": 2.0103698582786036, + "grad_norm": 0.9071841441363677, + "learning_rate": 5.884676683644463e-05, + "loss": 0.6273, + "num_tokens": 2771280788.0, + "step": 1186 + }, + { + "epoch": 2.0107155202212237, + "grad_norm": 0.7035937211960908, + "learning_rate": 5.880417147570086e-05, + "loss": 0.6107, + "num_tokens": 2773572746.0, + "step": 1187 + }, + { + "epoch": 2.0110611821638438, + "grad_norm": 0.7991766387080663, + "learning_rate": 5.876154872468661e-05, + "loss": 0.6234, + "num_tokens": 2775975921.0, + "step": 1188 + }, + { + "epoch": 2.011406844106464, + "grad_norm": 0.807391846706665, + "learning_rate": 5.8718898645487765e-05, + "loss": 0.6161, + "num_tokens": 2778337263.0, + "step": 1189 + }, + { + "epoch": 2.011752506049084, + "grad_norm": 0.5562165017013784, + "learning_rate": 5.867622130023e-05, + "loss": 0.6199, + "num_tokens": 2780668685.0, + "step": 1190 + }, + { + "epoch": 2.012098167991704, + "grad_norm": 0.525918324832085, + "learning_rate": 5.8633516751078715e-05, + "loss": 0.6054, + "num_tokens": 2782995071.0, + "step": 1191 + }, + { + "epoch": 2.012443829934324, + "grad_norm": 0.742850834984646, + "learning_rate": 5.8590785060238944e-05, + "loss": 0.594, + "num_tokens": 2785297846.0, + "step": 1192 + }, + { + "epoch": 2.0127894918769442, + "grad_norm": 0.5818736602944322, + "learning_rate": 5.8548026289955255e-05, + "loss": 0.6226, + "num_tokens": 2787606882.0, + "step": 1193 + }, + { + "epoch": 2.0131351538195643, + "grad_norm": 0.8146841490819778, + "learning_rate": 5.850524050251167e-05, + "loss": 0.5705, + "num_tokens": 2789917707.0, + "step": 1194 + }, + { + "epoch": 2.0134808157621844, + "grad_norm": 0.7799587675986269, + "learning_rate": 5.846242776023151e-05, + "loss": 0.5862, + "num_tokens": 2792208397.0, + "step": 1195 + }, + { + "epoch": 2.0138264777048045, + "grad_norm": 0.49756545870411284, + "learning_rate": 5.8419588125477454e-05, + "loss": 0.5858, + "num_tokens": 2794422122.0, + "step": 1196 + }, + { + "epoch": 2.014172139647425, + "grad_norm": 0.588581034799447, + "learning_rate": 5.837672166065128e-05, + "loss": 0.6036, + "num_tokens": 2796787647.0, + "step": 1197 + }, + { + "epoch": 2.014517801590045, + "grad_norm": 0.5794257519385543, + "learning_rate": 5.833382842819387e-05, + "loss": 0.5926, + "num_tokens": 2799008939.0, + "step": 1198 + }, + { + "epoch": 2.014863463532665, + "grad_norm": 0.37542314287494183, + "learning_rate": 5.8290908490585074e-05, + "loss": 0.6004, + "num_tokens": 2801343662.0, + "step": 1199 + }, + { + "epoch": 2.0152091254752853, + "grad_norm": 0.9164441781041319, + "learning_rate": 5.8247961910343695e-05, + "loss": 0.5918, + "num_tokens": 2803675383.0, + "step": 1200 + }, + { + "epoch": 2.0155547874179054, + "grad_norm": 0.8543498168754647, + "learning_rate": 5.820498875002731e-05, + "loss": 0.6219, + "num_tokens": 2806042191.0, + "step": 1201 + }, + { + "epoch": 2.0159004493605255, + "grad_norm": 0.6893260214351075, + "learning_rate": 5.8161989072232205e-05, + "loss": 0.6061, + "num_tokens": 2808320346.0, + "step": 1202 + }, + { + "epoch": 2.0162461113031456, + "grad_norm": 0.7544457260809125, + "learning_rate": 5.81189629395933e-05, + "loss": 0.6172, + "num_tokens": 2810811935.0, + "step": 1203 + }, + { + "epoch": 2.0165917732457657, + "grad_norm": 0.5104916803984129, + "learning_rate": 5.8075910414784084e-05, + "loss": 0.6265, + "num_tokens": 2813078604.0, + "step": 1204 + }, + { + "epoch": 2.0169374351883858, + "grad_norm": 0.6381132131624556, + "learning_rate": 5.8032831560516425e-05, + "loss": 0.619, + "num_tokens": 2815366899.0, + "step": 1205 + }, + { + "epoch": 2.017283097131006, + "grad_norm": 0.5284200882949209, + "learning_rate": 5.7989726439540605e-05, + "loss": 0.6269, + "num_tokens": 2817690994.0, + "step": 1206 + }, + { + "epoch": 2.017628759073626, + "grad_norm": 0.34542786689797705, + "learning_rate": 5.794659511464512e-05, + "loss": 0.6117, + "num_tokens": 2820055089.0, + "step": 1207 + }, + { + "epoch": 2.017974421016246, + "grad_norm": 0.7560090082336607, + "learning_rate": 5.7903437648656665e-05, + "loss": 0.6042, + "num_tokens": 2822288430.0, + "step": 1208 + }, + { + "epoch": 2.018320082958866, + "grad_norm": 0.5898726828853388, + "learning_rate": 5.786025410444002e-05, + "loss": 0.6203, + "num_tokens": 2824712012.0, + "step": 1209 + }, + { + "epoch": 2.018665744901486, + "grad_norm": 0.8370990498105547, + "learning_rate": 5.781704454489793e-05, + "loss": 0.621, + "num_tokens": 2827148049.0, + "step": 1210 + }, + { + "epoch": 2.0190114068441063, + "grad_norm": 0.7862899234850369, + "learning_rate": 5.7773809032971e-05, + "loss": 0.6311, + "num_tokens": 2829555653.0, + "step": 1211 + }, + { + "epoch": 2.0193570687867264, + "grad_norm": 0.7267285714096522, + "learning_rate": 5.7730547631637735e-05, + "loss": 0.6334, + "num_tokens": 2832078053.0, + "step": 1212 + }, + { + "epoch": 2.019702730729347, + "grad_norm": 0.8117802091918718, + "learning_rate": 5.7687260403914264e-05, + "loss": 0.6026, + "num_tokens": 2834508497.0, + "step": 1213 + }, + { + "epoch": 2.020048392671967, + "grad_norm": 0.5476173083610995, + "learning_rate": 5.764394741285438e-05, + "loss": 0.6262, + "num_tokens": 2836980452.0, + "step": 1214 + }, + { + "epoch": 2.020394054614587, + "grad_norm": 0.7998449760195921, + "learning_rate": 5.76006087215494e-05, + "loss": 0.6229, + "num_tokens": 2839284251.0, + "step": 1215 + }, + { + "epoch": 2.020739716557207, + "grad_norm": 0.6449016538857358, + "learning_rate": 5.7557244393128034e-05, + "loss": 0.627, + "num_tokens": 2841423648.0, + "step": 1216 + }, + { + "epoch": 2.0210853784998273, + "grad_norm": 0.6873619631462554, + "learning_rate": 5.751385449075641e-05, + "loss": 0.6065, + "num_tokens": 2843741178.0, + "step": 1217 + }, + { + "epoch": 2.0214310404424474, + "grad_norm": 0.8482070608413074, + "learning_rate": 5.7470439077637845e-05, + "loss": 0.6135, + "num_tokens": 2846061812.0, + "step": 1218 + }, + { + "epoch": 2.0217767023850675, + "grad_norm": 0.5494497236481478, + "learning_rate": 5.7426998217012835e-05, + "loss": 0.6222, + "num_tokens": 2848331389.0, + "step": 1219 + }, + { + "epoch": 2.0221223643276875, + "grad_norm": 0.6863107082502556, + "learning_rate": 5.738353197215897e-05, + "loss": 0.6322, + "num_tokens": 2850680680.0, + "step": 1220 + }, + { + "epoch": 2.0224680262703076, + "grad_norm": 0.6738447393787746, + "learning_rate": 5.734004040639076e-05, + "loss": 0.6238, + "num_tokens": 2853007171.0, + "step": 1221 + }, + { + "epoch": 2.0228136882129277, + "grad_norm": 0.5542657186036472, + "learning_rate": 5.7296523583059675e-05, + "loss": 0.6069, + "num_tokens": 2855360185.0, + "step": 1222 + }, + { + "epoch": 2.023159350155548, + "grad_norm": 0.6568104984881573, + "learning_rate": 5.7252981565553894e-05, + "loss": 0.6078, + "num_tokens": 2857628485.0, + "step": 1223 + }, + { + "epoch": 2.023505012098168, + "grad_norm": 0.5947952196949869, + "learning_rate": 5.7209414417298344e-05, + "loss": 0.6074, + "num_tokens": 2859902857.0, + "step": 1224 + }, + { + "epoch": 2.023850674040788, + "grad_norm": 0.5879663371033316, + "learning_rate": 5.716582220175456e-05, + "loss": 0.598, + "num_tokens": 2862224706.0, + "step": 1225 + }, + { + "epoch": 2.024196335983408, + "grad_norm": 0.5175978532057498, + "learning_rate": 5.712220498242057e-05, + "loss": 0.6244, + "num_tokens": 2864594415.0, + "step": 1226 + }, + { + "epoch": 2.024541997926028, + "grad_norm": 0.6423216752918504, + "learning_rate": 5.707856282283084e-05, + "loss": 0.6179, + "num_tokens": 2866860754.0, + "step": 1227 + }, + { + "epoch": 2.0248876598686483, + "grad_norm": 0.47903563464562704, + "learning_rate": 5.703489578655614e-05, + "loss": 0.5939, + "num_tokens": 2869075430.0, + "step": 1228 + }, + { + "epoch": 2.0252333218112684, + "grad_norm": 0.6224231697723156, + "learning_rate": 5.699120393720351e-05, + "loss": 0.6153, + "num_tokens": 2871371121.0, + "step": 1229 + }, + { + "epoch": 2.025578983753889, + "grad_norm": 0.5854130733860494, + "learning_rate": 5.6947487338416104e-05, + "loss": 0.5909, + "num_tokens": 2873664539.0, + "step": 1230 + }, + { + "epoch": 2.025924645696509, + "grad_norm": 0.4681324858119423, + "learning_rate": 5.690374605387318e-05, + "loss": 0.6033, + "num_tokens": 2875972761.0, + "step": 1231 + }, + { + "epoch": 2.026270307639129, + "grad_norm": 0.5066873916492088, + "learning_rate": 5.685998014728984e-05, + "loss": 0.621, + "num_tokens": 2878324473.0, + "step": 1232 + }, + { + "epoch": 2.026615969581749, + "grad_norm": 0.5165774101959238, + "learning_rate": 5.681618968241719e-05, + "loss": 0.6011, + "num_tokens": 2880743438.0, + "step": 1233 + }, + { + "epoch": 2.0269616315243693, + "grad_norm": 0.5462166677946223, + "learning_rate": 5.6772374723042016e-05, + "loss": 0.6066, + "num_tokens": 2883113562.0, + "step": 1234 + }, + { + "epoch": 2.0273072934669893, + "grad_norm": 0.4991318134459581, + "learning_rate": 5.672853533298683e-05, + "loss": 0.6083, + "num_tokens": 2885507881.0, + "step": 1235 + }, + { + "epoch": 2.0276529554096094, + "grad_norm": 0.5519840728737752, + "learning_rate": 5.668467157610968e-05, + "loss": 0.602, + "num_tokens": 2887789962.0, + "step": 1236 + }, + { + "epoch": 2.0279986173522295, + "grad_norm": 0.4820652100026768, + "learning_rate": 5.664078351630418e-05, + "loss": 0.5947, + "num_tokens": 2890127773.0, + "step": 1237 + }, + { + "epoch": 2.0283442792948496, + "grad_norm": 0.5218030585256224, + "learning_rate": 5.659687121749926e-05, + "loss": 0.59, + "num_tokens": 2892439637.0, + "step": 1238 + }, + { + "epoch": 2.0286899412374697, + "grad_norm": 0.44313267257206534, + "learning_rate": 5.655293474365925e-05, + "loss": 0.6108, + "num_tokens": 2894747159.0, + "step": 1239 + }, + { + "epoch": 2.02903560318009, + "grad_norm": 0.5371641585740026, + "learning_rate": 5.650897415878361e-05, + "loss": 0.6041, + "num_tokens": 2897077333.0, + "step": 1240 + }, + { + "epoch": 2.02938126512271, + "grad_norm": 0.46624849879625346, + "learning_rate": 5.6464989526906974e-05, + "loss": 0.5776, + "num_tokens": 2899461991.0, + "step": 1241 + }, + { + "epoch": 2.02972692706533, + "grad_norm": 0.5401807002198875, + "learning_rate": 5.642098091209899e-05, + "loss": 0.5753, + "num_tokens": 2901811919.0, + "step": 1242 + }, + { + "epoch": 2.03007258900795, + "grad_norm": 0.5472715875192805, + "learning_rate": 5.637694837846422e-05, + "loss": 0.6168, + "num_tokens": 2904210850.0, + "step": 1243 + }, + { + "epoch": 2.03041825095057, + "grad_norm": 0.6275862815911735, + "learning_rate": 5.633289199014211e-05, + "loss": 0.6089, + "num_tokens": 2906408823.0, + "step": 1244 + }, + { + "epoch": 2.0307639128931902, + "grad_norm": 0.5182927643911286, + "learning_rate": 5.6288811811306804e-05, + "loss": 0.5971, + "num_tokens": 2908814454.0, + "step": 1245 + }, + { + "epoch": 2.031109574835811, + "grad_norm": 0.615480497109353, + "learning_rate": 5.6244707906167136e-05, + "loss": 0.5966, + "num_tokens": 2911188765.0, + "step": 1246 + }, + { + "epoch": 2.031455236778431, + "grad_norm": 0.5654307300109646, + "learning_rate": 5.620058033896648e-05, + "loss": 0.5871, + "num_tokens": 2913544729.0, + "step": 1247 + }, + { + "epoch": 2.031800898721051, + "grad_norm": 0.6759285074914977, + "learning_rate": 5.615642917398271e-05, + "loss": 0.5915, + "num_tokens": 2915823840.0, + "step": 1248 + }, + { + "epoch": 2.032146560663671, + "grad_norm": 0.6397580415247495, + "learning_rate": 5.6112254475528006e-05, + "loss": 0.6063, + "num_tokens": 2918163854.0, + "step": 1249 + }, + { + "epoch": 2.032492222606291, + "grad_norm": 0.5278698967861356, + "learning_rate": 5.606805630794893e-05, + "loss": 0.602, + "num_tokens": 2920599492.0, + "step": 1250 + }, + { + "epoch": 2.0328378845489112, + "grad_norm": 0.6757133057033519, + "learning_rate": 5.6023834735626116e-05, + "loss": 0.5675, + "num_tokens": 2922914498.0, + "step": 1251 + }, + { + "epoch": 2.0331835464915313, + "grad_norm": 0.4395604949882517, + "learning_rate": 5.597958982297438e-05, + "loss": 0.6082, + "num_tokens": 2925267839.0, + "step": 1252 + }, + { + "epoch": 2.0335292084341514, + "grad_norm": 0.6407649357139156, + "learning_rate": 5.5935321634442474e-05, + "loss": 0.6004, + "num_tokens": 2927638944.0, + "step": 1253 + }, + { + "epoch": 2.0338748703767715, + "grad_norm": 0.5781375209109065, + "learning_rate": 5.5891030234513106e-05, + "loss": 0.6385, + "num_tokens": 2929963335.0, + "step": 1254 + }, + { + "epoch": 2.0342205323193916, + "grad_norm": 0.5346773012866548, + "learning_rate": 5.584671568770276e-05, + "loss": 0.6219, + "num_tokens": 2932283673.0, + "step": 1255 + }, + { + "epoch": 2.0345661942620117, + "grad_norm": 0.5634585941402117, + "learning_rate": 5.580237805856165e-05, + "loss": 0.6091, + "num_tokens": 2934624476.0, + "step": 1256 + }, + { + "epoch": 2.0349118562046318, + "grad_norm": 0.5223123448500595, + "learning_rate": 5.575801741167361e-05, + "loss": 0.6148, + "num_tokens": 2936811500.0, + "step": 1257 + }, + { + "epoch": 2.035257518147252, + "grad_norm": 0.6100766238181008, + "learning_rate": 5.5713633811656005e-05, + "loss": 0.5992, + "num_tokens": 2939199246.0, + "step": 1258 + }, + { + "epoch": 2.035603180089872, + "grad_norm": 0.6300319578822967, + "learning_rate": 5.566922732315962e-05, + "loss": 0.6082, + "num_tokens": 2941528529.0, + "step": 1259 + }, + { + "epoch": 2.035948842032492, + "grad_norm": 0.6029478230716395, + "learning_rate": 5.562479801086861e-05, + "loss": 0.5989, + "num_tokens": 2943818454.0, + "step": 1260 + }, + { + "epoch": 2.036294503975112, + "grad_norm": 0.5963737260826928, + "learning_rate": 5.5580345939500346e-05, + "loss": 0.6141, + "num_tokens": 2946126116.0, + "step": 1261 + }, + { + "epoch": 2.0366401659177327, + "grad_norm": 0.5068278020650865, + "learning_rate": 5.553587117380537e-05, + "loss": 0.6114, + "num_tokens": 2948525138.0, + "step": 1262 + }, + { + "epoch": 2.0369858278603528, + "grad_norm": 0.5568417324171215, + "learning_rate": 5.549137377856727e-05, + "loss": 0.6304, + "num_tokens": 2950884154.0, + "step": 1263 + }, + { + "epoch": 2.037331489802973, + "grad_norm": 0.5213384527120767, + "learning_rate": 5.5446853818602595e-05, + "loss": 0.6153, + "num_tokens": 2953141908.0, + "step": 1264 + }, + { + "epoch": 2.037677151745593, + "grad_norm": 0.5321980257371528, + "learning_rate": 5.540231135876077e-05, + "loss": 0.6063, + "num_tokens": 2955544112.0, + "step": 1265 + }, + { + "epoch": 2.038022813688213, + "grad_norm": 0.5015596988009549, + "learning_rate": 5.535774646392401e-05, + "loss": 0.6156, + "num_tokens": 2957931919.0, + "step": 1266 + }, + { + "epoch": 2.038368475630833, + "grad_norm": 0.5832821256270071, + "learning_rate": 5.531315919900717e-05, + "loss": 0.6166, + "num_tokens": 2960187122.0, + "step": 1267 + }, + { + "epoch": 2.038714137573453, + "grad_norm": 0.45580269126113726, + "learning_rate": 5.526854962895774e-05, + "loss": 0.5977, + "num_tokens": 2962475805.0, + "step": 1268 + }, + { + "epoch": 2.0390597995160733, + "grad_norm": 0.6142537898377721, + "learning_rate": 5.522391781875564e-05, + "loss": 0.6137, + "num_tokens": 2964868388.0, + "step": 1269 + }, + { + "epoch": 2.0394054614586934, + "grad_norm": 0.5333676282289138, + "learning_rate": 5.5179263833413234e-05, + "loss": 0.5995, + "num_tokens": 2967211344.0, + "step": 1270 + }, + { + "epoch": 2.0397511234013135, + "grad_norm": 0.5725340938172345, + "learning_rate": 5.513458773797519e-05, + "loss": 0.6112, + "num_tokens": 2969528142.0, + "step": 1271 + }, + { + "epoch": 2.0400967853439336, + "grad_norm": 0.6170193208053968, + "learning_rate": 5.5089889597518324e-05, + "loss": 0.6145, + "num_tokens": 2971957292.0, + "step": 1272 + }, + { + "epoch": 2.0404424472865537, + "grad_norm": 0.5682381648597012, + "learning_rate": 5.5045169477151645e-05, + "loss": 0.608, + "num_tokens": 2974134938.0, + "step": 1273 + }, + { + "epoch": 2.0407881092291738, + "grad_norm": 0.4827937183125581, + "learning_rate": 5.500042744201612e-05, + "loss": 0.6031, + "num_tokens": 2976572533.0, + "step": 1274 + }, + { + "epoch": 2.041133771171794, + "grad_norm": 0.5383409905181379, + "learning_rate": 5.495566355728465e-05, + "loss": 0.5952, + "num_tokens": 2978846275.0, + "step": 1275 + }, + { + "epoch": 2.041479433114414, + "grad_norm": 0.4977390274277039, + "learning_rate": 5.491087788816198e-05, + "loss": 0.5927, + "num_tokens": 2981218028.0, + "step": 1276 + }, + { + "epoch": 2.041825095057034, + "grad_norm": 0.5743564152603742, + "learning_rate": 5.4866070499884555e-05, + "loss": 0.607, + "num_tokens": 2983516829.0, + "step": 1277 + }, + { + "epoch": 2.0421707569996546, + "grad_norm": 0.6016395301475899, + "learning_rate": 5.482124145772051e-05, + "loss": 0.6235, + "num_tokens": 2985974853.0, + "step": 1278 + }, + { + "epoch": 2.0425164189422746, + "grad_norm": 0.4587878650251555, + "learning_rate": 5.477639082696947e-05, + "loss": 0.6134, + "num_tokens": 2988263957.0, + "step": 1279 + }, + { + "epoch": 2.0428620808848947, + "grad_norm": 0.5170040264186122, + "learning_rate": 5.473151867296254e-05, + "loss": 0.6036, + "num_tokens": 2990592023.0, + "step": 1280 + }, + { + "epoch": 2.043207742827515, + "grad_norm": 0.4548539404380404, + "learning_rate": 5.468662506106214e-05, + "loss": 0.6334, + "num_tokens": 2993000191.0, + "step": 1281 + }, + { + "epoch": 2.043553404770135, + "grad_norm": 0.5033124591021085, + "learning_rate": 5.464171005666198e-05, + "loss": 0.614, + "num_tokens": 2995344714.0, + "step": 1282 + }, + { + "epoch": 2.043899066712755, + "grad_norm": 0.44667062989295664, + "learning_rate": 5.459677372518692e-05, + "loss": 0.6137, + "num_tokens": 2997745826.0, + "step": 1283 + }, + { + "epoch": 2.044244728655375, + "grad_norm": 0.43621104229171326, + "learning_rate": 5.4551816132092876e-05, + "loss": 0.5943, + "num_tokens": 3000026030.0, + "step": 1284 + }, + { + "epoch": 2.044590390597995, + "grad_norm": 0.42703493409671434, + "learning_rate": 5.450683734286677e-05, + "loss": 0.5903, + "num_tokens": 3002321921.0, + "step": 1285 + }, + { + "epoch": 2.0449360525406153, + "grad_norm": 0.5395855525690519, + "learning_rate": 5.4461837423026355e-05, + "loss": 0.5898, + "num_tokens": 3004699940.0, + "step": 1286 + }, + { + "epoch": 2.0452817144832354, + "grad_norm": 0.50678507152604, + "learning_rate": 5.441681643812019e-05, + "loss": 0.6183, + "num_tokens": 3007112777.0, + "step": 1287 + }, + { + "epoch": 2.0456273764258555, + "grad_norm": 0.6019317612983736, + "learning_rate": 5.437177445372749e-05, + "loss": 0.602, + "num_tokens": 3009435776.0, + "step": 1288 + }, + { + "epoch": 2.0459730383684755, + "grad_norm": 0.5863072150441261, + "learning_rate": 5.432671153545811e-05, + "loss": 0.6276, + "num_tokens": 3011921411.0, + "step": 1289 + }, + { + "epoch": 2.0463187003110956, + "grad_norm": 0.6254608431749489, + "learning_rate": 5.428162774895234e-05, + "loss": 0.6186, + "num_tokens": 3014215887.0, + "step": 1290 + }, + { + "epoch": 2.0466643622537157, + "grad_norm": 0.47843745798493226, + "learning_rate": 5.423652315988093e-05, + "loss": 0.6249, + "num_tokens": 3016687372.0, + "step": 1291 + }, + { + "epoch": 2.047010024196336, + "grad_norm": 0.4141659775595344, + "learning_rate": 5.419139783394484e-05, + "loss": 0.5966, + "num_tokens": 3019034321.0, + "step": 1292 + }, + { + "epoch": 2.047355686138956, + "grad_norm": 0.510810522892774, + "learning_rate": 5.414625183687534e-05, + "loss": 0.6017, + "num_tokens": 3021292293.0, + "step": 1293 + }, + { + "epoch": 2.047701348081576, + "grad_norm": 0.41829526657986776, + "learning_rate": 5.4101085234433765e-05, + "loss": 0.6111, + "num_tokens": 3023533155.0, + "step": 1294 + }, + { + "epoch": 2.0480470100241965, + "grad_norm": 0.48062734874510304, + "learning_rate": 5.405589809241142e-05, + "loss": 0.5914, + "num_tokens": 3025891129.0, + "step": 1295 + }, + { + "epoch": 2.0483926719668166, + "grad_norm": 0.3875636403621201, + "learning_rate": 5.40106904766296e-05, + "loss": 0.5977, + "num_tokens": 3028170519.0, + "step": 1296 + }, + { + "epoch": 2.0487383339094367, + "grad_norm": 0.40725036120715, + "learning_rate": 5.39654624529394e-05, + "loss": 0.614, + "num_tokens": 3030533342.0, + "step": 1297 + }, + { + "epoch": 2.049083995852057, + "grad_norm": 0.48169707685500246, + "learning_rate": 5.392021408722161e-05, + "loss": 0.6029, + "num_tokens": 3032910008.0, + "step": 1298 + }, + { + "epoch": 2.049429657794677, + "grad_norm": 0.4082236704550609, + "learning_rate": 5.3874945445386706e-05, + "loss": 0.62, + "num_tokens": 3035353770.0, + "step": 1299 + }, + { + "epoch": 2.049775319737297, + "grad_norm": 0.4478889348463514, + "learning_rate": 5.3829656593374644e-05, + "loss": 0.6221, + "num_tokens": 3037661561.0, + "step": 1300 + }, + { + "epoch": 2.050120981679917, + "grad_norm": 0.34116851464317116, + "learning_rate": 5.3784347597154855e-05, + "loss": 0.5979, + "num_tokens": 3039995227.0, + "step": 1301 + }, + { + "epoch": 2.050466643622537, + "grad_norm": 0.40013225507271305, + "learning_rate": 5.373901852272611e-05, + "loss": 0.6253, + "num_tokens": 3042356211.0, + "step": 1302 + }, + { + "epoch": 2.0508123055651573, + "grad_norm": 0.44099928828290136, + "learning_rate": 5.369366943611641e-05, + "loss": 0.6133, + "num_tokens": 3044605964.0, + "step": 1303 + }, + { + "epoch": 2.0511579675077773, + "grad_norm": 0.4080138900079332, + "learning_rate": 5.364830040338293e-05, + "loss": 0.6037, + "num_tokens": 3046869637.0, + "step": 1304 + }, + { + "epoch": 2.0515036294503974, + "grad_norm": 0.5169473108300595, + "learning_rate": 5.3602911490611835e-05, + "loss": 0.6161, + "num_tokens": 3049203375.0, + "step": 1305 + }, + { + "epoch": 2.0518492913930175, + "grad_norm": 0.4449557408995129, + "learning_rate": 5.355750276391836e-05, + "loss": 0.6061, + "num_tokens": 3051445585.0, + "step": 1306 + }, + { + "epoch": 2.0521949533356376, + "grad_norm": 0.3921152430483812, + "learning_rate": 5.3512074289446514e-05, + "loss": 0.5812, + "num_tokens": 3053679376.0, + "step": 1307 + }, + { + "epoch": 2.0525406152782577, + "grad_norm": 0.36286474115355816, + "learning_rate": 5.34666261333691e-05, + "loss": 0.6156, + "num_tokens": 3056033295.0, + "step": 1308 + }, + { + "epoch": 2.052886277220878, + "grad_norm": 0.41416395772979153, + "learning_rate": 5.342115836188756e-05, + "loss": 0.5873, + "num_tokens": 3058237536.0, + "step": 1309 + }, + { + "epoch": 2.053231939163498, + "grad_norm": 0.33231583844955537, + "learning_rate": 5.3375671041231984e-05, + "loss": 0.5975, + "num_tokens": 3060555300.0, + "step": 1310 + }, + { + "epoch": 2.0535776011061184, + "grad_norm": 0.3800244093214951, + "learning_rate": 5.3330164237660844e-05, + "loss": 0.6233, + "num_tokens": 3062934789.0, + "step": 1311 + }, + { + "epoch": 2.0539232630487385, + "grad_norm": 0.43947625057591977, + "learning_rate": 5.328463801746108e-05, + "loss": 0.6088, + "num_tokens": 3065151430.0, + "step": 1312 + }, + { + "epoch": 2.0542689249913586, + "grad_norm": 0.4474886647128191, + "learning_rate": 5.323909244694782e-05, + "loss": 0.6006, + "num_tokens": 3067421239.0, + "step": 1313 + }, + { + "epoch": 2.0546145869339787, + "grad_norm": 0.40717613745553277, + "learning_rate": 5.319352759246447e-05, + "loss": 0.6142, + "num_tokens": 3069721674.0, + "step": 1314 + }, + { + "epoch": 2.054960248876599, + "grad_norm": 0.3519313502011318, + "learning_rate": 5.314794352038248e-05, + "loss": 0.5775, + "num_tokens": 3071984411.0, + "step": 1315 + }, + { + "epoch": 2.055305910819219, + "grad_norm": 0.400389918759006, + "learning_rate": 5.310234029710128e-05, + "loss": 0.622, + "num_tokens": 3074260919.0, + "step": 1316 + }, + { + "epoch": 2.055651572761839, + "grad_norm": 0.38289231712531946, + "learning_rate": 5.3056717989048236e-05, + "loss": 0.5894, + "num_tokens": 3076695076.0, + "step": 1317 + }, + { + "epoch": 2.055997234704459, + "grad_norm": 0.39825730402005594, + "learning_rate": 5.301107666267848e-05, + "loss": 0.6225, + "num_tokens": 3079048581.0, + "step": 1318 + }, + { + "epoch": 2.056342896647079, + "grad_norm": 0.4892193674826655, + "learning_rate": 5.2965416384474885e-05, + "loss": 0.6363, + "num_tokens": 3081481101.0, + "step": 1319 + }, + { + "epoch": 2.0566885585896992, + "grad_norm": 0.2918629069714954, + "learning_rate": 5.2919737220947876e-05, + "loss": 0.6083, + "num_tokens": 3083750248.0, + "step": 1320 + }, + { + "epoch": 2.0570342205323193, + "grad_norm": 0.5704164770851035, + "learning_rate": 5.28740392386354e-05, + "loss": 0.5997, + "num_tokens": 3086097092.0, + "step": 1321 + }, + { + "epoch": 2.0573798824749394, + "grad_norm": 0.3637904345056428, + "learning_rate": 5.2828322504102874e-05, + "loss": 0.6133, + "num_tokens": 3088445821.0, + "step": 1322 + }, + { + "epoch": 2.0577255444175595, + "grad_norm": 0.4299012223391985, + "learning_rate": 5.278258708394297e-05, + "loss": 0.5662, + "num_tokens": 3090655590.0, + "step": 1323 + }, + { + "epoch": 2.0580712063601796, + "grad_norm": 0.36114495013871306, + "learning_rate": 5.2736833044775595e-05, + "loss": 0.6008, + "num_tokens": 3093005486.0, + "step": 1324 + }, + { + "epoch": 2.0584168683027997, + "grad_norm": 0.4646665458202692, + "learning_rate": 5.269106045324778e-05, + "loss": 0.6004, + "num_tokens": 3095342757.0, + "step": 1325 + }, + { + "epoch": 2.0587625302454198, + "grad_norm": 0.36559405257066435, + "learning_rate": 5.264526937603358e-05, + "loss": 0.5876, + "num_tokens": 3097548940.0, + "step": 1326 + }, + { + "epoch": 2.0591081921880403, + "grad_norm": 0.5504694490825284, + "learning_rate": 5.259945987983397e-05, + "loss": 0.586, + "num_tokens": 3099855164.0, + "step": 1327 + }, + { + "epoch": 2.0594538541306604, + "grad_norm": 0.4343717465517934, + "learning_rate": 5.255363203137676e-05, + "loss": 0.5903, + "num_tokens": 3102151487.0, + "step": 1328 + }, + { + "epoch": 2.0597995160732805, + "grad_norm": 0.504407525936016, + "learning_rate": 5.25077858974165e-05, + "loss": 0.6026, + "num_tokens": 3104563621.0, + "step": 1329 + }, + { + "epoch": 2.0601451780159006, + "grad_norm": 0.4659436131566551, + "learning_rate": 5.246192154473436e-05, + "loss": 0.6019, + "num_tokens": 3106973953.0, + "step": 1330 + }, + { + "epoch": 2.0604908399585207, + "grad_norm": 0.46577612027413723, + "learning_rate": 5.241603904013806e-05, + "loss": 0.5681, + "num_tokens": 3109334071.0, + "step": 1331 + }, + { + "epoch": 2.0608365019011408, + "grad_norm": 0.4534767046573194, + "learning_rate": 5.237013845046175e-05, + "loss": 0.6072, + "num_tokens": 3111623711.0, + "step": 1332 + }, + { + "epoch": 2.061182163843761, + "grad_norm": 0.46583401123679735, + "learning_rate": 5.2324219842565955e-05, + "loss": 0.5823, + "num_tokens": 3113946994.0, + "step": 1333 + }, + { + "epoch": 2.061527825786381, + "grad_norm": 0.4975240998433116, + "learning_rate": 5.227828328333739e-05, + "loss": 0.619, + "num_tokens": 3116247769.0, + "step": 1334 + }, + { + "epoch": 2.061873487729001, + "grad_norm": 0.36868129528835913, + "learning_rate": 5.223232883968896e-05, + "loss": 0.6155, + "num_tokens": 3118478766.0, + "step": 1335 + }, + { + "epoch": 2.062219149671621, + "grad_norm": 0.4457842705164704, + "learning_rate": 5.218635657855961e-05, + "loss": 0.5931, + "num_tokens": 3120772785.0, + "step": 1336 + }, + { + "epoch": 2.062564811614241, + "grad_norm": 0.43226589878053656, + "learning_rate": 5.214036656691425e-05, + "loss": 0.6273, + "num_tokens": 3123260271.0, + "step": 1337 + }, + { + "epoch": 2.0629104735568613, + "grad_norm": 0.3710556079437764, + "learning_rate": 5.209435887174363e-05, + "loss": 0.5939, + "num_tokens": 3125735346.0, + "step": 1338 + }, + { + "epoch": 2.0632561354994814, + "grad_norm": 0.5689559844563727, + "learning_rate": 5.204833356006426e-05, + "loss": 0.613, + "num_tokens": 3127993693.0, + "step": 1339 + }, + { + "epoch": 2.0636017974421015, + "grad_norm": 0.50140646818568, + "learning_rate": 5.200229069891831e-05, + "loss": 0.6027, + "num_tokens": 3130303716.0, + "step": 1340 + }, + { + "epoch": 2.0639474593847216, + "grad_norm": 0.5337983095997584, + "learning_rate": 5.195623035537353e-05, + "loss": 0.5967, + "num_tokens": 3132513082.0, + "step": 1341 + }, + { + "epoch": 2.0642931213273417, + "grad_norm": 0.5868167137154597, + "learning_rate": 5.191015259652313e-05, + "loss": 0.5918, + "num_tokens": 3134785282.0, + "step": 1342 + }, + { + "epoch": 2.064638783269962, + "grad_norm": 0.5340252361151071, + "learning_rate": 5.186405748948566e-05, + "loss": 0.6076, + "num_tokens": 3137241546.0, + "step": 1343 + }, + { + "epoch": 2.0649844452125823, + "grad_norm": 0.638440713322839, + "learning_rate": 5.1817945101404976e-05, + "loss": 0.6011, + "num_tokens": 3139627948.0, + "step": 1344 + }, + { + "epoch": 2.0653301071552024, + "grad_norm": 0.5318842486596531, + "learning_rate": 5.177181549945009e-05, + "loss": 0.603, + "num_tokens": 3142033289.0, + "step": 1345 + }, + { + "epoch": 2.0656757690978225, + "grad_norm": 0.7233091124619869, + "learning_rate": 5.172566875081508e-05, + "loss": 0.6052, + "num_tokens": 3144284285.0, + "step": 1346 + }, + { + "epoch": 2.0660214310404426, + "grad_norm": 0.6217843162035248, + "learning_rate": 5.167950492271903e-05, + "loss": 0.5999, + "num_tokens": 3146659120.0, + "step": 1347 + }, + { + "epoch": 2.0663670929830626, + "grad_norm": 0.5013759738254941, + "learning_rate": 5.1633324082405855e-05, + "loss": 0.5952, + "num_tokens": 3149054559.0, + "step": 1348 + }, + { + "epoch": 2.0667127549256827, + "grad_norm": 0.5499401018826537, + "learning_rate": 5.1587126297144305e-05, + "loss": 0.5903, + "num_tokens": 3151378550.0, + "step": 1349 + }, + { + "epoch": 2.067058416868303, + "grad_norm": 0.4752547351654341, + "learning_rate": 5.1540911634227774e-05, + "loss": 0.5833, + "num_tokens": 3153763861.0, + "step": 1350 + }, + { + "epoch": 2.067404078810923, + "grad_norm": 0.4971518106908343, + "learning_rate": 5.149468016097426e-05, + "loss": 0.6177, + "num_tokens": 3156218779.0, + "step": 1351 + }, + { + "epoch": 2.067749740753543, + "grad_norm": 0.5237165597670185, + "learning_rate": 5.144843194472622e-05, + "loss": 0.6009, + "num_tokens": 3158730293.0, + "step": 1352 + }, + { + "epoch": 2.068095402696163, + "grad_norm": 0.35430393340268224, + "learning_rate": 5.140216705285054e-05, + "loss": 0.5819, + "num_tokens": 3161095133.0, + "step": 1353 + }, + { + "epoch": 2.068441064638783, + "grad_norm": 0.4699734048766594, + "learning_rate": 5.135588555273838e-05, + "loss": 0.6057, + "num_tokens": 3163412660.0, + "step": 1354 + }, + { + "epoch": 2.0687867265814033, + "grad_norm": 0.403406170186645, + "learning_rate": 5.130958751180508e-05, + "loss": 0.5547, + "num_tokens": 3165788784.0, + "step": 1355 + }, + { + "epoch": 2.0691323885240234, + "grad_norm": 0.5128392197639187, + "learning_rate": 5.126327299749008e-05, + "loss": 0.5981, + "num_tokens": 3168152449.0, + "step": 1356 + }, + { + "epoch": 2.0694780504666435, + "grad_norm": 0.49078424504025125, + "learning_rate": 5.1216942077256814e-05, + "loss": 0.5823, + "num_tokens": 3170440932.0, + "step": 1357 + }, + { + "epoch": 2.0698237124092635, + "grad_norm": 0.4623384473764788, + "learning_rate": 5.117059481859263e-05, + "loss": 0.5764, + "num_tokens": 3172676957.0, + "step": 1358 + }, + { + "epoch": 2.0701693743518836, + "grad_norm": 0.4749201080992479, + "learning_rate": 5.1124231289008664e-05, + "loss": 0.6076, + "num_tokens": 3175013843.0, + "step": 1359 + }, + { + "epoch": 2.070515036294504, + "grad_norm": 0.43539691480416165, + "learning_rate": 5.107785155603973e-05, + "loss": 0.5762, + "num_tokens": 3177306106.0, + "step": 1360 + }, + { + "epoch": 2.0708606982371243, + "grad_norm": 0.4321376119569733, + "learning_rate": 5.1031455687244286e-05, + "loss": 0.6027, + "num_tokens": 3179598307.0, + "step": 1361 + }, + { + "epoch": 2.0712063601797444, + "grad_norm": 0.3908657109199825, + "learning_rate": 5.0985043750204266e-05, + "loss": 0.5823, + "num_tokens": 3181990757.0, + "step": 1362 + }, + { + "epoch": 2.0715520221223644, + "grad_norm": 0.3959646593624416, + "learning_rate": 5.0938615812525037e-05, + "loss": 0.5983, + "num_tokens": 3184393405.0, + "step": 1363 + }, + { + "epoch": 2.0718976840649845, + "grad_norm": 0.33415542714869745, + "learning_rate": 5.089217194183523e-05, + "loss": 0.5887, + "num_tokens": 3186807958.0, + "step": 1364 + }, + { + "epoch": 2.0722433460076046, + "grad_norm": 0.4126534313619065, + "learning_rate": 5.08457122057867e-05, + "loss": 0.6033, + "num_tokens": 3189021466.0, + "step": 1365 + }, + { + "epoch": 2.0725890079502247, + "grad_norm": 0.3377253409568065, + "learning_rate": 5.079923667205445e-05, + "loss": 0.5959, + "num_tokens": 3191253057.0, + "step": 1366 + }, + { + "epoch": 2.072934669892845, + "grad_norm": 0.5145287857230376, + "learning_rate": 5.075274540833645e-05, + "loss": 0.6107, + "num_tokens": 3193667262.0, + "step": 1367 + }, + { + "epoch": 2.073280331835465, + "grad_norm": 0.46953729460069205, + "learning_rate": 5.07062384823536e-05, + "loss": 0.6133, + "num_tokens": 3196054273.0, + "step": 1368 + }, + { + "epoch": 2.073625993778085, + "grad_norm": 0.45222944451378894, + "learning_rate": 5.065971596184962e-05, + "loss": 0.5877, + "num_tokens": 3198364427.0, + "step": 1369 + }, + { + "epoch": 2.073971655720705, + "grad_norm": 0.5030619973621816, + "learning_rate": 5.0613177914590915e-05, + "loss": 0.5906, + "num_tokens": 3200742851.0, + "step": 1370 + }, + { + "epoch": 2.074317317663325, + "grad_norm": 0.4688222159601526, + "learning_rate": 5.056662440836654e-05, + "loss": 0.6092, + "num_tokens": 3203218957.0, + "step": 1371 + }, + { + "epoch": 2.0746629796059453, + "grad_norm": 0.49551805449835007, + "learning_rate": 5.052005551098808e-05, + "loss": 0.5844, + "num_tokens": 3205476093.0, + "step": 1372 + }, + { + "epoch": 2.0750086415485653, + "grad_norm": 0.5537450404177507, + "learning_rate": 5.0473471290289485e-05, + "loss": 0.5916, + "num_tokens": 3207792093.0, + "step": 1373 + }, + { + "epoch": 2.0753543034911854, + "grad_norm": 0.4241531512083838, + "learning_rate": 5.0426871814127077e-05, + "loss": 0.577, + "num_tokens": 3210086339.0, + "step": 1374 + }, + { + "epoch": 2.0756999654338055, + "grad_norm": 0.5404564702691456, + "learning_rate": 5.038025715037937e-05, + "loss": 0.59, + "num_tokens": 3212357764.0, + "step": 1375 + }, + { + "epoch": 2.076045627376426, + "grad_norm": 0.43073252489205993, + "learning_rate": 5.0333627366947015e-05, + "loss": 0.5672, + "num_tokens": 3214718419.0, + "step": 1376 + }, + { + "epoch": 2.076391289319046, + "grad_norm": 0.48967298286161537, + "learning_rate": 5.028698253175268e-05, + "loss": 0.6083, + "num_tokens": 3217184509.0, + "step": 1377 + }, + { + "epoch": 2.0767369512616662, + "grad_norm": 0.43217246640406853, + "learning_rate": 5.024032271274096e-05, + "loss": 0.6098, + "num_tokens": 3219412170.0, + "step": 1378 + }, + { + "epoch": 2.0770826132042863, + "grad_norm": 0.4047987816392751, + "learning_rate": 5.019364797787829e-05, + "loss": 0.6011, + "num_tokens": 3221700253.0, + "step": 1379 + }, + { + "epoch": 2.0774282751469064, + "grad_norm": 0.3769070366388324, + "learning_rate": 5.014695839515281e-05, + "loss": 0.6011, + "num_tokens": 3224131887.0, + "step": 1380 + }, + { + "epoch": 2.0777739370895265, + "grad_norm": 0.45695592521803, + "learning_rate": 5.0100254032574285e-05, + "loss": 0.5986, + "num_tokens": 3226482394.0, + "step": 1381 + }, + { + "epoch": 2.0781195990321466, + "grad_norm": 0.4135402265139777, + "learning_rate": 5.005353495817404e-05, + "loss": 0.6158, + "num_tokens": 3228795415.0, + "step": 1382 + }, + { + "epoch": 2.0784652609747667, + "grad_norm": 0.4158986146867342, + "learning_rate": 5.000680124000481e-05, + "loss": 0.6316, + "num_tokens": 3231197437.0, + "step": 1383 + }, + { + "epoch": 2.078810922917387, + "grad_norm": 0.3989372903988358, + "learning_rate": 4.996005294614064e-05, + "loss": 0.5867, + "num_tokens": 3233511135.0, + "step": 1384 + }, + { + "epoch": 2.079156584860007, + "grad_norm": 0.446182580372595, + "learning_rate": 4.991329014467685e-05, + "loss": 0.5949, + "num_tokens": 3235830712.0, + "step": 1385 + }, + { + "epoch": 2.079502246802627, + "grad_norm": 0.39912662186878356, + "learning_rate": 4.986651290372985e-05, + "loss": 0.6008, + "num_tokens": 3238161243.0, + "step": 1386 + }, + { + "epoch": 2.079847908745247, + "grad_norm": 0.534903402940179, + "learning_rate": 4.981972129143711e-05, + "loss": 0.5866, + "num_tokens": 3240351048.0, + "step": 1387 + }, + { + "epoch": 2.080193570687867, + "grad_norm": 0.3752422616304984, + "learning_rate": 4.9772915375957044e-05, + "loss": 0.5874, + "num_tokens": 3242642472.0, + "step": 1388 + }, + { + "epoch": 2.0805392326304872, + "grad_norm": 0.5699021025370616, + "learning_rate": 4.972609522546887e-05, + "loss": 0.6095, + "num_tokens": 3244951493.0, + "step": 1389 + }, + { + "epoch": 2.0808848945731073, + "grad_norm": 0.558996067689069, + "learning_rate": 4.967926090817253e-05, + "loss": 0.6075, + "num_tokens": 3247249640.0, + "step": 1390 + }, + { + "epoch": 2.0812305565157274, + "grad_norm": 0.4722460415367258, + "learning_rate": 4.963241249228867e-05, + "loss": 0.6016, + "num_tokens": 3249684782.0, + "step": 1391 + }, + { + "epoch": 2.081576218458348, + "grad_norm": 0.5845853284207462, + "learning_rate": 4.9585550046058404e-05, + "loss": 0.5765, + "num_tokens": 3251989661.0, + "step": 1392 + }, + { + "epoch": 2.081921880400968, + "grad_norm": 0.42409143982270275, + "learning_rate": 4.9538673637743324e-05, + "loss": 0.5693, + "num_tokens": 3254431747.0, + "step": 1393 + }, + { + "epoch": 2.082267542343588, + "grad_norm": 0.6343660772321286, + "learning_rate": 4.9491783335625326e-05, + "loss": 0.597, + "num_tokens": 3256803363.0, + "step": 1394 + }, + { + "epoch": 2.082613204286208, + "grad_norm": 0.488192163273591, + "learning_rate": 4.944487920800657e-05, + "loss": 0.5916, + "num_tokens": 3259174624.0, + "step": 1395 + }, + { + "epoch": 2.0829588662288283, + "grad_norm": 0.5034102009509795, + "learning_rate": 4.939796132320934e-05, + "loss": 0.5693, + "num_tokens": 3261486423.0, + "step": 1396 + }, + { + "epoch": 2.0833045281714484, + "grad_norm": 0.43475066943592616, + "learning_rate": 4.935102974957598e-05, + "loss": 0.5814, + "num_tokens": 3263831155.0, + "step": 1397 + }, + { + "epoch": 2.0836501901140685, + "grad_norm": 0.5919902106353729, + "learning_rate": 4.930408455546874e-05, + "loss": 0.5553, + "num_tokens": 3266193553.0, + "step": 1398 + }, + { + "epoch": 2.0839958520566886, + "grad_norm": 0.4188249775572424, + "learning_rate": 4.9257125809269753e-05, + "loss": 0.5639, + "num_tokens": 3268498134.0, + "step": 1399 + }, + { + "epoch": 2.0843415139993087, + "grad_norm": 0.5166414747770294, + "learning_rate": 4.9210153579380846e-05, + "loss": 0.5944, + "num_tokens": 3270823792.0, + "step": 1400 + }, + { + "epoch": 2.0846871759419288, + "grad_norm": 0.45806391476180136, + "learning_rate": 4.916316793422353e-05, + "loss": 0.5729, + "num_tokens": 3273050648.0, + "step": 1401 + }, + { + "epoch": 2.085032837884549, + "grad_norm": 0.3724607613035754, + "learning_rate": 4.9116168942238835e-05, + "loss": 0.6057, + "num_tokens": 3275356153.0, + "step": 1402 + }, + { + "epoch": 2.085378499827169, + "grad_norm": 0.6505604917192587, + "learning_rate": 4.9069156671887216e-05, + "loss": 0.6041, + "num_tokens": 3277732548.0, + "step": 1403 + }, + { + "epoch": 2.085724161769789, + "grad_norm": 0.48129075010108735, + "learning_rate": 4.902213119164851e-05, + "loss": 0.6157, + "num_tokens": 3280207621.0, + "step": 1404 + }, + { + "epoch": 2.086069823712409, + "grad_norm": 0.8202288157223512, + "learning_rate": 4.897509257002176e-05, + "loss": 0.5978, + "num_tokens": 3282480595.0, + "step": 1405 + }, + { + "epoch": 2.086415485655029, + "grad_norm": 0.8040169194748262, + "learning_rate": 4.8928040875525176e-05, + "loss": 0.6092, + "num_tokens": 3284840094.0, + "step": 1406 + }, + { + "epoch": 2.0867611475976493, + "grad_norm": 0.5836619310287828, + "learning_rate": 4.8880976176695995e-05, + "loss": 0.5913, + "num_tokens": 3287111698.0, + "step": 1407 + }, + { + "epoch": 2.08710680954027, + "grad_norm": 0.7486404750312236, + "learning_rate": 4.8833898542090395e-05, + "loss": 0.5991, + "num_tokens": 3289332013.0, + "step": 1408 + }, + { + "epoch": 2.08745247148289, + "grad_norm": 0.6290656818287904, + "learning_rate": 4.878680804028341e-05, + "loss": 0.6063, + "num_tokens": 3291628252.0, + "step": 1409 + }, + { + "epoch": 2.08779813342551, + "grad_norm": 0.6409232716485557, + "learning_rate": 4.873970473986882e-05, + "loss": 0.6024, + "num_tokens": 3294060415.0, + "step": 1410 + }, + { + "epoch": 2.08814379536813, + "grad_norm": 0.6547673585890731, + "learning_rate": 4.869258870945903e-05, + "loss": 0.609, + "num_tokens": 3296299276.0, + "step": 1411 + }, + { + "epoch": 2.08848945731075, + "grad_norm": 0.5486147956355585, + "learning_rate": 4.864546001768498e-05, + "loss": 0.5896, + "num_tokens": 3298647340.0, + "step": 1412 + }, + { + "epoch": 2.0888351192533703, + "grad_norm": 0.5052878907147205, + "learning_rate": 4.85983187331961e-05, + "loss": 0.5976, + "num_tokens": 3300966145.0, + "step": 1413 + }, + { + "epoch": 2.0891807811959904, + "grad_norm": 0.5805886398260992, + "learning_rate": 4.855116492466012e-05, + "loss": 0.6025, + "num_tokens": 3303379605.0, + "step": 1414 + }, + { + "epoch": 2.0895264431386105, + "grad_norm": 0.5012579039196586, + "learning_rate": 4.850399866076301e-05, + "loss": 0.5893, + "num_tokens": 3305761447.0, + "step": 1415 + }, + { + "epoch": 2.0898721050812306, + "grad_norm": 0.4913502668881687, + "learning_rate": 4.845682001020892e-05, + "loss": 0.5746, + "num_tokens": 3308107096.0, + "step": 1416 + }, + { + "epoch": 2.0902177670238506, + "grad_norm": 0.48518963539608134, + "learning_rate": 4.8409629041719995e-05, + "loss": 0.5926, + "num_tokens": 3310476165.0, + "step": 1417 + }, + { + "epoch": 2.0905634289664707, + "grad_norm": 0.5074854506829566, + "learning_rate": 4.8362425824036373e-05, + "loss": 0.5672, + "num_tokens": 3312834036.0, + "step": 1418 + }, + { + "epoch": 2.090909090909091, + "grad_norm": 0.3805762297279639, + "learning_rate": 4.831521042591601e-05, + "loss": 0.6126, + "num_tokens": 3315239045.0, + "step": 1419 + }, + { + "epoch": 2.091254752851711, + "grad_norm": 0.44891464557705857, + "learning_rate": 4.82679829161346e-05, + "loss": 0.5839, + "num_tokens": 3317598412.0, + "step": 1420 + }, + { + "epoch": 2.091600414794331, + "grad_norm": 0.3691877616880071, + "learning_rate": 4.822074336348547e-05, + "loss": 0.5823, + "num_tokens": 3319951359.0, + "step": 1421 + }, + { + "epoch": 2.091946076736951, + "grad_norm": 0.5335786191913499, + "learning_rate": 4.8173491836779516e-05, + "loss": 0.5784, + "num_tokens": 3322246526.0, + "step": 1422 + }, + { + "epoch": 2.092291738679571, + "grad_norm": 0.45654871941771114, + "learning_rate": 4.8126228404845066e-05, + "loss": 0.5885, + "num_tokens": 3324533316.0, + "step": 1423 + }, + { + "epoch": 2.0926374006221913, + "grad_norm": 0.5336701829321912, + "learning_rate": 4.807895313652778e-05, + "loss": 0.5992, + "num_tokens": 3326841076.0, + "step": 1424 + }, + { + "epoch": 2.092983062564812, + "grad_norm": 0.48665791463487357, + "learning_rate": 4.803166610069057e-05, + "loss": 0.5765, + "num_tokens": 3329107465.0, + "step": 1425 + }, + { + "epoch": 2.093328724507432, + "grad_norm": 0.5608239329390864, + "learning_rate": 4.798436736621348e-05, + "loss": 0.6104, + "num_tokens": 3331391244.0, + "step": 1426 + }, + { + "epoch": 2.093674386450052, + "grad_norm": 0.5366131652356881, + "learning_rate": 4.793705700199362e-05, + "loss": 0.5859, + "num_tokens": 3333692925.0, + "step": 1427 + }, + { + "epoch": 2.094020048392672, + "grad_norm": 0.6052726358649559, + "learning_rate": 4.788973507694499e-05, + "loss": 0.5674, + "num_tokens": 3335996348.0, + "step": 1428 + }, + { + "epoch": 2.094365710335292, + "grad_norm": 0.5249649617991476, + "learning_rate": 4.784240165999847e-05, + "loss": 0.5682, + "num_tokens": 3338217982.0, + "step": 1429 + }, + { + "epoch": 2.0947113722779123, + "grad_norm": 0.6264789215811467, + "learning_rate": 4.779505682010168e-05, + "loss": 0.5833, + "num_tokens": 3340472885.0, + "step": 1430 + }, + { + "epoch": 2.0950570342205324, + "grad_norm": 0.47708969906447374, + "learning_rate": 4.774770062621886e-05, + "loss": 0.6063, + "num_tokens": 3342753091.0, + "step": 1431 + }, + { + "epoch": 2.0954026961631524, + "grad_norm": 0.6774414755916596, + "learning_rate": 4.770033314733081e-05, + "loss": 0.5766, + "num_tokens": 3345017940.0, + "step": 1432 + }, + { + "epoch": 2.0957483581057725, + "grad_norm": 0.5183321834799592, + "learning_rate": 4.765295445243472e-05, + "loss": 0.6124, + "num_tokens": 3347387473.0, + "step": 1433 + }, + { + "epoch": 2.0960940200483926, + "grad_norm": 0.630312611680502, + "learning_rate": 4.76055646105442e-05, + "loss": 0.6003, + "num_tokens": 3349724179.0, + "step": 1434 + }, + { + "epoch": 2.0964396819910127, + "grad_norm": 0.5415849422988853, + "learning_rate": 4.755816369068902e-05, + "loss": 0.5965, + "num_tokens": 3352053792.0, + "step": 1435 + }, + { + "epoch": 2.096785343933633, + "grad_norm": 0.5421067514682311, + "learning_rate": 4.751075176191513e-05, + "loss": 0.5963, + "num_tokens": 3354431365.0, + "step": 1436 + }, + { + "epoch": 2.097131005876253, + "grad_norm": 0.5416433545303763, + "learning_rate": 4.746332889328448e-05, + "loss": 0.5859, + "num_tokens": 3356695537.0, + "step": 1437 + }, + { + "epoch": 2.097476667818873, + "grad_norm": 0.5588170582375704, + "learning_rate": 4.7415895153875015e-05, + "loss": 0.5918, + "num_tokens": 3359086245.0, + "step": 1438 + }, + { + "epoch": 2.097822329761493, + "grad_norm": 0.5201846710638441, + "learning_rate": 4.736845061278044e-05, + "loss": 0.6001, + "num_tokens": 3361405739.0, + "step": 1439 + }, + { + "epoch": 2.098167991704113, + "grad_norm": 0.5859383484532398, + "learning_rate": 4.7320995339110273e-05, + "loss": 0.594, + "num_tokens": 3363689862.0, + "step": 1440 + }, + { + "epoch": 2.0985136536467337, + "grad_norm": 0.42113565050373614, + "learning_rate": 4.7273529401989585e-05, + "loss": 0.5798, + "num_tokens": 3366019511.0, + "step": 1441 + }, + { + "epoch": 2.098859315589354, + "grad_norm": 0.6237849130858489, + "learning_rate": 4.722605287055904e-05, + "loss": 0.584, + "num_tokens": 3368379525.0, + "step": 1442 + }, + { + "epoch": 2.099204977531974, + "grad_norm": 0.4586223582503875, + "learning_rate": 4.7178565813974715e-05, + "loss": 0.5975, + "num_tokens": 3370713258.0, + "step": 1443 + }, + { + "epoch": 2.099550639474594, + "grad_norm": 0.5422602907137821, + "learning_rate": 4.7131068301408e-05, + "loss": 0.5712, + "num_tokens": 3373040733.0, + "step": 1444 + }, + { + "epoch": 2.099896301417214, + "grad_norm": 0.46879892452079386, + "learning_rate": 4.708356040204556e-05, + "loss": 0.5768, + "num_tokens": 3375342076.0, + "step": 1445 + }, + { + "epoch": 2.100241963359834, + "grad_norm": 0.48930034636831554, + "learning_rate": 4.703604218508912e-05, + "loss": 0.5872, + "num_tokens": 3377632930.0, + "step": 1446 + }, + { + "epoch": 2.1005876253024542, + "grad_norm": 0.46962094420586087, + "learning_rate": 4.698851371975552e-05, + "loss": 0.5981, + "num_tokens": 3379981420.0, + "step": 1447 + }, + { + "epoch": 2.1009332872450743, + "grad_norm": 0.4846185853901907, + "learning_rate": 4.6940975075276463e-05, + "loss": 0.5945, + "num_tokens": 3382410671.0, + "step": 1448 + }, + { + "epoch": 2.1012789491876944, + "grad_norm": 0.41576146001612885, + "learning_rate": 4.689342632089851e-05, + "loss": 0.5858, + "num_tokens": 3384812643.0, + "step": 1449 + }, + { + "epoch": 2.1016246111303145, + "grad_norm": 0.3412626515858297, + "learning_rate": 4.6845867525882914e-05, + "loss": 0.5738, + "num_tokens": 3387285255.0, + "step": 1450 + }, + { + "epoch": 2.1019702730729346, + "grad_norm": 0.46776387820593257, + "learning_rate": 4.6798298759505614e-05, + "loss": 0.5681, + "num_tokens": 3389686868.0, + "step": 1451 + }, + { + "epoch": 2.1023159350155547, + "grad_norm": 0.3848216810159128, + "learning_rate": 4.6750720091057005e-05, + "loss": 0.5961, + "num_tokens": 3392083494.0, + "step": 1452 + }, + { + "epoch": 2.102661596958175, + "grad_norm": 0.5155178693498613, + "learning_rate": 4.670313158984197e-05, + "loss": 0.5741, + "num_tokens": 3394419830.0, + "step": 1453 + }, + { + "epoch": 2.103007258900795, + "grad_norm": 0.5443959647458134, + "learning_rate": 4.6655533325179666e-05, + "loss": 0.5752, + "num_tokens": 3396718212.0, + "step": 1454 + }, + { + "epoch": 2.103352920843415, + "grad_norm": 0.5041536646693336, + "learning_rate": 4.660792536640348e-05, + "loss": 0.5871, + "num_tokens": 3399018515.0, + "step": 1455 + }, + { + "epoch": 2.103698582786035, + "grad_norm": 0.5690919491663442, + "learning_rate": 4.656030778286096e-05, + "loss": 0.5843, + "num_tokens": 3401370094.0, + "step": 1456 + }, + { + "epoch": 2.1040442447286556, + "grad_norm": 0.518871558275388, + "learning_rate": 4.651268064391362e-05, + "loss": 0.5759, + "num_tokens": 3403662743.0, + "step": 1457 + }, + { + "epoch": 2.1043899066712757, + "grad_norm": 0.6898761905106405, + "learning_rate": 4.6465044018936916e-05, + "loss": 0.5806, + "num_tokens": 3405979837.0, + "step": 1458 + }, + { + "epoch": 2.1047355686138958, + "grad_norm": 0.4738716491350161, + "learning_rate": 4.641739797732012e-05, + "loss": 0.596, + "num_tokens": 3408291704.0, + "step": 1459 + }, + { + "epoch": 2.105081230556516, + "grad_norm": 0.5746644977651222, + "learning_rate": 4.6369742588466226e-05, + "loss": 0.5635, + "num_tokens": 3410550762.0, + "step": 1460 + }, + { + "epoch": 2.105426892499136, + "grad_norm": 0.5932643602677807, + "learning_rate": 4.632207792179187e-05, + "loss": 0.5917, + "num_tokens": 3412933686.0, + "step": 1461 + }, + { + "epoch": 2.105772554441756, + "grad_norm": 0.37588434761962064, + "learning_rate": 4.627440404672712e-05, + "loss": 0.6056, + "num_tokens": 3415267705.0, + "step": 1462 + }, + { + "epoch": 2.106118216384376, + "grad_norm": 0.4363904470182498, + "learning_rate": 4.622672103271553e-05, + "loss": 0.5937, + "num_tokens": 3417596121.0, + "step": 1463 + }, + { + "epoch": 2.106463878326996, + "grad_norm": 0.3447751095469651, + "learning_rate": 4.617902894921395e-05, + "loss": 0.6219, + "num_tokens": 3420052184.0, + "step": 1464 + }, + { + "epoch": 2.1068095402696163, + "grad_norm": 0.49389399695766245, + "learning_rate": 4.613132786569246e-05, + "loss": 0.5934, + "num_tokens": 3422400090.0, + "step": 1465 + }, + { + "epoch": 2.1071552022122364, + "grad_norm": 0.49247303494521105, + "learning_rate": 4.608361785163418e-05, + "loss": 0.6083, + "num_tokens": 3424791575.0, + "step": 1466 + }, + { + "epoch": 2.1075008641548565, + "grad_norm": 0.3629743537621346, + "learning_rate": 4.603589897653532e-05, + "loss": 0.6106, + "num_tokens": 3427277003.0, + "step": 1467 + }, + { + "epoch": 2.1078465260974766, + "grad_norm": 0.6773288249904736, + "learning_rate": 4.5988171309904936e-05, + "loss": 0.5976, + "num_tokens": 3429692183.0, + "step": 1468 + }, + { + "epoch": 2.1081921880400967, + "grad_norm": 0.49861825156068956, + "learning_rate": 4.594043492126494e-05, + "loss": 0.5674, + "num_tokens": 3432019803.0, + "step": 1469 + }, + { + "epoch": 2.1085378499827168, + "grad_norm": 0.6280652330983727, + "learning_rate": 4.58926898801499e-05, + "loss": 0.5832, + "num_tokens": 3434228306.0, + "step": 1470 + }, + { + "epoch": 2.108883511925337, + "grad_norm": 0.5762792896475177, + "learning_rate": 4.5844936256107036e-05, + "loss": 0.6049, + "num_tokens": 3436700598.0, + "step": 1471 + }, + { + "epoch": 2.109229173867957, + "grad_norm": 0.39382035499885637, + "learning_rate": 4.579717411869603e-05, + "loss": 0.5822, + "num_tokens": 3439032685.0, + "step": 1472 + }, + { + "epoch": 2.1095748358105775, + "grad_norm": 0.4473620336132255, + "learning_rate": 4.5749403537489e-05, + "loss": 0.6021, + "num_tokens": 3441374442.0, + "step": 1473 + }, + { + "epoch": 2.1099204977531976, + "grad_norm": 0.4585866444065292, + "learning_rate": 4.570162458207034e-05, + "loss": 0.5859, + "num_tokens": 3443709704.0, + "step": 1474 + }, + { + "epoch": 2.1102661596958177, + "grad_norm": 0.364684118166688, + "learning_rate": 4.565383732203662e-05, + "loss": 0.5883, + "num_tokens": 3446103480.0, + "step": 1475 + }, + { + "epoch": 2.1106118216384377, + "grad_norm": 0.5153032271117641, + "learning_rate": 4.560604182699656e-05, + "loss": 0.5614, + "num_tokens": 3448483544.0, + "step": 1476 + }, + { + "epoch": 2.110957483581058, + "grad_norm": 0.41422749477160964, + "learning_rate": 4.555823816657085e-05, + "loss": 0.5664, + "num_tokens": 3450755965.0, + "step": 1477 + }, + { + "epoch": 2.111303145523678, + "grad_norm": 0.5455588438015295, + "learning_rate": 4.551042641039208e-05, + "loss": 0.5919, + "num_tokens": 3453034301.0, + "step": 1478 + }, + { + "epoch": 2.111648807466298, + "grad_norm": 0.5401825233451718, + "learning_rate": 4.5462606628104594e-05, + "loss": 0.6011, + "num_tokens": 3455373690.0, + "step": 1479 + }, + { + "epoch": 2.111994469408918, + "grad_norm": 0.5583967317519248, + "learning_rate": 4.5414778889364494e-05, + "loss": 0.5847, + "num_tokens": 3457697808.0, + "step": 1480 + }, + { + "epoch": 2.112340131351538, + "grad_norm": 0.5334758538056495, + "learning_rate": 4.536694326383941e-05, + "loss": 0.5998, + "num_tokens": 3460091172.0, + "step": 1481 + }, + { + "epoch": 2.1126857932941583, + "grad_norm": 0.46818213520330704, + "learning_rate": 4.531909982120852e-05, + "loss": 0.5714, + "num_tokens": 3462415354.0, + "step": 1482 + }, + { + "epoch": 2.1130314552367784, + "grad_norm": 0.5792435318300084, + "learning_rate": 4.527124863116234e-05, + "loss": 0.5971, + "num_tokens": 3464709383.0, + "step": 1483 + }, + { + "epoch": 2.1133771171793985, + "grad_norm": 0.4709965304517349, + "learning_rate": 4.522338976340266e-05, + "loss": 0.5839, + "num_tokens": 3467093607.0, + "step": 1484 + }, + { + "epoch": 2.1137227791220186, + "grad_norm": 0.6020637019196002, + "learning_rate": 4.5175523287642513e-05, + "loss": 0.5815, + "num_tokens": 3469393160.0, + "step": 1485 + }, + { + "epoch": 2.1140684410646386, + "grad_norm": 0.5945573427376513, + "learning_rate": 4.512764927360597e-05, + "loss": 0.579, + "num_tokens": 3471780093.0, + "step": 1486 + }, + { + "epoch": 2.1144141030072587, + "grad_norm": 0.5332782847648369, + "learning_rate": 4.50797677910281e-05, + "loss": 0.596, + "num_tokens": 3474181577.0, + "step": 1487 + }, + { + "epoch": 2.114759764949879, + "grad_norm": 0.5511499382254517, + "learning_rate": 4.503187890965486e-05, + "loss": 0.5822, + "num_tokens": 3476605643.0, + "step": 1488 + }, + { + "epoch": 2.115105426892499, + "grad_norm": 0.6185706934473375, + "learning_rate": 4.498398269924291e-05, + "loss": 0.5918, + "num_tokens": 3478966828.0, + "step": 1489 + }, + { + "epoch": 2.1154510888351195, + "grad_norm": 0.44621909558067707, + "learning_rate": 4.493607922955971e-05, + "loss": 0.6, + "num_tokens": 3481151839.0, + "step": 1490 + }, + { + "epoch": 2.1157967507777395, + "grad_norm": 0.5135248871613949, + "learning_rate": 4.4888168570383226e-05, + "loss": 0.5915, + "num_tokens": 3483550123.0, + "step": 1491 + }, + { + "epoch": 2.1161424127203596, + "grad_norm": 0.43408892246476505, + "learning_rate": 4.484025079150186e-05, + "loss": 0.581, + "num_tokens": 3485866600.0, + "step": 1492 + }, + { + "epoch": 2.1164880746629797, + "grad_norm": 0.3633011465325275, + "learning_rate": 4.4792325962714436e-05, + "loss": 0.5935, + "num_tokens": 3488198570.0, + "step": 1493 + }, + { + "epoch": 2.1168337366056, + "grad_norm": 0.35940000396493293, + "learning_rate": 4.474439415383006e-05, + "loss": 0.5886, + "num_tokens": 3490537109.0, + "step": 1494 + }, + { + "epoch": 2.11717939854822, + "grad_norm": 0.3878789934424021, + "learning_rate": 4.469645543466797e-05, + "loss": 0.5964, + "num_tokens": 3492755477.0, + "step": 1495 + }, + { + "epoch": 2.11752506049084, + "grad_norm": 0.33345901092344943, + "learning_rate": 4.464850987505747e-05, + "loss": 0.5735, + "num_tokens": 3495098110.0, + "step": 1496 + }, + { + "epoch": 2.11787072243346, + "grad_norm": 0.34213964080475556, + "learning_rate": 4.4600557544837847e-05, + "loss": 0.5881, + "num_tokens": 3497386067.0, + "step": 1497 + }, + { + "epoch": 2.11821638437608, + "grad_norm": 0.357226310955312, + "learning_rate": 4.4552598513858235e-05, + "loss": 0.6108, + "num_tokens": 3499756171.0, + "step": 1498 + }, + { + "epoch": 2.1185620463187003, + "grad_norm": 0.27708489722957935, + "learning_rate": 4.450463285197755e-05, + "loss": 0.5772, + "num_tokens": 3502091701.0, + "step": 1499 + }, + { + "epoch": 2.1189077082613204, + "grad_norm": 0.41232959144279124, + "learning_rate": 4.4456660629064354e-05, + "loss": 0.5709, + "num_tokens": 3504410297.0, + "step": 1500 + }, + { + "epoch": 2.1192533702039404, + "grad_norm": 0.35667810338884975, + "learning_rate": 4.440868191499675e-05, + "loss": 0.5719, + "num_tokens": 3506620011.0, + "step": 1501 + }, + { + "epoch": 2.1195990321465605, + "grad_norm": 0.40654290165366097, + "learning_rate": 4.43606967796623e-05, + "loss": 0.5871, + "num_tokens": 3508884566.0, + "step": 1502 + }, + { + "epoch": 2.1199446940891806, + "grad_norm": 0.44387074653271946, + "learning_rate": 4.431270529295797e-05, + "loss": 0.5713, + "num_tokens": 3511263472.0, + "step": 1503 + }, + { + "epoch": 2.1202903560318007, + "grad_norm": 0.39723237218678403, + "learning_rate": 4.4264707524789924e-05, + "loss": 0.5752, + "num_tokens": 3513597448.0, + "step": 1504 + }, + { + "epoch": 2.120636017974421, + "grad_norm": 0.4459400326476763, + "learning_rate": 4.421670354507347e-05, + "loss": 0.6013, + "num_tokens": 3515905910.0, + "step": 1505 + }, + { + "epoch": 2.1209816799170413, + "grad_norm": 0.43501625717579756, + "learning_rate": 4.4168693423733e-05, + "loss": 0.5937, + "num_tokens": 3518346169.0, + "step": 1506 + }, + { + "epoch": 2.1213273418596614, + "grad_norm": 0.38917831788586815, + "learning_rate": 4.412067723070184e-05, + "loss": 0.5605, + "num_tokens": 3520633844.0, + "step": 1507 + }, + { + "epoch": 2.1216730038022815, + "grad_norm": 0.40692183654120423, + "learning_rate": 4.4072655035922145e-05, + "loss": 0.5819, + "num_tokens": 3522976219.0, + "step": 1508 + }, + { + "epoch": 2.1220186657449016, + "grad_norm": 0.376500991496363, + "learning_rate": 4.4024626909344834e-05, + "loss": 0.571, + "num_tokens": 3525411195.0, + "step": 1509 + }, + { + "epoch": 2.1223643276875217, + "grad_norm": 0.48864380071217056, + "learning_rate": 4.3976592920929436e-05, + "loss": 0.568, + "num_tokens": 3527751502.0, + "step": 1510 + }, + { + "epoch": 2.122709989630142, + "grad_norm": 0.3452412067287764, + "learning_rate": 4.392855314064408e-05, + "loss": 0.5531, + "num_tokens": 3530065862.0, + "step": 1511 + }, + { + "epoch": 2.123055651572762, + "grad_norm": 0.5304391072595286, + "learning_rate": 4.388050763846524e-05, + "loss": 0.5554, + "num_tokens": 3532296308.0, + "step": 1512 + }, + { + "epoch": 2.123401313515382, + "grad_norm": 0.5218528518368897, + "learning_rate": 4.3832456484377814e-05, + "loss": 0.6138, + "num_tokens": 3534765692.0, + "step": 1513 + }, + { + "epoch": 2.123746975458002, + "grad_norm": 0.444695742396165, + "learning_rate": 4.378439974837488e-05, + "loss": 0.5766, + "num_tokens": 3536962895.0, + "step": 1514 + }, + { + "epoch": 2.124092637400622, + "grad_norm": 0.5016819751222594, + "learning_rate": 4.373633750045765e-05, + "loss": 0.5882, + "num_tokens": 3539318128.0, + "step": 1515 + }, + { + "epoch": 2.1244382993432422, + "grad_norm": 0.4350757709078293, + "learning_rate": 4.36882698106354e-05, + "loss": 0.6014, + "num_tokens": 3541585063.0, + "step": 1516 + }, + { + "epoch": 2.1247839612858623, + "grad_norm": 0.3998049436175027, + "learning_rate": 4.3640196748925294e-05, + "loss": 0.5815, + "num_tokens": 3543977438.0, + "step": 1517 + }, + { + "epoch": 2.1251296232284824, + "grad_norm": 0.4314261366017844, + "learning_rate": 4.359211838535232e-05, + "loss": 0.5983, + "num_tokens": 3546286233.0, + "step": 1518 + }, + { + "epoch": 2.1254752851711025, + "grad_norm": 0.4476508542513684, + "learning_rate": 4.354403478994924e-05, + "loss": 0.5935, + "num_tokens": 3548541267.0, + "step": 1519 + }, + { + "epoch": 2.1258209471137226, + "grad_norm": 0.37433606778038175, + "learning_rate": 4.3495946032756374e-05, + "loss": 0.5864, + "num_tokens": 3550874391.0, + "step": 1520 + }, + { + "epoch": 2.1261666090563427, + "grad_norm": 0.46467280179911385, + "learning_rate": 4.34478521838216e-05, + "loss": 0.5745, + "num_tokens": 3553174508.0, + "step": 1521 + }, + { + "epoch": 2.126512270998963, + "grad_norm": 0.45372222407536217, + "learning_rate": 4.33997533132002e-05, + "loss": 0.5888, + "num_tokens": 3555537942.0, + "step": 1522 + }, + { + "epoch": 2.1268579329415833, + "grad_norm": 0.49096504342195496, + "learning_rate": 4.3351649490954764e-05, + "loss": 0.5842, + "num_tokens": 3557916966.0, + "step": 1523 + }, + { + "epoch": 2.1272035948842034, + "grad_norm": 0.39642832214995855, + "learning_rate": 4.330354078715512e-05, + "loss": 0.5732, + "num_tokens": 3560249867.0, + "step": 1524 + }, + { + "epoch": 2.1275492568268235, + "grad_norm": 0.44299922760462324, + "learning_rate": 4.3255427271878155e-05, + "loss": 0.5773, + "num_tokens": 3562557716.0, + "step": 1525 + }, + { + "epoch": 2.1278949187694436, + "grad_norm": 0.4306769491884419, + "learning_rate": 4.320730901520783e-05, + "loss": 0.5865, + "num_tokens": 3565015887.0, + "step": 1526 + }, + { + "epoch": 2.1282405807120637, + "grad_norm": 0.4663182269941624, + "learning_rate": 4.315918608723497e-05, + "loss": 0.5959, + "num_tokens": 3567296964.0, + "step": 1527 + }, + { + "epoch": 2.1285862426546838, + "grad_norm": 0.44856056495356916, + "learning_rate": 4.311105855805722e-05, + "loss": 0.5827, + "num_tokens": 3569632351.0, + "step": 1528 + }, + { + "epoch": 2.128931904597304, + "grad_norm": 0.4020148740100099, + "learning_rate": 4.3062926497778924e-05, + "loss": 0.5815, + "num_tokens": 3571894689.0, + "step": 1529 + }, + { + "epoch": 2.129277566539924, + "grad_norm": 0.3967396751993151, + "learning_rate": 4.301478997651101e-05, + "loss": 0.6162, + "num_tokens": 3574334376.0, + "step": 1530 + }, + { + "epoch": 2.129623228482544, + "grad_norm": 0.33280922514754935, + "learning_rate": 4.2966649064370937e-05, + "loss": 0.6012, + "num_tokens": 3576721263.0, + "step": 1531 + }, + { + "epoch": 2.129968890425164, + "grad_norm": 0.49112133215390136, + "learning_rate": 4.2918503831482534e-05, + "loss": 0.5908, + "num_tokens": 3579152413.0, + "step": 1532 + }, + { + "epoch": 2.130314552367784, + "grad_norm": 0.4029924244476603, + "learning_rate": 4.2870354347975923e-05, + "loss": 0.5715, + "num_tokens": 3581580238.0, + "step": 1533 + }, + { + "epoch": 2.1306602143104043, + "grad_norm": 0.5797849101549221, + "learning_rate": 4.2822200683987445e-05, + "loss": 0.6057, + "num_tokens": 3583885606.0, + "step": 1534 + }, + { + "epoch": 2.1310058762530244, + "grad_norm": 0.5545419293403632, + "learning_rate": 4.27740429096595e-05, + "loss": 0.5836, + "num_tokens": 3586094085.0, + "step": 1535 + }, + { + "epoch": 2.1313515381956445, + "grad_norm": 0.39544120446802755, + "learning_rate": 4.2725881095140494e-05, + "loss": 0.5931, + "num_tokens": 3588394128.0, + "step": 1536 + }, + { + "epoch": 2.1316972001382646, + "grad_norm": 0.5414136784004402, + "learning_rate": 4.267771531058471e-05, + "loss": 0.5696, + "num_tokens": 3590731600.0, + "step": 1537 + }, + { + "epoch": 2.132042862080885, + "grad_norm": 0.38650157575223854, + "learning_rate": 4.2629545626152205e-05, + "loss": 0.6047, + "num_tokens": 3593037684.0, + "step": 1538 + }, + { + "epoch": 2.132388524023505, + "grad_norm": 0.6449431876129881, + "learning_rate": 4.2581372112008725e-05, + "loss": 0.5775, + "num_tokens": 3595425955.0, + "step": 1539 + }, + { + "epoch": 2.1327341859661253, + "grad_norm": 0.6446301043232713, + "learning_rate": 4.2533194838325616e-05, + "loss": 0.5921, + "num_tokens": 3597824590.0, + "step": 1540 + }, + { + "epoch": 2.1330798479087454, + "grad_norm": 0.47956740063433745, + "learning_rate": 4.248501387527966e-05, + "loss": 0.613, + "num_tokens": 3600213089.0, + "step": 1541 + }, + { + "epoch": 2.1334255098513655, + "grad_norm": 0.5929778445476284, + "learning_rate": 4.243682929305306e-05, + "loss": 0.6075, + "num_tokens": 3602513874.0, + "step": 1542 + }, + { + "epoch": 2.1337711717939856, + "grad_norm": 0.4233463449994534, + "learning_rate": 4.238864116183322e-05, + "loss": 0.5964, + "num_tokens": 3604801371.0, + "step": 1543 + }, + { + "epoch": 2.1341168337366057, + "grad_norm": 0.4319308428438848, + "learning_rate": 4.234044955181281e-05, + "loss": 0.6005, + "num_tokens": 3607264893.0, + "step": 1544 + }, + { + "epoch": 2.1344624956792257, + "grad_norm": 0.4721185051044637, + "learning_rate": 4.2292254533189475e-05, + "loss": 0.586, + "num_tokens": 3609595515.0, + "step": 1545 + }, + { + "epoch": 2.134808157621846, + "grad_norm": 0.3626804909078634, + "learning_rate": 4.2244056176165896e-05, + "loss": 0.5545, + "num_tokens": 3611744975.0, + "step": 1546 + }, + { + "epoch": 2.135153819564466, + "grad_norm": 0.4367178541922521, + "learning_rate": 4.219585455094955e-05, + "loss": 0.5758, + "num_tokens": 3614154877.0, + "step": 1547 + }, + { + "epoch": 2.135499481507086, + "grad_norm": 0.4597158653426491, + "learning_rate": 4.2147649727752755e-05, + "loss": 0.5696, + "num_tokens": 3616446292.0, + "step": 1548 + }, + { + "epoch": 2.135845143449706, + "grad_norm": 0.48742549902524335, + "learning_rate": 4.20994417767924e-05, + "loss": 0.5887, + "num_tokens": 3618771550.0, + "step": 1549 + }, + { + "epoch": 2.136190805392326, + "grad_norm": 0.421195380403554, + "learning_rate": 4.205123076829001e-05, + "loss": 0.5643, + "num_tokens": 3621106294.0, + "step": 1550 + }, + { + "epoch": 2.1365364673349463, + "grad_norm": 0.4962367428200045, + "learning_rate": 4.2003016772471505e-05, + "loss": 0.5726, + "num_tokens": 3623422035.0, + "step": 1551 + }, + { + "epoch": 2.1368821292775664, + "grad_norm": 0.43707162666899874, + "learning_rate": 4.1954799859567176e-05, + "loss": 0.5717, + "num_tokens": 3625688520.0, + "step": 1552 + }, + { + "epoch": 2.1372277912201865, + "grad_norm": 0.42731496762786897, + "learning_rate": 4.190658009981158e-05, + "loss": 0.5736, + "num_tokens": 3628089484.0, + "step": 1553 + }, + { + "epoch": 2.1375734531628066, + "grad_norm": 0.48091479194805914, + "learning_rate": 4.1858357563443384e-05, + "loss": 0.585, + "num_tokens": 3630383875.0, + "step": 1554 + }, + { + "epoch": 2.137919115105427, + "grad_norm": 0.4403560738343109, + "learning_rate": 4.1810132320705365e-05, + "loss": 0.5719, + "num_tokens": 3632784416.0, + "step": 1555 + }, + { + "epoch": 2.138264777048047, + "grad_norm": 0.4760848017039719, + "learning_rate": 4.176190444184416e-05, + "loss": 0.5749, + "num_tokens": 3635039143.0, + "step": 1556 + }, + { + "epoch": 2.1386104389906673, + "grad_norm": 0.426180017500231, + "learning_rate": 4.1713673997110285e-05, + "loss": 0.5781, + "num_tokens": 3637356892.0, + "step": 1557 + }, + { + "epoch": 2.1389561009332874, + "grad_norm": 0.45476029314308675, + "learning_rate": 4.166544105675801e-05, + "loss": 0.5996, + "num_tokens": 3639682333.0, + "step": 1558 + }, + { + "epoch": 2.1393017628759075, + "grad_norm": 0.3592081374070471, + "learning_rate": 4.161720569104523e-05, + "loss": 0.5516, + "num_tokens": 3641984410.0, + "step": 1559 + }, + { + "epoch": 2.1396474248185275, + "grad_norm": 0.4009572816418318, + "learning_rate": 4.1568967970233325e-05, + "loss": 0.5788, + "num_tokens": 3644313717.0, + "step": 1560 + }, + { + "epoch": 2.1399930867611476, + "grad_norm": 0.35149549551799997, + "learning_rate": 4.15207279645872e-05, + "loss": 0.5586, + "num_tokens": 3646648804.0, + "step": 1561 + }, + { + "epoch": 2.1403387487037677, + "grad_norm": 0.5059356602348714, + "learning_rate": 4.1472485744375006e-05, + "loss": 0.5752, + "num_tokens": 3648990511.0, + "step": 1562 + }, + { + "epoch": 2.140684410646388, + "grad_norm": 0.41615788756120015, + "learning_rate": 4.142424137986816e-05, + "loss": 0.5638, + "num_tokens": 3651351991.0, + "step": 1563 + }, + { + "epoch": 2.141030072589008, + "grad_norm": 0.44677868070682053, + "learning_rate": 4.137599494134118e-05, + "loss": 0.5728, + "num_tokens": 3653679901.0, + "step": 1564 + }, + { + "epoch": 2.141375734531628, + "grad_norm": 0.5081129234433894, + "learning_rate": 4.132774649907162e-05, + "loss": 0.5838, + "num_tokens": 3656020743.0, + "step": 1565 + }, + { + "epoch": 2.141721396474248, + "grad_norm": 0.40991654980282255, + "learning_rate": 4.127949612333996e-05, + "loss": 0.5782, + "num_tokens": 3658276861.0, + "step": 1566 + }, + { + "epoch": 2.142067058416868, + "grad_norm": 0.4045643934993304, + "learning_rate": 4.1231243884429476e-05, + "loss": 0.5561, + "num_tokens": 3660554400.0, + "step": 1567 + }, + { + "epoch": 2.1424127203594883, + "grad_norm": 0.3315821456992701, + "learning_rate": 4.1182989852626175e-05, + "loss": 0.5721, + "num_tokens": 3662928870.0, + "step": 1568 + }, + { + "epoch": 2.1427583823021084, + "grad_norm": 0.349729997771642, + "learning_rate": 4.113473409821866e-05, + "loss": 0.5791, + "num_tokens": 3665234551.0, + "step": 1569 + }, + { + "epoch": 2.143104044244729, + "grad_norm": 0.3229124633389833, + "learning_rate": 4.108647669149804e-05, + "loss": 0.5539, + "num_tokens": 3667471908.0, + "step": 1570 + }, + { + "epoch": 2.143449706187349, + "grad_norm": 0.40862242253722186, + "learning_rate": 4.1038217702757876e-05, + "loss": 0.5742, + "num_tokens": 3669930740.0, + "step": 1571 + }, + { + "epoch": 2.143795368129969, + "grad_norm": 0.35957973163615636, + "learning_rate": 4.0989957202293966e-05, + "loss": 0.5708, + "num_tokens": 3672314150.0, + "step": 1572 + }, + { + "epoch": 2.144141030072589, + "grad_norm": 0.35633507648705237, + "learning_rate": 4.094169526040436e-05, + "loss": 0.5469, + "num_tokens": 3674632925.0, + "step": 1573 + }, + { + "epoch": 2.1444866920152093, + "grad_norm": 0.3303090242476595, + "learning_rate": 4.089343194738919e-05, + "loss": 0.5803, + "num_tokens": 3676986083.0, + "step": 1574 + }, + { + "epoch": 2.1448323539578293, + "grad_norm": 0.35448412932832873, + "learning_rate": 4.084516733355059e-05, + "loss": 0.5355, + "num_tokens": 3679279499.0, + "step": 1575 + }, + { + "epoch": 2.1451780159004494, + "grad_norm": 0.3909475116919189, + "learning_rate": 4.079690148919259e-05, + "loss": 0.5935, + "num_tokens": 3681707807.0, + "step": 1576 + }, + { + "epoch": 2.1455236778430695, + "grad_norm": 0.35393198281782284, + "learning_rate": 4.0748634484620995e-05, + "loss": 0.5517, + "num_tokens": 3684079934.0, + "step": 1577 + }, + { + "epoch": 2.1458693397856896, + "grad_norm": 0.42075058534895854, + "learning_rate": 4.0700366390143295e-05, + "loss": 0.5792, + "num_tokens": 3686404471.0, + "step": 1578 + }, + { + "epoch": 2.1462150017283097, + "grad_norm": 0.2995313447082663, + "learning_rate": 4.065209727606863e-05, + "loss": 0.5747, + "num_tokens": 3688806237.0, + "step": 1579 + }, + { + "epoch": 2.14656066367093, + "grad_norm": 0.5343983022659001, + "learning_rate": 4.060382721270755e-05, + "loss": 0.599, + "num_tokens": 3691135757.0, + "step": 1580 + }, + { + "epoch": 2.14690632561355, + "grad_norm": 0.4438786339745476, + "learning_rate": 4.055555627037203e-05, + "loss": 0.6006, + "num_tokens": 3693355136.0, + "step": 1581 + }, + { + "epoch": 2.14725198755617, + "grad_norm": 0.4941069824134752, + "learning_rate": 4.050728451937531e-05, + "loss": 0.5821, + "num_tokens": 3695714669.0, + "step": 1582 + }, + { + "epoch": 2.14759764949879, + "grad_norm": 0.4031208896621595, + "learning_rate": 4.04590120300318e-05, + "loss": 0.5731, + "num_tokens": 3697986604.0, + "step": 1583 + }, + { + "epoch": 2.14794331144141, + "grad_norm": 0.44305852363219267, + "learning_rate": 4.041073887265702e-05, + "loss": 0.5596, + "num_tokens": 3700330742.0, + "step": 1584 + }, + { + "epoch": 2.1482889733840302, + "grad_norm": 0.39280717857519387, + "learning_rate": 4.036246511756743e-05, + "loss": 0.581, + "num_tokens": 3702747047.0, + "step": 1585 + }, + { + "epoch": 2.1486346353266503, + "grad_norm": 0.36926333880870965, + "learning_rate": 4.031419083508037e-05, + "loss": 0.5768, + "num_tokens": 3705117151.0, + "step": 1586 + }, + { + "epoch": 2.1489802972692704, + "grad_norm": 0.3246583198619419, + "learning_rate": 4.0265916095513936e-05, + "loss": 0.5779, + "num_tokens": 3707455326.0, + "step": 1587 + }, + { + "epoch": 2.149325959211891, + "grad_norm": 0.368634262075389, + "learning_rate": 4.021764096918693e-05, + "loss": 0.5827, + "num_tokens": 3709742556.0, + "step": 1588 + }, + { + "epoch": 2.149671621154511, + "grad_norm": 0.30572720892886823, + "learning_rate": 4.016936552641868e-05, + "loss": 0.555, + "num_tokens": 3712072511.0, + "step": 1589 + }, + { + "epoch": 2.150017283097131, + "grad_norm": 0.49248626919039834, + "learning_rate": 4.012108983752897e-05, + "loss": 0.5662, + "num_tokens": 3714334512.0, + "step": 1590 + }, + { + "epoch": 2.1503629450397512, + "grad_norm": 0.4360364207942161, + "learning_rate": 4.007281397283796e-05, + "loss": 0.5945, + "num_tokens": 3716685869.0, + "step": 1591 + }, + { + "epoch": 2.1507086069823713, + "grad_norm": 0.522127775857114, + "learning_rate": 4.00245380026661e-05, + "loss": 0.5856, + "num_tokens": 3719018791.0, + "step": 1592 + }, + { + "epoch": 2.1510542689249914, + "grad_norm": 0.5594619646273812, + "learning_rate": 3.9976261997333914e-05, + "loss": 0.6005, + "num_tokens": 3721391823.0, + "step": 1593 + }, + { + "epoch": 2.1513999308676115, + "grad_norm": 0.38787599697682235, + "learning_rate": 3.992798602716204e-05, + "loss": 0.5752, + "num_tokens": 3723691361.0, + "step": 1594 + }, + { + "epoch": 2.1517455928102316, + "grad_norm": 0.4459806993929073, + "learning_rate": 3.9879710162471054e-05, + "loss": 0.5693, + "num_tokens": 3726088092.0, + "step": 1595 + }, + { + "epoch": 2.1520912547528517, + "grad_norm": 0.34125732773785955, + "learning_rate": 3.983143447358134e-05, + "loss": 0.5805, + "num_tokens": 3728511986.0, + "step": 1596 + }, + { + "epoch": 2.1524369166954718, + "grad_norm": 0.4854586263264401, + "learning_rate": 3.978315903081308e-05, + "loss": 0.5759, + "num_tokens": 3730839307.0, + "step": 1597 + }, + { + "epoch": 2.152782578638092, + "grad_norm": 0.386244758997106, + "learning_rate": 3.9734883904486065e-05, + "loss": 0.5767, + "num_tokens": 3733220980.0, + "step": 1598 + }, + { + "epoch": 2.153128240580712, + "grad_norm": 0.5040568212989219, + "learning_rate": 3.968660916491966e-05, + "loss": 0.546, + "num_tokens": 3735510614.0, + "step": 1599 + }, + { + "epoch": 2.153473902523332, + "grad_norm": 0.4522781544588036, + "learning_rate": 3.9638334882432585e-05, + "loss": 0.5969, + "num_tokens": 3737849023.0, + "step": 1600 + }, + { + "epoch": 2.153819564465952, + "grad_norm": 0.5147815585535901, + "learning_rate": 3.959006112734299e-05, + "loss": 0.574, + "num_tokens": 3740134532.0, + "step": 1601 + }, + { + "epoch": 2.154165226408572, + "grad_norm": 0.4672815081830139, + "learning_rate": 3.9541787969968205e-05, + "loss": 0.5934, + "num_tokens": 3742446477.0, + "step": 1602 + }, + { + "epoch": 2.1545108883511928, + "grad_norm": 0.42399242565611145, + "learning_rate": 3.9493515480624714e-05, + "loss": 0.5505, + "num_tokens": 3744841618.0, + "step": 1603 + }, + { + "epoch": 2.154856550293813, + "grad_norm": 0.40969670996276963, + "learning_rate": 3.9445243729627985e-05, + "loss": 0.5677, + "num_tokens": 3747208815.0, + "step": 1604 + }, + { + "epoch": 2.155202212236433, + "grad_norm": 0.4552923328944928, + "learning_rate": 3.939697278729246e-05, + "loss": 0.5727, + "num_tokens": 3749583666.0, + "step": 1605 + }, + { + "epoch": 2.155547874179053, + "grad_norm": 0.35678686798820974, + "learning_rate": 3.9348702723931376e-05, + "loss": 0.5982, + "num_tokens": 3751880865.0, + "step": 1606 + }, + { + "epoch": 2.155893536121673, + "grad_norm": 0.47752650155402254, + "learning_rate": 3.930043360985671e-05, + "loss": 0.5815, + "num_tokens": 3754180557.0, + "step": 1607 + }, + { + "epoch": 2.156239198064293, + "grad_norm": 0.3887701798950676, + "learning_rate": 3.925216551537903e-05, + "loss": 0.5726, + "num_tokens": 3756398897.0, + "step": 1608 + }, + { + "epoch": 2.1565848600069133, + "grad_norm": 0.39243930632609825, + "learning_rate": 3.920389851080742e-05, + "loss": 0.5639, + "num_tokens": 3758706586.0, + "step": 1609 + }, + { + "epoch": 2.1569305219495334, + "grad_norm": 0.42972319302645756, + "learning_rate": 3.9155632666449415e-05, + "loss": 0.5796, + "num_tokens": 3761140111.0, + "step": 1610 + }, + { + "epoch": 2.1572761838921535, + "grad_norm": 0.3470801351212925, + "learning_rate": 3.910736805261082e-05, + "loss": 0.5635, + "num_tokens": 3763460414.0, + "step": 1611 + }, + { + "epoch": 2.1576218458347736, + "grad_norm": 0.5074819719261473, + "learning_rate": 3.905910473959564e-05, + "loss": 0.5658, + "num_tokens": 3765800756.0, + "step": 1612 + }, + { + "epoch": 2.1579675077773937, + "grad_norm": 0.40571157567100113, + "learning_rate": 3.9010842797706035e-05, + "loss": 0.596, + "num_tokens": 3768207619.0, + "step": 1613 + }, + { + "epoch": 2.1583131697200137, + "grad_norm": 0.5015041501523927, + "learning_rate": 3.896258229724214e-05, + "loss": 0.5691, + "num_tokens": 3770527547.0, + "step": 1614 + }, + { + "epoch": 2.158658831662634, + "grad_norm": 0.5003281357026551, + "learning_rate": 3.891432330850197e-05, + "loss": 0.5558, + "num_tokens": 3772852814.0, + "step": 1615 + }, + { + "epoch": 2.159004493605254, + "grad_norm": 0.39819019361785885, + "learning_rate": 3.886606590178135e-05, + "loss": 0.5498, + "num_tokens": 3775145704.0, + "step": 1616 + }, + { + "epoch": 2.159350155547874, + "grad_norm": 0.4382841730052522, + "learning_rate": 3.8817810147373826e-05, + "loss": 0.5813, + "num_tokens": 3777547542.0, + "step": 1617 + }, + { + "epoch": 2.159695817490494, + "grad_norm": 0.35900507831841894, + "learning_rate": 3.876955611557053e-05, + "loss": 0.5639, + "num_tokens": 3779747682.0, + "step": 1618 + }, + { + "epoch": 2.160041479433114, + "grad_norm": 0.4589987629771109, + "learning_rate": 3.8721303876660045e-05, + "loss": 0.5932, + "num_tokens": 3782057777.0, + "step": 1619 + }, + { + "epoch": 2.1603871413757347, + "grad_norm": 0.3620142433657542, + "learning_rate": 3.867305350092838e-05, + "loss": 0.6081, + "num_tokens": 3784387192.0, + "step": 1620 + }, + { + "epoch": 2.160732803318355, + "grad_norm": 0.4420137108586777, + "learning_rate": 3.8624805058658824e-05, + "loss": 0.603, + "num_tokens": 3786728617.0, + "step": 1621 + }, + { + "epoch": 2.161078465260975, + "grad_norm": 0.36168094848982324, + "learning_rate": 3.857655862013185e-05, + "loss": 0.5828, + "num_tokens": 3789097835.0, + "step": 1622 + }, + { + "epoch": 2.161424127203595, + "grad_norm": 0.5168728609871764, + "learning_rate": 3.852831425562501e-05, + "loss": 0.5956, + "num_tokens": 3791356470.0, + "step": 1623 + }, + { + "epoch": 2.161769789146215, + "grad_norm": 0.37723357438553606, + "learning_rate": 3.84800720354128e-05, + "loss": 0.5684, + "num_tokens": 3793747531.0, + "step": 1624 + }, + { + "epoch": 2.162115451088835, + "grad_norm": 0.5464799620004885, + "learning_rate": 3.843183202976667e-05, + "loss": 0.615, + "num_tokens": 3796165964.0, + "step": 1625 + }, + { + "epoch": 2.1624611130314553, + "grad_norm": 0.4108637334345988, + "learning_rate": 3.838359430895479e-05, + "loss": 0.5737, + "num_tokens": 3798367885.0, + "step": 1626 + }, + { + "epoch": 2.1628067749740754, + "grad_norm": 0.46423095918540663, + "learning_rate": 3.833535894324201e-05, + "loss": 0.5843, + "num_tokens": 3800727725.0, + "step": 1627 + }, + { + "epoch": 2.1631524369166955, + "grad_norm": 0.4715351834469286, + "learning_rate": 3.828712600288972e-05, + "loss": 0.5855, + "num_tokens": 3803174059.0, + "step": 1628 + }, + { + "epoch": 2.1634980988593155, + "grad_norm": 0.4608463076050439, + "learning_rate": 3.8238895558155854e-05, + "loss": 0.598, + "num_tokens": 3805513338.0, + "step": 1629 + }, + { + "epoch": 2.1638437608019356, + "grad_norm": 0.5289605500181319, + "learning_rate": 3.819066767929466e-05, + "loss": 0.5896, + "num_tokens": 3807807850.0, + "step": 1630 + }, + { + "epoch": 2.1641894227445557, + "grad_norm": 0.6843097301058589, + "learning_rate": 3.8142442436556625e-05, + "loss": 0.5935, + "num_tokens": 3810121801.0, + "step": 1631 + }, + { + "epoch": 2.164535084687176, + "grad_norm": 0.6028055748980918, + "learning_rate": 3.809421990018843e-05, + "loss": 0.5851, + "num_tokens": 3812446360.0, + "step": 1632 + }, + { + "epoch": 2.164880746629796, + "grad_norm": 0.4922345520406235, + "learning_rate": 3.8046000140432826e-05, + "loss": 0.5867, + "num_tokens": 3814740571.0, + "step": 1633 + }, + { + "epoch": 2.165226408572416, + "grad_norm": 0.6687640322126353, + "learning_rate": 3.799778322752851e-05, + "loss": 0.5949, + "num_tokens": 3817120162.0, + "step": 1634 + }, + { + "epoch": 2.1655720705150365, + "grad_norm": 0.5579615685628413, + "learning_rate": 3.794956923171e-05, + "loss": 0.5881, + "num_tokens": 3819363439.0, + "step": 1635 + }, + { + "epoch": 2.1659177324576566, + "grad_norm": 0.5151441528085586, + "learning_rate": 3.790135822320761e-05, + "loss": 0.5668, + "num_tokens": 3821594912.0, + "step": 1636 + }, + { + "epoch": 2.1662633944002767, + "grad_norm": 0.6955999134965759, + "learning_rate": 3.785315027224727e-05, + "loss": 0.5885, + "num_tokens": 3823831911.0, + "step": 1637 + }, + { + "epoch": 2.166609056342897, + "grad_norm": 0.5911351599728437, + "learning_rate": 3.7804945449050456e-05, + "loss": 0.5931, + "num_tokens": 3826240000.0, + "step": 1638 + }, + { + "epoch": 2.166954718285517, + "grad_norm": 0.47681908925145744, + "learning_rate": 3.775674382383412e-05, + "loss": 0.5926, + "num_tokens": 3828554280.0, + "step": 1639 + }, + { + "epoch": 2.167300380228137, + "grad_norm": 0.5637941079453287, + "learning_rate": 3.770854546681053e-05, + "loss": 0.5633, + "num_tokens": 3830736107.0, + "step": 1640 + }, + { + "epoch": 2.167646042170757, + "grad_norm": 0.6560763691253044, + "learning_rate": 3.7660350448187216e-05, + "loss": 0.5758, + "num_tokens": 3833114775.0, + "step": 1641 + }, + { + "epoch": 2.167991704113377, + "grad_norm": 0.37057116614678604, + "learning_rate": 3.7612158838166786e-05, + "loss": 0.5662, + "num_tokens": 3835453534.0, + "step": 1642 + }, + { + "epoch": 2.1683373660559973, + "grad_norm": 0.5788086706023449, + "learning_rate": 3.756397070694696e-05, + "loss": 0.5542, + "num_tokens": 3837662669.0, + "step": 1643 + }, + { + "epoch": 2.1686830279986173, + "grad_norm": 0.6129402349533551, + "learning_rate": 3.751578612472035e-05, + "loss": 0.5677, + "num_tokens": 3840013052.0, + "step": 1644 + }, + { + "epoch": 2.1690286899412374, + "grad_norm": 0.371156729607469, + "learning_rate": 3.7467605161674406e-05, + "loss": 0.5875, + "num_tokens": 3842461152.0, + "step": 1645 + }, + { + "epoch": 2.1693743518838575, + "grad_norm": 0.5392213992727422, + "learning_rate": 3.741942788799129e-05, + "loss": 0.5697, + "num_tokens": 3844716205.0, + "step": 1646 + }, + { + "epoch": 2.1697200138264776, + "grad_norm": 0.48273504754983154, + "learning_rate": 3.737125437384781e-05, + "loss": 0.5981, + "num_tokens": 3847095801.0, + "step": 1647 + }, + { + "epoch": 2.1700656757690977, + "grad_norm": 0.40866544936507615, + "learning_rate": 3.7323084689415306e-05, + "loss": 0.6022, + "num_tokens": 3849384483.0, + "step": 1648 + }, + { + "epoch": 2.170411337711718, + "grad_norm": 0.47430210882285406, + "learning_rate": 3.727491890485953e-05, + "loss": 0.588, + "num_tokens": 3851658649.0, + "step": 1649 + }, + { + "epoch": 2.170756999654338, + "grad_norm": 0.5029182108552628, + "learning_rate": 3.722675709034051e-05, + "loss": 0.551, + "num_tokens": 3853946535.0, + "step": 1650 + }, + { + "epoch": 2.171102661596958, + "grad_norm": 0.47222407166530844, + "learning_rate": 3.717859931601256e-05, + "loss": 0.5765, + "num_tokens": 3856287417.0, + "step": 1651 + }, + { + "epoch": 2.171448323539578, + "grad_norm": 0.4248156581917221, + "learning_rate": 3.7130445652024085e-05, + "loss": 0.5584, + "num_tokens": 3858679688.0, + "step": 1652 + }, + { + "epoch": 2.1717939854821986, + "grad_norm": 0.4535261692200449, + "learning_rate": 3.7082296168517494e-05, + "loss": 0.5773, + "num_tokens": 3861121000.0, + "step": 1653 + }, + { + "epoch": 2.1721396474248187, + "grad_norm": 0.312373249306112, + "learning_rate": 3.703415093562908e-05, + "loss": 0.5669, + "num_tokens": 3863403886.0, + "step": 1654 + }, + { + "epoch": 2.1724853093674388, + "grad_norm": 0.4621334089991505, + "learning_rate": 3.6986010023489e-05, + "loss": 0.5301, + "num_tokens": 3865679721.0, + "step": 1655 + }, + { + "epoch": 2.172830971310059, + "grad_norm": 0.38147592404275305, + "learning_rate": 3.693787350222109e-05, + "loss": 0.5527, + "num_tokens": 3867933649.0, + "step": 1656 + }, + { + "epoch": 2.173176633252679, + "grad_norm": 0.5125259932427738, + "learning_rate": 3.68897414419428e-05, + "loss": 0.5699, + "num_tokens": 3870159012.0, + "step": 1657 + }, + { + "epoch": 2.173522295195299, + "grad_norm": 0.4348251802703402, + "learning_rate": 3.684161391276505e-05, + "loss": 0.555, + "num_tokens": 3872523399.0, + "step": 1658 + }, + { + "epoch": 2.173867957137919, + "grad_norm": 0.41104162167740904, + "learning_rate": 3.6793490984792175e-05, + "loss": 0.5866, + "num_tokens": 3874887786.0, + "step": 1659 + }, + { + "epoch": 2.1742136190805392, + "grad_norm": 0.4844079362517981, + "learning_rate": 3.674537272812185e-05, + "loss": 0.5539, + "num_tokens": 3877132256.0, + "step": 1660 + }, + { + "epoch": 2.1745592810231593, + "grad_norm": 0.4312784903033744, + "learning_rate": 3.6697259212844905e-05, + "loss": 0.5668, + "num_tokens": 3879437798.0, + "step": 1661 + }, + { + "epoch": 2.1749049429657794, + "grad_norm": 0.4481543658200724, + "learning_rate": 3.6649150509045244e-05, + "loss": 0.5516, + "num_tokens": 3881767743.0, + "step": 1662 + }, + { + "epoch": 2.1752506049083995, + "grad_norm": 0.4448007881232388, + "learning_rate": 3.660104668679981e-05, + "loss": 0.5893, + "num_tokens": 3884097463.0, + "step": 1663 + }, + { + "epoch": 2.1755962668510196, + "grad_norm": 0.29651920272278304, + "learning_rate": 3.655294781617841e-05, + "loss": 0.5417, + "num_tokens": 3886439225.0, + "step": 1664 + }, + { + "epoch": 2.1759419287936397, + "grad_norm": 0.3052619543124406, + "learning_rate": 3.6504853967243634e-05, + "loss": 0.5542, + "num_tokens": 3888704797.0, + "step": 1665 + }, + { + "epoch": 2.1762875907362598, + "grad_norm": 0.3547086546099954, + "learning_rate": 3.645676521005077e-05, + "loss": 0.5778, + "num_tokens": 3890992988.0, + "step": 1666 + }, + { + "epoch": 2.17663325267888, + "grad_norm": 0.2762549379423686, + "learning_rate": 3.640868161464768e-05, + "loss": 0.5723, + "num_tokens": 3893294065.0, + "step": 1667 + }, + { + "epoch": 2.1769789146215004, + "grad_norm": 0.4059613017175796, + "learning_rate": 3.636060325107473e-05, + "loss": 0.5764, + "num_tokens": 3895769991.0, + "step": 1668 + }, + { + "epoch": 2.1773245765641205, + "grad_norm": 0.3474366918655778, + "learning_rate": 3.6312530189364614e-05, + "loss": 0.5679, + "num_tokens": 3898051577.0, + "step": 1669 + }, + { + "epoch": 2.1776702385067406, + "grad_norm": 0.39132664501943865, + "learning_rate": 3.626446249954236e-05, + "loss": 0.5719, + "num_tokens": 3900484293.0, + "step": 1670 + }, + { + "epoch": 2.1780159004493607, + "grad_norm": 0.3113923774174792, + "learning_rate": 3.6216400251625124e-05, + "loss": 0.556, + "num_tokens": 3902863416.0, + "step": 1671 + }, + { + "epoch": 2.1783615623919808, + "grad_norm": 0.30332646255432144, + "learning_rate": 3.61683435156222e-05, + "loss": 0.5612, + "num_tokens": 3905076450.0, + "step": 1672 + }, + { + "epoch": 2.178707224334601, + "grad_norm": 0.4113288561124863, + "learning_rate": 3.612029236153476e-05, + "loss": 0.5696, + "num_tokens": 3907417790.0, + "step": 1673 + }, + { + "epoch": 2.179052886277221, + "grad_norm": 0.35240037904750227, + "learning_rate": 3.607224685935594e-05, + "loss": 0.5611, + "num_tokens": 3909804374.0, + "step": 1674 + }, + { + "epoch": 2.179398548219841, + "grad_norm": 0.48687355247249303, + "learning_rate": 3.602420707907056e-05, + "loss": 0.5792, + "num_tokens": 3912203016.0, + "step": 1675 + }, + { + "epoch": 2.179744210162461, + "grad_norm": 0.4098875075701958, + "learning_rate": 3.597617309065519e-05, + "loss": 0.5496, + "num_tokens": 3914502362.0, + "step": 1676 + }, + { + "epoch": 2.180089872105081, + "grad_norm": 0.4304335772544368, + "learning_rate": 3.5928144964077864e-05, + "loss": 0.5962, + "num_tokens": 3916882888.0, + "step": 1677 + }, + { + "epoch": 2.1804355340477013, + "grad_norm": 0.4785925068848899, + "learning_rate": 3.5880122769298175e-05, + "loss": 0.5948, + "num_tokens": 3919174544.0, + "step": 1678 + }, + { + "epoch": 2.1807811959903214, + "grad_norm": 0.46746268909684413, + "learning_rate": 3.5832106576266995e-05, + "loss": 0.6129, + "num_tokens": 3921622264.0, + "step": 1679 + }, + { + "epoch": 2.1811268579329415, + "grad_norm": 0.4163477885566145, + "learning_rate": 3.578409645492655e-05, + "loss": 0.5856, + "num_tokens": 3923875335.0, + "step": 1680 + }, + { + "epoch": 2.1814725198755616, + "grad_norm": 0.4532640252382832, + "learning_rate": 3.5736092475210084e-05, + "loss": 0.6094, + "num_tokens": 3926146162.0, + "step": 1681 + }, + { + "epoch": 2.1818181818181817, + "grad_norm": 0.46035438824564406, + "learning_rate": 3.5688094707042035e-05, + "loss": 0.6068, + "num_tokens": 3928536514.0, + "step": 1682 + }, + { + "epoch": 2.1821638437608017, + "grad_norm": 0.4353662263060845, + "learning_rate": 3.564010322033769e-05, + "loss": 0.6003, + "num_tokens": 3930858866.0, + "step": 1683 + }, + { + "epoch": 2.182509505703422, + "grad_norm": 0.4850276992486815, + "learning_rate": 3.559211808500327e-05, + "loss": 0.6219, + "num_tokens": 3933245557.0, + "step": 1684 + }, + { + "epoch": 2.1828551676460424, + "grad_norm": 0.5131446191738124, + "learning_rate": 3.554413937093566e-05, + "loss": 0.5848, + "num_tokens": 3935545307.0, + "step": 1685 + }, + { + "epoch": 2.1832008295886625, + "grad_norm": 0.4677052907972313, + "learning_rate": 3.549616714802246e-05, + "loss": 0.6118, + "num_tokens": 3937967699.0, + "step": 1686 + }, + { + "epoch": 2.1835464915312826, + "grad_norm": 0.44589653162288206, + "learning_rate": 3.544820148614177e-05, + "loss": 0.5742, + "num_tokens": 3940325335.0, + "step": 1687 + }, + { + "epoch": 2.1838921534739026, + "grad_norm": 0.49780096945699487, + "learning_rate": 3.5400242455162175e-05, + "loss": 0.5932, + "num_tokens": 3942675501.0, + "step": 1688 + }, + { + "epoch": 2.1842378154165227, + "grad_norm": 0.4936410748096351, + "learning_rate": 3.5352290124942547e-05, + "loss": 0.5968, + "num_tokens": 3945052081.0, + "step": 1689 + }, + { + "epoch": 2.184583477359143, + "grad_norm": 0.45272207924575136, + "learning_rate": 3.530434456533204e-05, + "loss": 0.5893, + "num_tokens": 3947538611.0, + "step": 1690 + }, + { + "epoch": 2.184929139301763, + "grad_norm": 0.4872721715786065, + "learning_rate": 3.525640584616995e-05, + "loss": 0.5719, + "num_tokens": 3949946390.0, + "step": 1691 + }, + { + "epoch": 2.185274801244383, + "grad_norm": 0.36201367992005706, + "learning_rate": 3.520847403728558e-05, + "loss": 0.5747, + "num_tokens": 3952399105.0, + "step": 1692 + }, + { + "epoch": 2.185620463187003, + "grad_norm": 0.463602572864178, + "learning_rate": 3.516054920849815e-05, + "loss": 0.5674, + "num_tokens": 3954799132.0, + "step": 1693 + }, + { + "epoch": 2.185966125129623, + "grad_norm": 0.36234238128339163, + "learning_rate": 3.511263142961678e-05, + "loss": 0.5583, + "num_tokens": 3957086139.0, + "step": 1694 + }, + { + "epoch": 2.1863117870722433, + "grad_norm": 0.5317641049169286, + "learning_rate": 3.5064720770440296e-05, + "loss": 0.5742, + "num_tokens": 3959378894.0, + "step": 1695 + }, + { + "epoch": 2.1866574490148634, + "grad_norm": 0.471573403778105, + "learning_rate": 3.501681730075709e-05, + "loss": 0.5505, + "num_tokens": 3961659354.0, + "step": 1696 + }, + { + "epoch": 2.1870031109574835, + "grad_norm": 0.41216022549670006, + "learning_rate": 3.4968921090345165e-05, + "loss": 0.6026, + "num_tokens": 3964020105.0, + "step": 1697 + }, + { + "epoch": 2.1873487729001035, + "grad_norm": 0.42323122533725727, + "learning_rate": 3.49210322089719e-05, + "loss": 0.5675, + "num_tokens": 3966420672.0, + "step": 1698 + }, + { + "epoch": 2.1876944348427236, + "grad_norm": 0.3138653948971231, + "learning_rate": 3.487315072639404e-05, + "loss": 0.5869, + "num_tokens": 3968792359.0, + "step": 1699 + }, + { + "epoch": 2.188040096785344, + "grad_norm": 0.40319030343127277, + "learning_rate": 3.48252767123575e-05, + "loss": 0.5596, + "num_tokens": 3971046068.0, + "step": 1700 + }, + { + "epoch": 2.1883857587279643, + "grad_norm": 0.3610504504125991, + "learning_rate": 3.4777410236597353e-05, + "loss": 0.5726, + "num_tokens": 3973296546.0, + "step": 1701 + }, + { + "epoch": 2.1887314206705843, + "grad_norm": 0.3554967056008757, + "learning_rate": 3.472955136883768e-05, + "loss": 0.5923, + "num_tokens": 3975689890.0, + "step": 1702 + }, + { + "epoch": 2.1890770826132044, + "grad_norm": 0.39904146644921934, + "learning_rate": 3.4681700178791495e-05, + "loss": 0.5721, + "num_tokens": 3978029629.0, + "step": 1703 + }, + { + "epoch": 2.1894227445558245, + "grad_norm": 0.3755173027738497, + "learning_rate": 3.46338567361606e-05, + "loss": 0.6228, + "num_tokens": 3980420400.0, + "step": 1704 + }, + { + "epoch": 2.1897684064984446, + "grad_norm": 0.497751920434996, + "learning_rate": 3.4586021110635515e-05, + "loss": 0.6244, + "num_tokens": 3982873421.0, + "step": 1705 + }, + { + "epoch": 2.1901140684410647, + "grad_norm": 0.5148781680410864, + "learning_rate": 3.453819337189541e-05, + "loss": 0.5741, + "num_tokens": 3985283043.0, + "step": 1706 + }, + { + "epoch": 2.190459730383685, + "grad_norm": 0.4066198393868349, + "learning_rate": 3.449037358960794e-05, + "loss": 0.5666, + "num_tokens": 3987507032.0, + "step": 1707 + }, + { + "epoch": 2.190805392326305, + "grad_norm": 0.3836308625845691, + "learning_rate": 3.4442561833429166e-05, + "loss": 0.5972, + "num_tokens": 3989875944.0, + "step": 1708 + }, + { + "epoch": 2.191151054268925, + "grad_norm": 0.35836301484995864, + "learning_rate": 3.439475817300345e-05, + "loss": 0.5462, + "num_tokens": 3992251565.0, + "step": 1709 + }, + { + "epoch": 2.191496716211545, + "grad_norm": 0.4104946221458571, + "learning_rate": 3.4346962677963386e-05, + "loss": 0.5931, + "num_tokens": 3994508762.0, + "step": 1710 + }, + { + "epoch": 2.191842378154165, + "grad_norm": 0.4124014388600809, + "learning_rate": 3.429917541792969e-05, + "loss": 0.5722, + "num_tokens": 3996692941.0, + "step": 1711 + }, + { + "epoch": 2.1921880400967853, + "grad_norm": 0.4503361710607658, + "learning_rate": 3.425139646251102e-05, + "loss": 0.5987, + "num_tokens": 3999103782.0, + "step": 1712 + }, + { + "epoch": 2.1925337020394053, + "grad_norm": 0.5352192987620799, + "learning_rate": 3.4203625881303976e-05, + "loss": 0.6011, + "num_tokens": 4001472034.0, + "step": 1713 + }, + { + "epoch": 2.1928793639820254, + "grad_norm": 0.4382444346671446, + "learning_rate": 3.415586374389297e-05, + "loss": 0.5808, + "num_tokens": 4003727846.0, + "step": 1714 + }, + { + "epoch": 2.1932250259246455, + "grad_norm": 0.46278263438291717, + "learning_rate": 3.4108110119850114e-05, + "loss": 0.6062, + "num_tokens": 4005985234.0, + "step": 1715 + }, + { + "epoch": 2.1935706878672656, + "grad_norm": 0.4514643181419856, + "learning_rate": 3.406036507873508e-05, + "loss": 0.602, + "num_tokens": 4008334084.0, + "step": 1716 + }, + { + "epoch": 2.1939163498098857, + "grad_norm": 0.5221191447664183, + "learning_rate": 3.401262869009508e-05, + "loss": 0.6126, + "num_tokens": 4010712684.0, + "step": 1717 + }, + { + "epoch": 2.1942620117525062, + "grad_norm": 0.42153524973538975, + "learning_rate": 3.3964901023464697e-05, + "loss": 0.5748, + "num_tokens": 4013053800.0, + "step": 1718 + }, + { + "epoch": 2.1946076736951263, + "grad_norm": 0.45106538070198826, + "learning_rate": 3.3917182148365836e-05, + "loss": 0.5883, + "num_tokens": 4015279737.0, + "step": 1719 + }, + { + "epoch": 2.1949533356377464, + "grad_norm": 0.40253657755700084, + "learning_rate": 3.386947213430755e-05, + "loss": 0.5622, + "num_tokens": 4017606067.0, + "step": 1720 + }, + { + "epoch": 2.1952989975803665, + "grad_norm": 0.5191393403861773, + "learning_rate": 3.3821771050786045e-05, + "loss": 0.5771, + "num_tokens": 4019953661.0, + "step": 1721 + }, + { + "epoch": 2.1956446595229866, + "grad_norm": 0.5145188361850536, + "learning_rate": 3.3774078967284486e-05, + "loss": 0.5803, + "num_tokens": 4022258281.0, + "step": 1722 + }, + { + "epoch": 2.1959903214656067, + "grad_norm": 0.4371924657081148, + "learning_rate": 3.37263959532729e-05, + "loss": 0.5759, + "num_tokens": 4024528869.0, + "step": 1723 + }, + { + "epoch": 2.1963359834082268, + "grad_norm": 0.5693070825878815, + "learning_rate": 3.367872207820815e-05, + "loss": 0.5923, + "num_tokens": 4026863101.0, + "step": 1724 + }, + { + "epoch": 2.196681645350847, + "grad_norm": 0.35133547147608174, + "learning_rate": 3.363105741153377e-05, + "loss": 0.5869, + "num_tokens": 4029263862.0, + "step": 1725 + }, + { + "epoch": 2.197027307293467, + "grad_norm": 0.6060415450474594, + "learning_rate": 3.3583402022679897e-05, + "loss": 0.5601, + "num_tokens": 4031469175.0, + "step": 1726 + }, + { + "epoch": 2.197372969236087, + "grad_norm": 0.46437751488956786, + "learning_rate": 3.3535755981063106e-05, + "loss": 0.6094, + "num_tokens": 4033880574.0, + "step": 1727 + }, + { + "epoch": 2.197718631178707, + "grad_norm": 0.5097295330864767, + "learning_rate": 3.34881193560864e-05, + "loss": 0.5665, + "num_tokens": 4036290081.0, + "step": 1728 + }, + { + "epoch": 2.1980642931213272, + "grad_norm": 0.5628972758761455, + "learning_rate": 3.3440492217139055e-05, + "loss": 0.5714, + "num_tokens": 4038530918.0, + "step": 1729 + }, + { + "epoch": 2.1984099550639473, + "grad_norm": 0.4055640721344835, + "learning_rate": 3.3392874633596534e-05, + "loss": 0.6007, + "num_tokens": 4040869460.0, + "step": 1730 + }, + { + "epoch": 2.1987556170065674, + "grad_norm": 0.353572067518476, + "learning_rate": 3.334526667482035e-05, + "loss": 0.535, + "num_tokens": 4043234043.0, + "step": 1731 + }, + { + "epoch": 2.1991012789491875, + "grad_norm": 0.44273212246122495, + "learning_rate": 3.3297668410158036e-05, + "loss": 0.5799, + "num_tokens": 4045578364.0, + "step": 1732 + }, + { + "epoch": 2.199446940891808, + "grad_norm": 0.3080731708087997, + "learning_rate": 3.3250079908943e-05, + "loss": 0.5986, + "num_tokens": 4048001062.0, + "step": 1733 + }, + { + "epoch": 2.199792602834428, + "grad_norm": 0.5738468906760364, + "learning_rate": 3.3202501240494414e-05, + "loss": 0.6045, + "num_tokens": 4050333985.0, + "step": 1734 + } + ], + "logging_steps": 1, + "max_steps": 2893, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 5000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.4440977488812704e+19, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}