diff --git "a/trainer_state.json" "b/trainer_state.json" deleted file mode 100644--- "a/trainer_state.json" +++ /dev/null @@ -1,10034 +0,0 @@ -{ - "best_global_step": null, - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 1.0, - "eval_steps": 500, - "global_step": 100000, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.001, - "grad_norm": 3.9445953369140625, - "learning_rate": 2.97e-05, - "loss": 6.7008, - "num_input_tokens_seen": 6553600, - "step": 100, - "train_runtime": 61.1942, - "train_tokens_per_second": 107095.166 - }, - { - "epoch": 0.002, - "grad_norm": 0.6828203797340393, - "learning_rate": 5.97e-05, - "loss": 3.3177, - "num_input_tokens_seen": 13107200, - "step": 200, - "train_runtime": 107.6856, - "train_tokens_per_second": 121717.274 - }, - { - "epoch": 0.003, - "grad_norm": 16.05720329284668, - "learning_rate": 8.969999999999998e-05, - "loss": 3.0024, - "num_input_tokens_seen": 19660800, - "step": 300, - "train_runtime": 154.3564, - "train_tokens_per_second": 127372.748 - }, - { - "epoch": 0.004, - "grad_norm": 13.74783706665039, - "learning_rate": 0.0001197, - "loss": 2.6797, - "num_input_tokens_seen": 26214400, - "step": 400, - "train_runtime": 200.698, - "train_tokens_per_second": 130616.167 - }, - { - "epoch": 0.005, - "grad_norm": 12.893468856811523, - "learning_rate": 0.00014969999999999998, - "loss": 2.4588, - "num_input_tokens_seen": 32768000, - "step": 500, - "train_runtime": 252.1632, - "train_tokens_per_second": 129947.566 - }, - { - "epoch": 0.006, - "grad_norm": 9.021939277648926, - "learning_rate": 0.00017969999999999998, - "loss": 2.276, - "num_input_tokens_seen": 39321600, - "step": 600, - "train_runtime": 299.2712, - "train_tokens_per_second": 131391.184 - }, - { - "epoch": 0.007, - "grad_norm": 8.669090270996094, - "learning_rate": 0.00020969999999999997, - "loss": 2.1203, - "num_input_tokens_seen": 45875200, - "step": 700, - "train_runtime": 346.3366, - "train_tokens_per_second": 132458.429 - }, - { - "epoch": 0.008, - "grad_norm": 7.335177898406982, - "learning_rate": 0.0002397, - "loss": 1.9886, - "num_input_tokens_seen": 52428800, - "step": 800, - "train_runtime": 393.5299, - "train_tokens_per_second": 133226.965 - }, - { - "epoch": 0.009, - "grad_norm": 6.051175117492676, - "learning_rate": 0.0002697, - "loss": 1.9128, - "num_input_tokens_seen": 58982400, - "step": 900, - "train_runtime": 440.0136, - "train_tokens_per_second": 134046.765 - }, - { - "epoch": 0.01, - "grad_norm": 5.503482818603516, - "learning_rate": 0.00029969999999999997, - "loss": 1.8296, - "num_input_tokens_seen": 65536000, - "step": 1000, - "train_runtime": 492.2662, - "train_tokens_per_second": 133131.222 - }, - { - "epoch": 0.011, - "grad_norm": 2.8459227085113525, - "learning_rate": 0.00029999925978027874, - "loss": 1.779, - "num_input_tokens_seen": 72089600, - "step": 1100, - "train_runtime": 538.0301, - "train_tokens_per_second": 133988.032 - }, - { - "epoch": 0.012, - "grad_norm": 2.292707920074463, - "learning_rate": 0.0002999970091452017, - "loss": 1.7037, - "num_input_tokens_seen": 78643200, - "step": 1200, - "train_runtime": 585.618, - "train_tokens_per_second": 134290.951 - }, - { - "epoch": 0.013, - "grad_norm": 3.362025737762451, - "learning_rate": 0.00029999324804190795, - "loss": 1.6688, - "num_input_tokens_seen": 85196800, - "step": 1300, - "train_runtime": 632.1008, - "train_tokens_per_second": 134783.565 - }, - { - "epoch": 0.014, - "grad_norm": 2.2756998538970947, - "learning_rate": 0.0002999879765082716, - "loss": 1.6397, - "num_input_tokens_seen": 91750400, - "step": 1400, - "train_runtime": 684.3545, - "train_tokens_per_second": 134068.525 - }, - { - "epoch": 0.015, - "grad_norm": 2.5730831623077393, - "learning_rate": 0.000299981194597377, - "loss": 1.605, - "num_input_tokens_seen": 98304000, - "step": 1500, - "train_runtime": 730.5087, - "train_tokens_per_second": 134569.247 - }, - { - "epoch": 0.016, - "grad_norm": 1.7514433860778809, - "learning_rate": 0.0002999729023775179, - "loss": 1.5838, - "num_input_tokens_seen": 104857600, - "step": 1600, - "train_runtime": 781.9407, - "train_tokens_per_second": 134099.179 - }, - { - "epoch": 0.017, - "grad_norm": 1.8343929052352905, - "learning_rate": 0.0002999630999321969, - "loss": 1.6037, - "num_input_tokens_seen": 111411200, - "step": 1700, - "train_runtime": 824.7241, - "train_tokens_per_second": 135089.057 - }, - { - "epoch": 0.018, - "grad_norm": 1.5672227144241333, - "learning_rate": 0.00029995178736012443, - "loss": 1.5627, - "num_input_tokens_seen": 117964800, - "step": 1800, - "train_runtime": 871.9564, - "train_tokens_per_second": 135287.497 - }, - { - "epoch": 0.019, - "grad_norm": 1.6202061176300049, - "learning_rate": 0.0002999389647752181, - "loss": 1.5398, - "num_input_tokens_seen": 124518400, - "step": 1900, - "train_runtime": 923.402, - "train_tokens_per_second": 134847.439 - }, - { - "epoch": 0.02, - "grad_norm": 1.5145666599273682, - "learning_rate": 0.00029992463230660104, - "loss": 1.5389, - "num_input_tokens_seen": 131072000, - "step": 2000, - "train_runtime": 968.9283, - "train_tokens_per_second": 135275.229 - }, - { - "epoch": 0.021, - "grad_norm": 1.0306257009506226, - "learning_rate": 0.00029990879009860117, - "loss": 1.5098, - "num_input_tokens_seen": 137625600, - "step": 2100, - "train_runtime": 1020.8371, - "train_tokens_per_second": 134816.412 - }, - { - "epoch": 0.022, - "grad_norm": 2.0710599422454834, - "learning_rate": 0.0002998914383107493, - "loss": 1.5081, - "num_input_tokens_seen": 144179200, - "step": 2200, - "train_runtime": 1067.2796, - "train_tokens_per_second": 135090.368 - }, - { - "epoch": 0.023, - "grad_norm": 1.4022581577301025, - "learning_rate": 0.0002998725771177778, - "loss": 1.521, - "num_input_tokens_seen": 150732800, - "step": 2300, - "train_runtime": 1114.7094, - "train_tokens_per_second": 135221.616 - }, - { - "epoch": 0.024, - "grad_norm": 1.4328904151916504, - "learning_rate": 0.00029985220670961847, - "loss": 1.4855, - "num_input_tokens_seen": 157286400, - "step": 2400, - "train_runtime": 1160.6217, - "train_tokens_per_second": 135519.092 - }, - { - "epoch": 0.025, - "grad_norm": 1.3760366439819336, - "learning_rate": 0.0002998303272914014, - "loss": 1.4966, - "num_input_tokens_seen": 163840000, - "step": 2500, - "train_runtime": 1212.6489, - "train_tokens_per_second": 135109.18 - }, - { - "epoch": 0.026, - "grad_norm": 0.9530190825462341, - "learning_rate": 0.00029980693908345185, - "loss": 1.4795, - "num_input_tokens_seen": 170393600, - "step": 2600, - "train_runtime": 1258.3106, - "train_tokens_per_second": 135414.576 - }, - { - "epoch": 0.027, - "grad_norm": 0.8715839385986328, - "learning_rate": 0.00029978204232128895, - "loss": 1.4601, - "num_input_tokens_seen": 176947200, - "step": 2700, - "train_runtime": 1304.6837, - "train_tokens_per_second": 135624.597 - }, - { - "epoch": 0.028, - "grad_norm": 1.1879854202270508, - "learning_rate": 0.0002997556372556227, - "loss": 1.487, - "num_input_tokens_seen": 183500800, - "step": 2800, - "train_runtime": 1358.2195, - "train_tokens_per_second": 135103.938 - }, - { - "epoch": 0.029, - "grad_norm": 1.0949848890304565, - "learning_rate": 0.0002997277241523519, - "loss": 1.4658, - "num_input_tokens_seen": 190054400, - "step": 2900, - "train_runtime": 1404.4203, - "train_tokens_per_second": 135325.869 - }, - { - "epoch": 0.03, - "grad_norm": 1.465809941291809, - "learning_rate": 0.00029969830329256125, - "loss": 1.4463, - "num_input_tokens_seen": 196608000, - "step": 3000, - "train_runtime": 1451.3838, - "train_tokens_per_second": 135462.45 - }, - { - "epoch": 0.031, - "grad_norm": 0.9500088095664978, - "learning_rate": 0.00029966737497251836, - "loss": 1.4533, - "num_input_tokens_seen": 203161600, - "step": 3100, - "train_runtime": 1496.7114, - "train_tokens_per_second": 135738.657 - }, - { - "epoch": 0.032, - "grad_norm": 1.3393683433532715, - "learning_rate": 0.0002996349395036711, - "loss": 1.4402, - "num_input_tokens_seen": 209715200, - "step": 3200, - "train_runtime": 1549.2536, - "train_tokens_per_second": 135365.316 - }, - { - "epoch": 0.033, - "grad_norm": 0.7998270988464355, - "learning_rate": 0.00029960099721264435, - "loss": 1.4467, - "num_input_tokens_seen": 216268800, - "step": 3300, - "train_runtime": 1596.5035, - "train_tokens_per_second": 135464.03 - }, - { - "epoch": 0.034, - "grad_norm": 0.8441318273544312, - "learning_rate": 0.0002995655484412365, - "loss": 1.4353, - "num_input_tokens_seen": 222822400, - "step": 3400, - "train_runtime": 1642.6114, - "train_tokens_per_second": 135651.317 - }, - { - "epoch": 0.035, - "grad_norm": 0.7577129006385803, - "learning_rate": 0.00029952859354641636, - "loss": 1.4253, - "num_input_tokens_seen": 229376000, - "step": 3500, - "train_runtime": 1690.0779, - "train_tokens_per_second": 135719.187 - }, - { - "epoch": 0.036, - "grad_norm": 0.8359817862510681, - "learning_rate": 0.00029949013290031924, - "loss": 1.4348, - "num_input_tokens_seen": 235929600, - "step": 3600, - "train_runtime": 1736.0232, - "train_tokens_per_second": 135902.33 - }, - { - "epoch": 0.037, - "grad_norm": 0.7565376162528992, - "learning_rate": 0.00029945016689024353, - "loss": 1.4114, - "num_input_tokens_seen": 242483200, - "step": 3700, - "train_runtime": 1788.0113, - "train_tokens_per_second": 135616.148 - }, - { - "epoch": 0.038, - "grad_norm": 0.9537010788917542, - "learning_rate": 0.0002994086959186464, - "loss": 1.4134, - "num_input_tokens_seen": 249036800, - "step": 3800, - "train_runtime": 1835.9254, - "train_tokens_per_second": 135646.47 - }, - { - "epoch": 0.039, - "grad_norm": 0.8911266922950745, - "learning_rate": 0.00029936572040314014, - "loss": 1.4224, - "num_input_tokens_seen": 255590400, - "step": 3900, - "train_runtime": 1882.537, - "train_tokens_per_second": 135769.123 - }, - { - "epoch": 0.04, - "grad_norm": 0.7832906246185303, - "learning_rate": 0.0002993212407764877, - "loss": 1.4177, - "num_input_tokens_seen": 262144000, - "step": 4000, - "train_runtime": 1928.8118, - "train_tokens_per_second": 135909.579 - }, - { - "epoch": 0.041, - "grad_norm": 0.8426671624183655, - "learning_rate": 0.00029927525748659834, - "loss": 1.4194, - "num_input_tokens_seen": 268697600, - "step": 4100, - "train_runtime": 1981.7143, - "train_tokens_per_second": 135588.467 - }, - { - "epoch": 0.042, - "grad_norm": 0.9675344824790955, - "learning_rate": 0.0002992277709965234, - "loss": 1.4059, - "num_input_tokens_seen": 275251200, - "step": 4200, - "train_runtime": 2027.927, - "train_tokens_per_second": 135730.33 - }, - { - "epoch": 0.043, - "grad_norm": 1.1866440773010254, - "learning_rate": 0.0002991787817844513, - "loss": 1.4065, - "num_input_tokens_seen": 281804800, - "step": 4300, - "train_runtime": 2074.708, - "train_tokens_per_second": 135828.659 - }, - { - "epoch": 0.044, - "grad_norm": 0.8417257070541382, - "learning_rate": 0.0002991282903437028, - "loss": 1.397, - "num_input_tokens_seen": 288358400, - "step": 4400, - "train_runtime": 2126.0513, - "train_tokens_per_second": 135630.972 - }, - { - "epoch": 0.045, - "grad_norm": 0.8226633071899414, - "learning_rate": 0.0002990762971827262, - "loss": 1.3996, - "num_input_tokens_seen": 294912000, - "step": 4500, - "train_runtime": 2172.3837, - "train_tokens_per_second": 135755.024 - }, - { - "epoch": 0.046, - "grad_norm": 0.8411224484443665, - "learning_rate": 0.00029902280282509197, - "loss": 1.4002, - "num_input_tokens_seen": 301465600, - "step": 4600, - "train_runtime": 2220.1775, - "train_tokens_per_second": 135784.456 - }, - { - "epoch": 0.047, - "grad_norm": 0.7082719802856445, - "learning_rate": 0.0002989678078094878, - "loss": 1.3804, - "num_input_tokens_seen": 308019200, - "step": 4700, - "train_runtime": 2266.6848, - "train_tokens_per_second": 135889.739 - }, - { - "epoch": 0.048, - "grad_norm": 0.7628137469291687, - "learning_rate": 0.00029891131268971284, - "loss": 1.3795, - "num_input_tokens_seen": 314572800, - "step": 4800, - "train_runtime": 2318.5885, - "train_tokens_per_second": 135674.269 - }, - { - "epoch": 0.049, - "grad_norm": 0.7231079936027527, - "learning_rate": 0.0002988533180346723, - "loss": 1.3789, - "num_input_tokens_seen": 321126400, - "step": 4900, - "train_runtime": 2364.3453, - "train_tokens_per_second": 135820.432 - }, - { - "epoch": 0.05, - "grad_norm": 0.7210503816604614, - "learning_rate": 0.0002987938244283717, - "loss": 1.3641, - "num_input_tokens_seen": 327680000, - "step": 5000, - "train_runtime": 2410.3286, - "train_tokens_per_second": 135948.267 - }, - { - "epoch": 0.051, - "grad_norm": 0.729364275932312, - "learning_rate": 0.00029873283246991105, - "loss": 1.3756, - "num_input_tokens_seen": 334233600, - "step": 5100, - "train_runtime": 2458.4762, - "train_tokens_per_second": 135951.532 - }, - { - "epoch": 0.052, - "grad_norm": 0.7513293027877808, - "learning_rate": 0.0002986703427734787, - "loss": 1.3778, - "num_input_tokens_seen": 340787200, - "step": 5200, - "train_runtime": 2506.9032, - "train_tokens_per_second": 135939.511 - }, - { - "epoch": 0.053, - "grad_norm": 0.7382386326789856, - "learning_rate": 0.00029860635596834517, - "loss": 1.3807, - "num_input_tokens_seen": 347340800, - "step": 5300, - "train_runtime": 2559.5035, - "train_tokens_per_second": 135706.321 - }, - { - "epoch": 0.054, - "grad_norm": 0.5869194269180298, - "learning_rate": 0.0002985408726988569, - "loss": 1.3695, - "num_input_tokens_seen": 353894400, - "step": 5400, - "train_runtime": 2605.4484, - "train_tokens_per_second": 135828.598 - }, - { - "epoch": 0.055, - "grad_norm": 0.7805973291397095, - "learning_rate": 0.0002984738936244296, - "loss": 1.3746, - "num_input_tokens_seen": 360448000, - "step": 5500, - "train_runtime": 2655.8515, - "train_tokens_per_second": 135718.431 - }, - { - "epoch": 0.056, - "grad_norm": 0.6918448209762573, - "learning_rate": 0.0002984054194195419, - "loss": 1.3855, - "num_input_tokens_seen": 367001600, - "step": 5600, - "train_runtime": 2703.0299, - "train_tokens_per_second": 135774.155 - }, - { - "epoch": 0.057, - "grad_norm": 0.6129201054573059, - "learning_rate": 0.0002983354507737283, - "loss": 1.3816, - "num_input_tokens_seen": 373555200, - "step": 5700, - "train_runtime": 2750.071, - "train_tokens_per_second": 135834.747 - }, - { - "epoch": 0.058, - "grad_norm": 0.7457948923110962, - "learning_rate": 0.00029826398839157215, - "loss": 1.3748, - "num_input_tokens_seen": 380108800, - "step": 5800, - "train_runtime": 2795.4164, - "train_tokens_per_second": 135975.735 - }, - { - "epoch": 0.059, - "grad_norm": 0.6171481013298035, - "learning_rate": 0.000298191032992699, - "loss": 1.3725, - "num_input_tokens_seen": 386662400, - "step": 5900, - "train_runtime": 2842.5021, - "train_tokens_per_second": 136028.889 - }, - { - "epoch": 0.06, - "grad_norm": 0.6233596205711365, - "learning_rate": 0.0002981165853117688, - "loss": 1.3624, - "num_input_tokens_seen": 393216000, - "step": 6000, - "train_runtime": 2892.8273, - "train_tokens_per_second": 135927.922 - }, - { - "epoch": 0.061, - "grad_norm": 0.5645745396614075, - "learning_rate": 0.000298040646098469, - "loss": 1.356, - "num_input_tokens_seen": 399769600, - "step": 6100, - "train_runtime": 2940.1153, - "train_tokens_per_second": 135970.721 - }, - { - "epoch": 0.062, - "grad_norm": 0.6580554246902466, - "learning_rate": 0.0002979632161175064, - "loss": 1.3627, - "num_input_tokens_seen": 406323200, - "step": 6200, - "train_runtime": 2986.9073, - "train_tokens_per_second": 136034.754 - }, - { - "epoch": 0.063, - "grad_norm": 0.6815545558929443, - "learning_rate": 0.0002978842961486003, - "loss": 1.3562, - "num_input_tokens_seen": 412876800, - "step": 6300, - "train_runtime": 3038.4238, - "train_tokens_per_second": 135885.191 - }, - { - "epoch": 0.064, - "grad_norm": 0.9602898955345154, - "learning_rate": 0.0002978038869864738, - "loss": 1.3562, - "num_input_tokens_seen": 419430400, - "step": 6400, - "train_runtime": 3085.1228, - "train_tokens_per_second": 135952.578 - }, - { - "epoch": 0.065, - "grad_norm": 0.7086384892463684, - "learning_rate": 0.0002977219894408463, - "loss": 1.3579, - "num_input_tokens_seen": 425984000, - "step": 6500, - "train_runtime": 3130.8346, - "train_tokens_per_second": 136060.844 - }, - { - "epoch": 0.066, - "grad_norm": 0.5864439010620117, - "learning_rate": 0.0002976386043364251, - "loss": 1.3563, - "num_input_tokens_seen": 432537600, - "step": 6600, - "train_runtime": 3182.4893, - "train_tokens_per_second": 135911.72 - }, - { - "epoch": 0.067, - "grad_norm": 0.6041991114616394, - "learning_rate": 0.00029755373251289733, - "loss": 1.3753, - "num_input_tokens_seen": 439091200, - "step": 6700, - "train_runtime": 3229.4118, - "train_tokens_per_second": 135966.308 - }, - { - "epoch": 0.068, - "grad_norm": 0.7153160572052002, - "learning_rate": 0.0002974673748249213, - "loss": 1.3475, - "num_input_tokens_seen": 445644800, - "step": 6800, - "train_runtime": 3276.7034, - "train_tokens_per_second": 136004.008 - }, - { - "epoch": 0.069, - "grad_norm": 0.5409119725227356, - "learning_rate": 0.00029737953214211804, - "loss": 1.3464, - "num_input_tokens_seen": 452198400, - "step": 6900, - "train_runtime": 3324.3119, - "train_tokens_per_second": 136027.67 - }, - { - "epoch": 0.07, - "grad_norm": 0.6369441151618958, - "learning_rate": 0.0002972902053490623, - "loss": 1.3546, - "num_input_tokens_seen": 458752000, - "step": 7000, - "train_runtime": 3370.6322, - "train_tokens_per_second": 136102.657 - }, - { - "epoch": 0.071, - "grad_norm": 0.8589248061180115, - "learning_rate": 0.00029719939534527393, - "loss": 1.3479, - "num_input_tokens_seen": 465305600, - "step": 7100, - "train_runtime": 3424.7139, - "train_tokens_per_second": 135867.0 - }, - { - "epoch": 0.072, - "grad_norm": 0.8014613389968872, - "learning_rate": 0.00029710710304520866, - "loss": 1.3667, - "num_input_tokens_seen": 471859200, - "step": 7200, - "train_runtime": 3472.985, - "train_tokens_per_second": 135865.601 - }, - { - "epoch": 0.073, - "grad_norm": 0.5970280766487122, - "learning_rate": 0.00029701332937824885, - "loss": 1.3423, - "num_input_tokens_seen": 478412800, - "step": 7300, - "train_runtime": 3519.3052, - "train_tokens_per_second": 135939.558 - }, - { - "epoch": 0.074, - "grad_norm": 0.6963617205619812, - "learning_rate": 0.0002969180752886944, - "loss": 1.3443, - "num_input_tokens_seen": 484966400, - "step": 7400, - "train_runtime": 3565.8739, - "train_tokens_per_second": 136002.118 - }, - { - "epoch": 0.075, - "grad_norm": 0.5769393444061279, - "learning_rate": 0.0002968213417357529, - "loss": 1.3576, - "num_input_tokens_seen": 491520000, - "step": 7500, - "train_runtime": 3611.5043, - "train_tokens_per_second": 136098.411 - }, - { - "epoch": 0.076, - "grad_norm": 0.5492929816246033, - "learning_rate": 0.00029672312969353015, - "loss": 1.3422, - "num_input_tokens_seen": 498073600, - "step": 7600, - "train_runtime": 3664.3633, - "train_tokens_per_second": 135923.642 - }, - { - "epoch": 0.077, - "grad_norm": 0.8065637946128845, - "learning_rate": 0.00029662344015102027, - "loss": 1.3395, - "num_input_tokens_seen": 504627200, - "step": 7700, - "train_runtime": 3711.2689, - "train_tokens_per_second": 135971.608 - }, - { - "epoch": 0.078, - "grad_norm": 0.552871584892273, - "learning_rate": 0.00029652227411209594, - "loss": 1.3427, - "num_input_tokens_seen": 511180800, - "step": 7800, - "train_runtime": 3758.1209, - "train_tokens_per_second": 136020.319 - }, - { - "epoch": 0.079, - "grad_norm": 0.6378001570701599, - "learning_rate": 0.0002964196325954979, - "loss": 1.3339, - "num_input_tokens_seen": 517734400, - "step": 7900, - "train_runtime": 3804.2295, - "train_tokens_per_second": 136094.417 - }, - { - "epoch": 0.08, - "grad_norm": 0.6196131706237793, - "learning_rate": 0.0002963155166348253, - "loss": 1.341, - "num_input_tokens_seen": 524288000, - "step": 8000, - "train_runtime": 3855.6562, - "train_tokens_per_second": 135978.93 - }, - { - "epoch": 0.081, - "grad_norm": 0.5841253399848938, - "learning_rate": 0.0002962099272785246, - "loss": 1.3366, - "num_input_tokens_seen": 530841600, - "step": 8100, - "train_runtime": 3903.5348, - "train_tokens_per_second": 135989.977 - }, - { - "epoch": 0.082, - "grad_norm": 0.5912770628929138, - "learning_rate": 0.0002961028655898794, - "loss": 1.3417, - "num_input_tokens_seen": 537395200, - "step": 8200, - "train_runtime": 3951.3698, - "train_tokens_per_second": 136002.255 - }, - { - "epoch": 0.083, - "grad_norm": 0.5480249524116516, - "learning_rate": 0.0002959943326469998, - "loss": 1.3419, - "num_input_tokens_seen": 543948800, - "step": 8300, - "train_runtime": 3997.3554, - "train_tokens_per_second": 136077.166 - }, - { - "epoch": 0.084, - "grad_norm": 0.49880343675613403, - "learning_rate": 0.0002958843295428112, - "loss": 1.3165, - "num_input_tokens_seen": 550502400, - "step": 8400, - "train_runtime": 4044.3967, - "train_tokens_per_second": 136114.838 - }, - { - "epoch": 0.085, - "grad_norm": 0.5670176148414612, - "learning_rate": 0.0002957728573850438, - "loss": 1.3314, - "num_input_tokens_seen": 557056000, - "step": 8500, - "train_runtime": 4095.7201, - "train_tokens_per_second": 136009.294 - }, - { - "epoch": 0.086, - "grad_norm": 2.3274426460266113, - "learning_rate": 0.0002956599172962209, - "loss": 1.3323, - "num_input_tokens_seen": 563609600, - "step": 8600, - "train_runtime": 4143.1443, - "train_tokens_per_second": 136034.268 - }, - { - "epoch": 0.087, - "grad_norm": 0.7660558819770813, - "learning_rate": 0.0002955455104136479, - "loss": 1.3382, - "num_input_tokens_seen": 570163200, - "step": 8700, - "train_runtime": 4190.7065, - "train_tokens_per_second": 136054.194 - }, - { - "epoch": 0.088, - "grad_norm": 0.5114762783050537, - "learning_rate": 0.00029542963788940096, - "loss": 1.3252, - "num_input_tokens_seen": 576716800, - "step": 8800, - "train_runtime": 4237.8545, - "train_tokens_per_second": 136086.974 - }, - { - "epoch": 0.089, - "grad_norm": 0.6698548197746277, - "learning_rate": 0.00029531230089031505, - "loss": 1.3449, - "num_input_tokens_seen": 583270400, - "step": 8900, - "train_runtime": 4285.2299, - "train_tokens_per_second": 136111.81 - }, - { - "epoch": 0.09, - "grad_norm": 0.5562598705291748, - "learning_rate": 0.0002951935005979724, - "loss": 1.3204, - "num_input_tokens_seen": 589824000, - "step": 9000, - "train_runtime": 4336.4907, - "train_tokens_per_second": 136014.126 - }, - { - "epoch": 0.091, - "grad_norm": 0.6327181458473206, - "learning_rate": 0.0002950732382086907, - "loss": 1.3178, - "num_input_tokens_seen": 596377600, - "step": 9100, - "train_runtime": 4383.0811, - "train_tokens_per_second": 136063.555 - }, - { - "epoch": 0.092, - "grad_norm": 0.6857426166534424, - "learning_rate": 0.0002949515149335108, - "loss": 1.3332, - "num_input_tokens_seen": 602931200, - "step": 9200, - "train_runtime": 4431.4231, - "train_tokens_per_second": 136058.142 - }, - { - "epoch": 0.093, - "grad_norm": 0.6040679812431335, - "learning_rate": 0.0002948283319981848, - "loss": 1.307, - "num_input_tokens_seen": 609484800, - "step": 9300, - "train_runtime": 4478.1663, - "train_tokens_per_second": 136101.423 - }, - { - "epoch": 0.094, - "grad_norm": 1.0060901641845703, - "learning_rate": 0.00029470369064316354, - "loss": 1.3108, - "num_input_tokens_seen": 616038400, - "step": 9400, - "train_runtime": 4524.7167, - "train_tokens_per_second": 136149.607 - }, - { - "epoch": 0.095, - "grad_norm": 0.504460871219635, - "learning_rate": 0.00029457759212358397, - "loss": 1.3169, - "num_input_tokens_seen": 622592000, - "step": 9500, - "train_runtime": 4575.869, - "train_tokens_per_second": 136059.84 - }, - { - "epoch": 0.096, - "grad_norm": 0.5062097907066345, - "learning_rate": 0.00029445003770925686, - "loss": 1.3137, - "num_input_tokens_seen": 629145600, - "step": 9600, - "train_runtime": 4621.4422, - "train_tokens_per_second": 136136.203 - }, - { - "epoch": 0.097, - "grad_norm": 0.5388786792755127, - "learning_rate": 0.00029432102868465367, - "loss": 1.3128, - "num_input_tokens_seen": 635699200, - "step": 9700, - "train_runtime": 4668.6149, - "train_tokens_per_second": 136164.411 - }, - { - "epoch": 0.098, - "grad_norm": 0.5705980062484741, - "learning_rate": 0.0002941905663488939, - "loss": 1.3065, - "num_input_tokens_seen": 642252800, - "step": 9800, - "train_runtime": 4715.2389, - "train_tokens_per_second": 136207.903 - }, - { - "epoch": 0.099, - "grad_norm": 0.5500839352607727, - "learning_rate": 0.0002940586520157318, - "loss": 1.3222, - "num_input_tokens_seen": 648806400, - "step": 9900, - "train_runtime": 4767.1995, - "train_tokens_per_second": 136098.019 - }, - { - "epoch": 0.1, - "grad_norm": 0.5740068554878235, - "learning_rate": 0.00029392528701354325, - "loss": 1.3173, - "num_input_tokens_seen": 655360000, - "step": 10000, - "train_runtime": 4814.2762, - "train_tokens_per_second": 136128.458 - }, - { - "epoch": 0.101, - "grad_norm": 0.47691279649734497, - "learning_rate": 0.00029379047268531243, - "loss": 1.3084, - "num_input_tokens_seen": 661913600, - "step": 10100, - "train_runtime": 4861.0919, - "train_tokens_per_second": 136165.622 - }, - { - "epoch": 0.102, - "grad_norm": 0.5993319153785706, - "learning_rate": 0.00029365421038861795, - "loss": 1.3299, - "num_input_tokens_seen": 668467200, - "step": 10200, - "train_runtime": 4908.6949, - "train_tokens_per_second": 136180.229 - }, - { - "epoch": 0.103, - "grad_norm": 0.556516170501709, - "learning_rate": 0.0002935165014956198, - "loss": 1.316, - "num_input_tokens_seen": 675020800, - "step": 10300, - "train_runtime": 4956.5309, - "train_tokens_per_second": 136188.156 - }, - { - "epoch": 0.104, - "grad_norm": 0.6757346391677856, - "learning_rate": 0.0002933773473930448, - "loss": 1.3048, - "num_input_tokens_seen": 681574400, - "step": 10400, - "train_runtime": 5003.7965, - "train_tokens_per_second": 136211.454 - }, - { - "epoch": 0.105, - "grad_norm": 0.9610360860824585, - "learning_rate": 0.0002932367494821734, - "loss": 1.3043, - "num_input_tokens_seen": 688128000, - "step": 10500, - "train_runtime": 5050.8058, - "train_tokens_per_second": 136241.232 - }, - { - "epoch": 0.106, - "grad_norm": 0.5780071020126343, - "learning_rate": 0.00029309470917882497, - "loss": 1.3015, - "num_input_tokens_seen": 694681600, - "step": 10600, - "train_runtime": 5104.0171, - "train_tokens_per_second": 136104.873 - }, - { - "epoch": 0.107, - "grad_norm": 0.6387894749641418, - "learning_rate": 0.0002929512279133437, - "loss": 1.3342, - "num_input_tokens_seen": 701235200, - "step": 10700, - "train_runtime": 5151.2508, - "train_tokens_per_second": 136129.112 - }, - { - "epoch": 0.108, - "grad_norm": 0.48744165897369385, - "learning_rate": 0.0002928063071305844, - "loss": 1.2999, - "num_input_tokens_seen": 707788800, - "step": 10800, - "train_runtime": 5198.4813, - "train_tokens_per_second": 136152.995 - }, - { - "epoch": 0.109, - "grad_norm": 0.5223510265350342, - "learning_rate": 0.0002926599482898978, - "loss": 1.2996, - "num_input_tokens_seen": 714342400, - "step": 10900, - "train_runtime": 5244.0735, - "train_tokens_per_second": 136218.99 - }, - { - "epoch": 0.11, - "grad_norm": 0.6020687222480774, - "learning_rate": 0.00029251215286511573, - "loss": 1.3029, - "num_input_tokens_seen": 720896000, - "step": 11000, - "train_runtime": 5291.0983, - "train_tokens_per_second": 136246.948 - }, - { - "epoch": 0.111, - "grad_norm": 0.5317751169204712, - "learning_rate": 0.00029236292234453647, - "loss": 1.316, - "num_input_tokens_seen": 727449600, - "step": 11100, - "train_runtime": 5342.4851, - "train_tokens_per_second": 136163.15 - }, - { - "epoch": 0.112, - "grad_norm": 1.2369730472564697, - "learning_rate": 0.0002922122582309097, - "loss": 1.298, - "num_input_tokens_seen": 734003200, - "step": 11200, - "train_runtime": 5391.0041, - "train_tokens_per_second": 136153.338 - }, - { - "epoch": 0.113, - "grad_norm": 0.5294257998466492, - "learning_rate": 0.0002920601620414215, - "loss": 1.316, - "num_input_tokens_seen": 740556800, - "step": 11300, - "train_runtime": 5437.8422, - "train_tokens_per_second": 136185.784 - }, - { - "epoch": 0.114, - "grad_norm": 0.5318885445594788, - "learning_rate": 0.0002919066353076786, - "loss": 1.2993, - "num_input_tokens_seen": 747110400, - "step": 11400, - "train_runtime": 5484.1183, - "train_tokens_per_second": 136231.635 - }, - { - "epoch": 0.115, - "grad_norm": 0.5208443403244019, - "learning_rate": 0.00029175167957569366, - "loss": 1.3066, - "num_input_tokens_seen": 753664000, - "step": 11500, - "train_runtime": 5531.5155, - "train_tokens_per_second": 136249.099 - }, - { - "epoch": 0.116, - "grad_norm": 0.5068408250808716, - "learning_rate": 0.0002915952964058691, - "loss": 1.3041, - "num_input_tokens_seen": 760217600, - "step": 11600, - "train_runtime": 5578.6188, - "train_tokens_per_second": 136273.445 - }, - { - "epoch": 0.117, - "grad_norm": 0.6206523776054382, - "learning_rate": 0.00029143748737298173, - "loss": 1.3061, - "num_input_tokens_seen": 766771200, - "step": 11700, - "train_runtime": 5631.31, - "train_tokens_per_second": 136162.136 - }, - { - "epoch": 0.118, - "grad_norm": 0.5741725564002991, - "learning_rate": 0.00029127825406616677, - "loss": 1.3097, - "num_input_tokens_seen": 773324800, - "step": 11800, - "train_runtime": 5678.817, - "train_tokens_per_second": 136177.096 - }, - { - "epoch": 0.119, - "grad_norm": 0.5251154899597168, - "learning_rate": 0.0002911175980889019, - "loss": 1.3054, - "num_input_tokens_seen": 779878400, - "step": 11900, - "train_runtime": 5725.8659, - "train_tokens_per_second": 136202.701 - }, - { - "epoch": 0.12, - "grad_norm": 0.4509083032608032, - "learning_rate": 0.00029095552105899095, - "loss": 1.301, - "num_input_tokens_seen": 786432000, - "step": 12000, - "train_runtime": 5772.0962, - "train_tokens_per_second": 136247.211 - }, - { - "epoch": 0.121, - "grad_norm": 0.4560108184814453, - "learning_rate": 0.0002907920246085478, - "loss": 1.2981, - "num_input_tokens_seen": 792985600, - "step": 12100, - "train_runtime": 5817.8977, - "train_tokens_per_second": 136301.056 - }, - { - "epoch": 0.122, - "grad_norm": 1.227121114730835, - "learning_rate": 0.00029062711038397996, - "loss": 1.302, - "num_input_tokens_seen": 799539200, - "step": 12200, - "train_runtime": 5870.3451, - "train_tokens_per_second": 136199.693 - }, - { - "epoch": 0.123, - "grad_norm": 0.4861258864402771, - "learning_rate": 0.00029046078004597175, - "loss": 1.318, - "num_input_tokens_seen": 806092800, - "step": 12300, - "train_runtime": 5916.8489, - "train_tokens_per_second": 136236.84 - }, - { - "epoch": 0.124, - "grad_norm": 0.9702387452125549, - "learning_rate": 0.00029029303526946796, - "loss": 1.2869, - "num_input_tokens_seen": 812646400, - "step": 12400, - "train_runtime": 5964.0243, - "train_tokens_per_second": 136258.063 - }, - { - "epoch": 0.125, - "grad_norm": 0.4712119400501251, - "learning_rate": 0.0002901238777436565, - "loss": 1.2924, - "num_input_tokens_seen": 819200000, - "step": 12500, - "train_runtime": 6009.6089, - "train_tokens_per_second": 136315.026 - }, - { - "epoch": 0.126, - "grad_norm": 0.4670332372188568, - "learning_rate": 0.00028995330917195184, - "loss": 1.2942, - "num_input_tokens_seen": 825753600, - "step": 12600, - "train_runtime": 6061.3166, - "train_tokens_per_second": 136233.371 - }, - { - "epoch": 0.127, - "grad_norm": 0.4821685552597046, - "learning_rate": 0.00028978133127197765, - "loss": 1.2856, - "num_input_tokens_seen": 832307200, - "step": 12700, - "train_runtime": 6108.5206, - "train_tokens_per_second": 136253.481 - }, - { - "epoch": 0.128, - "grad_norm": 0.5634518265724182, - "learning_rate": 0.0002896079457755493, - "loss": 1.2982, - "num_input_tokens_seen": 838860800, - "step": 12800, - "train_runtime": 6155.2503, - "train_tokens_per_second": 136283.785 - }, - { - "epoch": 0.129, - "grad_norm": 0.45673057436943054, - "learning_rate": 0.000289433154428657, - "loss": 1.2997, - "num_input_tokens_seen": 845414400, - "step": 12900, - "train_runtime": 6202.1106, - "train_tokens_per_second": 136310.758 - }, - { - "epoch": 0.13, - "grad_norm": 0.4386661648750305, - "learning_rate": 0.0002892569589914476, - "loss": 1.2985, - "num_input_tokens_seen": 851968000, - "step": 13000, - "train_runtime": 6249.4681, - "train_tokens_per_second": 136326.482 - }, - { - "epoch": 0.131, - "grad_norm": 0.4749270975589752, - "learning_rate": 0.0002890793612382072, - "loss": 1.2946, - "num_input_tokens_seen": 858521600, - "step": 13100, - "train_runtime": 6301.6638, - "train_tokens_per_second": 136237.291 - }, - { - "epoch": 0.132, - "grad_norm": 0.5405780673027039, - "learning_rate": 0.0002889003629573432, - "loss": 1.2857, - "num_input_tokens_seen": 865075200, - "step": 13200, - "train_runtime": 6349.664, - "train_tokens_per_second": 136239.523 - }, - { - "epoch": 0.133, - "grad_norm": 0.4045722782611847, - "learning_rate": 0.00028871996595136626, - "loss": 1.3009, - "num_input_tokens_seen": 871628800, - "step": 13300, - "train_runtime": 6396.2349, - "train_tokens_per_second": 136272.169 - }, - { - "epoch": 0.134, - "grad_norm": 0.5851114392280579, - "learning_rate": 0.0002885381720368723, - "loss": 1.3026, - "num_input_tokens_seen": 878182400, - "step": 13400, - "train_runtime": 6442.8884, - "train_tokens_per_second": 136302.594 - }, - { - "epoch": 0.135, - "grad_norm": 0.5135608315467834, - "learning_rate": 0.000288354983044524, - "loss": 1.2778, - "num_input_tokens_seen": 884736000, - "step": 13500, - "train_runtime": 6489.2417, - "train_tokens_per_second": 136338.889 - }, - { - "epoch": 0.136, - "grad_norm": 0.4828953742980957, - "learning_rate": 0.00028817040081903245, - "loss": 1.2864, - "num_input_tokens_seen": 891289600, - "step": 13600, - "train_runtime": 6540.9813, - "train_tokens_per_second": 136262.368 - }, - { - "epoch": 0.137, - "grad_norm": 0.5756350755691528, - "learning_rate": 0.00028798442721913867, - "loss": 1.2858, - "num_input_tokens_seen": 897843200, - "step": 13700, - "train_runtime": 6588.3179, - "train_tokens_per_second": 136278.063 - }, - { - "epoch": 0.138, - "grad_norm": 0.5231483578681946, - "learning_rate": 0.00028779706411759465, - "loss": 1.282, - "num_input_tokens_seen": 904396800, - "step": 13800, - "train_runtime": 6635.0521, - "train_tokens_per_second": 136305.909 - }, - { - "epoch": 0.139, - "grad_norm": 0.5475858449935913, - "learning_rate": 0.00028760831340114484, - "loss": 1.2797, - "num_input_tokens_seen": 910950400, - "step": 13900, - "train_runtime": 6681.4731, - "train_tokens_per_second": 136339.754 - }, - { - "epoch": 0.14, - "grad_norm": 0.7064163684844971, - "learning_rate": 0.00028741817697050683, - "loss": 1.2927, - "num_input_tokens_seen": 917504000, - "step": 14000, - "train_runtime": 6730.4553, - "train_tokens_per_second": 136321.238 - }, - { - "epoch": 0.141, - "grad_norm": 0.5267386436462402, - "learning_rate": 0.00028722665674035233, - "loss": 1.2815, - "num_input_tokens_seen": 924057600, - "step": 14100, - "train_runtime": 6782.7717, - "train_tokens_per_second": 136235.987 - }, - { - "epoch": 0.142, - "grad_norm": 0.5816136598587036, - "learning_rate": 0.0002870337546392879, - "loss": 1.2983, - "num_input_tokens_seen": 930611200, - "step": 14200, - "train_runtime": 6829.7567, - "train_tokens_per_second": 136258.323 - }, - { - "epoch": 0.143, - "grad_norm": 0.4982451796531677, - "learning_rate": 0.00028683947260983576, - "loss": 1.3026, - "num_input_tokens_seen": 937164800, - "step": 14300, - "train_runtime": 6877.8163, - "train_tokens_per_second": 136259.063 - }, - { - "epoch": 0.144, - "grad_norm": 0.49408379197120667, - "learning_rate": 0.00028664381260841356, - "loss": 1.2869, - "num_input_tokens_seen": 943718400, - "step": 14400, - "train_runtime": 6923.5994, - "train_tokens_per_second": 136304.593 - }, - { - "epoch": 0.145, - "grad_norm": 0.4885796904563904, - "learning_rate": 0.0002864467766053154, - "loss": 1.2768, - "num_input_tokens_seen": 950272000, - "step": 14500, - "train_runtime": 6969.9199, - "train_tokens_per_second": 136339.014 - }, - { - "epoch": 0.146, - "grad_norm": 0.5424348711967468, - "learning_rate": 0.00028624836658469165, - "loss": 1.2806, - "num_input_tokens_seen": 956825600, - "step": 14600, - "train_runtime": 7020.7829, - "train_tokens_per_second": 136284.743 - }, - { - "epoch": 0.147, - "grad_norm": 0.4333992898464203, - "learning_rate": 0.00028604858454452906, - "loss": 1.2776, - "num_input_tokens_seen": 963379200, - "step": 14700, - "train_runtime": 7066.7012, - "train_tokens_per_second": 136326.58 - }, - { - "epoch": 0.148, - "grad_norm": 1.3118066787719727, - "learning_rate": 0.00028584743249663057, - "loss": 1.3039, - "num_input_tokens_seen": 969932800, - "step": 14800, - "train_runtime": 7115.8691, - "train_tokens_per_second": 136305.6 - }, - { - "epoch": 0.149, - "grad_norm": 0.5320950150489807, - "learning_rate": 0.000285644912466595, - "loss": 1.2801, - "num_input_tokens_seen": 976486400, - "step": 14900, - "train_runtime": 7162.6662, - "train_tokens_per_second": 136330.016 - }, - { - "epoch": 0.15, - "grad_norm": 0.6902542114257812, - "learning_rate": 0.00028544102649379684, - "loss": 1.2832, - "num_input_tokens_seen": 983040000, - "step": 15000, - "train_runtime": 7209.6657, - "train_tokens_per_second": 136350.29 - }, - { - "epoch": 0.151, - "grad_norm": 0.544683039188385, - "learning_rate": 0.00028523577663136556, - "loss": 1.2948, - "num_input_tokens_seen": 989593600, - "step": 15100, - "train_runtime": 7261.0326, - "train_tokens_per_second": 136288.275 - }, - { - "epoch": 0.152, - "grad_norm": 0.500091552734375, - "learning_rate": 0.000285029164946165, - "loss": 1.2746, - "num_input_tokens_seen": 996147200, - "step": 15200, - "train_runtime": 7306.6445, - "train_tokens_per_second": 136334.427 - }, - { - "epoch": 0.153, - "grad_norm": 0.4995329678058624, - "learning_rate": 0.0002848211935187725, - "loss": 1.2893, - "num_input_tokens_seen": 1002700800, - "step": 15300, - "train_runtime": 7353.2711, - "train_tokens_per_second": 136361.19 - }, - { - "epoch": 0.154, - "grad_norm": 0.42985284328460693, - "learning_rate": 0.0002846118644434581, - "loss": 1.3077, - "num_input_tokens_seen": 1009254400, - "step": 15400, - "train_runtime": 7400.7889, - "train_tokens_per_second": 136371.192 - }, - { - "epoch": 0.155, - "grad_norm": 0.4847468137741089, - "learning_rate": 0.00028440117982816326, - "loss": 1.2723, - "num_input_tokens_seen": 1015808000, - "step": 15500, - "train_runtime": 7452.7433, - "train_tokens_per_second": 136299.877 - }, - { - "epoch": 0.156, - "grad_norm": 0.47867411375045776, - "learning_rate": 0.0002841891417944796, - "loss": 1.2754, - "num_input_tokens_seen": 1022361600, - "step": 15600, - "train_runtime": 7498.8195, - "train_tokens_per_second": 136336.339 - }, - { - "epoch": 0.157, - "grad_norm": 0.43365904688835144, - "learning_rate": 0.0002839757524776279, - "loss": 1.2737, - "num_input_tokens_seen": 1028915200, - "step": 15700, - "train_runtime": 7545.0284, - "train_tokens_per_second": 136369.957 - }, - { - "epoch": 0.158, - "grad_norm": 0.5739541053771973, - "learning_rate": 0.0002837610140264361, - "loss": 1.286, - "num_input_tokens_seen": 1035468800, - "step": 15800, - "train_runtime": 7597.8039, - "train_tokens_per_second": 136285.275 - }, - { - "epoch": 0.159, - "grad_norm": 0.4836307168006897, - "learning_rate": 0.0002835449286033182, - "loss": 1.2779, - "num_input_tokens_seen": 1042022400, - "step": 15900, - "train_runtime": 7643.6023, - "train_tokens_per_second": 136326.088 - }, - { - "epoch": 0.16, - "grad_norm": 0.5712729692459106, - "learning_rate": 0.0002833274983842518, - "loss": 1.2702, - "num_input_tokens_seen": 1048576000, - "step": 16000, - "train_runtime": 7691.0096, - "train_tokens_per_second": 136337.887 - }, - { - "epoch": 0.161, - "grad_norm": 0.48568034172058105, - "learning_rate": 0.0002831087255587569, - "loss": 1.2696, - "num_input_tokens_seen": 1055129600, - "step": 16100, - "train_runtime": 7737.6132, - "train_tokens_per_second": 136363.705 - }, - { - "epoch": 0.162, - "grad_norm": 0.5240116715431213, - "learning_rate": 0.0002828886123298734, - "loss": 1.2636, - "num_input_tokens_seen": 1061683200, - "step": 16200, - "train_runtime": 7790.0975, - "train_tokens_per_second": 136286.253 - }, - { - "epoch": 0.163, - "grad_norm": 0.4505080580711365, - "learning_rate": 0.00028266716091413906, - "loss": 1.2679, - "num_input_tokens_seen": 1068236800, - "step": 16300, - "train_runtime": 7837.0156, - "train_tokens_per_second": 136306.581 - }, - { - "epoch": 0.164, - "grad_norm": 0.38184958696365356, - "learning_rate": 0.0002824443735415673, - "loss": 1.2801, - "num_input_tokens_seen": 1074790400, - "step": 16400, - "train_runtime": 7884.0198, - "train_tokens_per_second": 136325.178 - }, - { - "epoch": 0.165, - "grad_norm": 0.860382616519928, - "learning_rate": 0.0002822202524556243, - "loss": 1.2737, - "num_input_tokens_seen": 1081344000, - "step": 16500, - "train_runtime": 7930.486, - "train_tokens_per_second": 136352.803 - }, - { - "epoch": 0.166, - "grad_norm": 0.771594226360321, - "learning_rate": 0.00028199479991320695, - "loss": 1.2876, - "num_input_tokens_seen": 1087897600, - "step": 16600, - "train_runtime": 7977.0943, - "train_tokens_per_second": 136377.678 - }, - { - "epoch": 0.167, - "grad_norm": 0.4533759653568268, - "learning_rate": 0.00028176801818461994, - "loss": 1.2769, - "num_input_tokens_seen": 1094451200, - "step": 16700, - "train_runtime": 8024.6165, - "train_tokens_per_second": 136386.73 - }, - { - "epoch": 0.168, - "grad_norm": 0.548772394657135, - "learning_rate": 0.00028153990955355273, - "loss": 1.2647, - "num_input_tokens_seen": 1101004800, - "step": 16800, - "train_runtime": 8077.0632, - "train_tokens_per_second": 136312.515 - }, - { - "epoch": 0.169, - "grad_norm": 0.5390068888664246, - "learning_rate": 0.00028131047631705665, - "loss": 1.2799, - "num_input_tokens_seen": 1107558400, - "step": 16900, - "train_runtime": 8123.3347, - "train_tokens_per_second": 136342.824 - }, - { - "epoch": 0.17, - "grad_norm": 0.4429817795753479, - "learning_rate": 0.00028107972078552187, - "loss": 1.2727, - "num_input_tokens_seen": 1114112000, - "step": 17000, - "train_runtime": 8169.0719, - "train_tokens_per_second": 136381.71 - }, - { - "epoch": 0.171, - "grad_norm": 0.6212127208709717, - "learning_rate": 0.0002808476452826541, - "loss": 1.2743, - "num_input_tokens_seen": 1120665600, - "step": 17100, - "train_runtime": 8217.1136, - "train_tokens_per_second": 136381.904 - }, - { - "epoch": 0.172, - "grad_norm": 0.44569867849349976, - "learning_rate": 0.00028061425214545094, - "loss": 1.2628, - "num_input_tokens_seen": 1127219200, - "step": 17200, - "train_runtime": 8268.2495, - "train_tokens_per_second": 136331.057 - }, - { - "epoch": 0.173, - "grad_norm": 0.5025371313095093, - "learning_rate": 0.00028037954372417883, - "loss": 1.2651, - "num_input_tokens_seen": 1133772800, - "step": 17300, - "train_runtime": 8315.4333, - "train_tokens_per_second": 136345.607 - }, - { - "epoch": 0.174, - "grad_norm": 0.5257975459098816, - "learning_rate": 0.0002801435223823488, - "loss": 1.2701, - "num_input_tokens_seen": 1140326400, - "step": 17400, - "train_runtime": 8361.8666, - "train_tokens_per_second": 136372.23 - }, - { - "epoch": 0.175, - "grad_norm": 0.6858969926834106, - "learning_rate": 0.00027990619049669336, - "loss": 1.2759, - "num_input_tokens_seen": 1146880000, - "step": 17500, - "train_runtime": 8408.7431, - "train_tokens_per_second": 136391.371 - }, - { - "epoch": 0.176, - "grad_norm": 0.5586578845977783, - "learning_rate": 0.00027966755045714177, - "loss": 1.2782, - "num_input_tokens_seen": 1153433600, - "step": 17600, - "train_runtime": 8455.5155, - "train_tokens_per_second": 136411.978 - }, - { - "epoch": 0.177, - "grad_norm": 0.583242654800415, - "learning_rate": 0.00027942760466679673, - "loss": 1.287, - "num_input_tokens_seen": 1159987200, - "step": 17700, - "train_runtime": 8508.2754, - "train_tokens_per_second": 136336.349 - }, - { - "epoch": 0.178, - "grad_norm": 0.5521747469902039, - "learning_rate": 0.00027918635554190956, - "loss": 1.2704, - "num_input_tokens_seen": 1166540800, - "step": 17800, - "train_runtime": 8555.5497, - "train_tokens_per_second": 136349.018 - }, - { - "epoch": 0.179, - "grad_norm": 0.6325215697288513, - "learning_rate": 0.00027894380551185636, - "loss": 1.2912, - "num_input_tokens_seen": 1173094400, - "step": 17900, - "train_runtime": 8602.3857, - "train_tokens_per_second": 136368.495 - }, - { - "epoch": 0.18, - "grad_norm": 0.44643789529800415, - "learning_rate": 0.00027869995701911314, - "loss": 1.2762, - "num_input_tokens_seen": 1179648000, - "step": 18000, - "train_runtime": 8649.7648, - "train_tokens_per_second": 136379.2 - }, - { - "epoch": 0.181, - "grad_norm": 0.49556615948677063, - "learning_rate": 0.0002784548125192316, - "loss": 1.2577, - "num_input_tokens_seen": 1186201600, - "step": 18100, - "train_runtime": 8701.0558, - "train_tokens_per_second": 136328.467 - }, - { - "epoch": 0.182, - "grad_norm": 0.5336231589317322, - "learning_rate": 0.0002782083744808141, - "loss": 1.2629, - "num_input_tokens_seen": 1192755200, - "step": 18200, - "train_runtime": 8748.3794, - "train_tokens_per_second": 136340.131 - }, - { - "epoch": 0.183, - "grad_norm": 0.3993295431137085, - "learning_rate": 0.000277960645385489, - "loss": 1.2621, - "num_input_tokens_seen": 1199308800, - "step": 18300, - "train_runtime": 8795.9903, - "train_tokens_per_second": 136347.217 - }, - { - "epoch": 0.184, - "grad_norm": 0.5608197450637817, - "learning_rate": 0.00027771162772788544, - "loss": 1.2746, - "num_input_tokens_seen": 1205862400, - "step": 18400, - "train_runtime": 8844.0918, - "train_tokens_per_second": 136346.663 - }, - { - "epoch": 0.185, - "grad_norm": 0.5299677848815918, - "learning_rate": 0.00027746132401560857, - "loss": 1.2608, - "num_input_tokens_seen": 1212416000, - "step": 18500, - "train_runtime": 8890.974, - "train_tokens_per_second": 136364.812 - }, - { - "epoch": 0.186, - "grad_norm": 0.5247559547424316, - "learning_rate": 0.0002772097367692139, - "loss": 1.2628, - "num_input_tokens_seen": 1218969600, - "step": 18600, - "train_runtime": 8937.3092, - "train_tokens_per_second": 136391.119 - }, - { - "epoch": 0.187, - "grad_norm": 0.4991471469402313, - "learning_rate": 0.00027695686852218226, - "loss": 1.2617, - "num_input_tokens_seen": 1225523200, - "step": 18700, - "train_runtime": 8984.1463, - "train_tokens_per_second": 136409.532 - }, - { - "epoch": 0.188, - "grad_norm": 0.4922790229320526, - "learning_rate": 0.00027670272182089416, - "loss": 1.277, - "num_input_tokens_seen": 1232076800, - "step": 18800, - "train_runtime": 9036.4876, - "train_tokens_per_second": 136344.656 - }, - { - "epoch": 0.189, - "grad_norm": 0.49377188086509705, - "learning_rate": 0.0002764472992246039, - "loss": 1.2767, - "num_input_tokens_seen": 1238630400, - "step": 18900, - "train_runtime": 9084.3866, - "train_tokens_per_second": 136347.169 - }, - { - "epoch": 0.19, - "grad_norm": 0.6417357921600342, - "learning_rate": 0.0002761906033054143, - "loss": 1.2616, - "num_input_tokens_seen": 1245184000, - "step": 19000, - "train_runtime": 9130.7221, - "train_tokens_per_second": 136373.004 - }, - { - "epoch": 0.191, - "grad_norm": 0.44580140709877014, - "learning_rate": 0.00027593263664825045, - "loss": 1.2686, - "num_input_tokens_seen": 1251737600, - "step": 19100, - "train_runtime": 9176.6051, - "train_tokens_per_second": 136405.303 - }, - { - "epoch": 0.192, - "grad_norm": 0.5867856740951538, - "learning_rate": 0.00027567340185083363, - "loss": 1.2638, - "num_input_tokens_seen": 1258291200, - "step": 19200, - "train_runtime": 9229.719, - "train_tokens_per_second": 136330.391 - }, - { - "epoch": 0.193, - "grad_norm": 0.4900195896625519, - "learning_rate": 0.00027541290152365537, - "loss": 1.263, - "num_input_tokens_seen": 1264844800, - "step": 19300, - "train_runtime": 9276.2421, - "train_tokens_per_second": 136353.147 - }, - { - "epoch": 0.194, - "grad_norm": 0.49572521448135376, - "learning_rate": 0.00027515113828995117, - "loss": 1.273, - "num_input_tokens_seen": 1271398400, - "step": 19400, - "train_runtime": 9323.5363, - "train_tokens_per_second": 136364.396 - }, - { - "epoch": 0.195, - "grad_norm": 0.440213680267334, - "learning_rate": 0.00027488811478567374, - "loss": 1.2657, - "num_input_tokens_seen": 1277952000, - "step": 19500, - "train_runtime": 9371.4717, - "train_tokens_per_second": 136366.201 - }, - { - "epoch": 0.196, - "grad_norm": 0.5604475736618042, - "learning_rate": 0.0002746238336594671, - "loss": 1.2619, - "num_input_tokens_seen": 1284505600, - "step": 19600, - "train_runtime": 9417.129, - "train_tokens_per_second": 136400.978 - }, - { - "epoch": 0.197, - "grad_norm": 0.45344123244285583, - "learning_rate": 0.00027435829757263894, - "loss": 1.2573, - "num_input_tokens_seen": 1291059200, - "step": 19700, - "train_runtime": 9468.5748, - "train_tokens_per_second": 136352.009 - }, - { - "epoch": 0.198, - "grad_norm": 0.7260287404060364, - "learning_rate": 0.0002740915091991349, - "loss": 1.2668, - "num_input_tokens_seen": 1297612800, - "step": 19800, - "train_runtime": 9515.3702, - "train_tokens_per_second": 136370.186 - }, - { - "epoch": 0.199, - "grad_norm": 0.47865310311317444, - "learning_rate": 0.0002738234712255109, - "loss": 1.2674, - "num_input_tokens_seen": 1304166400, - "step": 19900, - "train_runtime": 9562.0606, - "train_tokens_per_second": 136389.682 - }, - { - "epoch": 0.2, - "grad_norm": 0.8422930240631104, - "learning_rate": 0.00027355418635090635, - "loss": 1.2671, - "num_input_tokens_seen": 1310720000, - "step": 20000, - "train_runtime": 9614.8867, - "train_tokens_per_second": 136321.939 - }, - { - "epoch": 0.201, - "grad_norm": 0.8500565886497498, - "learning_rate": 0.000273283657287017, - "loss": 1.2722, - "num_input_tokens_seen": 1317273600, - "step": 20100, - "train_runtime": 9662.5316, - "train_tokens_per_second": 136327.999 - }, - { - "epoch": 0.202, - "grad_norm": 0.4511219263076782, - "learning_rate": 0.00027301188675806745, - "loss": 1.257, - "num_input_tokens_seen": 1323827200, - "step": 20200, - "train_runtime": 9710.3614, - "train_tokens_per_second": 136331.404 - }, - { - "epoch": 0.203, - "grad_norm": 0.6040441393852234, - "learning_rate": 0.0002727388775007839, - "loss": 1.2787, - "num_input_tokens_seen": 1330380800, - "step": 20300, - "train_runtime": 9757.2415, - "train_tokens_per_second": 136348.045 - }, - { - "epoch": 0.204, - "grad_norm": 0.531548798084259, - "learning_rate": 0.0002724646322643666, - "loss": 1.2567, - "num_input_tokens_seen": 1336934400, - "step": 20400, - "train_runtime": 9803.907, - "train_tokens_per_second": 136367.512 - }, - { - "epoch": 0.205, - "grad_norm": 0.5128377079963684, - "learning_rate": 0.000272189153810462, - "loss": 1.2634, - "num_input_tokens_seen": 1343488000, - "step": 20500, - "train_runtime": 9849.6975, - "train_tokens_per_second": 136398.909 - }, - { - "epoch": 0.206, - "grad_norm": 0.5763120651245117, - "learning_rate": 0.0002719124449131351, - "loss": 1.2708, - "num_input_tokens_seen": 1350041600, - "step": 20600, - "train_runtime": 9902.5747, - "train_tokens_per_second": 136332.382 - }, - { - "epoch": 0.207, - "grad_norm": 0.5266316533088684, - "learning_rate": 0.00027163450835884144, - "loss": 1.2579, - "num_input_tokens_seen": 1356595200, - "step": 20700, - "train_runtime": 9950.4471, - "train_tokens_per_second": 136335.1 - }, - { - "epoch": 0.208, - "grad_norm": 0.6279749274253845, - "learning_rate": 0.00027135534694639894, - "loss": 1.2566, - "num_input_tokens_seen": 1363148800, - "step": 20800, - "train_runtime": 9997.0613, - "train_tokens_per_second": 136354.951 - }, - { - "epoch": 0.209, - "grad_norm": 0.5421542525291443, - "learning_rate": 0.00027107496348696003, - "loss": 1.2687, - "num_input_tokens_seen": 1369702400, - "step": 20900, - "train_runtime": 10044.3146, - "train_tokens_per_second": 136365.939 - }, - { - "epoch": 0.21, - "grad_norm": 0.5376498699188232, - "learning_rate": 0.00027079336080398296, - "loss": 1.2772, - "num_input_tokens_seen": 1376256000, - "step": 21000, - "train_runtime": 10090.6051, - "train_tokens_per_second": 136389.839 - }, - { - "epoch": 0.211, - "grad_norm": 0.41719597578048706, - "learning_rate": 0.00027051054173320366, - "loss": 1.2502, - "num_input_tokens_seen": 1382809600, - "step": 21100, - "train_runtime": 10143.3243, - "train_tokens_per_second": 136327.063 - }, - { - "epoch": 0.212, - "grad_norm": 0.4714694321155548, - "learning_rate": 0.000270226509122607, - "loss": 1.2537, - "num_input_tokens_seen": 1389363200, - "step": 21200, - "train_runtime": 10188.8874, - "train_tokens_per_second": 136360.639 - }, - { - "epoch": 0.213, - "grad_norm": 0.4616274833679199, - "learning_rate": 0.0002699412658323983, - "loss": 1.2571, - "num_input_tokens_seen": 1395916800, - "step": 21300, - "train_runtime": 10236.5378, - "train_tokens_per_second": 136366.107 - }, - { - "epoch": 0.214, - "grad_norm": 0.4215717911720276, - "learning_rate": 0.00026965481473497423, - "loss": 1.2687, - "num_input_tokens_seen": 1402470400, - "step": 21400, - "train_runtime": 10282.9404, - "train_tokens_per_second": 136388.071 - }, - { - "epoch": 0.215, - "grad_norm": 0.5976271033287048, - "learning_rate": 0.0002693671587148942, - "loss": 1.2573, - "num_input_tokens_seen": 1409024000, - "step": 21500, - "train_runtime": 10329.955, - "train_tokens_per_second": 136401.756 - }, - { - "epoch": 0.216, - "grad_norm": 0.5200098752975464, - "learning_rate": 0.0002690783006688511, - "loss": 1.247, - "num_input_tokens_seen": 1415577600, - "step": 21600, - "train_runtime": 10382.0767, - "train_tokens_per_second": 136348.213 - }, - { - "epoch": 0.217, - "grad_norm": 0.8170623779296875, - "learning_rate": 0.0002687882435056423, - "loss": 1.2562, - "num_input_tokens_seen": 1422131200, - "step": 21700, - "train_runtime": 10429.827, - "train_tokens_per_second": 136352.329 - }, - { - "epoch": 0.218, - "grad_norm": 0.52497398853302, - "learning_rate": 0.0002684969901461402, - "loss": 1.2533, - "num_input_tokens_seen": 1428684800, - "step": 21800, - "train_runtime": 10476.8104, - "train_tokens_per_second": 136366.388 - }, - { - "epoch": 0.219, - "grad_norm": 0.4417087137699127, - "learning_rate": 0.000268204543523263, - "loss": 1.2721, - "num_input_tokens_seen": 1435238400, - "step": 21900, - "train_runtime": 10524.1028, - "train_tokens_per_second": 136376.319 - }, - { - "epoch": 0.22, - "grad_norm": 0.5729189515113831, - "learning_rate": 0.0002679109065819447, - "loss": 1.2654, - "num_input_tokens_seen": 1441792000, - "step": 22000, - "train_runtime": 10572.3447, - "train_tokens_per_second": 136373.911 - }, - { - "epoch": 0.221, - "grad_norm": 0.5111753940582275, - "learning_rate": 0.0002676160822791062, - "loss": 1.2581, - "num_input_tokens_seen": 1448345600, - "step": 22100, - "train_runtime": 10619.3771, - "train_tokens_per_second": 136387.057 - }, - { - "epoch": 0.222, - "grad_norm": 0.4302677512168884, - "learning_rate": 0.00026732007358362496, - "loss": 1.2581, - "num_input_tokens_seen": 1454899200, - "step": 22200, - "train_runtime": 10666.0714, - "train_tokens_per_second": 136404.413 - }, - { - "epoch": 0.223, - "grad_norm": 3.9242477416992188, - "learning_rate": 0.0002670228834763052, - "loss": 1.2872, - "num_input_tokens_seen": 1461452800, - "step": 22300, - "train_runtime": 10719.3985, - "train_tokens_per_second": 136337.203 - }, - { - "epoch": 0.224, - "grad_norm": 0.7662601470947266, - "learning_rate": 0.00026672451494984804, - "loss": 1.2602, - "num_input_tokens_seen": 1468006400, - "step": 22400, - "train_runtime": 10767.2807, - "train_tokens_per_second": 136339.568 - }, - { - "epoch": 0.225, - "grad_norm": 0.48544740676879883, - "learning_rate": 0.0002664249710088213, - "loss": 1.257, - "num_input_tokens_seen": 1474560000, - "step": 22500, - "train_runtime": 10813.982, - "train_tokens_per_second": 136356.802 - }, - { - "epoch": 0.226, - "grad_norm": 0.4495686888694763, - "learning_rate": 0.00026612425466962893, - "loss": 1.2552, - "num_input_tokens_seen": 1481113600, - "step": 22600, - "train_runtime": 10860.2948, - "train_tokens_per_second": 136378.766 - }, - { - "epoch": 0.227, - "grad_norm": 0.5733143091201782, - "learning_rate": 0.00026582236896048134, - "loss": 1.2403, - "num_input_tokens_seen": 1487667200, - "step": 22700, - "train_runtime": 10907.2107, - "train_tokens_per_second": 136393.001 - }, - { - "epoch": 0.228, - "grad_norm": 0.7318263649940491, - "learning_rate": 0.00026551931692136413, - "loss": 1.2468, - "num_input_tokens_seen": 1494220800, - "step": 22800, - "train_runtime": 10953.9499, - "train_tokens_per_second": 136409.315 - }, - { - "epoch": 0.229, - "grad_norm": 0.5192084312438965, - "learning_rate": 0.00026521510160400804, - "loss": 1.2458, - "num_input_tokens_seen": 1500774400, - "step": 22900, - "train_runtime": 11006.6198, - "train_tokens_per_second": 136351.98 - }, - { - "epoch": 0.23, - "grad_norm": 0.4651305079460144, - "learning_rate": 0.00026490972607185793, - "loss": 1.2601, - "num_input_tokens_seen": 1507328000, - "step": 23000, - "train_runtime": 11053.8305, - "train_tokens_per_second": 136362.504 - }, - { - "epoch": 0.231, - "grad_norm": 0.5470275282859802, - "learning_rate": 0.0002646031934000421, - "loss": 1.2405, - "num_input_tokens_seen": 1513881600, - "step": 23100, - "train_runtime": 11099.6418, - "train_tokens_per_second": 136390.132 - }, - { - "epoch": 0.232, - "grad_norm": 0.519235372543335, - "learning_rate": 0.00026429550667534095, - "loss": 1.2586, - "num_input_tokens_seen": 1520435200, - "step": 23200, - "train_runtime": 11152.1986, - "train_tokens_per_second": 136335.018 - }, - { - "epoch": 0.233, - "grad_norm": 0.4892626404762268, - "learning_rate": 0.0002639866689961565, - "loss": 1.2595, - "num_input_tokens_seen": 1526988800, - "step": 23300, - "train_runtime": 11199.2653, - "train_tokens_per_second": 136347.23 - }, - { - "epoch": 0.234, - "grad_norm": 0.4089221656322479, - "learning_rate": 0.00026367668347248083, - "loss": 1.2393, - "num_input_tokens_seen": 1533542400, - "step": 23400, - "train_runtime": 11247.6635, - "train_tokens_per_second": 136343.196 - }, - { - "epoch": 0.235, - "grad_norm": 0.467582106590271, - "learning_rate": 0.0002633655532258646, - "loss": 1.2534, - "num_input_tokens_seen": 1540096000, - "step": 23500, - "train_runtime": 11294.1646, - "train_tokens_per_second": 136362.099 - }, - { - "epoch": 0.236, - "grad_norm": 0.48117080330848694, - "learning_rate": 0.000263053281389386, - "loss": 1.2644, - "num_input_tokens_seen": 1546649600, - "step": 23600, - "train_runtime": 11340.9021, - "train_tokens_per_second": 136378.004 - }, - { - "epoch": 0.237, - "grad_norm": 0.4495629072189331, - "learning_rate": 0.0002627398711076189, - "loss": 1.2442, - "num_input_tokens_seen": 1553203200, - "step": 23700, - "train_runtime": 11387.7566, - "train_tokens_per_second": 136392.377 - }, - { - "epoch": 0.238, - "grad_norm": 0.4376384913921356, - "learning_rate": 0.0002624253255366014, - "loss": 1.2489, - "num_input_tokens_seen": 1559756800, - "step": 23800, - "train_runtime": 11439.8893, - "train_tokens_per_second": 136343.696 - }, - { - "epoch": 0.239, - "grad_norm": 0.4419648349285126, - "learning_rate": 0.0002621096478438039, - "loss": 1.2353, - "num_input_tokens_seen": 1566310400, - "step": 23900, - "train_runtime": 11486.001, - "train_tokens_per_second": 136366.904 - }, - { - "epoch": 0.24, - "grad_norm": 0.669739305973053, - "learning_rate": 0.00026179284120809727, - "loss": 1.2528, - "num_input_tokens_seen": 1572864000, - "step": 24000, - "train_runtime": 11533.9608, - "train_tokens_per_second": 136368.072 - }, - { - "epoch": 0.241, - "grad_norm": 0.4047415554523468, - "learning_rate": 0.0002614749088197208, - "loss": 1.2679, - "num_input_tokens_seen": 1579417600, - "step": 24100, - "train_runtime": 11582.9583, - "train_tokens_per_second": 136357.013 - }, - { - "epoch": 0.242, - "grad_norm": 0.5224933624267578, - "learning_rate": 0.00026115585388025015, - "loss": 1.2425, - "num_input_tokens_seen": 1585971200, - "step": 24200, - "train_runtime": 11630.022, - "train_tokens_per_second": 136368.719 - }, - { - "epoch": 0.243, - "grad_norm": 0.5125856399536133, - "learning_rate": 0.00026083567960256493, - "loss": 1.2423, - "num_input_tokens_seen": 1592524800, - "step": 24300, - "train_runtime": 11677.13, - "train_tokens_per_second": 136379.813 - }, - { - "epoch": 0.244, - "grad_norm": 0.5344144701957703, - "learning_rate": 0.00026051438921081667, - "loss": 1.2431, - "num_input_tokens_seen": 1599078400, - "step": 24400, - "train_runtime": 11723.5349, - "train_tokens_per_second": 136398.997 - }, - { - "epoch": 0.245, - "grad_norm": 0.4386890232563019, - "learning_rate": 0.00026019198594039595, - "loss": 1.2426, - "num_input_tokens_seen": 1605632000, - "step": 24500, - "train_runtime": 11773.1296, - "train_tokens_per_second": 136381.069 - }, - { - "epoch": 0.246, - "grad_norm": 0.4986630082130432, - "learning_rate": 0.00025986847303790026, - "loss": 1.2531, - "num_input_tokens_seen": 1612185600, - "step": 24600, - "train_runtime": 11820.6579, - "train_tokens_per_second": 136387.13 - }, - { - "epoch": 0.247, - "grad_norm": 0.5271715521812439, - "learning_rate": 0.00025954385376110076, - "loss": 1.249, - "num_input_tokens_seen": 1618739200, - "step": 24700, - "train_runtime": 11867.4874, - "train_tokens_per_second": 136401.172 - }, - { - "epoch": 0.248, - "grad_norm": 0.45263609290122986, - "learning_rate": 0.00025921813137891005, - "loss": 1.2507, - "num_input_tokens_seen": 1625292800, - "step": 24800, - "train_runtime": 11919.9131, - "train_tokens_per_second": 136351.061 - }, - { - "epoch": 0.249, - "grad_norm": 0.5932081937789917, - "learning_rate": 0.000258891309171349, - "loss": 1.2438, - "num_input_tokens_seen": 1631846400, - "step": 24900, - "train_runtime": 11962.6395, - "train_tokens_per_second": 136411.902 - }, - { - "epoch": 0.25, - "grad_norm": 0.5539859533309937, - "learning_rate": 0.00025856339042951344, - "loss": 1.2548, - "num_input_tokens_seen": 1638400000, - "step": 25000, - "train_runtime": 12014.9411, - "train_tokens_per_second": 136363.548 - }, - { - "epoch": 0.251, - "grad_norm": 0.5236772298812866, - "learning_rate": 0.0002582343784555415, - "loss": 1.2386, - "num_input_tokens_seen": 1644953600, - "step": 25100, - "train_runtime": 12062.3997, - "train_tokens_per_second": 136370.344 - }, - { - "epoch": 0.252, - "grad_norm": 0.5913048982620239, - "learning_rate": 0.00025790427656258017, - "loss": 1.2354, - "num_input_tokens_seen": 1651507200, - "step": 25200, - "train_runtime": 12108.5333, - "train_tokens_per_second": 136392.01 - }, - { - "epoch": 0.253, - "grad_norm": 0.5929732322692871, - "learning_rate": 0.00025757308807475185, - "loss": 1.2582, - "num_input_tokens_seen": 1658060800, - "step": 25300, - "train_runtime": 12154.8252, - "train_tokens_per_second": 136411.736 - }, - { - "epoch": 0.254, - "grad_norm": 0.4542764723300934, - "learning_rate": 0.00025724081632712086, - "loss": 1.2488, - "num_input_tokens_seen": 1664614400, - "step": 25400, - "train_runtime": 12207.8935, - "train_tokens_per_second": 136355.58 - }, - { - "epoch": 0.255, - "grad_norm": 1.0848513841629028, - "learning_rate": 0.0002569074646656601, - "loss": 1.2375, - "num_input_tokens_seen": 1671168000, - "step": 25500, - "train_runtime": 12254.3162, - "train_tokens_per_second": 136373.827 - }, - { - "epoch": 0.256, - "grad_norm": 0.5190780162811279, - "learning_rate": 0.00025657303644721695, - "loss": 1.236, - "num_input_tokens_seen": 1677721600, - "step": 25600, - "train_runtime": 12301.2378, - "train_tokens_per_second": 136386.405 - }, - { - "epoch": 0.257, - "grad_norm": 0.43418362736701965, - "learning_rate": 0.00025623753503948004, - "loss": 1.2484, - "num_input_tokens_seen": 1684275200, - "step": 25700, - "train_runtime": 12347.684, - "train_tokens_per_second": 136404.138 - }, - { - "epoch": 0.258, - "grad_norm": 0.4586409032344818, - "learning_rate": 0.00025590096382094475, - "loss": 1.2674, - "num_input_tokens_seen": 1690828800, - "step": 25800, - "train_runtime": 12394.5809, - "train_tokens_per_second": 136416.778 - }, - { - "epoch": 0.259, - "grad_norm": 0.5069702863693237, - "learning_rate": 0.00025556332618087945, - "loss": 1.2428, - "num_input_tokens_seen": 1697382400, - "step": 25900, - "train_runtime": 12447.2116, - "train_tokens_per_second": 136366.478 - }, - { - "epoch": 0.26, - "grad_norm": 0.591788649559021, - "learning_rate": 0.00025522462551929155, - "loss": 1.2417, - "num_input_tokens_seen": 1703936000, - "step": 26000, - "train_runtime": 12492.8891, - "train_tokens_per_second": 136392.47 - }, - { - "epoch": 0.261, - "grad_norm": 0.6001791954040527, - "learning_rate": 0.00025488486524689283, - "loss": 1.2407, - "num_input_tokens_seen": 1710489600, - "step": 26100, - "train_runtime": 12539.4548, - "train_tokens_per_second": 136408.61 - }, - { - "epoch": 0.262, - "grad_norm": 0.47005897760391235, - "learning_rate": 0.00025454404878506555, - "loss": 1.2558, - "num_input_tokens_seen": 1717043200, - "step": 26200, - "train_runtime": 12587.1655, - "train_tokens_per_second": 136412.221 - }, - { - "epoch": 0.263, - "grad_norm": 0.42708972096443176, - "learning_rate": 0.0002542021795658276, - "loss": 1.2445, - "num_input_tokens_seen": 1723596800, - "step": 26300, - "train_runtime": 12634.1294, - "train_tokens_per_second": 136423.868 - }, - { - "epoch": 0.264, - "grad_norm": 0.48100486397743225, - "learning_rate": 0.0002538592610317984, - "loss": 1.2416, - "num_input_tokens_seen": 1730150400, - "step": 26400, - "train_runtime": 12686.5075, - "train_tokens_per_second": 136377.202 - }, - { - "epoch": 0.265, - "grad_norm": 0.5689502954483032, - "learning_rate": 0.00025351529663616355, - "loss": 1.2476, - "num_input_tokens_seen": 1736704000, - "step": 26500, - "train_runtime": 12733.1403, - "train_tokens_per_second": 136392.435 - }, - { - "epoch": 0.266, - "grad_norm": 0.3999510705471039, - "learning_rate": 0.00025317028984264087, - "loss": 1.2507, - "num_input_tokens_seen": 1743257600, - "step": 26600, - "train_runtime": 12780.4326, - "train_tokens_per_second": 136400.515 - }, - { - "epoch": 0.267, - "grad_norm": 0.4349440336227417, - "learning_rate": 0.0002528242441254448, - "loss": 1.2359, - "num_input_tokens_seen": 1749811200, - "step": 26700, - "train_runtime": 12826.6298, - "train_tokens_per_second": 136420.184 - }, - { - "epoch": 0.268, - "grad_norm": 0.40468648076057434, - "learning_rate": 0.000252477162969252, - "loss": 1.2463, - "num_input_tokens_seen": 1756364800, - "step": 26800, - "train_runtime": 12873.4848, - "train_tokens_per_second": 136432.739 - }, - { - "epoch": 0.269, - "grad_norm": 0.5858653783798218, - "learning_rate": 0.00025212904986916584, - "loss": 1.2385, - "num_input_tokens_seen": 1762918400, - "step": 26900, - "train_runtime": 12926.2009, - "train_tokens_per_second": 136383.336 - }, - { - "epoch": 0.27, - "grad_norm": 0.4621046483516693, - "learning_rate": 0.00025177990833068133, - "loss": 1.2366, - "num_input_tokens_seen": 1769472000, - "step": 27000, - "train_runtime": 12973.4952, - "train_tokens_per_second": 136391.31 - }, - { - "epoch": 0.271, - "grad_norm": 0.4884892404079437, - "learning_rate": 0.0002514297418696499, - "loss": 1.2436, - "num_input_tokens_seen": 1776025600, - "step": 27100, - "train_runtime": 13021.2871, - "train_tokens_per_second": 136394.013 - }, - { - "epoch": 0.272, - "grad_norm": 0.5108981132507324, - "learning_rate": 0.0002510785540122439, - "loss": 1.2423, - "num_input_tokens_seen": 1782579200, - "step": 27200, - "train_runtime": 13068.0423, - "train_tokens_per_second": 136407.517 - }, - { - "epoch": 0.273, - "grad_norm": 0.3898067772388458, - "learning_rate": 0.0002507263482949212, - "loss": 1.2415, - "num_input_tokens_seen": 1789132800, - "step": 27300, - "train_runtime": 13113.8421, - "train_tokens_per_second": 136430.864 - }, - { - "epoch": 0.274, - "grad_norm": 0.5622383952140808, - "learning_rate": 0.0002503731282643894, - "loss": 1.2378, - "num_input_tokens_seen": 1795686400, - "step": 27400, - "train_runtime": 13161.1635, - "train_tokens_per_second": 136438.272 - }, - { - "epoch": 0.275, - "grad_norm": 0.7748796343803406, - "learning_rate": 0.0002500188974775704, - "loss": 1.248, - "num_input_tokens_seen": 1802240000, - "step": 27500, - "train_runtime": 13209.4471, - "train_tokens_per_second": 136435.688 - }, - { - "epoch": 0.276, - "grad_norm": 0.8867826461791992, - "learning_rate": 0.00024966365950156416, - "loss": 1.2409, - "num_input_tokens_seen": 1808793600, - "step": 27600, - "train_runtime": 13256.4066, - "train_tokens_per_second": 136446.751 - }, - { - "epoch": 0.277, - "grad_norm": 0.49997836351394653, - "learning_rate": 0.00024930741791361326, - "loss": 1.2382, - "num_input_tokens_seen": 1815347200, - "step": 27700, - "train_runtime": 13309.6196, - "train_tokens_per_second": 136393.62 - }, - { - "epoch": 0.278, - "grad_norm": 0.5048521161079407, - "learning_rate": 0.0002489501763010664, - "loss": 1.2351, - "num_input_tokens_seen": 1821900800, - "step": 27800, - "train_runtime": 13356.706, - "train_tokens_per_second": 136403.451 - }, - { - "epoch": 0.279, - "grad_norm": 0.5528578162193298, - "learning_rate": 0.00024859193826134285, - "loss": 1.2404, - "num_input_tokens_seen": 1828454400, - "step": 27900, - "train_runtime": 13405.5813, - "train_tokens_per_second": 136395.01 - }, - { - "epoch": 0.28, - "grad_norm": 0.44376805424690247, - "learning_rate": 0.00024823270740189556, - "loss": 1.2461, - "num_input_tokens_seen": 1835008000, - "step": 28000, - "train_runtime": 13452.7686, - "train_tokens_per_second": 136403.743 - }, - { - "epoch": 0.281, - "grad_norm": 0.5072674751281738, - "learning_rate": 0.00024787248734017527, - "loss": 1.2301, - "num_input_tokens_seen": 1841561600, - "step": 28100, - "train_runtime": 13501.0413, - "train_tokens_per_second": 136401.449 - }, - { - "epoch": 0.282, - "grad_norm": 0.46835577487945557, - "learning_rate": 0.0002475112817035941, - "loss": 1.237, - "num_input_tokens_seen": 1848115200, - "step": 28200, - "train_runtime": 13547.4814, - "train_tokens_per_second": 136417.622 - }, - { - "epoch": 0.283, - "grad_norm": 0.4893036186695099, - "learning_rate": 0.0002471490941294887, - "loss": 1.2612, - "num_input_tokens_seen": 1854668800, - "step": 28300, - "train_runtime": 13593.9904, - "train_tokens_per_second": 136432.993 - }, - { - "epoch": 0.284, - "grad_norm": 0.66542649269104, - "learning_rate": 0.000246785928265084, - "loss": 1.2405, - "num_input_tokens_seen": 1861222400, - "step": 28400, - "train_runtime": 13646.3147, - "train_tokens_per_second": 136390.113 - }, - { - "epoch": 0.285, - "grad_norm": 0.669306755065918, - "learning_rate": 0.0002464217877674562, - "loss": 1.2409, - "num_input_tokens_seen": 1867776000, - "step": 28500, - "train_runtime": 13692.502, - "train_tokens_per_second": 136408.671 - }, - { - "epoch": 0.286, - "grad_norm": 0.43464845418930054, - "learning_rate": 0.0002460566763034961, - "loss": 1.2435, - "num_input_tokens_seen": 1874329600, - "step": 28600, - "train_runtime": 13738.7564, - "train_tokens_per_second": 136426.438 - }, - { - "epoch": 0.287, - "grad_norm": 0.5084187388420105, - "learning_rate": 0.00024569059754987196, - "loss": 1.2572, - "num_input_tokens_seen": 1880883200, - "step": 28700, - "train_runtime": 13785.6191, - "train_tokens_per_second": 136438.065 - }, - { - "epoch": 0.288, - "grad_norm": 0.473603755235672, - "learning_rate": 0.00024532355519299296, - "loss": 1.2459, - "num_input_tokens_seen": 1887436800, - "step": 28800, - "train_runtime": 13838.5181, - "train_tokens_per_second": 136390.095 - }, - { - "epoch": 0.289, - "grad_norm": 0.493012011051178, - "learning_rate": 0.0002449555529289714, - "loss": 1.243, - "num_input_tokens_seen": 1893990400, - "step": 28900, - "train_runtime": 13886.1283, - "train_tokens_per_second": 136394.419 - }, - { - "epoch": 0.29, - "grad_norm": 0.7421333193778992, - "learning_rate": 0.0002445865944635861, - "loss": 1.2455, - "num_input_tokens_seen": 1900544000, - "step": 29000, - "train_runtime": 13931.9406, - "train_tokens_per_second": 136416.315 - }, - { - "epoch": 0.291, - "grad_norm": 0.5027185678482056, - "learning_rate": 0.0002442166835122446, - "loss": 1.2686, - "num_input_tokens_seen": 1907097600, - "step": 29100, - "train_runtime": 13980.446, - "train_tokens_per_second": 136411.785 - }, - { - "epoch": 0.292, - "grad_norm": 0.48427557945251465, - "learning_rate": 0.00024384582379994614, - "loss": 1.2369, - "num_input_tokens_seen": 1913651200, - "step": 29200, - "train_runtime": 14028.0456, - "train_tokens_per_second": 136416.095 - }, - { - "epoch": 0.293, - "grad_norm": 0.6620755195617676, - "learning_rate": 0.00024347401906124388, - "loss": 1.2317, - "num_input_tokens_seen": 1920204800, - "step": 29300, - "train_runtime": 14074.3372, - "train_tokens_per_second": 136433.054 - }, - { - "epoch": 0.294, - "grad_norm": 0.5745883584022522, - "learning_rate": 0.0002431012730402075, - "loss": 1.2443, - "num_input_tokens_seen": 1926758400, - "step": 29400, - "train_runtime": 14125.645, - "train_tokens_per_second": 136401.446 - }, - { - "epoch": 0.295, - "grad_norm": 0.441680908203125, - "learning_rate": 0.00024272758949038517, - "loss": 1.2393, - "num_input_tokens_seen": 1933312000, - "step": 29500, - "train_runtime": 14172.5336, - "train_tokens_per_second": 136412.588 - }, - { - "epoch": 0.296, - "grad_norm": 0.4417046904563904, - "learning_rate": 0.00024235297217476616, - "loss": 1.2371, - "num_input_tokens_seen": 1939865600, - "step": 29600, - "train_runtime": 14220.1572, - "train_tokens_per_second": 136416.608 - }, - { - "epoch": 0.297, - "grad_norm": 0.5888639688491821, - "learning_rate": 0.00024197742486574268, - "loss": 1.2344, - "num_input_tokens_seen": 1946419200, - "step": 29700, - "train_runtime": 14267.366, - "train_tokens_per_second": 136424.565 - }, - { - "epoch": 0.298, - "grad_norm": 0.4625283479690552, - "learning_rate": 0.0002416009513450719, - "loss": 1.2373, - "num_input_tokens_seen": 1952972800, - "step": 29800, - "train_runtime": 14318.8989, - "train_tokens_per_second": 136391.27 - }, - { - "epoch": 0.299, - "grad_norm": 0.47661375999450684, - "learning_rate": 0.00024122355540383806, - "loss": 1.2454, - "num_input_tokens_seen": 1959526400, - "step": 29900, - "train_runtime": 14365.8797, - "train_tokens_per_second": 136401.42 - }, - { - "epoch": 0.3, - "grad_norm": 0.727032482624054, - "learning_rate": 0.00024084524084241405, - "loss": 1.2379, - "num_input_tokens_seen": 1966080000, - "step": 30000, - "train_runtime": 14415.1273, - "train_tokens_per_second": 136390.055 - }, - { - "epoch": 0.301, - "grad_norm": 0.45500555634498596, - "learning_rate": 0.00024046601147042332, - "loss": 1.2358, - "num_input_tokens_seen": 1972633600, - "step": 30100, - "train_runtime": 14461.5845, - "train_tokens_per_second": 136405.08 - }, - { - "epoch": 0.302, - "grad_norm": 0.44596830010414124, - "learning_rate": 0.0002400858711067015, - "loss": 1.2301, - "num_input_tokens_seen": 1979187200, - "step": 30200, - "train_runtime": 14508.0707, - "train_tokens_per_second": 136419.737 - }, - { - "epoch": 0.303, - "grad_norm": 0.4207491874694824, - "learning_rate": 0.00023970482357925772, - "loss": 1.2441, - "num_input_tokens_seen": 1985740800, - "step": 30300, - "train_runtime": 14555.5751, - "train_tokens_per_second": 136424.757 - }, - { - "epoch": 0.304, - "grad_norm": 0.4833202064037323, - "learning_rate": 0.00023932287272523646, - "loss": 1.2351, - "num_input_tokens_seen": 1992294400, - "step": 30400, - "train_runtime": 14601.9546, - "train_tokens_per_second": 136440.255 - }, - { - "epoch": 0.305, - "grad_norm": 0.5268282294273376, - "learning_rate": 0.00023894002239087847, - "loss": 1.2384, - "num_input_tokens_seen": 1998848000, - "step": 30500, - "train_runtime": 14654.2539, - "train_tokens_per_second": 136400.53 - }, - { - "epoch": 0.306, - "grad_norm": 0.4639832377433777, - "learning_rate": 0.0002385562764314825, - "loss": 1.3007, - "num_input_tokens_seen": 2005401600, - "step": 30600, - "train_runtime": 14702.026, - "train_tokens_per_second": 136403.078 - }, - { - "epoch": 0.307, - "grad_norm": 0.526703953742981, - "learning_rate": 0.00023817163871136596, - "loss": 1.2481, - "num_input_tokens_seen": 2011955200, - "step": 30700, - "train_runtime": 14749.4458, - "train_tokens_per_second": 136408.868 - }, - { - "epoch": 0.308, - "grad_norm": 0.43404075503349304, - "learning_rate": 0.00023778611310382652, - "loss": 1.2273, - "num_input_tokens_seen": 2018508800, - "step": 30800, - "train_runtime": 14796.5936, - "train_tokens_per_second": 136417.128 - }, - { - "epoch": 0.309, - "grad_norm": 0.39956456422805786, - "learning_rate": 0.0002373997034911027, - "loss": 1.2275, - "num_input_tokens_seen": 2025062400, - "step": 30900, - "train_runtime": 14843.3887, - "train_tokens_per_second": 136428.578 - }, - { - "epoch": 0.31, - "grad_norm": 0.46024298667907715, - "learning_rate": 0.00023701241376433506, - "loss": 1.2353, - "num_input_tokens_seen": 2031616000, - "step": 31000, - "train_runtime": 14890.8282, - "train_tokens_per_second": 136434.05 - }, - { - "epoch": 0.311, - "grad_norm": 0.38429203629493713, - "learning_rate": 0.0002366242478235268, - "loss": 1.2403, - "num_input_tokens_seen": 2038169600, - "step": 31100, - "train_runtime": 14937.8781, - "train_tokens_per_second": 136443.047 - }, - { - "epoch": 0.312, - "grad_norm": 0.5401485562324524, - "learning_rate": 0.00023623520957750471, - "loss": 1.2273, - "num_input_tokens_seen": 2044723200, - "step": 31200, - "train_runtime": 14990.0842, - "train_tokens_per_second": 136405.051 - }, - { - "epoch": 0.313, - "grad_norm": 0.5360187888145447, - "learning_rate": 0.00023584530294387953, - "loss": 1.2312, - "num_input_tokens_seen": 2051276800, - "step": 31300, - "train_runtime": 15037.4257, - "train_tokens_per_second": 136411.434 - }, - { - "epoch": 0.314, - "grad_norm": 0.4468795359134674, - "learning_rate": 0.00023545453184900682, - "loss": 1.2383, - "num_input_tokens_seen": 2057830400, - "step": 31400, - "train_runtime": 15083.4771, - "train_tokens_per_second": 136429.444 - }, - { - "epoch": 0.315, - "grad_norm": 0.4575517177581787, - "learning_rate": 0.00023506290022794706, - "loss": 1.2354, - "num_input_tokens_seen": 2064384000, - "step": 31500, - "train_runtime": 15131.2692, - "train_tokens_per_second": 136431.648 - }, - { - "epoch": 0.316, - "grad_norm": 0.7983475923538208, - "learning_rate": 0.00023467041202442643, - "loss": 1.2309, - "num_input_tokens_seen": 2070937600, - "step": 31600, - "train_runtime": 15178.6218, - "train_tokens_per_second": 136437.789 - }, - { - "epoch": 0.317, - "grad_norm": 0.4316498339176178, - "learning_rate": 0.00023427707119079669, - "loss": 1.2462, - "num_input_tokens_seen": 2077491200, - "step": 31700, - "train_runtime": 15225.1881, - "train_tokens_per_second": 136450.938 - }, - { - "epoch": 0.318, - "grad_norm": 0.5765666365623474, - "learning_rate": 0.0002338828816879957, - "loss": 1.2367, - "num_input_tokens_seen": 2084044800, - "step": 31800, - "train_runtime": 15277.5735, - "train_tokens_per_second": 136412.029 - }, - { - "epoch": 0.319, - "grad_norm": 0.44825831055641174, - "learning_rate": 0.00023348784748550744, - "loss": 1.2354, - "num_input_tokens_seen": 2090598400, - "step": 31900, - "train_runtime": 15324.8285, - "train_tokens_per_second": 136419.04 - }, - { - "epoch": 0.32, - "grad_norm": 0.5602436661720276, - "learning_rate": 0.00023309197256132184, - "loss": 1.2324, - "num_input_tokens_seen": 2097152000, - "step": 32000, - "train_runtime": 15371.4775, - "train_tokens_per_second": 136431.387 - }, - { - "epoch": 0.321, - "grad_norm": 0.4002476930618286, - "learning_rate": 0.00023269526090189505, - "loss": 1.2396, - "num_input_tokens_seen": 2103705600, - "step": 32100, - "train_runtime": 15419.2672, - "train_tokens_per_second": 136433.565 - }, - { - "epoch": 0.322, - "grad_norm": 0.4306688606739044, - "learning_rate": 0.00023229771650210907, - "loss": 1.2468, - "num_input_tokens_seen": 2110259200, - "step": 32200, - "train_runtime": 15466.1068, - "train_tokens_per_second": 136444.111 - }, - { - "epoch": 0.323, - "grad_norm": 0.584658145904541, - "learning_rate": 0.00023189934336523163, - "loss": 1.2459, - "num_input_tokens_seen": 2116812800, - "step": 32300, - "train_runtime": 15513.277, - "train_tokens_per_second": 136451.686 - }, - { - "epoch": 0.324, - "grad_norm": 0.4049496352672577, - "learning_rate": 0.00023150014550287574, - "loss": 1.2455, - "num_input_tokens_seen": 2123366400, - "step": 32400, - "train_runtime": 15565.7808, - "train_tokens_per_second": 136412.456 - }, - { - "epoch": 0.325, - "grad_norm": 0.45713433623313904, - "learning_rate": 0.00023110012693495943, - "loss": 1.2308, - "num_input_tokens_seen": 2129920000, - "step": 32500, - "train_runtime": 15610.6324, - "train_tokens_per_second": 136440.341 - }, - { - "epoch": 0.326, - "grad_norm": 0.5710960030555725, - "learning_rate": 0.00023069929168966527, - "loss": 1.2434, - "num_input_tokens_seen": 2136473600, - "step": 32600, - "train_runtime": 15657.7335, - "train_tokens_per_second": 136448.458 - }, - { - "epoch": 0.327, - "grad_norm": 0.5807371735572815, - "learning_rate": 0.0002302976438033997, - "loss": 1.2292, - "num_input_tokens_seen": 2143027200, - "step": 32700, - "train_runtime": 15710.1819, - "train_tokens_per_second": 136410.082 - }, - { - "epoch": 0.328, - "grad_norm": 0.4462313652038574, - "learning_rate": 0.0002298951873207525, - "loss": 1.2427, - "num_input_tokens_seen": 2149580800, - "step": 32800, - "train_runtime": 15757.3708, - "train_tokens_per_second": 136417.479 - }, - { - "epoch": 0.329, - "grad_norm": 0.6099971532821655, - "learning_rate": 0.00022949192629445606, - "loss": 1.2313, - "num_input_tokens_seen": 2156134400, - "step": 32900, - "train_runtime": 15804.1823, - "train_tokens_per_second": 136428.089 - }, - { - "epoch": 0.33, - "grad_norm": 0.8630947470664978, - "learning_rate": 0.0002290878647853443, - "loss": 1.247, - "num_input_tokens_seen": 2162688000, - "step": 33000, - "train_runtime": 15852.2039, - "train_tokens_per_second": 136428.223 - }, - { - "epoch": 0.331, - "grad_norm": 0.5154317021369934, - "learning_rate": 0.00022868300686231224, - "loss": 1.2246, - "num_input_tokens_seen": 2169241600, - "step": 33100, - "train_runtime": 15899.5617, - "train_tokens_per_second": 136434.05 - }, - { - "epoch": 0.332, - "grad_norm": 0.5033185482025146, - "learning_rate": 0.00022827735660227457, - "loss": 1.2271, - "num_input_tokens_seen": 2175795200, - "step": 33200, - "train_runtime": 15947.1716, - "train_tokens_per_second": 136437.686 - }, - { - "epoch": 0.333, - "grad_norm": 0.7760284543037415, - "learning_rate": 0.000227870918090125, - "loss": 1.2445, - "num_input_tokens_seen": 2182348800, - "step": 33300, - "train_runtime": 16000.1889, - "train_tokens_per_second": 136395.189 - }, - { - "epoch": 0.334, - "grad_norm": 0.5042400360107422, - "learning_rate": 0.00022746369541869476, - "loss": 1.223, - "num_input_tokens_seen": 2188902400, - "step": 33400, - "train_runtime": 16047.8873, - "train_tokens_per_second": 136398.166 - }, - { - "epoch": 0.335, - "grad_norm": 0.421273410320282, - "learning_rate": 0.00022705569268871163, - "loss": 1.2222, - "num_input_tokens_seen": 2195456000, - "step": 33500, - "train_runtime": 16094.6711, - "train_tokens_per_second": 136408.876 - }, - { - "epoch": 0.336, - "grad_norm": 0.48292359709739685, - "learning_rate": 0.00022664691400875865, - "loss": 1.222, - "num_input_tokens_seen": 2202009600, - "step": 33600, - "train_runtime": 16143.6943, - "train_tokens_per_second": 136400.601 - }, - { - "epoch": 0.337, - "grad_norm": 0.4301004409790039, - "learning_rate": 0.00022623736349523254, - "loss": 1.2308, - "num_input_tokens_seen": 2208563200, - "step": 33700, - "train_runtime": 16189.7469, - "train_tokens_per_second": 136417.401 - }, - { - "epoch": 0.338, - "grad_norm": 0.6592893600463867, - "learning_rate": 0.00022582704527230238, - "loss": 1.2401, - "num_input_tokens_seen": 2215116800, - "step": 33800, - "train_runtime": 16235.6512, - "train_tokens_per_second": 136435.353 - }, - { - "epoch": 0.339, - "grad_norm": 0.6183221340179443, - "learning_rate": 0.0002254159634718682, - "loss": 1.2364, - "num_input_tokens_seen": 2221670400, - "step": 33900, - "train_runtime": 16283.1306, - "train_tokens_per_second": 136440.003 - }, - { - "epoch": 0.34, - "grad_norm": 0.529971182346344, - "learning_rate": 0.00022500412223351915, - "loss": 1.2222, - "num_input_tokens_seen": 2228224000, - "step": 34000, - "train_runtime": 16330.1955, - "train_tokens_per_second": 136448.091 - }, - { - "epoch": 0.341, - "grad_norm": 0.41906896233558655, - "learning_rate": 0.0002245915257044919, - "loss": 1.2261, - "num_input_tokens_seen": 2234777600, - "step": 34100, - "train_runtime": 16381.7912, - "train_tokens_per_second": 136418.391 - }, - { - "epoch": 0.342, - "grad_norm": 0.4326164722442627, - "learning_rate": 0.00022417817803962892, - "loss": 1.2452, - "num_input_tokens_seen": 2241331200, - "step": 34200, - "train_runtime": 16429.3997, - "train_tokens_per_second": 136421.978 - }, - { - "epoch": 0.343, - "grad_norm": 0.8329346179962158, - "learning_rate": 0.0002237640834013366, - "loss": 1.2197, - "num_input_tokens_seen": 2247884800, - "step": 34300, - "train_runtime": 16476.2139, - "train_tokens_per_second": 136432.121 - }, - { - "epoch": 0.344, - "grad_norm": 0.4649752378463745, - "learning_rate": 0.0002233492459595434, - "loss": 1.2255, - "num_input_tokens_seen": 2254438400, - "step": 34400, - "train_runtime": 16523.092, - "train_tokens_per_second": 136441.678 - }, - { - "epoch": 0.345, - "grad_norm": 0.5218563675880432, - "learning_rate": 0.00022293366989165772, - "loss": 1.2365, - "num_input_tokens_seen": 2260992000, - "step": 34500, - "train_runtime": 16575.1624, - "train_tokens_per_second": 136408.437 - }, - { - "epoch": 0.346, - "grad_norm": 0.8002403974533081, - "learning_rate": 0.00022251735938252587, - "loss": 1.2179, - "num_input_tokens_seen": 2267545600, - "step": 34600, - "train_runtime": 16622.274, - "train_tokens_per_second": 136416.088 - }, - { - "epoch": 0.347, - "grad_norm": 0.5648475289344788, - "learning_rate": 0.0002221003186243902, - "loss": 1.2301, - "num_input_tokens_seen": 2274099200, - "step": 34700, - "train_runtime": 16668.9107, - "train_tokens_per_second": 136427.583 - }, - { - "epoch": 0.348, - "grad_norm": 0.4631340801715851, - "learning_rate": 0.00022168255181684643, - "loss": 1.2292, - "num_input_tokens_seen": 2280652800, - "step": 34800, - "train_runtime": 16715.4649, - "train_tokens_per_second": 136439.687 - }, - { - "epoch": 0.349, - "grad_norm": 0.4492770731449127, - "learning_rate": 0.00022126406316680172, - "loss": 1.226, - "num_input_tokens_seen": 2287206400, - "step": 34900, - "train_runtime": 16761.744, - "train_tokens_per_second": 136453.963 - }, - { - "epoch": 0.35, - "grad_norm": 0.5984812378883362, - "learning_rate": 0.00022084485688843208, - "loss": 1.2332, - "num_input_tokens_seen": 2293760000, - "step": 35000, - "train_runtime": 16816.4332, - "train_tokens_per_second": 136399.912 - }, - { - "epoch": 0.351, - "grad_norm": 0.6245887875556946, - "learning_rate": 0.00022042493720314003, - "loss": 1.2324, - "num_input_tokens_seen": 2300313600, - "step": 35100, - "train_runtime": 16864.2018, - "train_tokens_per_second": 136402.163 - }, - { - "epoch": 0.352, - "grad_norm": 0.6719664335250854, - "learning_rate": 0.00022000430833951228, - "loss": 1.2272, - "num_input_tokens_seen": 2306867200, - "step": 35200, - "train_runtime": 16910.313, - "train_tokens_per_second": 136417.77 - }, - { - "epoch": 0.353, - "grad_norm": 0.43880173563957214, - "learning_rate": 0.00021958297453327673, - "loss": 1.2572, - "num_input_tokens_seen": 2313420800, - "step": 35300, - "train_runtime": 16958.9376, - "train_tokens_per_second": 136413.073 - }, - { - "epoch": 0.354, - "grad_norm": 0.6195557713508606, - "learning_rate": 0.00021916094002726012, - "loss": 1.2299, - "num_input_tokens_seen": 2319974400, - "step": 35400, - "train_runtime": 17005.9814, - "train_tokens_per_second": 136421.083 - }, - { - "epoch": 0.355, - "grad_norm": 0.5288188457489014, - "learning_rate": 0.00021873820907134534, - "loss": 1.2157, - "num_input_tokens_seen": 2326528000, - "step": 35500, - "train_runtime": 17053.3579, - "train_tokens_per_second": 136426.387 - }, - { - "epoch": 0.356, - "grad_norm": 0.4962466061115265, - "learning_rate": 0.0002183147859224283, - "loss": 1.2282, - "num_input_tokens_seen": 2333081600, - "step": 35600, - "train_runtime": 17099.0541, - "train_tokens_per_second": 136445.068 - }, - { - "epoch": 0.357, - "grad_norm": 0.4940129518508911, - "learning_rate": 0.00021789067484437544, - "loss": 1.2349, - "num_input_tokens_seen": 2339635200, - "step": 35700, - "train_runtime": 17146.892, - "train_tokens_per_second": 136446.605 - }, - { - "epoch": 0.358, - "grad_norm": 0.5929033160209656, - "learning_rate": 0.00021746588010798068, - "loss": 1.2368, - "num_input_tokens_seen": 2346188800, - "step": 35800, - "train_runtime": 17199.6266, - "train_tokens_per_second": 136409.287 - }, - { - "epoch": 0.359, - "grad_norm": 0.4825666546821594, - "learning_rate": 0.00021704040599092216, - "loss": 1.2215, - "num_input_tokens_seen": 2352742400, - "step": 35900, - "train_runtime": 17246.2748, - "train_tokens_per_second": 136420.324 - }, - { - "epoch": 0.36, - "grad_norm": 0.4572449028491974, - "learning_rate": 0.00021661425677771965, - "loss": 1.2291, - "num_input_tokens_seen": 2359296000, - "step": 36000, - "train_runtime": 17292.1332, - "train_tokens_per_second": 136437.533 - }, - { - "epoch": 0.361, - "grad_norm": 0.467132568359375, - "learning_rate": 0.00021618743675969095, - "loss": 1.2295, - "num_input_tokens_seen": 2365849600, - "step": 36100, - "train_runtime": 17339.1599, - "train_tokens_per_second": 136445.457 - }, - { - "epoch": 0.362, - "grad_norm": 0.4863705635070801, - "learning_rate": 0.0002157599502349089, - "loss": 1.2154, - "num_input_tokens_seen": 2372403200, - "step": 36200, - "train_runtime": 17386.7454, - "train_tokens_per_second": 136448.952 - }, - { - "epoch": 0.363, - "grad_norm": 0.43923652172088623, - "learning_rate": 0.00021533180150815802, - "loss": 1.2268, - "num_input_tokens_seen": 2378956800, - "step": 36300, - "train_runtime": 17439.0785, - "train_tokens_per_second": 136415.282 - }, - { - "epoch": 0.364, - "grad_norm": 0.5028465390205383, - "learning_rate": 0.00021490299489089132, - "loss": 1.2293, - "num_input_tokens_seen": 2385510400, - "step": 36400, - "train_runtime": 17485.9662, - "train_tokens_per_second": 136424.283 - }, - { - "epoch": 0.365, - "grad_norm": 0.4366530478000641, - "learning_rate": 0.00021447353470118656, - "loss": 1.2276, - "num_input_tokens_seen": 2392064000, - "step": 36500, - "train_runtime": 17533.3809, - "train_tokens_per_second": 136429.136 - }, - { - "epoch": 0.366, - "grad_norm": 0.46415793895721436, - "learning_rate": 0.00021404342526370326, - "loss": 1.2227, - "num_input_tokens_seen": 2398617600, - "step": 36600, - "train_runtime": 17580.8443, - "train_tokens_per_second": 136433.584 - }, - { - "epoch": 0.367, - "grad_norm": 0.6382859349250793, - "learning_rate": 0.00021361267090963846, - "loss": 1.2212, - "num_input_tokens_seen": 2405171200, - "step": 36700, - "train_runtime": 17626.7905, - "train_tokens_per_second": 136449.753 - }, - { - "epoch": 0.368, - "grad_norm": 0.6642177700996399, - "learning_rate": 0.0002131812759766839, - "loss": 1.2317, - "num_input_tokens_seen": 2411724800, - "step": 36800, - "train_runtime": 17679.381, - "train_tokens_per_second": 136414.55 - }, - { - "epoch": 0.369, - "grad_norm": 0.4071521461009979, - "learning_rate": 0.00021274924480898169, - "loss": 1.2262, - "num_input_tokens_seen": 2418278400, - "step": 36900, - "train_runtime": 17726.5473, - "train_tokens_per_second": 136421.288 - }, - { - "epoch": 0.37, - "grad_norm": 0.5301467776298523, - "learning_rate": 0.00021231658175708087, - "loss": 1.2192, - "num_input_tokens_seen": 2424832000, - "step": 37000, - "train_runtime": 17772.7667, - "train_tokens_per_second": 136435.258 - }, - { - "epoch": 0.371, - "grad_norm": 0.5216257572174072, - "learning_rate": 0.00021188329117789357, - "loss": 1.213, - "num_input_tokens_seen": 2431385600, - "step": 37100, - "train_runtime": 17824.6083, - "train_tokens_per_second": 136406.116 - }, - { - "epoch": 0.372, - "grad_norm": 0.5098195672035217, - "learning_rate": 0.0002114493774346512, - "loss": 1.2311, - "num_input_tokens_seen": 2437939200, - "step": 37200, - "train_runtime": 17870.9901, - "train_tokens_per_second": 136418.81 - }, - { - "epoch": 0.373, - "grad_norm": 0.47295039892196655, - "learning_rate": 0.00021101484489686025, - "loss": 1.2211, - "num_input_tokens_seen": 2444492800, - "step": 37300, - "train_runtime": 17918.4906, - "train_tokens_per_second": 136422.919 - }, - { - "epoch": 0.374, - "grad_norm": 0.49752944707870483, - "learning_rate": 0.00021057969794025866, - "loss": 1.2292, - "num_input_tokens_seen": 2451046400, - "step": 37400, - "train_runtime": 17965.5373, - "train_tokens_per_second": 136430.453 - }, - { - "epoch": 0.375, - "grad_norm": 0.9500930905342102, - "learning_rate": 0.00021014394094677128, - "loss": 1.2187, - "num_input_tokens_seen": 2457600000, - "step": 37500, - "train_runtime": 18012.267, - "train_tokens_per_second": 136440.349 - }, - { - "epoch": 0.376, - "grad_norm": 0.4800110459327698, - "learning_rate": 0.00020970757830446633, - "loss": 1.2336, - "num_input_tokens_seen": 2464153600, - "step": 37600, - "train_runtime": 18059.6653, - "train_tokens_per_second": 136445.143 - }, - { - "epoch": 0.377, - "grad_norm": 0.48905813694000244, - "learning_rate": 0.00020927061440751072, - "loss": 1.2189, - "num_input_tokens_seen": 2470707200, - "step": 37700, - "train_runtime": 18111.7548, - "train_tokens_per_second": 136414.567 - }, - { - "epoch": 0.378, - "grad_norm": 0.593604564666748, - "learning_rate": 0.00020883305365612602, - "loss": 1.2178, - "num_input_tokens_seen": 2477260800, - "step": 37800, - "train_runtime": 18157.6424, - "train_tokens_per_second": 136430.751 - }, - { - "epoch": 0.379, - "grad_norm": 0.46399399638175964, - "learning_rate": 0.00020839490045654425, - "loss": 1.2141, - "num_input_tokens_seen": 2483814400, - "step": 37900, - "train_runtime": 18204.4326, - "train_tokens_per_second": 136440.089 - }, - { - "epoch": 0.38, - "grad_norm": 0.5679593086242676, - "learning_rate": 0.00020795615922096313, - "loss": 1.2332, - "num_input_tokens_seen": 2490368000, - "step": 38000, - "train_runtime": 18252.6627, - "train_tokens_per_second": 136438.614 - }, - { - "epoch": 0.381, - "grad_norm": 0.48073315620422363, - "learning_rate": 0.00020751683436750207, - "loss": 1.2369, - "num_input_tokens_seen": 2496921600, - "step": 38100, - "train_runtime": 18300.6025, - "train_tokens_per_second": 136439.311 - }, - { - "epoch": 0.382, - "grad_norm": 0.4134567677974701, - "learning_rate": 0.00020707693032015752, - "loss": 1.2168, - "num_input_tokens_seen": 2503475200, - "step": 38200, - "train_runtime": 18351.6848, - "train_tokens_per_second": 136416.641 - }, - { - "epoch": 0.383, - "grad_norm": 0.4675845503807068, - "learning_rate": 0.00020663645150875834, - "loss": 1.2272, - "num_input_tokens_seen": 2510028800, - "step": 38300, - "train_runtime": 18398.2852, - "train_tokens_per_second": 136427.323 - }, - { - "epoch": 0.384, - "grad_norm": 0.4632211923599243, - "learning_rate": 0.00020619540236892125, - "loss": 1.2444, - "num_input_tokens_seen": 2516582400, - "step": 38400, - "train_runtime": 18445.2271, - "train_tokens_per_second": 136435.425 - }, - { - "epoch": 0.385, - "grad_norm": 0.5543389916419983, - "learning_rate": 0.00020575378734200616, - "loss": 1.22, - "num_input_tokens_seen": 2523136000, - "step": 38500, - "train_runtime": 18492.3307, - "train_tokens_per_second": 136442.292 - }, - { - "epoch": 0.386, - "grad_norm": 0.5775281190872192, - "learning_rate": 0.0002053116108750715, - "loss": 1.2277, - "num_input_tokens_seen": 2529689600, - "step": 38600, - "train_runtime": 18544.2017, - "train_tokens_per_second": 136414.047 - }, - { - "epoch": 0.387, - "grad_norm": 0.5202789306640625, - "learning_rate": 0.0002048688774208294, - "loss": 1.2203, - "num_input_tokens_seen": 2536243200, - "step": 38700, - "train_runtime": 18591.8641, - "train_tokens_per_second": 136416.832 - }, - { - "epoch": 0.388, - "grad_norm": 0.44833704829216003, - "learning_rate": 0.0002044255914376009, - "loss": 1.2209, - "num_input_tokens_seen": 2542796800, - "step": 38800, - "train_runtime": 18637.8905, - "train_tokens_per_second": 136431.577 - }, - { - "epoch": 0.389, - "grad_norm": 0.5180789828300476, - "learning_rate": 0.00020398175738927082, - "loss": 1.2105, - "num_input_tokens_seen": 2549350400, - "step": 38900, - "train_runtime": 18684.0663, - "train_tokens_per_second": 136445.159 - }, - { - "epoch": 0.39, - "grad_norm": 0.6083468794822693, - "learning_rate": 0.00020353737974524312, - "loss": 1.2136, - "num_input_tokens_seen": 2555904000, - "step": 39000, - "train_runtime": 18730.572, - "train_tokens_per_second": 136456.27 - }, - { - "epoch": 0.391, - "grad_norm": 0.39693883061408997, - "learning_rate": 0.00020309246298039584, - "loss": 1.2285, - "num_input_tokens_seen": 2562457600, - "step": 39100, - "train_runtime": 18784.1544, - "train_tokens_per_second": 136415.914 - }, - { - "epoch": 0.392, - "grad_norm": 0.5166248679161072, - "learning_rate": 0.0002026470115750357, - "loss": 1.223, - "num_input_tokens_seen": 2569011200, - "step": 39200, - "train_runtime": 18830.687, - "train_tokens_per_second": 136426.844 - }, - { - "epoch": 0.393, - "grad_norm": 0.4967111051082611, - "learning_rate": 0.0002022010300148535, - "loss": 1.2163, - "num_input_tokens_seen": 2575564800, - "step": 39300, - "train_runtime": 18876.8963, - "train_tokens_per_second": 136440.057 - }, - { - "epoch": 0.394, - "grad_norm": 0.627816915512085, - "learning_rate": 0.0002017545227908786, - "loss": 1.2328, - "num_input_tokens_seen": 2582118400, - "step": 39400, - "train_runtime": 18923.6736, - "train_tokens_per_second": 136449.109 - }, - { - "epoch": 0.395, - "grad_norm": 0.489969938993454, - "learning_rate": 0.00020130749439943376, - "loss": 1.224, - "num_input_tokens_seen": 2588672000, - "step": 39500, - "train_runtime": 18970.0964, - "train_tokens_per_second": 136460.666 - }, - { - "epoch": 0.396, - "grad_norm": 0.6713995933532715, - "learning_rate": 0.00020085994934208998, - "loss": 1.2156, - "num_input_tokens_seen": 2595225600, - "step": 39600, - "train_runtime": 19023.1241, - "train_tokens_per_second": 136424.784 - }, - { - "epoch": 0.397, - "grad_norm": 0.4549367427825928, - "learning_rate": 0.00020041189212562094, - "loss": 1.2094, - "num_input_tokens_seen": 2601779200, - "step": 39700, - "train_runtime": 19070.6234, - "train_tokens_per_second": 136428.639 - }, - { - "epoch": 0.398, - "grad_norm": 0.47548773884773254, - "learning_rate": 0.0001999633272619579, - "loss": 1.2244, - "num_input_tokens_seen": 2608332800, - "step": 39800, - "train_runtime": 19117.4992, - "train_tokens_per_second": 136436.925 - }, - { - "epoch": 0.399, - "grad_norm": 0.46569159626960754, - "learning_rate": 0.00019951425926814404, - "loss": 1.2189, - "num_input_tokens_seen": 2614886400, - "step": 39900, - "train_runtime": 19164.3173, - "train_tokens_per_second": 136445.581 - }, - { - "epoch": 0.4, - "grad_norm": 0.5518438220024109, - "learning_rate": 0.00019906469266628904, - "loss": 1.2097, - "num_input_tokens_seen": 2621440000, - "step": 40000, - "train_runtime": 19211.1586, - "train_tokens_per_second": 136454.029 - }, - { - "epoch": 0.401, - "grad_norm": 0.4615115821361542, - "learning_rate": 0.0001986146319835236, - "loss": 1.2177, - "num_input_tokens_seen": 2627993600, - "step": 40100, - "train_runtime": 19263.5816, - "train_tokens_per_second": 136422.897 - }, - { - "epoch": 0.402, - "grad_norm": 0.4154411554336548, - "learning_rate": 0.00019816408175195383, - "loss": 1.2262, - "num_input_tokens_seen": 2634547200, - "step": 40200, - "train_runtime": 19310.6242, - "train_tokens_per_second": 136429.935 - }, - { - "epoch": 0.403, - "grad_norm": 0.48504838347435, - "learning_rate": 0.0001977130465086155, - "loss": 1.2205, - "num_input_tokens_seen": 2641100800, - "step": 40300, - "train_runtime": 19356.9428, - "train_tokens_per_second": 136442.042 - }, - { - "epoch": 0.404, - "grad_norm": 0.477006196975708, - "learning_rate": 0.0001972615307954286, - "loss": 1.2099, - "num_input_tokens_seen": 2647654400, - "step": 40400, - "train_runtime": 19403.4467, - "train_tokens_per_second": 136452.788 - }, - { - "epoch": 0.405, - "grad_norm": 0.46401214599609375, - "learning_rate": 0.00019680953915915124, - "loss": 1.2142, - "num_input_tokens_seen": 2654208000, - "step": 40500, - "train_runtime": 19456.0604, - "train_tokens_per_second": 136420.629 - }, - { - "epoch": 0.406, - "grad_norm": 0.4205267131328583, - "learning_rate": 0.00019635707615133427, - "loss": 1.2233, - "num_input_tokens_seen": 2660761600, - "step": 40600, - "train_runtime": 19503.129, - "train_tokens_per_second": 136427.422 - }, - { - "epoch": 0.407, - "grad_norm": 0.7298253178596497, - "learning_rate": 0.00019590414632827513, - "loss": 1.2143, - "num_input_tokens_seen": 2667315200, - "step": 40700, - "train_runtime": 19550.1113, - "train_tokens_per_second": 136434.783 - }, - { - "epoch": 0.408, - "grad_norm": 0.47734642028808594, - "learning_rate": 0.00019545075425097204, - "loss": 1.222, - "num_input_tokens_seen": 2673868800, - "step": 40800, - "train_runtime": 19596.9887, - "train_tokens_per_second": 136442.84 - }, - { - "epoch": 0.409, - "grad_norm": 0.4535351097583771, - "learning_rate": 0.00019499690448507827, - "loss": 1.2373, - "num_input_tokens_seen": 2680422400, - "step": 40900, - "train_runtime": 19649.1805, - "train_tokens_per_second": 136413.954 - }, - { - "epoch": 0.41, - "grad_norm": 0.572079062461853, - "learning_rate": 0.00019454260160085588, - "loss": 1.2125, - "num_input_tokens_seen": 2686976000, - "step": 41000, - "train_runtime": 19697.7854, - "train_tokens_per_second": 136410.056 - }, - { - "epoch": 0.411, - "grad_norm": 0.4487378001213074, - "learning_rate": 0.0001940878501731299, - "loss": 1.2124, - "num_input_tokens_seen": 2693529600, - "step": 41100, - "train_runtime": 19744.9135, - "train_tokens_per_second": 136416.379 - }, - { - "epoch": 0.412, - "grad_norm": 0.47419917583465576, - "learning_rate": 0.00019363265478124214, - "loss": 1.2037, - "num_input_tokens_seen": 2700083200, - "step": 41200, - "train_runtime": 19791.8314, - "train_tokens_per_second": 136424.121 - }, - { - "epoch": 0.413, - "grad_norm": 0.6295040845870972, - "learning_rate": 0.00019317702000900516, - "loss": 1.2246, - "num_input_tokens_seen": 2706636800, - "step": 41300, - "train_runtime": 19838.5236, - "train_tokens_per_second": 136433.379 - }, - { - "epoch": 0.414, - "grad_norm": 0.53326016664505, - "learning_rate": 0.000192720950444656, - "loss": 1.2192, - "num_input_tokens_seen": 2713190400, - "step": 41400, - "train_runtime": 19885.4264, - "train_tokens_per_second": 136441.147 - }, - { - "epoch": 0.415, - "grad_norm": 0.49727046489715576, - "learning_rate": 0.00019226445068081018, - "loss": 1.2279, - "num_input_tokens_seen": 2719744000, - "step": 41500, - "train_runtime": 19937.4737, - "train_tokens_per_second": 136413.672 - }, - { - "epoch": 0.416, - "grad_norm": 0.47963398694992065, - "learning_rate": 0.00019180752531441523, - "loss": 1.2226, - "num_input_tokens_seen": 2726297600, - "step": 41600, - "train_runtime": 19984.6667, - "train_tokens_per_second": 136419.468 - }, - { - "epoch": 0.417, - "grad_norm": 0.4789304733276367, - "learning_rate": 0.00019135017894670456, - "loss": 1.2222, - "num_input_tokens_seen": 2732851200, - "step": 41700, - "train_runtime": 20032.7071, - "train_tokens_per_second": 136419.465 - }, - { - "epoch": 0.418, - "grad_norm": 0.6693325638771057, - "learning_rate": 0.0001908924161831509, - "loss": 1.2366, - "num_input_tokens_seen": 2739404800, - "step": 41800, - "train_runtime": 20078.7138, - "train_tokens_per_second": 136433.281 - }, - { - "epoch": 0.419, - "grad_norm": 0.41989439725875854, - "learning_rate": 0.0001904342416334203, - "loss": 1.2212, - "num_input_tokens_seen": 2745958400, - "step": 41900, - "train_runtime": 20125.0521, - "train_tokens_per_second": 136444.785 - }, - { - "epoch": 0.42, - "grad_norm": 0.5444014072418213, - "learning_rate": 0.00018997565991132532, - "loss": 1.2164, - "num_input_tokens_seen": 2752512000, - "step": 42000, - "train_runtime": 20177.4596, - "train_tokens_per_second": 136415.191 - }, - { - "epoch": 0.421, - "grad_norm": 0.5790873169898987, - "learning_rate": 0.0001895166756347789, - "loss": 1.215, - "num_input_tokens_seen": 2759065600, - "step": 42100, - "train_runtime": 20224.878, - "train_tokens_per_second": 136419.394 - }, - { - "epoch": 0.422, - "grad_norm": 0.4666343927383423, - "learning_rate": 0.0001890572934257475, - "loss": 1.2229, - "num_input_tokens_seen": 2765619200, - "step": 42200, - "train_runtime": 20270.922, - "train_tokens_per_second": 136432.827 - }, - { - "epoch": 0.423, - "grad_norm": 0.4322357177734375, - "learning_rate": 0.00018859751791020497, - "loss": 1.2258, - "num_input_tokens_seen": 2772172800, - "step": 42300, - "train_runtime": 20317.4494, - "train_tokens_per_second": 136442.954 - }, - { - "epoch": 0.424, - "grad_norm": 0.6240208148956299, - "learning_rate": 0.0001881373537180856, - "loss": 1.221, - "num_input_tokens_seen": 2778726400, - "step": 42400, - "train_runtime": 20364.5753, - "train_tokens_per_second": 136449.023 - }, - { - "epoch": 0.425, - "grad_norm": 0.5865579843521118, - "learning_rate": 0.00018767680548323766, - "loss": 1.2244, - "num_input_tokens_seen": 2785280000, - "step": 42500, - "train_runtime": 20417.9029, - "train_tokens_per_second": 136413.617 - }, - { - "epoch": 0.426, - "grad_norm": 0.5201649069786072, - "learning_rate": 0.0001872158778433768, - "loss": 1.2076, - "num_input_tokens_seen": 2791833600, - "step": 42600, - "train_runtime": 20464.7135, - "train_tokens_per_second": 136421.827 - }, - { - "epoch": 0.427, - "grad_norm": 0.5092735290527344, - "learning_rate": 0.0001867545754400392, - "loss": 1.2057, - "num_input_tokens_seen": 2798387200, - "step": 42700, - "train_runtime": 20511.0273, - "train_tokens_per_second": 136433.303 - }, - { - "epoch": 0.428, - "grad_norm": 0.4439486265182495, - "learning_rate": 0.000186292902918535, - "loss": 1.209, - "num_input_tokens_seen": 2804940800, - "step": 42800, - "train_runtime": 20558.3684, - "train_tokens_per_second": 136437.909 - }, - { - "epoch": 0.429, - "grad_norm": 0.4466177225112915, - "learning_rate": 0.00018583086492790136, - "loss": 1.218, - "num_input_tokens_seen": 2811494400, - "step": 42900, - "train_runtime": 20605.5543, - "train_tokens_per_second": 136443.522 - }, - { - "epoch": 0.43, - "grad_norm": 0.5813594460487366, - "learning_rate": 0.00018536846612085566, - "loss": 1.2161, - "num_input_tokens_seen": 2818048000, - "step": 43000, - "train_runtime": 20658.6134, - "train_tokens_per_second": 136410.317 - }, - { - "epoch": 0.431, - "grad_norm": 0.49140629172325134, - "learning_rate": 0.00018490571115374878, - "loss": 1.227, - "num_input_tokens_seen": 2824601600, - "step": 43100, - "train_runtime": 20705.6255, - "train_tokens_per_second": 136417.11 - }, - { - "epoch": 0.432, - "grad_norm": 0.4938826858997345, - "learning_rate": 0.00018444260468651816, - "loss": 1.2252, - "num_input_tokens_seen": 2831155200, - "step": 43200, - "train_runtime": 20752.3571, - "train_tokens_per_second": 136425.717 - }, - { - "epoch": 0.433, - "grad_norm": 0.5228791832923889, - "learning_rate": 0.00018397915138264068, - "loss": 1.2274, - "num_input_tokens_seen": 2837708800, - "step": 43300, - "train_runtime": 20799.4436, - "train_tokens_per_second": 136431.957 - }, - { - "epoch": 0.434, - "grad_norm": 0.46896296739578247, - "learning_rate": 0.00018351535590908606, - "loss": 1.2043, - "num_input_tokens_seen": 2844262400, - "step": 43400, - "train_runtime": 20845.6184, - "train_tokens_per_second": 136444.137 - }, - { - "epoch": 0.435, - "grad_norm": 0.4269004464149475, - "learning_rate": 0.00018305122293626948, - "loss": 1.2213, - "num_input_tokens_seen": 2850816000, - "step": 43500, - "train_runtime": 20897.7485, - "train_tokens_per_second": 136417.375 - }, - { - "epoch": 0.436, - "grad_norm": 0.6213890314102173, - "learning_rate": 0.00018258675713800492, - "loss": 1.2096, - "num_input_tokens_seen": 2857369600, - "step": 43600, - "train_runtime": 20944.9642, - "train_tokens_per_second": 136422.749 - }, - { - "epoch": 0.437, - "grad_norm": 0.4281384348869324, - "learning_rate": 0.00018212196319145773, - "loss": 1.2111, - "num_input_tokens_seen": 2863923200, - "step": 43700, - "train_runtime": 20992.0443, - "train_tokens_per_second": 136428.98 - }, - { - "epoch": 0.438, - "grad_norm": 1.044310212135315, - "learning_rate": 0.00018165684577709778, - "loss": 1.2142, - "num_input_tokens_seen": 2870476800, - "step": 43800, - "train_runtime": 21039.718, - "train_tokens_per_second": 136431.334 - }, - { - "epoch": 0.439, - "grad_norm": 0.445425808429718, - "learning_rate": 0.0001811914095786524, - "loss": 1.218, - "num_input_tokens_seen": 2877030400, - "step": 43900, - "train_runtime": 21088.215, - "train_tokens_per_second": 136428.351 - }, - { - "epoch": 0.44, - "grad_norm": 0.43947216868400574, - "learning_rate": 0.0001807256592830588, - "loss": 1.2124, - "num_input_tokens_seen": 2883584000, - "step": 44000, - "train_runtime": 21136.0286, - "train_tokens_per_second": 136429.793 - }, - { - "epoch": 0.441, - "grad_norm": 0.5147203803062439, - "learning_rate": 0.00018025959958041732, - "loss": 1.2227, - "num_input_tokens_seen": 2890137600, - "step": 44100, - "train_runtime": 21182.9913, - "train_tokens_per_second": 136436.708 - }, - { - "epoch": 0.442, - "grad_norm": 0.473652184009552, - "learning_rate": 0.00017979323516394407, - "loss": 1.2277, - "num_input_tokens_seen": 2896691200, - "step": 44200, - "train_runtime": 21236.5796, - "train_tokens_per_second": 136401.024 - }, - { - "epoch": 0.443, - "grad_norm": 0.4356568157672882, - "learning_rate": 0.00017932657072992344, - "loss": 1.2018, - "num_input_tokens_seen": 2903244800, - "step": 44300, - "train_runtime": 21282.9387, - "train_tokens_per_second": 136411.838 - }, - { - "epoch": 0.444, - "grad_norm": 0.4458017647266388, - "learning_rate": 0.00017885961097766117, - "loss": 1.2124, - "num_input_tokens_seen": 2909798400, - "step": 44400, - "train_runtime": 21331.1223, - "train_tokens_per_second": 136410.938 - }, - { - "epoch": 0.445, - "grad_norm": 0.5065773725509644, - "learning_rate": 0.00017839236060943674, - "loss": 1.2262, - "num_input_tokens_seen": 2916352000, - "step": 44500, - "train_runtime": 21377.5493, - "train_tokens_per_second": 136421.25 - }, - { - "epoch": 0.446, - "grad_norm": 0.5424425601959229, - "learning_rate": 0.0001779248243304562, - "loss": 1.2171, - "num_input_tokens_seen": 2922905600, - "step": 44600, - "train_runtime": 21424.9021, - "train_tokens_per_second": 136425.622 - }, - { - "epoch": 0.447, - "grad_norm": 0.4595748484134674, - "learning_rate": 0.00017745700684880465, - "loss": 1.2039, - "num_input_tokens_seen": 2929459200, - "step": 44700, - "train_runtime": 21472.2167, - "train_tokens_per_second": 136430.218 - }, - { - "epoch": 0.448, - "grad_norm": 0.5353960990905762, - "learning_rate": 0.000176988912875399, - "loss": 1.2075, - "num_input_tokens_seen": 2936012800, - "step": 44800, - "train_runtime": 21524.5148, - "train_tokens_per_second": 136403.205 - }, - { - "epoch": 0.449, - "grad_norm": 0.4949302673339844, - "learning_rate": 0.00017652054712394028, - "loss": 1.2174, - "num_input_tokens_seen": 2942566400, - "step": 44900, - "train_runtime": 21571.6626, - "train_tokens_per_second": 136408.883 - }, - { - "epoch": 0.45, - "grad_norm": 0.5596060752868652, - "learning_rate": 0.0001760519143108665, - "loss": 1.2178, - "num_input_tokens_seen": 2949120000, - "step": 45000, - "train_runtime": 21618.3195, - "train_tokens_per_second": 136417.634 - }, - { - "epoch": 0.451, - "grad_norm": 0.5348083972930908, - "learning_rate": 0.00017558301915530483, - "loss": 1.215, - "num_input_tokens_seen": 2955673600, - "step": 45100, - "train_runtime": 21666.1069, - "train_tokens_per_second": 136419.229 - }, - { - "epoch": 0.452, - "grad_norm": 0.46748441457748413, - "learning_rate": 0.00017511386637902428, - "loss": 1.2104, - "num_input_tokens_seen": 2962227200, - "step": 45200, - "train_runtime": 21713.1957, - "train_tokens_per_second": 136425.206 - }, - { - "epoch": 0.453, - "grad_norm": 0.47188806533813477, - "learning_rate": 0.00017464446070638814, - "loss": 1.213, - "num_input_tokens_seen": 2968780800, - "step": 45300, - "train_runtime": 21760.1393, - "train_tokens_per_second": 136432.068 - }, - { - "epoch": 0.454, - "grad_norm": 0.5225762128829956, - "learning_rate": 0.00017417480686430622, - "loss": 1.2152, - "num_input_tokens_seen": 2975334400, - "step": 45400, - "train_runtime": 21812.7666, - "train_tokens_per_second": 136403.348 - }, - { - "epoch": 0.455, - "grad_norm": 0.5889186263084412, - "learning_rate": 0.00017370490958218765, - "loss": 1.2214, - "num_input_tokens_seen": 2981888000, - "step": 45500, - "train_runtime": 21859.0263, - "train_tokens_per_second": 136414.493 - }, - { - "epoch": 0.456, - "grad_norm": 0.6613258719444275, - "learning_rate": 0.00017323477359189272, - "loss": 1.2334, - "num_input_tokens_seen": 2988441600, - "step": 45600, - "train_runtime": 21905.9003, - "train_tokens_per_second": 136421.766 - }, - { - "epoch": 0.457, - "grad_norm": 0.4657646715641022, - "learning_rate": 0.00017276440362768564, - "loss": 1.2132, - "num_input_tokens_seen": 2994995200, - "step": 45700, - "train_runtime": 21952.9851, - "train_tokens_per_second": 136427.697 - }, - { - "epoch": 0.458, - "grad_norm": 0.8410550355911255, - "learning_rate": 0.0001722938044261868, - "loss": 1.2073, - "num_input_tokens_seen": 3001548800, - "step": 45800, - "train_runtime": 22005.352, - "train_tokens_per_second": 136400.854 - }, - { - "epoch": 0.459, - "grad_norm": 0.7687750458717346, - "learning_rate": 0.0001718229807263249, - "loss": 1.2116, - "num_input_tokens_seen": 3008102400, - "step": 45900, - "train_runtime": 22051.2762, - "train_tokens_per_second": 136413.982 - }, - { - "epoch": 0.46, - "grad_norm": 0.40700653195381165, - "learning_rate": 0.0001713519372692894, - "loss": 1.2082, - "num_input_tokens_seen": 3014656000, - "step": 46000, - "train_runtime": 22102.8898, - "train_tokens_per_second": 136391.939 - }, - { - "epoch": 0.461, - "grad_norm": 0.44239944219589233, - "learning_rate": 0.0001708806787984826, - "loss": 1.2177, - "num_input_tokens_seen": 3021209600, - "step": 46100, - "train_runtime": 22149.1222, - "train_tokens_per_second": 136403.13 - }, - { - "epoch": 0.462, - "grad_norm": 0.4981868267059326, - "learning_rate": 0.00017040921005947212, - "loss": 1.2073, - "num_input_tokens_seen": 3027763200, - "step": 46200, - "train_runtime": 22195.5009, - "train_tokens_per_second": 136413.376 - }, - { - "epoch": 0.463, - "grad_norm": 0.5651112198829651, - "learning_rate": 0.0001699375357999429, - "loss": 1.2098, - "num_input_tokens_seen": 3034316800, - "step": 46300, - "train_runtime": 22241.367, - "train_tokens_per_second": 136426.722 - }, - { - "epoch": 0.464, - "grad_norm": 1.1314237117767334, - "learning_rate": 0.0001694656607696496, - "loss": 1.2335, - "num_input_tokens_seen": 3040870400, - "step": 46400, - "train_runtime": 22294.8896, - "train_tokens_per_second": 136393.158 - }, - { - "epoch": 0.465, - "grad_norm": 0.568980872631073, - "learning_rate": 0.0001689935897203684, - "loss": 1.2096, - "num_input_tokens_seen": 3047424000, - "step": 46500, - "train_runtime": 22342.7849, - "train_tokens_per_second": 136394.098 - }, - { - "epoch": 0.466, - "grad_norm": 0.7110226154327393, - "learning_rate": 0.0001685213274058496, - "loss": 1.2136, - "num_input_tokens_seen": 3053977600, - "step": 46600, - "train_runtime": 22393.3193, - "train_tokens_per_second": 136378.96 - }, - { - "epoch": 0.467, - "grad_norm": 0.5052018761634827, - "learning_rate": 0.00016804887858176944, - "loss": 1.2237, - "num_input_tokens_seen": 3060531200, - "step": 46700, - "train_runtime": 22441.2606, - "train_tokens_per_second": 136379.647 - }, - { - "epoch": 0.468, - "grad_norm": 0.4663156270980835, - "learning_rate": 0.00016757624800568238, - "loss": 1.2071, - "num_input_tokens_seen": 3067084800, - "step": 46800, - "train_runtime": 22487.9084, - "train_tokens_per_second": 136388.175 - }, - { - "epoch": 0.469, - "grad_norm": 0.5441033840179443, - "learning_rate": 0.00016710344043697301, - "loss": 1.2078, - "num_input_tokens_seen": 3073638400, - "step": 46900, - "train_runtime": 22534.6023, - "train_tokens_per_second": 136396.39 - }, - { - "epoch": 0.47, - "grad_norm": 0.4578142464160919, - "learning_rate": 0.0001666304606368083, - "loss": 1.1956, - "num_input_tokens_seen": 3080192000, - "step": 47000, - "train_runtime": 22587.0441, - "train_tokens_per_second": 136369.858 - }, - { - "epoch": 0.471, - "grad_norm": 0.6252749562263489, - "learning_rate": 0.00016615731336808962, - "loss": 1.1911, - "num_input_tokens_seen": 3086745600, - "step": 47100, - "train_runtime": 22634.7186, - "train_tokens_per_second": 136372.166 - }, - { - "epoch": 0.472, - "grad_norm": 0.45418813824653625, - "learning_rate": 0.0001656840033954047, - "loss": 1.22, - "num_input_tokens_seen": 3093299200, - "step": 47200, - "train_runtime": 22681.221, - "train_tokens_per_second": 136381.511 - }, - { - "epoch": 0.473, - "grad_norm": 0.55946284532547, - "learning_rate": 0.00016521053548497973, - "loss": 1.2073, - "num_input_tokens_seen": 3099852800, - "step": 47300, - "train_runtime": 22728.7635, - "train_tokens_per_second": 136384.577 - }, - { - "epoch": 0.474, - "grad_norm": 0.508859395980835, - "learning_rate": 0.0001647369144046313, - "loss": 1.1957, - "num_input_tokens_seen": 3106406400, - "step": 47400, - "train_runtime": 22775.8652, - "train_tokens_per_second": 136390.27 - }, - { - "epoch": 0.475, - "grad_norm": 0.5557622313499451, - "learning_rate": 0.00016426314492371842, - "loss": 1.1996, - "num_input_tokens_seen": 3112960000, - "step": 47500, - "train_runtime": 22823.5391, - "train_tokens_per_second": 136392.519 - }, - { - "epoch": 0.476, - "grad_norm": 0.5686858296394348, - "learning_rate": 0.0001637892318130945, - "loss": 1.201, - "num_input_tokens_seen": 3119513600, - "step": 47600, - "train_runtime": 22875.0526, - "train_tokens_per_second": 136371.866 - }, - { - "epoch": 0.477, - "grad_norm": 0.47568413615226746, - "learning_rate": 0.00016331517984505934, - "loss": 1.2132, - "num_input_tokens_seen": 3126067200, - "step": 47700, - "train_runtime": 22923.2754, - "train_tokens_per_second": 136370.878 - }, - { - "epoch": 0.478, - "grad_norm": 0.40612325072288513, - "learning_rate": 0.00016284099379331092, - "loss": 1.2085, - "num_input_tokens_seen": 3132620800, - "step": 47800, - "train_runtime": 22970.2831, - "train_tokens_per_second": 136377.109 - }, - { - "epoch": 0.479, - "grad_norm": 0.491755872964859, - "learning_rate": 0.00016236667843289759, - "loss": 1.206, - "num_input_tokens_seen": 3139174400, - "step": 47900, - "train_runtime": 23016.8676, - "train_tokens_per_second": 136385.821 - }, - { - "epoch": 0.48, - "grad_norm": 1.2421867847442627, - "learning_rate": 0.00016189223854016973, - "loss": 1.1991, - "num_input_tokens_seen": 3145728000, - "step": 48000, - "train_runtime": 23070.1067, - "train_tokens_per_second": 136355.156 - }, - { - "epoch": 0.481, - "grad_norm": 0.44709935784339905, - "learning_rate": 0.00016141767889273182, - "loss": 1.1987, - "num_input_tokens_seen": 3152281600, - "step": 48100, - "train_runtime": 23117.6704, - "train_tokens_per_second": 136358.1 - }, - { - "epoch": 0.482, - "grad_norm": 0.6956078410148621, - "learning_rate": 0.00016094300426939417, - "loss": 1.206, - "num_input_tokens_seen": 3158835200, - "step": 48200, - "train_runtime": 23164.6084, - "train_tokens_per_second": 136364.714 - }, - { - "epoch": 0.483, - "grad_norm": 0.4756148159503937, - "learning_rate": 0.00016046821945012505, - "loss": 1.213, - "num_input_tokens_seen": 3165388800, - "step": 48300, - "train_runtime": 23212.4256, - "train_tokens_per_second": 136366.137 - }, - { - "epoch": 0.484, - "grad_norm": 0.4668136239051819, - "learning_rate": 0.00015999332921600226, - "loss": 1.2027, - "num_input_tokens_seen": 3171942400, - "step": 48400, - "train_runtime": 23260.1957, - "train_tokens_per_second": 136367.829 - }, - { - "epoch": 0.485, - "grad_norm": 0.48166415095329285, - "learning_rate": 0.00015951833834916532, - "loss": 1.1885, - "num_input_tokens_seen": 3178496000, - "step": 48500, - "train_runtime": 23308.4042, - "train_tokens_per_second": 136366.951 - }, - { - "epoch": 0.486, - "grad_norm": 1.4835230112075806, - "learning_rate": 0.00015904325163276672, - "loss": 1.2144, - "num_input_tokens_seen": 3185049600, - "step": 48600, - "train_runtime": 23355.0119, - "train_tokens_per_second": 136375.422 - }, - { - "epoch": 0.487, - "grad_norm": 0.47993043065071106, - "learning_rate": 0.00015856807385092466, - "loss": 1.2092, - "num_input_tokens_seen": 3191603200, - "step": 48700, - "train_runtime": 23408.2289, - "train_tokens_per_second": 136345.352 - }, - { - "epoch": 0.488, - "grad_norm": 0.4617721736431122, - "learning_rate": 0.00015809280978867405, - "loss": 1.2079, - "num_input_tokens_seen": 3198156800, - "step": 48800, - "train_runtime": 23456.3091, - "train_tokens_per_second": 136345.27 - }, - { - "epoch": 0.489, - "grad_norm": 0.4698822796344757, - "learning_rate": 0.0001576174642319187, - "loss": 1.2221, - "num_input_tokens_seen": 3204710400, - "step": 48900, - "train_runtime": 23502.92, - "train_tokens_per_second": 136353.713 - }, - { - "epoch": 0.49, - "grad_norm": 0.5454009771347046, - "learning_rate": 0.0001571420419673831, - "loss": 1.201, - "num_input_tokens_seen": 3211264000, - "step": 49000, - "train_runtime": 23550.5868, - "train_tokens_per_second": 136356.008 - }, - { - "epoch": 0.491, - "grad_norm": 0.9021556973457336, - "learning_rate": 0.0001566665477825642, - "loss": 1.2047, - "num_input_tokens_seen": 3217817600, - "step": 49100, - "train_runtime": 23597.4655, - "train_tokens_per_second": 136362.848 - }, - { - "epoch": 0.492, - "grad_norm": 0.4959240257740021, - "learning_rate": 0.0001561909864656831, - "loss": 1.2042, - "num_input_tokens_seen": 3224371200, - "step": 49200, - "train_runtime": 23650.6048, - "train_tokens_per_second": 136333.562 - }, - { - "epoch": 0.493, - "grad_norm": 0.554251492023468, - "learning_rate": 0.00015571536280563705, - "loss": 1.2163, - "num_input_tokens_seen": 3230924800, - "step": 49300, - "train_runtime": 23697.3685, - "train_tokens_per_second": 136341.079 - }, - { - "epoch": 0.494, - "grad_norm": 0.5000952482223511, - "learning_rate": 0.000155239681591951, - "loss": 1.2086, - "num_input_tokens_seen": 3237478400, - "step": 49400, - "train_runtime": 23745.4988, - "train_tokens_per_second": 136340.72 - }, - { - "epoch": 0.495, - "grad_norm": 0.7438832521438599, - "learning_rate": 0.00015476394761472953, - "loss": 1.1999, - "num_input_tokens_seen": 3244032000, - "step": 49500, - "train_runtime": 23793.3349, - "train_tokens_per_second": 136342.048 - }, - { - "epoch": 0.496, - "grad_norm": 0.4872761368751526, - "learning_rate": 0.00015428816566460843, - "loss": 1.194, - "num_input_tokens_seen": 3250585600, - "step": 49600, - "train_runtime": 23839.649, - "train_tokens_per_second": 136352.074 - }, - { - "epoch": 0.497, - "grad_norm": 0.48635321855545044, - "learning_rate": 0.00015381234053270669, - "loss": 1.1957, - "num_input_tokens_seen": 3257139200, - "step": 49700, - "train_runtime": 23886.4418, - "train_tokens_per_second": 136359.33 - }, - { - "epoch": 0.498, - "grad_norm": 0.899361252784729, - "learning_rate": 0.0001533364770105781, - "loss": 1.201, - "num_input_tokens_seen": 3263692800, - "step": 49800, - "train_runtime": 23933.6337, - "train_tokens_per_second": 136364.283 - }, - { - "epoch": 0.499, - "grad_norm": 0.5460925698280334, - "learning_rate": 0.0001528605798901631, - "loss": 1.2086, - "num_input_tokens_seen": 3270246400, - "step": 49900, - "train_runtime": 23985.6033, - "train_tokens_per_second": 136342.053 - }, - { - "epoch": 0.5, - "grad_norm": 0.4763907194137573, - "learning_rate": 0.00015238465396374027, - "loss": 1.1987, - "num_input_tokens_seen": 3276800000, - "step": 50000, - "train_runtime": 24033.0829, - "train_tokens_per_second": 136345.388 - }, - { - "epoch": 0.501, - "grad_norm": 0.4716530442237854, - "learning_rate": 0.00015190870402387858, - "loss": 1.2083, - "num_input_tokens_seen": 3283353600, - "step": 50100, - "train_runtime": 24080.0017, - "train_tokens_per_second": 136351.884 - }, - { - "epoch": 0.502, - "grad_norm": 0.65655517578125, - "learning_rate": 0.00015143273486338857, - "loss": 1.2026, - "num_input_tokens_seen": 3289907200, - "step": 50200, - "train_runtime": 24132.759, - "train_tokens_per_second": 136325.366 - }, - { - "epoch": 0.503, - "grad_norm": 0.494205117225647, - "learning_rate": 0.00015095675127527438, - "loss": 1.208, - "num_input_tokens_seen": 3296460800, - "step": 50300, - "train_runtime": 24179.9126, - "train_tokens_per_second": 136330.551 - }, - { - "epoch": 0.504, - "grad_norm": 0.485307902097702, - "learning_rate": 0.00015048075805268547, - "loss": 1.1965, - "num_input_tokens_seen": 3303014400, - "step": 50400, - "train_runtime": 24227.2685, - "train_tokens_per_second": 136334.577 - }, - { - "epoch": 0.505, - "grad_norm": 0.4843132793903351, - "learning_rate": 0.00015000475998886825, - "loss": 1.2028, - "num_input_tokens_seen": 3309568000, - "step": 50500, - "train_runtime": 24274.7092, - "train_tokens_per_second": 136338.111 - }, - { - "epoch": 0.506, - "grad_norm": 0.4654887020587921, - "learning_rate": 0.00014952876187711804, - "loss": 1.2151, - "num_input_tokens_seen": 3316121600, - "step": 50600, - "train_runtime": 24321.273, - "train_tokens_per_second": 136346.547 - }, - { - "epoch": 0.507, - "grad_norm": 0.4625457525253296, - "learning_rate": 0.00014905276851073053, - "loss": 1.209, - "num_input_tokens_seen": 3322675200, - "step": 50700, - "train_runtime": 24374.7609, - "train_tokens_per_second": 136316.217 - }, - { - "epoch": 0.508, - "grad_norm": 0.527594268321991, - "learning_rate": 0.00014857678468295352, - "loss": 1.2043, - "num_input_tokens_seen": 3329228800, - "step": 50800, - "train_runtime": 24422.59, - "train_tokens_per_second": 136317.598 - }, - { - "epoch": 0.509, - "grad_norm": 0.4604775011539459, - "learning_rate": 0.00014810081518693902, - "loss": 1.1895, - "num_input_tokens_seen": 3335782400, - "step": 50900, - "train_runtime": 24468.7673, - "train_tokens_per_second": 136328.176 - }, - { - "epoch": 0.51, - "grad_norm": 0.4973219335079193, - "learning_rate": 0.0001476248648156945, - "loss": 1.1977, - "num_input_tokens_seen": 3342336000, - "step": 51000, - "train_runtime": 24516.9703, - "train_tokens_per_second": 136327.448 - }, - { - "epoch": 0.511, - "grad_norm": 0.42552006244659424, - "learning_rate": 0.00014714893836203485, - "loss": 1.2109, - "num_input_tokens_seen": 3348889600, - "step": 51100, - "train_runtime": 24564.5614, - "train_tokens_per_second": 136330.12 - }, - { - "epoch": 0.512, - "grad_norm": 0.5027197003364563, - "learning_rate": 0.0001466730406185343, - "loss": 1.1949, - "num_input_tokens_seen": 3355443200, - "step": 51200, - "train_runtime": 24611.9784, - "train_tokens_per_second": 136333.745 - }, - { - "epoch": 0.513, - "grad_norm": 0.6097121238708496, - "learning_rate": 0.0001461971763774778, - "loss": 1.2, - "num_input_tokens_seen": 3361996800, - "step": 51300, - "train_runtime": 24665.0046, - "train_tokens_per_second": 136306.352 - }, - { - "epoch": 0.514, - "grad_norm": 0.9953346848487854, - "learning_rate": 0.0001457213504308129, - "loss": 1.1919, - "num_input_tokens_seen": 3368550400, - "step": 51400, - "train_runtime": 24711.3817, - "train_tokens_per_second": 136315.745 - }, - { - "epoch": 0.515, - "grad_norm": 0.5582478642463684, - "learning_rate": 0.00014524556757010177, - "loss": 1.1924, - "num_input_tokens_seen": 3375104000, - "step": 51500, - "train_runtime": 24758.0554, - "train_tokens_per_second": 136323.469 - }, - { - "epoch": 0.516, - "grad_norm": 0.5084798336029053, - "learning_rate": 0.00014476983258647234, - "loss": 1.2068, - "num_input_tokens_seen": 3381657600, - "step": 51600, - "train_runtime": 24807.6959, - "train_tokens_per_second": 136314.86 - }, - { - "epoch": 0.517, - "grad_norm": 0.6907379627227783, - "learning_rate": 0.0001442941502705707, - "loss": 1.1945, - "num_input_tokens_seen": 3388211200, - "step": 51700, - "train_runtime": 24855.3849, - "train_tokens_per_second": 136316.988 - }, - { - "epoch": 0.518, - "grad_norm": 0.6037150025367737, - "learning_rate": 0.0001438185254125125, - "loss": 1.2053, - "num_input_tokens_seen": 3394764800, - "step": 51800, - "train_runtime": 24901.8712, - "train_tokens_per_second": 136325.691 - }, - { - "epoch": 0.519, - "grad_norm": 0.6816796064376831, - "learning_rate": 0.00014334296280183473, - "loss": 1.2019, - "num_input_tokens_seen": 3401318400, - "step": 51900, - "train_runtime": 24955.4949, - "train_tokens_per_second": 136295.37 - }, - { - "epoch": 0.52, - "grad_norm": 0.5201036930084229, - "learning_rate": 0.00014286746722744768, - "loss": 1.206, - "num_input_tokens_seen": 3407872000, - "step": 52000, - "train_runtime": 25002.9753, - "train_tokens_per_second": 136298.659 - }, - { - "epoch": 0.521, - "grad_norm": 0.5104642510414124, - "learning_rate": 0.00014239204347758647, - "loss": 1.2029, - "num_input_tokens_seen": 3414425600, - "step": 52100, - "train_runtime": 25051.9745, - "train_tokens_per_second": 136293.672 - }, - { - "epoch": 0.522, - "grad_norm": 0.4965505003929138, - "learning_rate": 0.00014191669633976294, - "loss": 1.1961, - "num_input_tokens_seen": 3420979200, - "step": 52200, - "train_runtime": 25099.3949, - "train_tokens_per_second": 136297.278 - }, - { - "epoch": 0.523, - "grad_norm": 0.5390327572822571, - "learning_rate": 0.00014144143060071756, - "loss": 1.194, - "num_input_tokens_seen": 3427532800, - "step": 52300, - "train_runtime": 25146.6291, - "train_tokens_per_second": 136301.879 - }, - { - "epoch": 0.524, - "grad_norm": 2.647089719772339, - "learning_rate": 0.000140966251046371, - "loss": 1.2006, - "num_input_tokens_seen": 3434086400, - "step": 52400, - "train_runtime": 25194.2742, - "train_tokens_per_second": 136304.24 - }, - { - "epoch": 0.525, - "grad_norm": 0.46030643582344055, - "learning_rate": 0.0001404911624617761, - "loss": 1.2071, - "num_input_tokens_seen": 3440640000, - "step": 52500, - "train_runtime": 25247.7567, - "train_tokens_per_second": 136275.077 - }, - { - "epoch": 0.526, - "grad_norm": 0.487699031829834, - "learning_rate": 0.00014001616963106966, - "loss": 1.2046, - "num_input_tokens_seen": 3447193600, - "step": 52600, - "train_runtime": 25295.5054, - "train_tokens_per_second": 136276.921 - }, - { - "epoch": 0.527, - "grad_norm": 0.4782906472682953, - "learning_rate": 0.00013954127733742416, - "loss": 1.1891, - "num_input_tokens_seen": 3453747200, - "step": 52700, - "train_runtime": 25344.1317, - "train_tokens_per_second": 136274.039 - }, - { - "epoch": 0.528, - "grad_norm": 0.595632016658783, - "learning_rate": 0.0001390664903629998, - "loss": 1.1867, - "num_input_tokens_seen": 3460300800, - "step": 52800, - "train_runtime": 25391.6777, - "train_tokens_per_second": 136276.966 - }, - { - "epoch": 0.529, - "grad_norm": 0.5201537609100342, - "learning_rate": 0.0001385918134888961, - "loss": 1.1955, - "num_input_tokens_seen": 3466854400, - "step": 52900, - "train_runtime": 25439.3874, - "train_tokens_per_second": 136279.005 - }, - { - "epoch": 0.53, - "grad_norm": 0.4726644456386566, - "learning_rate": 0.00013811725149510387, - "loss": 1.206, - "num_input_tokens_seen": 3473408000, - "step": 53000, - "train_runtime": 25492.0415, - "train_tokens_per_second": 136254.603 - }, - { - "epoch": 0.531, - "grad_norm": 0.5846008062362671, - "learning_rate": 0.0001376428091604572, - "loss": 1.2117, - "num_input_tokens_seen": 3479961600, - "step": 53100, - "train_runtime": 25540.3083, - "train_tokens_per_second": 136253.704 - }, - { - "epoch": 0.532, - "grad_norm": 0.4758647382259369, - "learning_rate": 0.00013716849126258512, - "loss": 1.2042, - "num_input_tokens_seen": 3486515200, - "step": 53200, - "train_runtime": 25589.0853, - "train_tokens_per_second": 136250.091 - }, - { - "epoch": 0.533, - "grad_norm": 0.4607105255126953, - "learning_rate": 0.00013669430257786354, - "loss": 1.1992, - "num_input_tokens_seen": 3493068800, - "step": 53300, - "train_runtime": 25636.4376, - "train_tokens_per_second": 136254.063 - }, - { - "epoch": 0.534, - "grad_norm": 0.6885077357292175, - "learning_rate": 0.00013622024788136728, - "loss": 1.2006, - "num_input_tokens_seen": 3499622400, - "step": 53400, - "train_runtime": 25684.4816, - "train_tokens_per_second": 136254.352 - }, - { - "epoch": 0.535, - "grad_norm": 0.6578366160392761, - "learning_rate": 0.00013574633194682185, - "loss": 1.1948, - "num_input_tokens_seen": 3506176000, - "step": 53500, - "train_runtime": 25730.7322, - "train_tokens_per_second": 136264.136 - }, - { - "epoch": 0.536, - "grad_norm": 0.4718693196773529, - "learning_rate": 0.0001352725595465555, - "loss": 1.2, - "num_input_tokens_seen": 3512729600, - "step": 53600, - "train_runtime": 25783.9922, - "train_tokens_per_second": 136236.839 - }, - { - "epoch": 0.537, - "grad_norm": 0.5561531186103821, - "learning_rate": 0.000134798935451451, - "loss": 1.2052, - "num_input_tokens_seen": 3519283200, - "step": 53700, - "train_runtime": 25832.8858, - "train_tokens_per_second": 136232.677 - }, - { - "epoch": 0.538, - "grad_norm": 0.5250628590583801, - "learning_rate": 0.00013432546443089768, - "loss": 1.2, - "num_input_tokens_seen": 3525836800, - "step": 53800, - "train_runtime": 25880.0084, - "train_tokens_per_second": 136237.854 - }, - { - "epoch": 0.539, - "grad_norm": 0.5457636117935181, - "learning_rate": 0.0001338521512527436, - "loss": 1.1944, - "num_input_tokens_seen": 3532390400, - "step": 53900, - "train_runtime": 25927.8228, - "train_tokens_per_second": 136239.376 - }, - { - "epoch": 0.54, - "grad_norm": 0.4437522292137146, - "learning_rate": 0.00013337900068324712, - "loss": 1.1912, - "num_input_tokens_seen": 3538944000, - "step": 54000, - "train_runtime": 25975.6777, - "train_tokens_per_second": 136240.68 - }, - { - "epoch": 0.541, - "grad_norm": 0.5343025326728821, - "learning_rate": 0.00013290601748702918, - "loss": 1.188, - "num_input_tokens_seen": 3545497600, - "step": 54100, - "train_runtime": 26027.6243, - "train_tokens_per_second": 136220.562 - }, - { - "epoch": 0.542, - "grad_norm": 0.4907335042953491, - "learning_rate": 0.00013243320642702543, - "loss": 1.1909, - "num_input_tokens_seen": 3552051200, - "step": 54200, - "train_runtime": 26075.5648, - "train_tokens_per_second": 136221.448 - }, - { - "epoch": 0.543, - "grad_norm": 0.7268043160438538, - "learning_rate": 0.0001319605722644379, - "loss": 1.1911, - "num_input_tokens_seen": 3558604800, - "step": 54300, - "train_runtime": 26122.2114, - "train_tokens_per_second": 136229.079 - }, - { - "epoch": 0.544, - "grad_norm": 1.3769776821136475, - "learning_rate": 0.0001314881197586874, - "loss": 1.224, - "num_input_tokens_seen": 3565158400, - "step": 54400, - "train_runtime": 26170.2324, - "train_tokens_per_second": 136229.528 - }, - { - "epoch": 0.545, - "grad_norm": 0.7141419649124146, - "learning_rate": 0.0001310158536673654, - "loss": 1.2025, - "num_input_tokens_seen": 3571712000, - "step": 54500, - "train_runtime": 26217.6992, - "train_tokens_per_second": 136232.854 - }, - { - "epoch": 0.546, - "grad_norm": 0.5124280452728271, - "learning_rate": 0.0001305437787461862, - "loss": 1.1972, - "num_input_tokens_seen": 3578265600, - "step": 54600, - "train_runtime": 26264.9719, - "train_tokens_per_second": 136237.176 - }, - { - "epoch": 0.547, - "grad_norm": 0.5609524250030518, - "learning_rate": 0.00013007189974893903, - "loss": 1.1924, - "num_input_tokens_seen": 3584819200, - "step": 54700, - "train_runtime": 26319.2824, - "train_tokens_per_second": 136205.051 - }, - { - "epoch": 0.548, - "grad_norm": 0.5220986604690552, - "learning_rate": 0.00012960022142744016, - "loss": 1.188, - "num_input_tokens_seen": 3591372800, - "step": 54800, - "train_runtime": 26367.119, - "train_tokens_per_second": 136206.493 - }, - { - "epoch": 0.549, - "grad_norm": 0.5159165263175964, - "learning_rate": 0.00012912874853148506, - "loss": 1.1891, - "num_input_tokens_seen": 3597926400, - "step": 54900, - "train_runtime": 26415.2651, - "train_tokens_per_second": 136206.333 - }, - { - "epoch": 0.55, - "grad_norm": 0.5019519925117493, - "learning_rate": 0.00012865748580880053, - "loss": 1.1827, - "num_input_tokens_seen": 3604480000, - "step": 55000, - "train_runtime": 26462.5595, - "train_tokens_per_second": 136210.558 - }, - { - "epoch": 0.551, - "grad_norm": 0.5309172868728638, - "learning_rate": 0.0001281864380049969, - "loss": 1.1876, - "num_input_tokens_seen": 3611033600, - "step": 55100, - "train_runtime": 26514.9513, - "train_tokens_per_second": 136188.581 - }, - { - "epoch": 0.552, - "grad_norm": 0.5431755781173706, - "learning_rate": 0.00012771560986352042, - "loss": 1.2038, - "num_input_tokens_seen": 3617587200, - "step": 55200, - "train_runtime": 26562.7975, - "train_tokens_per_second": 136189.993 - }, - { - "epoch": 0.553, - "grad_norm": 0.5063371658325195, - "learning_rate": 0.0001272450061256052, - "loss": 1.1837, - "num_input_tokens_seen": 3624140800, - "step": 55300, - "train_runtime": 26609.2594, - "train_tokens_per_second": 136198.484 - }, - { - "epoch": 0.554, - "grad_norm": 0.502314567565918, - "learning_rate": 0.00012677463153022565, - "loss": 1.1988, - "num_input_tokens_seen": 3630694400, - "step": 55400, - "train_runtime": 26655.8656, - "train_tokens_per_second": 136206.209 - }, - { - "epoch": 0.555, - "grad_norm": 0.5824739336967468, - "learning_rate": 0.0001263044908140488, - "loss": 1.1917, - "num_input_tokens_seen": 3637248000, - "step": 55500, - "train_runtime": 26707.6694, - "train_tokens_per_second": 136187.398 - }, - { - "epoch": 0.556, - "grad_norm": 0.5498598217964172, - "learning_rate": 0.00012583458871138632, - "loss": 1.1908, - "num_input_tokens_seen": 3643801600, - "step": 55600, - "train_runtime": 26755.8413, - "train_tokens_per_second": 136187.144 - }, - { - "epoch": 0.557, - "grad_norm": 0.5867239832878113, - "learning_rate": 0.00012536492995414723, - "loss": 1.193, - "num_input_tokens_seen": 3650355200, - "step": 55700, - "train_runtime": 26804.5182, - "train_tokens_per_second": 136184.324 - }, - { - "epoch": 0.558, - "grad_norm": 0.5584626197814941, - "learning_rate": 0.00012489551927179007, - "loss": 1.1833, - "num_input_tokens_seen": 3656908800, - "step": 55800, - "train_runtime": 26850.8981, - "train_tokens_per_second": 136193.165 - }, - { - "epoch": 0.559, - "grad_norm": 0.48578086495399475, - "learning_rate": 0.00012442636139127508, - "loss": 1.1919, - "num_input_tokens_seen": 3663462400, - "step": 55900, - "train_runtime": 26898.2376, - "train_tokens_per_second": 136197.116 - }, - { - "epoch": 0.56, - "grad_norm": 0.5344805121421814, - "learning_rate": 0.00012395746103701695, - "loss": 1.1978, - "num_input_tokens_seen": 3670016000, - "step": 56000, - "train_runtime": 26951.1383, - "train_tokens_per_second": 136172.95 - }, - { - "epoch": 0.561, - "grad_norm": 0.5378079414367676, - "learning_rate": 0.00012348882293083708, - "loss": 1.192, - "num_input_tokens_seen": 3676569600, - "step": 56100, - "train_runtime": 26999.7429, - "train_tokens_per_second": 136170.541 - }, - { - "epoch": 0.562, - "grad_norm": 0.6195780038833618, - "learning_rate": 0.00012302045179191594, - "loss": 1.1919, - "num_input_tokens_seen": 3683123200, - "step": 56200, - "train_runtime": 27047.827, - "train_tokens_per_second": 136170.761 - }, - { - "epoch": 0.563, - "grad_norm": 0.5348559617996216, - "learning_rate": 0.00012255235233674572, - "loss": 1.1875, - "num_input_tokens_seen": 3689676800, - "step": 56300, - "train_runtime": 27094.1422, - "train_tokens_per_second": 136179.871 - }, - { - "epoch": 0.564, - "grad_norm": 0.48098888993263245, - "learning_rate": 0.00012208452927908278, - "loss": 1.1818, - "num_input_tokens_seen": 3696230400, - "step": 56400, - "train_runtime": 27141.6856, - "train_tokens_per_second": 136182.787 - }, - { - "epoch": 0.565, - "grad_norm": 0.585021436214447, - "learning_rate": 0.00012161698732990003, - "loss": 1.1887, - "num_input_tokens_seen": 3702784000, - "step": 56500, - "train_runtime": 27194.4825, - "train_tokens_per_second": 136159.385 - }, - { - "epoch": 0.566, - "grad_norm": 0.5269266963005066, - "learning_rate": 0.00012114973119733987, - "loss": 1.187, - "num_input_tokens_seen": 3709337600, - "step": 56600, - "train_runtime": 27242.6521, - "train_tokens_per_second": 136159.196 - }, - { - "epoch": 0.567, - "grad_norm": 0.5563040971755981, - "learning_rate": 0.00012068276558666616, - "loss": 1.1996, - "num_input_tokens_seen": 3715891200, - "step": 56700, - "train_runtime": 27290.3101, - "train_tokens_per_second": 136161.56 - }, - { - "epoch": 0.568, - "grad_norm": 0.6131460666656494, - "learning_rate": 0.00012021609520021752, - "loss": 1.195, - "num_input_tokens_seen": 3722444800, - "step": 56800, - "train_runtime": 27337.7804, - "train_tokens_per_second": 136164.851 - }, - { - "epoch": 0.569, - "grad_norm": 0.5921023488044739, - "learning_rate": 0.00011974972473735957, - "loss": 1.2018, - "num_input_tokens_seen": 3728998400, - "step": 56900, - "train_runtime": 27384.9126, - "train_tokens_per_second": 136169.812 - }, - { - "epoch": 0.57, - "grad_norm": 0.4582422375679016, - "learning_rate": 0.00011928365889443764, - "loss": 1.1914, - "num_input_tokens_seen": 3735552000, - "step": 57000, - "train_runtime": 27436.2125, - "train_tokens_per_second": 136154.07 - }, - { - "epoch": 0.571, - "grad_norm": 0.6521887183189392, - "learning_rate": 0.00011881790236472966, - "loss": 1.2041, - "num_input_tokens_seen": 3742105600, - "step": 57100, - "train_runtime": 27484.9505, - "train_tokens_per_second": 136151.076 - }, - { - "epoch": 0.572, - "grad_norm": 0.5971055030822754, - "learning_rate": 0.00011835245983839869, - "loss": 1.1992, - "num_input_tokens_seen": 3748659200, - "step": 57200, - "train_runtime": 27531.7756, - "train_tokens_per_second": 136157.553 - }, - { - "epoch": 0.573, - "grad_norm": 0.5187013745307922, - "learning_rate": 0.00011788733600244575, - "loss": 1.193, - "num_input_tokens_seen": 3755212800, - "step": 57300, - "train_runtime": 27579.3239, - "train_tokens_per_second": 136160.437 - }, - { - "epoch": 0.574, - "grad_norm": 0.5805628299713135, - "learning_rate": 0.00011742253554066278, - "loss": 1.1925, - "num_input_tokens_seen": 3761766400, - "step": 57400, - "train_runtime": 27633.4529, - "train_tokens_per_second": 136130.885 - }, - { - "epoch": 0.575, - "grad_norm": 0.5242844223976135, - "learning_rate": 0.00011695806313358523, - "loss": 1.1991, - "num_input_tokens_seen": 3768320000, - "step": 57500, - "train_runtime": 27681.3237, - "train_tokens_per_second": 136132.218 - }, - { - "epoch": 0.576, - "grad_norm": 0.7652018666267395, - "learning_rate": 0.00011649392345844506, - "loss": 1.192, - "num_input_tokens_seen": 3774873600, - "step": 57600, - "train_runtime": 27728.8266, - "train_tokens_per_second": 136135.353 - }, - { - "epoch": 0.577, - "grad_norm": 0.5232011675834656, - "learning_rate": 0.00011603012118912372, - "loss": 1.2019, - "num_input_tokens_seen": 3781427200, - "step": 57700, - "train_runtime": 27778.1555, - "train_tokens_per_second": 136129.528 - }, - { - "epoch": 0.578, - "grad_norm": 0.5537053942680359, - "learning_rate": 0.00011556666099610485, - "loss": 1.1948, - "num_input_tokens_seen": 3787980800, - "step": 57800, - "train_runtime": 27824.9287, - "train_tokens_per_second": 136136.227 - }, - { - "epoch": 0.579, - "grad_norm": 0.6031852960586548, - "learning_rate": 0.00011510354754642745, - "loss": 1.1888, - "num_input_tokens_seen": 3794534400, - "step": 57900, - "train_runtime": 27872.2044, - "train_tokens_per_second": 136140.448 - }, - { - "epoch": 0.58, - "grad_norm": 0.5748854875564575, - "learning_rate": 0.00011464078550363887, - "loss": 1.1921, - "num_input_tokens_seen": 3801088000, - "step": 58000, - "train_runtime": 27925.2055, - "train_tokens_per_second": 136116.742 - }, - { - "epoch": 0.581, - "grad_norm": 0.5586141347885132, - "learning_rate": 0.0001141783795277477, - "loss": 1.2024, - "num_input_tokens_seen": 3807641600, - "step": 58100, - "train_runtime": 27972.7534, - "train_tokens_per_second": 136119.657 - }, - { - "epoch": 0.582, - "grad_norm": 0.4893476366996765, - "learning_rate": 0.00011371633427517696, - "loss": 1.2034, - "num_input_tokens_seen": 3814195200, - "step": 58200, - "train_runtime": 28020.2529, - "train_tokens_per_second": 136122.797 - }, - { - "epoch": 0.583, - "grad_norm": 0.5007518529891968, - "learning_rate": 0.00011325465439871731, - "loss": 1.1885, - "num_input_tokens_seen": 3820748800, - "step": 58300, - "train_runtime": 28067.154, - "train_tokens_per_second": 136128.829 - }, - { - "epoch": 0.584, - "grad_norm": 0.5260310769081116, - "learning_rate": 0.00011279334454747989, - "loss": 1.1931, - "num_input_tokens_seen": 3827302400, - "step": 58400, - "train_runtime": 28120.6157, - "train_tokens_per_second": 136103.08 - }, - { - "epoch": 0.585, - "grad_norm": 0.5364392399787903, - "learning_rate": 0.00011233240936684981, - "loss": 1.1928, - "num_input_tokens_seen": 3833856000, - "step": 58500, - "train_runtime": 28168.5149, - "train_tokens_per_second": 136104.3 - }, - { - "epoch": 0.586, - "grad_norm": 0.49333399534225464, - "learning_rate": 0.00011187185349843916, - "loss": 1.1935, - "num_input_tokens_seen": 3840409600, - "step": 58600, - "train_runtime": 28215.0596, - "train_tokens_per_second": 136112.05 - }, - { - "epoch": 0.587, - "grad_norm": 0.5711957216262817, - "learning_rate": 0.00011141168158004053, - "loss": 1.1812, - "num_input_tokens_seen": 3846963200, - "step": 58700, - "train_runtime": 28264.2863, - "train_tokens_per_second": 136106.858 - }, - { - "epoch": 0.588, - "grad_norm": 1.0157184600830078, - "learning_rate": 0.00011095189824557998, - "loss": 1.1929, - "num_input_tokens_seen": 3853516800, - "step": 58800, - "train_runtime": 28311.6057, - "train_tokens_per_second": 136110.853 - }, - { - "epoch": 0.589, - "grad_norm": 0.552700936794281, - "learning_rate": 0.00011049250812507054, - "loss": 1.1909, - "num_input_tokens_seen": 3860070400, - "step": 58900, - "train_runtime": 28359.0956, - "train_tokens_per_second": 136114.016 - }, - { - "epoch": 0.59, - "grad_norm": 0.46860748529434204, - "learning_rate": 0.00011003351584456571, - "loss": 1.1972, - "num_input_tokens_seen": 3866624000, - "step": 59000, - "train_runtime": 28412.8978, - "train_tokens_per_second": 136086.929 - }, - { - "epoch": 0.591, - "grad_norm": 0.5399055480957031, - "learning_rate": 0.0001095749260261126, - "loss": 1.1895, - "num_input_tokens_seen": 3873177600, - "step": 59100, - "train_runtime": 28462.0603, - "train_tokens_per_second": 136082.123 - }, - { - "epoch": 0.592, - "grad_norm": 0.49921005964279175, - "learning_rate": 0.00010911674328770559, - "loss": 1.1968, - "num_input_tokens_seen": 3879731200, - "step": 59200, - "train_runtime": 28510.9551, - "train_tokens_per_second": 136078.612 - }, - { - "epoch": 0.593, - "grad_norm": 0.5357686877250671, - "learning_rate": 0.00010865897224323979, - "loss": 1.1889, - "num_input_tokens_seen": 3886284800, - "step": 59300, - "train_runtime": 28558.3344, - "train_tokens_per_second": 136082.334 - }, - { - "epoch": 0.594, - "grad_norm": 0.5710283517837524, - "learning_rate": 0.00010820161750246453, - "loss": 1.1864, - "num_input_tokens_seen": 3892838400, - "step": 59400, - "train_runtime": 28606.454, - "train_tokens_per_second": 136082.522 - }, - { - "epoch": 0.595, - "grad_norm": 0.6333475112915039, - "learning_rate": 0.00010774468367093696, - "loss": 1.2009, - "num_input_tokens_seen": 3899392000, - "step": 59500, - "train_runtime": 28653.986, - "train_tokens_per_second": 136085.5 - }, - { - "epoch": 0.596, - "grad_norm": 0.5585243701934814, - "learning_rate": 0.00010728817534997573, - "loss": 1.1877, - "num_input_tokens_seen": 3905945600, - "step": 59600, - "train_runtime": 28701.832, - "train_tokens_per_second": 136086.979 - }, - { - "epoch": 0.597, - "grad_norm": 0.5805736184120178, - "learning_rate": 0.00010683209713661453, - "loss": 1.211, - "num_input_tokens_seen": 3912499200, - "step": 59700, - "train_runtime": 28751.7229, - "train_tokens_per_second": 136078.774 - }, - { - "epoch": 0.598, - "grad_norm": 0.5607670545578003, - "learning_rate": 0.00010637645362355589, - "loss": 1.196, - "num_input_tokens_seen": 3919052800, - "step": 59800, - "train_runtime": 28798.1873, - "train_tokens_per_second": 136086.788 - }, - { - "epoch": 0.599, - "grad_norm": 0.4962175488471985, - "learning_rate": 0.00010592124939912497, - "loss": 1.1889, - "num_input_tokens_seen": 3925606400, - "step": 59900, - "train_runtime": 28852.3337, - "train_tokens_per_second": 136058.54 - }, - { - "epoch": 0.6, - "grad_norm": 0.6488810777664185, - "learning_rate": 0.00010546648904722326, - "loss": 1.1968, - "num_input_tokens_seen": 3932160000, - "step": 60000, - "train_runtime": 28898.713, - "train_tokens_per_second": 136066.959 - }, - { - "epoch": 0.601, - "grad_norm": 0.9370976686477661, - "learning_rate": 0.0001050121771472824, - "loss": 1.183, - "num_input_tokens_seen": 3938713600, - "step": 60100, - "train_runtime": 28946.5523, - "train_tokens_per_second": 136068.488 - }, - { - "epoch": 0.602, - "grad_norm": 0.5040610432624817, - "learning_rate": 0.0001045583182742182, - "loss": 1.2023, - "num_input_tokens_seen": 3945267200, - "step": 60200, - "train_runtime": 28994.2594, - "train_tokens_per_second": 136070.632 - }, - { - "epoch": 0.603, - "grad_norm": 0.5120612382888794, - "learning_rate": 0.00010410491699838448, - "loss": 1.1865, - "num_input_tokens_seen": 3951820800, - "step": 60300, - "train_runtime": 29042.095, - "train_tokens_per_second": 136072.167 - }, - { - "epoch": 0.604, - "grad_norm": 0.8983064889907837, - "learning_rate": 0.00010365197788552707, - "loss": 1.1734, - "num_input_tokens_seen": 3958374400, - "step": 60400, - "train_runtime": 29090.1772, - "train_tokens_per_second": 136072.543 - }, - { - "epoch": 0.605, - "grad_norm": 0.5155735015869141, - "learning_rate": 0.00010319950549673778, - "loss": 1.1923, - "num_input_tokens_seen": 3964928000, - "step": 60500, - "train_runtime": 29143.642, - "train_tokens_per_second": 136047.787 - }, - { - "epoch": 0.606, - "grad_norm": 1.5562913417816162, - "learning_rate": 0.00010274750438840855, - "loss": 1.1877, - "num_input_tokens_seen": 3971481600, - "step": 60600, - "train_runtime": 29191.8256, - "train_tokens_per_second": 136047.73 - }, - { - "epoch": 0.607, - "grad_norm": 0.5603190064430237, - "learning_rate": 0.00010229597911218554, - "loss": 1.1862, - "num_input_tokens_seen": 3978035200, - "step": 60700, - "train_runtime": 29240.4534, - "train_tokens_per_second": 136045.606 - }, - { - "epoch": 0.608, - "grad_norm": 0.550956130027771, - "learning_rate": 0.00010184493421492324, - "loss": 1.1869, - "num_input_tokens_seen": 3984588800, - "step": 60800, - "train_runtime": 29287.1822, - "train_tokens_per_second": 136052.31 - }, - { - "epoch": 0.609, - "grad_norm": 0.5152813196182251, - "learning_rate": 0.0001013943742386388, - "loss": 1.1902, - "num_input_tokens_seen": 3991142400, - "step": 60900, - "train_runtime": 29335.0152, - "train_tokens_per_second": 136053.872 - }, - { - "epoch": 0.61, - "grad_norm": 0.5258508324623108, - "learning_rate": 0.00010094430372046616, - "loss": 1.1843, - "num_input_tokens_seen": 3997696000, - "step": 61000, - "train_runtime": 29387.778, - "train_tokens_per_second": 136032.605 - }, - { - "epoch": 0.611, - "grad_norm": 0.5804030895233154, - "learning_rate": 0.0001004947271926104, - "loss": 1.1872, - "num_input_tokens_seen": 4004249600, - "step": 61100, - "train_runtime": 29435.5024, - "train_tokens_per_second": 136034.695 - }, - { - "epoch": 0.612, - "grad_norm": 0.5679774284362793, - "learning_rate": 0.00010004564918230222, - "loss": 1.1933, - "num_input_tokens_seen": 4010803200, - "step": 61200, - "train_runtime": 29483.504, - "train_tokens_per_second": 136035.5 - }, - { - "epoch": 0.613, - "grad_norm": 0.611191987991333, - "learning_rate": 9.959707421175217e-05, - "loss": 1.1926, - "num_input_tokens_seen": 4017356800, - "step": 61300, - "train_runtime": 29529.7223, - "train_tokens_per_second": 136044.517 - }, - { - "epoch": 0.614, - "grad_norm": 0.5725626945495605, - "learning_rate": 9.914900679810522e-05, - "loss": 1.1812, - "num_input_tokens_seen": 4023910400, - "step": 61400, - "train_runtime": 29577.4052, - "train_tokens_per_second": 136046.769 - }, - { - "epoch": 0.615, - "grad_norm": 0.6058773398399353, - "learning_rate": 9.870145145339529e-05, - "loss": 1.1904, - "num_input_tokens_seen": 4030464000, - "step": 61500, - "train_runtime": 29630.1636, - "train_tokens_per_second": 136025.708 - }, - { - "epoch": 0.616, - "grad_norm": 0.5151665806770325, - "learning_rate": 9.825441268449969e-05, - "loss": 1.1783, - "num_input_tokens_seen": 4037017600, - "step": 61600, - "train_runtime": 29677.4813, - "train_tokens_per_second": 136029.657 - }, - { - "epoch": 0.617, - "grad_norm": 0.5461622476577759, - "learning_rate": 9.780789499309391e-05, - "loss": 1.1825, - "num_input_tokens_seen": 4043571200, - "step": 61700, - "train_runtime": 29725.432, - "train_tokens_per_second": 136030.696 - }, - { - "epoch": 0.618, - "grad_norm": 0.8243169784545898, - "learning_rate": 9.736190287560608e-05, - "loss": 1.1933, - "num_input_tokens_seen": 4050124800, - "step": 61800, - "train_runtime": 29772.1739, - "train_tokens_per_second": 136037.255 - }, - { - "epoch": 0.619, - "grad_norm": 0.4877258539199829, - "learning_rate": 9.691644082317186e-05, - "loss": 1.1881, - "num_input_tokens_seen": 4056678400, - "step": 61900, - "train_runtime": 29825.721, - "train_tokens_per_second": 136012.752 - }, - { - "epoch": 0.62, - "grad_norm": 0.5376379489898682, - "learning_rate": 9.647151332158926e-05, - "loss": 1.1812, - "num_input_tokens_seen": 4063232000, - "step": 62000, - "train_runtime": 29872.1612, - "train_tokens_per_second": 136020.691 - }, - { - "epoch": 0.621, - "grad_norm": 0.5128985643386841, - "learning_rate": 9.60271248512732e-05, - "loss": 1.1719, - "num_input_tokens_seen": 4069785600, - "step": 62100, - "train_runtime": 29919.8698, - "train_tokens_per_second": 136022.838 - }, - { - "epoch": 0.622, - "grad_norm": 0.6911051273345947, - "learning_rate": 9.558327988721068e-05, - "loss": 1.199, - "num_input_tokens_seen": 4076339200, - "step": 62200, - "train_runtime": 29967.7263, - "train_tokens_per_second": 136024.307 - }, - { - "epoch": 0.623, - "grad_norm": 0.5334423184394836, - "learning_rate": 9.513998289891559e-05, - "loss": 1.1922, - "num_input_tokens_seen": 4082892800, - "step": 62300, - "train_runtime": 30014.7483, - "train_tokens_per_second": 136029.553 - }, - { - "epoch": 0.624, - "grad_norm": 0.47934290766716003, - "learning_rate": 9.469723835038361e-05, - "loss": 1.1864, - "num_input_tokens_seen": 4089446400, - "step": 62400, - "train_runtime": 30062.3944, - "train_tokens_per_second": 136031.959 - }, - { - "epoch": 0.625, - "grad_norm": 0.6690011620521545, - "learning_rate": 9.42550507000475e-05, - "loss": 1.1887, - "num_input_tokens_seen": 4096000000, - "step": 62500, - "train_runtime": 30115.1503, - "train_tokens_per_second": 136011.275 - }, - { - "epoch": 0.626, - "grad_norm": 0.5379562973976135, - "learning_rate": 9.381342440073194e-05, - "loss": 1.1873, - "num_input_tokens_seen": 4102553600, - "step": 62600, - "train_runtime": 30162.8214, - "train_tokens_per_second": 136013.589 - }, - { - "epoch": 0.627, - "grad_norm": 0.5619449615478516, - "learning_rate": 9.337236389960886e-05, - "loss": 1.184, - "num_input_tokens_seen": 4109107200, - "step": 62700, - "train_runtime": 30211.3171, - "train_tokens_per_second": 136012.183 - }, - { - "epoch": 0.628, - "grad_norm": 0.9017994999885559, - "learning_rate": 9.293187363815265e-05, - "loss": 1.1869, - "num_input_tokens_seen": 4115660800, - "step": 62800, - "train_runtime": 30263.5761, - "train_tokens_per_second": 135993.869 - }, - { - "epoch": 0.629, - "grad_norm": 0.6502019762992859, - "learning_rate": 9.249195805209533e-05, - "loss": 1.1944, - "num_input_tokens_seen": 4122214400, - "step": 62900, - "train_runtime": 30310.6247, - "train_tokens_per_second": 135998.992 - }, - { - "epoch": 0.63, - "grad_norm": 0.5749123096466064, - "learning_rate": 9.205262157138192e-05, - "loss": 1.1896, - "num_input_tokens_seen": 4128768000, - "step": 63000, - "train_runtime": 30359.0787, - "train_tokens_per_second": 135997.803 - }, - { - "epoch": 0.631, - "grad_norm": 0.4843611419200897, - "learning_rate": 9.161386862012601e-05, - "loss": 1.1932, - "num_input_tokens_seen": 4135321600, - "step": 63100, - "train_runtime": 30406.8492, - "train_tokens_per_second": 135999.675 - }, - { - "epoch": 0.632, - "grad_norm": 0.634504497051239, - "learning_rate": 9.11757036165649e-05, - "loss": 1.181, - "num_input_tokens_seen": 4141875200, - "step": 63200, - "train_runtime": 30453.794, - "train_tokens_per_second": 136005.228 - }, - { - "epoch": 0.633, - "grad_norm": 0.605948269367218, - "learning_rate": 9.073813097301521e-05, - "loss": 1.1742, - "num_input_tokens_seen": 4148428800, - "step": 63300, - "train_runtime": 30506.719, - "train_tokens_per_second": 135984.102 - }, - { - "epoch": 0.634, - "grad_norm": 0.5731847882270813, - "learning_rate": 9.030115509582883e-05, - "loss": 1.1809, - "num_input_tokens_seen": 4154982400, - "step": 63400, - "train_runtime": 30554.7018, - "train_tokens_per_second": 135985.042 - }, - { - "epoch": 0.635, - "grad_norm": 0.9707246422767639, - "learning_rate": 8.986478038534775e-05, - "loss": 1.1981, - "num_input_tokens_seen": 4161536000, - "step": 63500, - "train_runtime": 30602.1945, - "train_tokens_per_second": 135988.156 - }, - { - "epoch": 0.636, - "grad_norm": 0.7120965719223022, - "learning_rate": 8.942901123586059e-05, - "loss": 1.1816, - "num_input_tokens_seen": 4168089600, - "step": 63600, - "train_runtime": 30649.6499, - "train_tokens_per_second": 135991.426 - }, - { - "epoch": 0.637, - "grad_norm": 0.5136720538139343, - "learning_rate": 8.899385203555781e-05, - "loss": 1.177, - "num_input_tokens_seen": 4174643200, - "step": 63700, - "train_runtime": 30696.6221, - "train_tokens_per_second": 135996.827 - }, - { - "epoch": 0.638, - "grad_norm": 0.5284336805343628, - "learning_rate": 8.855930716648774e-05, - "loss": 1.184, - "num_input_tokens_seen": 4181196800, - "step": 63800, - "train_runtime": 30745.5123, - "train_tokens_per_second": 135993.727 - }, - { - "epoch": 0.639, - "grad_norm": 0.5269259810447693, - "learning_rate": 8.812538100451239e-05, - "loss": 1.2174, - "num_input_tokens_seen": 4187750400, - "step": 63900, - "train_runtime": 30792.1632, - "train_tokens_per_second": 136000.526 - }, - { - "epoch": 0.64, - "grad_norm": 0.5354572534561157, - "learning_rate": 8.769207791926338e-05, - "loss": 1.1771, - "num_input_tokens_seen": 4194304000, - "step": 64000, - "train_runtime": 30846.5823, - "train_tokens_per_second": 135973.054 - }, - { - "epoch": 0.641, - "grad_norm": 0.7058772444725037, - "learning_rate": 8.725940227409797e-05, - "loss": 1.179, - "num_input_tokens_seen": 4200857600, - "step": 64100, - "train_runtime": 30893.4429, - "train_tokens_per_second": 135978.94 - }, - { - "epoch": 0.642, - "grad_norm": 0.5777366161346436, - "learning_rate": 8.682735842605509e-05, - "loss": 1.182, - "num_input_tokens_seen": 4207411200, - "step": 64200, - "train_runtime": 30940.3826, - "train_tokens_per_second": 135984.459 - }, - { - "epoch": 0.643, - "grad_norm": 0.5608710646629333, - "learning_rate": 8.639595072581158e-05, - "loss": 1.1904, - "num_input_tokens_seen": 4213964800, - "step": 64300, - "train_runtime": 30988.4894, - "train_tokens_per_second": 135984.841 - }, - { - "epoch": 0.644, - "grad_norm": 0.6048064231872559, - "learning_rate": 8.596518351763806e-05, - "loss": 1.1851, - "num_input_tokens_seen": 4220518400, - "step": 64400, - "train_runtime": 31041.3711, - "train_tokens_per_second": 135964.304 - }, - { - "epoch": 0.645, - "grad_norm": 0.47835734486579895, - "learning_rate": 8.553506113935561e-05, - "loss": 1.1803, - "num_input_tokens_seen": 4227072000, - "step": 64500, - "train_runtime": 31089.9624, - "train_tokens_per_second": 135962.596 - }, - { - "epoch": 0.646, - "grad_norm": 1.1150704622268677, - "learning_rate": 8.510558792229183e-05, - "loss": 1.1878, - "num_input_tokens_seen": 4233625600, - "step": 64600, - "train_runtime": 31137.4325, - "train_tokens_per_second": 135965.79 - }, - { - "epoch": 0.647, - "grad_norm": 0.6650880575180054, - "learning_rate": 8.467676819123716e-05, - "loss": 1.1951, - "num_input_tokens_seen": 4240179200, - "step": 64700, - "train_runtime": 31185.0957, - "train_tokens_per_second": 135968.132 - }, - { - "epoch": 0.648, - "grad_norm": 0.7750310897827148, - "learning_rate": 8.424860626440158e-05, - "loss": 1.1829, - "num_input_tokens_seen": 4246732800, - "step": 64800, - "train_runtime": 31237.5852, - "train_tokens_per_second": 135949.459 - }, - { - "epoch": 0.649, - "grad_norm": 0.595783531665802, - "learning_rate": 8.382110645337102e-05, - "loss": 1.1856, - "num_input_tokens_seen": 4253286400, - "step": 64900, - "train_runtime": 31285.0064, - "train_tokens_per_second": 135952.87 - }, - { - "epoch": 0.65, - "grad_norm": 0.6093938946723938, - "learning_rate": 8.339427306306365e-05, - "loss": 1.1842, - "num_input_tokens_seen": 4259840000, - "step": 65000, - "train_runtime": 31332.1176, - "train_tokens_per_second": 135957.615 - }, - { - "epoch": 0.651, - "grad_norm": 0.6823499798774719, - "learning_rate": 8.296811039168716e-05, - "loss": 1.1818, - "num_input_tokens_seen": 4266393600, - "step": 65100, - "train_runtime": 31381.0925, - "train_tokens_per_second": 135954.273 - }, - { - "epoch": 0.652, - "grad_norm": 0.5052744746208191, - "learning_rate": 8.254262273069477e-05, - "loss": 1.2034, - "num_input_tokens_seen": 4272947200, - "step": 65200, - "train_runtime": 31428.8012, - "train_tokens_per_second": 135956.417 - }, - { - "epoch": 0.653, - "grad_norm": 0.5003641247749329, - "learning_rate": 8.211781436474263e-05, - "loss": 1.177, - "num_input_tokens_seen": 4279500800, - "step": 65300, - "train_runtime": 31476.0702, - "train_tokens_per_second": 135960.454 - }, - { - "epoch": 0.654, - "grad_norm": 0.5675527453422546, - "learning_rate": 8.169368957164613e-05, - "loss": 1.1707, - "num_input_tokens_seen": 4286054400, - "step": 65400, - "train_runtime": 31524.8831, - "train_tokens_per_second": 135957.82 - }, - { - "epoch": 0.655, - "grad_norm": 0.5109818577766418, - "learning_rate": 8.127025262233731e-05, - "loss": 1.187, - "num_input_tokens_seen": 4292608000, - "step": 65500, - "train_runtime": 31578.0721, - "train_tokens_per_second": 135936.354 - }, - { - "epoch": 0.656, - "grad_norm": 0.6228885054588318, - "learning_rate": 8.084750778082159e-05, - "loss": 1.1944, - "num_input_tokens_seen": 4299161600, - "step": 65600, - "train_runtime": 31626.6624, - "train_tokens_per_second": 135934.723 - }, - { - "epoch": 0.657, - "grad_norm": 0.6139951348304749, - "learning_rate": 8.042545930413473e-05, - "loss": 1.1788, - "num_input_tokens_seen": 4305715200, - "step": 65700, - "train_runtime": 31673.2442, - "train_tokens_per_second": 135941.717 - }, - { - "epoch": 0.658, - "grad_norm": 0.6792371273040771, - "learning_rate": 8.000411144230025e-05, - "loss": 1.2019, - "num_input_tokens_seen": 4312268800, - "step": 65800, - "train_runtime": 31721.455, - "train_tokens_per_second": 135941.709 - }, - { - "epoch": 0.659, - "grad_norm": 0.546470582485199, - "learning_rate": 7.95834684382865e-05, - "loss": 1.1905, - "num_input_tokens_seen": 4318822400, - "step": 65900, - "train_runtime": 31770.1998, - "train_tokens_per_second": 135939.416 - }, - { - "epoch": 0.66, - "grad_norm": 0.5273057818412781, - "learning_rate": 7.916353452796378e-05, - "loss": 1.1769, - "num_input_tokens_seen": 4325376000, - "step": 66000, - "train_runtime": 31818.123, - "train_tokens_per_second": 135940.64 - }, - { - "epoch": 0.661, - "grad_norm": 0.5213398933410645, - "learning_rate": 7.874431394006188e-05, - "loss": 1.1834, - "num_input_tokens_seen": 4331929600, - "step": 66100, - "train_runtime": 31870.8187, - "train_tokens_per_second": 135921.504 - }, - { - "epoch": 0.662, - "grad_norm": 0.5762707591056824, - "learning_rate": 7.832581089612762e-05, - "loss": 1.1875, - "num_input_tokens_seen": 4338483200, - "step": 66200, - "train_runtime": 31918.6258, - "train_tokens_per_second": 135923.245 - }, - { - "epoch": 0.663, - "grad_norm": 0.6153529286384583, - "learning_rate": 7.790802961048183e-05, - "loss": 1.1895, - "num_input_tokens_seen": 4345036800, - "step": 66300, - "train_runtime": 31967.5441, - "train_tokens_per_second": 135920.257 - }, - { - "epoch": 0.664, - "grad_norm": 0.6668293476104736, - "learning_rate": 7.749097429017749e-05, - "loss": 1.1835, - "num_input_tokens_seen": 4351590400, - "step": 66400, - "train_runtime": 32014.502, - "train_tokens_per_second": 135925.6 - }, - { - "epoch": 0.665, - "grad_norm": 0.49117180705070496, - "learning_rate": 7.70746491349571e-05, - "loss": 1.1762, - "num_input_tokens_seen": 4358144000, - "step": 66500, - "train_runtime": 32062.234, - "train_tokens_per_second": 135927.646 - }, - { - "epoch": 0.666, - "grad_norm": 0.5580335259437561, - "learning_rate": 7.665905833721025e-05, - "loss": 1.1751, - "num_input_tokens_seen": 4364697600, - "step": 66600, - "train_runtime": 32116.4057, - "train_tokens_per_second": 135902.431 - }, - { - "epoch": 0.667, - "grad_norm": 0.4941908121109009, - "learning_rate": 7.624420608193171e-05, - "loss": 1.1991, - "num_input_tokens_seen": 4371251200, - "step": 66700, - "train_runtime": 32164.7962, - "train_tokens_per_second": 135901.722 - }, - { - "epoch": 0.668, - "grad_norm": 0.5203377604484558, - "learning_rate": 7.583009654667912e-05, - "loss": 1.1892, - "num_input_tokens_seen": 4377804800, - "step": 66800, - "train_runtime": 32211.7614, - "train_tokens_per_second": 135907.029 - }, - { - "epoch": 0.669, - "grad_norm": 0.5924380421638489, - "learning_rate": 7.541673390153087e-05, - "loss": 1.1749, - "num_input_tokens_seen": 4384358400, - "step": 66900, - "train_runtime": 32259.5523, - "train_tokens_per_second": 135908.842 - }, - { - "epoch": 0.67, - "grad_norm": 0.5180861353874207, - "learning_rate": 7.500412230904416e-05, - "loss": 1.1833, - "num_input_tokens_seen": 4390912000, - "step": 67000, - "train_runtime": 32305.7062, - "train_tokens_per_second": 135917.536 - }, - { - "epoch": 0.671, - "grad_norm": 0.5575404167175293, - "learning_rate": 7.459226592421318e-05, - "loss": 1.1908, - "num_input_tokens_seen": 4397465600, - "step": 67100, - "train_runtime": 32353.5616, - "train_tokens_per_second": 135919.058 - }, - { - "epoch": 0.672, - "grad_norm": 0.519868016242981, - "learning_rate": 7.418116889442721e-05, - "loss": 1.191, - "num_input_tokens_seen": 4404019200, - "step": 67200, - "train_runtime": 32407.2129, - "train_tokens_per_second": 135896.265 - }, - { - "epoch": 0.673, - "grad_norm": 0.5036019086837769, - "learning_rate": 7.377083535942868e-05, - "loss": 1.1771, - "num_input_tokens_seen": 4410572800, - "step": 67300, - "train_runtime": 32454.4825, - "train_tokens_per_second": 135900.266 - }, - { - "epoch": 0.674, - "grad_norm": 0.5349675416946411, - "learning_rate": 7.336126945127178e-05, - "loss": 1.1834, - "num_input_tokens_seen": 4417126400, - "step": 67400, - "train_runtime": 32501.8427, - "train_tokens_per_second": 135903.876 - }, - { - "epoch": 0.675, - "grad_norm": 0.675538957118988, - "learning_rate": 7.29524752942807e-05, - "loss": 1.1852, - "num_input_tokens_seen": 4423680000, - "step": 67500, - "train_runtime": 32550.3797, - "train_tokens_per_second": 135902.562 - }, - { - "epoch": 0.676, - "grad_norm": 0.5116747617721558, - "learning_rate": 7.254445700500798e-05, - "loss": 1.1816, - "num_input_tokens_seen": 4430233600, - "step": 67600, - "train_runtime": 32598.0387, - "train_tokens_per_second": 135904.913 - }, - { - "epoch": 0.677, - "grad_norm": 0.5892815589904785, - "learning_rate": 7.213721869219329e-05, - "loss": 1.1827, - "num_input_tokens_seen": 4436787200, - "step": 67700, - "train_runtime": 32650.3715, - "train_tokens_per_second": 135887.802 - }, - { - "epoch": 0.678, - "grad_norm": 0.6862092614173889, - "learning_rate": 7.173076445672198e-05, - "loss": 1.1801, - "num_input_tokens_seen": 4443340800, - "step": 67800, - "train_runtime": 32698.6817, - "train_tokens_per_second": 135887.46 - }, - { - "epoch": 0.679, - "grad_norm": 0.8308249115943909, - "learning_rate": 7.132509839158359e-05, - "loss": 1.1887, - "num_input_tokens_seen": 4449894400, - "step": 67900, - "train_runtime": 32745.9782, - "train_tokens_per_second": 135891.326 - }, - { - "epoch": 0.68, - "grad_norm": 0.5063105225563049, - "learning_rate": 7.092022458183096e-05, - "loss": 1.1949, - "num_input_tokens_seen": 4456448000, - "step": 68000, - "train_runtime": 32794.3077, - "train_tokens_per_second": 135890.9 - }, - { - "epoch": 0.681, - "grad_norm": 0.6090216040611267, - "learning_rate": 7.051614710453888e-05, - "loss": 1.1827, - "num_input_tokens_seen": 4463001600, - "step": 68100, - "train_runtime": 32841.7871, - "train_tokens_per_second": 135893.993 - }, - { - "epoch": 0.682, - "grad_norm": 0.5802315473556519, - "learning_rate": 7.011287002876296e-05, - "loss": 1.1808, - "num_input_tokens_seen": 4469555200, - "step": 68200, - "train_runtime": 32889.3297, - "train_tokens_per_second": 135896.816 - }, - { - "epoch": 0.683, - "grad_norm": 0.5431249141693115, - "learning_rate": 6.971039741549894e-05, - "loss": 1.1872, - "num_input_tokens_seen": 4476108800, - "step": 68300, - "train_runtime": 32943.0615, - "train_tokens_per_second": 135874.099 - }, - { - "epoch": 0.684, - "grad_norm": 0.8621413111686707, - "learning_rate": 6.930873331764162e-05, - "loss": 1.1776, - "num_input_tokens_seen": 4482662400, - "step": 68400, - "train_runtime": 32991.0019, - "train_tokens_per_second": 135875.304 - }, - { - "epoch": 0.685, - "grad_norm": 0.6102387309074402, - "learning_rate": 6.890788177994391e-05, - "loss": 1.18, - "num_input_tokens_seen": 4489216000, - "step": 68500, - "train_runtime": 33039.2288, - "train_tokens_per_second": 135875.326 - }, - { - "epoch": 0.686, - "grad_norm": 0.5266649723052979, - "learning_rate": 6.850784683897641e-05, - "loss": 1.1743, - "num_input_tokens_seen": 4495769600, - "step": 68600, - "train_runtime": 33086.8363, - "train_tokens_per_second": 135877.893 - }, - { - "epoch": 0.687, - "grad_norm": 0.5879511833190918, - "learning_rate": 6.810863252308653e-05, - "loss": 1.1803, - "num_input_tokens_seen": 4502323200, - "step": 68700, - "train_runtime": 33133.6328, - "train_tokens_per_second": 135883.778 - }, - { - "epoch": 0.688, - "grad_norm": 0.5183672308921814, - "learning_rate": 6.771024285235792e-05, - "loss": 1.1834, - "num_input_tokens_seen": 4508876800, - "step": 68800, - "train_runtime": 33182.6281, - "train_tokens_per_second": 135880.642 - }, - { - "epoch": 0.689, - "grad_norm": 0.5091114640235901, - "learning_rate": 6.73126818385702e-05, - "loss": 1.1913, - "num_input_tokens_seen": 4515430400, - "step": 68900, - "train_runtime": 33236.4019, - "train_tokens_per_second": 135857.979 - }, - { - "epoch": 0.69, - "grad_norm": 0.7696628570556641, - "learning_rate": 6.691595348515837e-05, - "loss": 1.1786, - "num_input_tokens_seen": 4521984000, - "step": 69000, - "train_runtime": 33285.7582, - "train_tokens_per_second": 135853.417 - }, - { - "epoch": 0.691, - "grad_norm": 0.5338857769966125, - "learning_rate": 6.65200617871726e-05, - "loss": 1.1832, - "num_input_tokens_seen": 4528537600, - "step": 69100, - "train_runtime": 33332.8826, - "train_tokens_per_second": 135857.965 - }, - { - "epoch": 0.692, - "grad_norm": 0.7705228328704834, - "learning_rate": 6.612501073123775e-05, - "loss": 1.1762, - "num_input_tokens_seen": 4535091200, - "step": 69200, - "train_runtime": 33380.8611, - "train_tokens_per_second": 135859.024 - }, - { - "epoch": 0.693, - "grad_norm": 0.5423911213874817, - "learning_rate": 6.573080429551368e-05, - "loss": 1.19, - "num_input_tokens_seen": 4541644800, - "step": 69300, - "train_runtime": 33429.7481, - "train_tokens_per_second": 135856.387 - }, - { - "epoch": 0.694, - "grad_norm": 0.5332856774330139, - "learning_rate": 6.533744644965482e-05, - "loss": 1.1753, - "num_input_tokens_seen": 4548198400, - "step": 69400, - "train_runtime": 33476.6955, - "train_tokens_per_second": 135861.629 - }, - { - "epoch": 0.695, - "grad_norm": 0.5862846970558167, - "learning_rate": 6.494494115477023e-05, - "loss": 1.1799, - "num_input_tokens_seen": 4554752000, - "step": 69500, - "train_runtime": 33523.7618, - "train_tokens_per_second": 135866.375 - }, - { - "epoch": 0.696, - "grad_norm": 0.658592164516449, - "learning_rate": 6.455329236338394e-05, - "loss": 1.1846, - "num_input_tokens_seen": 4561305600, - "step": 69600, - "train_runtime": 33571.8888, - "train_tokens_per_second": 135866.815 - }, - { - "epoch": 0.697, - "grad_norm": 0.5558256506919861, - "learning_rate": 6.416250401939496e-05, - "loss": 1.1873, - "num_input_tokens_seen": 4567859200, - "step": 69700, - "train_runtime": 33620.7189, - "train_tokens_per_second": 135864.412 - }, - { - "epoch": 0.698, - "grad_norm": 0.5283026099205017, - "learning_rate": 6.377258005803746e-05, - "loss": 1.1743, - "num_input_tokens_seen": 4574412800, - "step": 69800, - "train_runtime": 33674.4741, - "train_tokens_per_second": 135842.145 - }, - { - "epoch": 0.699, - "grad_norm": 0.802412211894989, - "learning_rate": 6.338352440584149e-05, - "loss": 1.1782, - "num_input_tokens_seen": 4580966400, - "step": 69900, - "train_runtime": 33722.7187, - "train_tokens_per_second": 135842.144 - }, - { - "epoch": 0.7, - "grad_norm": 0.5585867762565613, - "learning_rate": 6.299534098059318e-05, - "loss": 1.1809, - "num_input_tokens_seen": 4587520000, - "step": 70000, - "train_runtime": 33770.2671, - "train_tokens_per_second": 135844.943 - }, - { - "epoch": 0.701, - "grad_norm": 0.6285941004753113, - "learning_rate": 6.260803369129522e-05, - "loss": 1.1807, - "num_input_tokens_seen": 4594073600, - "step": 70100, - "train_runtime": 33818.7011, - "train_tokens_per_second": 135844.176 - }, - { - "epoch": 0.702, - "grad_norm": 0.9580085277557373, - "learning_rate": 6.222160643812774e-05, - "loss": 1.1802, - "num_input_tokens_seen": 4600627200, - "step": 70200, - "train_runtime": 33866.618, - "train_tokens_per_second": 135845.487 - }, - { - "epoch": 0.703, - "grad_norm": 0.6520081162452698, - "learning_rate": 6.183606311240901e-05, - "loss": 1.1879, - "num_input_tokens_seen": 4607180800, - "step": 70300, - "train_runtime": 33915.2388, - "train_tokens_per_second": 135843.973 - }, - { - "epoch": 0.704, - "grad_norm": 0.520710289478302, - "learning_rate": 6.145140759655585e-05, - "loss": 1.179, - "num_input_tokens_seen": 4613734400, - "step": 70400, - "train_runtime": 33968.6026, - "train_tokens_per_second": 135823.497 - }, - { - "epoch": 0.705, - "grad_norm": 0.5945906639099121, - "learning_rate": 6.10676437640451e-05, - "loss": 1.192, - "num_input_tokens_seen": 4620288000, - "step": 70500, - "train_runtime": 34016.7254, - "train_tokens_per_second": 135824.008 - }, - { - "epoch": 0.706, - "grad_norm": 0.5285692811012268, - "learning_rate": 6.068477547937436e-05, - "loss": 1.1855, - "num_input_tokens_seen": 4626841600, - "step": 70600, - "train_runtime": 34064.6033, - "train_tokens_per_second": 135825.495 - }, - { - "epoch": 0.707, - "grad_norm": 0.6492000222206116, - "learning_rate": 6.030280659802294e-05, - "loss": 1.192, - "num_input_tokens_seen": 4633395200, - "step": 70700, - "train_runtime": 34111.1694, - "train_tokens_per_second": 135832.2 - }, - { - "epoch": 0.708, - "grad_norm": 0.5521112084388733, - "learning_rate": 5.9921740966413204e-05, - "loss": 1.1781, - "num_input_tokens_seen": 4639948800, - "step": 70800, - "train_runtime": 34162.8893, - "train_tokens_per_second": 135818.395 - }, - { - "epoch": 0.709, - "grad_norm": 0.9012600183486938, - "learning_rate": 5.954158242187197e-05, - "loss": 1.1748, - "num_input_tokens_seen": 4646502400, - "step": 70900, - "train_runtime": 34211.5739, - "train_tokens_per_second": 135816.68 - }, - { - "epoch": 0.71, - "grad_norm": 0.4976861774921417, - "learning_rate": 5.91623347925914e-05, - "loss": 1.1902, - "num_input_tokens_seen": 4653056000, - "step": 71000, - "train_runtime": 34258.4131, - "train_tokens_per_second": 135822.287 - }, - { - "epoch": 0.711, - "grad_norm": 0.5690837502479553, - "learning_rate": 5.8784001897590996e-05, - "loss": 1.1767, - "num_input_tokens_seen": 4659609600, - "step": 71100, - "train_runtime": 34307.7023, - "train_tokens_per_second": 135818.177 - }, - { - "epoch": 0.712, - "grad_norm": 0.5648302435874939, - "learning_rate": 5.840658754667877e-05, - "loss": 1.182, - "num_input_tokens_seen": 4666163200, - "step": 71200, - "train_runtime": 34355.8058, - "train_tokens_per_second": 135818.768 - }, - { - "epoch": 0.713, - "grad_norm": 0.5309351086616516, - "learning_rate": 5.8030095540413144e-05, - "loss": 1.1755, - "num_input_tokens_seen": 4672716800, - "step": 71300, - "train_runtime": 34402.7961, - "train_tokens_per_second": 135823.751 - }, - { - "epoch": 0.714, - "grad_norm": 1.0066486597061157, - "learning_rate": 5.7654529670064326e-05, - "loss": 1.2073, - "num_input_tokens_seen": 4679270400, - "step": 71400, - "train_runtime": 34458.8447, - "train_tokens_per_second": 135793.015 - }, - { - "epoch": 0.715, - "grad_norm": 0.625823974609375, - "learning_rate": 5.7279893717576485e-05, - "loss": 1.2012, - "num_input_tokens_seen": 4685824000, - "step": 71500, - "train_runtime": 34506.5957, - "train_tokens_per_second": 135795.024 - }, - { - "epoch": 0.716, - "grad_norm": 0.512055516242981, - "learning_rate": 5.690619145552958e-05, - "loss": 1.1702, - "num_input_tokens_seen": 4692377600, - "step": 71600, - "train_runtime": 34554.5393, - "train_tokens_per_second": 135796.271 - }, - { - "epoch": 0.717, - "grad_norm": 0.749454915523529, - "learning_rate": 5.6533426647101135e-05, - "loss": 1.1812, - "num_input_tokens_seen": 4698931200, - "step": 71700, - "train_runtime": 34601.4153, - "train_tokens_per_second": 135801.705 - }, - { - "epoch": 0.718, - "grad_norm": 0.5417782068252563, - "learning_rate": 5.6161603046028674e-05, - "loss": 1.1681, - "num_input_tokens_seen": 4705484800, - "step": 71800, - "train_runtime": 34650.0822, - "train_tokens_per_second": 135800.105 - }, - { - "epoch": 0.719, - "grad_norm": 0.7127480506896973, - "learning_rate": 5.579072439657179e-05, - "loss": 1.1946, - "num_input_tokens_seen": 4712038400, - "step": 71900, - "train_runtime": 34698.539, - "train_tokens_per_second": 135799.331 - }, - { - "epoch": 0.72, - "grad_norm": 0.5434790253639221, - "learning_rate": 5.542079443347431e-05, - "loss": 1.1761, - "num_input_tokens_seen": 4718592000, - "step": 72000, - "train_runtime": 34745.7766, - "train_tokens_per_second": 135803.325 - }, - { - "epoch": 0.721, - "grad_norm": 0.5872786045074463, - "learning_rate": 5.505181688192682e-05, - "loss": 1.1758, - "num_input_tokens_seen": 4725145600, - "step": 72100, - "train_runtime": 34797.942, - "train_tokens_per_second": 135788.076 - }, - { - "epoch": 0.722, - "grad_norm": 0.5440493822097778, - "learning_rate": 5.468379545752925e-05, - "loss": 1.2086, - "num_input_tokens_seen": 4731699200, - "step": 72200, - "train_runtime": 34846.6082, - "train_tokens_per_second": 135786.507 - }, - { - "epoch": 0.723, - "grad_norm": 0.5699992775917053, - "learning_rate": 5.4316733866253166e-05, - "loss": 1.1705, - "num_input_tokens_seen": 4738252800, - "step": 72300, - "train_runtime": 34894.2941, - "train_tokens_per_second": 135788.756 - }, - { - "epoch": 0.724, - "grad_norm": 0.7067492604255676, - "learning_rate": 5.3950635804404754e-05, - "loss": 1.1788, - "num_input_tokens_seen": 4744806400, - "step": 72400, - "train_runtime": 34943.1279, - "train_tokens_per_second": 135786.539 - }, - { - "epoch": 0.725, - "grad_norm": 0.4926595389842987, - "learning_rate": 5.358550495858751e-05, - "loss": 1.1712, - "num_input_tokens_seen": 4751360000, - "step": 72500, - "train_runtime": 34988.8033, - "train_tokens_per_second": 135796.585 - }, - { - "epoch": 0.726, - "grad_norm": 0.6217764616012573, - "learning_rate": 5.322134500566487e-05, - "loss": 1.199, - "num_input_tokens_seen": 4757913600, - "step": 72600, - "train_runtime": 35043.098, - "train_tokens_per_second": 135773.201 - }, - { - "epoch": 0.727, - "grad_norm": 0.5704054236412048, - "learning_rate": 5.285815961272359e-05, - "loss": 1.1782, - "num_input_tokens_seen": 4764467200, - "step": 72700, - "train_runtime": 35090.0359, - "train_tokens_per_second": 135778.351 - }, - { - "epoch": 0.728, - "grad_norm": 0.6081520915031433, - "learning_rate": 5.249595243703658e-05, - "loss": 1.1679, - "num_input_tokens_seen": 4771020800, - "step": 72800, - "train_runtime": 35136.6254, - "train_tokens_per_second": 135784.833 - }, - { - "epoch": 0.729, - "grad_norm": 0.6235555410385132, - "learning_rate": 5.213472712602598e-05, - "loss": 1.1707, - "num_input_tokens_seen": 4777574400, - "step": 72900, - "train_runtime": 35185.4188, - "train_tokens_per_second": 135782.792 - }, - { - "epoch": 0.73, - "grad_norm": 0.5777461528778076, - "learning_rate": 5.17744873172267e-05, - "loss": 1.1816, - "num_input_tokens_seen": 4784128000, - "step": 73000, - "train_runtime": 35238.2318, - "train_tokens_per_second": 135765.268 - }, - { - "epoch": 0.731, - "grad_norm": 0.569218635559082, - "learning_rate": 5.1415236638249694e-05, - "loss": 1.1757, - "num_input_tokens_seen": 4790681600, - "step": 73100, - "train_runtime": 35286.0257, - "train_tokens_per_second": 135767.106 - }, - { - "epoch": 0.732, - "grad_norm": 1.2679173946380615, - "learning_rate": 5.105697870674519e-05, - "loss": 1.1686, - "num_input_tokens_seen": 4797235200, - "step": 73200, - "train_runtime": 35333.5517, - "train_tokens_per_second": 135769.969 - }, - { - "epoch": 0.733, - "grad_norm": 0.5663115382194519, - "learning_rate": 5.069971713036664e-05, - "loss": 1.1699, - "num_input_tokens_seen": 4803788800, - "step": 73300, - "train_runtime": 35380.3642, - "train_tokens_per_second": 135775.561 - }, - { - "epoch": 0.734, - "grad_norm": 0.5404617190361023, - "learning_rate": 5.034345550673415e-05, - "loss": 1.1916, - "num_input_tokens_seen": 4810342400, - "step": 73400, - "train_runtime": 35434.8234, - "train_tokens_per_second": 135751.838 - }, - { - "epoch": 0.735, - "grad_norm": 0.7994534373283386, - "learning_rate": 4.998819742339835e-05, - "loss": 1.1842, - "num_input_tokens_seen": 4816896000, - "step": 73500, - "train_runtime": 35482.3263, - "train_tokens_per_second": 135754.797 - }, - { - "epoch": 0.736, - "grad_norm": 0.6482565402984619, - "learning_rate": 4.963394645780411e-05, - "loss": 1.1789, - "num_input_tokens_seen": 4823449600, - "step": 73600, - "train_runtime": 35530.782, - "train_tokens_per_second": 135754.107 - }, - { - "epoch": 0.737, - "grad_norm": 0.5401994585990906, - "learning_rate": 4.928070617725482e-05, - "loss": 1.1832, - "num_input_tokens_seen": 4830003200, - "step": 73700, - "train_runtime": 35578.1016, - "train_tokens_per_second": 135757.755 - }, - { - "epoch": 0.738, - "grad_norm": 0.5170857906341553, - "learning_rate": 4.892848013887613e-05, - "loss": 1.1804, - "num_input_tokens_seen": 4836556800, - "step": 73800, - "train_runtime": 35625.1017, - "train_tokens_per_second": 135762.61 - }, - { - "epoch": 0.739, - "grad_norm": 0.5744811296463013, - "learning_rate": 4.857727188958031e-05, - "loss": 1.181, - "num_input_tokens_seen": 4843110400, - "step": 73900, - "train_runtime": 35672.7413, - "train_tokens_per_second": 135765.019 - }, - { - "epoch": 0.74, - "grad_norm": 0.6613340377807617, - "learning_rate": 4.822708496603052e-05, - "loss": 1.1879, - "num_input_tokens_seen": 4849664000, - "step": 74000, - "train_runtime": 35721.0554, - "train_tokens_per_second": 135764.858 - }, - { - "epoch": 0.741, - "grad_norm": 0.5571849346160889, - "learning_rate": 4.7877922894605304e-05, - "loss": 1.1781, - "num_input_tokens_seen": 4856217600, - "step": 74100, - "train_runtime": 35771.1997, - "train_tokens_per_second": 135757.75 - }, - { - "epoch": 0.742, - "grad_norm": 0.6960323452949524, - "learning_rate": 4.752978919136273e-05, - "loss": 1.1702, - "num_input_tokens_seen": 4862771200, - "step": 74200, - "train_runtime": 35823.4168, - "train_tokens_per_second": 135742.808 - }, - { - "epoch": 0.743, - "grad_norm": 0.5823075175285339, - "learning_rate": 4.7182687362005337e-05, - "loss": 1.1762, - "num_input_tokens_seen": 4869324800, - "step": 74300, - "train_runtime": 35872.0393, - "train_tokens_per_second": 135741.511 - }, - { - "epoch": 0.744, - "grad_norm": 0.5310567021369934, - "learning_rate": 4.6836620901844794e-05, - "loss": 1.1737, - "num_input_tokens_seen": 4875878400, - "step": 74400, - "train_runtime": 35918.2124, - "train_tokens_per_second": 135749.473 - }, - { - "epoch": 0.745, - "grad_norm": 0.560118556022644, - "learning_rate": 4.64915932957664e-05, - "loss": 1.1746, - "num_input_tokens_seen": 4882432000, - "step": 74500, - "train_runtime": 35972.3831, - "train_tokens_per_second": 135727.232 - }, - { - "epoch": 0.746, - "grad_norm": 0.5729120969772339, - "learning_rate": 4.614760801819433e-05, - "loss": 1.1729, - "num_input_tokens_seen": 4888985600, - "step": 74600, - "train_runtime": 36018.4093, - "train_tokens_per_second": 135735.744 - }, - { - "epoch": 0.747, - "grad_norm": 0.5329717397689819, - "learning_rate": 4.58046685330566e-05, - "loss": 1.1969, - "num_input_tokens_seen": 4895539200, - "step": 74700, - "train_runtime": 36066.8487, - "train_tokens_per_second": 135735.152 - }, - { - "epoch": 0.748, - "grad_norm": 0.5714908838272095, - "learning_rate": 4.546277829374993e-05, - "loss": 1.172, - "num_input_tokens_seen": 4902092800, - "step": 74800, - "train_runtime": 36115.3648, - "train_tokens_per_second": 135734.273 - }, - { - "epoch": 0.749, - "grad_norm": 0.5672817826271057, - "learning_rate": 4.5121940743105246e-05, - "loss": 1.1813, - "num_input_tokens_seen": 4908646400, - "step": 74900, - "train_runtime": 36164.0493, - "train_tokens_per_second": 135732.765 - }, - { - "epoch": 0.75, - "grad_norm": 0.5890370607376099, - "learning_rate": 4.478215931335295e-05, - "loss": 1.1667, - "num_input_tokens_seen": 4915200000, - "step": 75000, - "train_runtime": 36215.8524, - "train_tokens_per_second": 135719.572 - }, - { - "epoch": 0.751, - "grad_norm": 0.6215245127677917, - "learning_rate": 4.4443437426088205e-05, - "loss": 1.179, - "num_input_tokens_seen": 4921753600, - "step": 75100, - "train_runtime": 36264.1849, - "train_tokens_per_second": 135719.405 - }, - { - "epoch": 0.752, - "grad_norm": 1.4719446897506714, - "learning_rate": 4.410577849223666e-05, - "loss": 1.1847, - "num_input_tokens_seen": 4928307200, - "step": 75200, - "train_runtime": 36312.9781, - "train_tokens_per_second": 135717.516 - }, - { - "epoch": 0.753, - "grad_norm": 1.3475043773651123, - "learning_rate": 4.376918591202006e-05, - "loss": 1.1745, - "num_input_tokens_seen": 4934860800, - "step": 75300, - "train_runtime": 36359.7761, - "train_tokens_per_second": 135723.08 - }, - { - "epoch": 0.754, - "grad_norm": 0.9558594822883606, - "learning_rate": 4.3433663074922046e-05, - "loss": 1.181, - "num_input_tokens_seen": 4941414400, - "step": 75400, - "train_runtime": 36406.8385, - "train_tokens_per_second": 135727.644 - }, - { - "epoch": 0.755, - "grad_norm": 0.5916360020637512, - "learning_rate": 4.309921335965367e-05, - "loss": 1.1706, - "num_input_tokens_seen": 4947968000, - "step": 75500, - "train_runtime": 36460.2599, - "train_tokens_per_second": 135708.522 - }, - { - "epoch": 0.756, - "grad_norm": 0.5985275506973267, - "learning_rate": 4.276584013411992e-05, - "loss": 1.1758, - "num_input_tokens_seen": 4954521600, - "step": 75600, - "train_runtime": 36507.6786, - "train_tokens_per_second": 135711.768 - }, - { - "epoch": 0.757, - "grad_norm": 0.5550095438957214, - "learning_rate": 4.243354675538555e-05, - "loss": 1.1705, - "num_input_tokens_seen": 4961075200, - "step": 75700, - "train_runtime": 36554.9962, - "train_tokens_per_second": 135715.38 - }, - { - "epoch": 0.758, - "grad_norm": 0.5496001243591309, - "learning_rate": 4.210233656964111e-05, - "loss": 1.1746, - "num_input_tokens_seen": 4967628800, - "step": 75800, - "train_runtime": 36602.3493, - "train_tokens_per_second": 135718.851 - }, - { - "epoch": 0.759, - "grad_norm": 0.570070743560791, - "learning_rate": 4.1772212912169516e-05, - "loss": 1.1771, - "num_input_tokens_seen": 4974182400, - "step": 75900, - "train_runtime": 36656.3482, - "train_tokens_per_second": 135697.707 - }, - { - "epoch": 0.76, - "grad_norm": 0.7570028305053711, - "learning_rate": 4.14431791073124e-05, - "loss": 1.1756, - "num_input_tokens_seen": 4980736000, - "step": 76000, - "train_runtime": 36704.1036, - "train_tokens_per_second": 135699.704 - }, - { - "epoch": 0.761, - "grad_norm": 0.6243161559104919, - "learning_rate": 4.111523846843639e-05, - "loss": 1.1667, - "num_input_tokens_seen": 4987289600, - "step": 76100, - "train_runtime": 36753.037, - "train_tokens_per_second": 135697.347 - }, - { - "epoch": 0.762, - "grad_norm": 0.5531216263771057, - "learning_rate": 4.078839429790019e-05, - "loss": 1.1755, - "num_input_tokens_seen": 4993843200, - "step": 76200, - "train_runtime": 36800.3039, - "train_tokens_per_second": 135701.14 - }, - { - "epoch": 0.763, - "grad_norm": 0.5894837379455566, - "learning_rate": 4.046264988702097e-05, - "loss": 1.1778, - "num_input_tokens_seen": 5000396800, - "step": 76300, - "train_runtime": 36847.8696, - "train_tokens_per_second": 135703.824 - }, - { - "epoch": 0.764, - "grad_norm": 0.6210083365440369, - "learning_rate": 4.013800851604123e-05, - "loss": 1.1729, - "num_input_tokens_seen": 5006950400, - "step": 76400, - "train_runtime": 36901.2456, - "train_tokens_per_second": 135685.133 - }, - { - "epoch": 0.765, - "grad_norm": 0.5929700136184692, - "learning_rate": 3.981447345409606e-05, - "loss": 1.171, - "num_input_tokens_seen": 5013504000, - "step": 76500, - "train_runtime": 36949.2788, - "train_tokens_per_second": 135686.113 - }, - { - "epoch": 0.766, - "grad_norm": 0.5809143781661987, - "learning_rate": 3.949204795917995e-05, - "loss": 1.1775, - "num_input_tokens_seen": 5020057600, - "step": 76600, - "train_runtime": 36996.6957, - "train_tokens_per_second": 135689.35 - }, - { - "epoch": 0.767, - "grad_norm": 0.5398791432380676, - "learning_rate": 3.917073527811399e-05, - "loss": 1.1765, - "num_input_tokens_seen": 5026611200, - "step": 76700, - "train_runtime": 37044.9859, - "train_tokens_per_second": 135689.381 - }, - { - "epoch": 0.768, - "grad_norm": 0.8559983372688293, - "learning_rate": 3.885053864651334e-05, - "loss": 1.1661, - "num_input_tokens_seen": 5033164800, - "step": 76800, - "train_runtime": 37092.5707, - "train_tokens_per_second": 135691.992 - }, - { - "epoch": 0.769, - "grad_norm": 1.0961577892303467, - "learning_rate": 3.8531461288754564e-05, - "loss": 1.1734, - "num_input_tokens_seen": 5039718400, - "step": 76900, - "train_runtime": 37145.642, - "train_tokens_per_second": 135674.554 - }, - { - "epoch": 0.77, - "grad_norm": 0.5564078688621521, - "learning_rate": 3.821350641794305e-05, - "loss": 1.1783, - "num_input_tokens_seen": 5046272000, - "step": 77000, - "train_runtime": 37194.2194, - "train_tokens_per_second": 135673.556 - }, - { - "epoch": 0.771, - "grad_norm": 0.6036384701728821, - "learning_rate": 3.789667723588087e-05, - "loss": 1.1651, - "num_input_tokens_seen": 5052825600, - "step": 77100, - "train_runtime": 37242.6728, - "train_tokens_per_second": 135673.012 - }, - { - "epoch": 0.772, - "grad_norm": 1.4465519189834595, - "learning_rate": 3.758097693303431e-05, - "loss": 1.1783, - "num_input_tokens_seen": 5059379200, - "step": 77200, - "train_runtime": 37290.7014, - "train_tokens_per_second": 135674.015 - }, - { - "epoch": 0.773, - "grad_norm": 0.5566693544387817, - "learning_rate": 3.7266408688502005e-05, - "loss": 1.1751, - "num_input_tokens_seen": 5065932800, - "step": 77300, - "train_runtime": 37338.6452, - "train_tokens_per_second": 135675.324 - }, - { - "epoch": 0.774, - "grad_norm": 0.653806209564209, - "learning_rate": 3.695297566998256e-05, - "loss": 1.1709, - "num_input_tokens_seen": 5072486400, - "step": 77400, - "train_runtime": 37386.3122, - "train_tokens_per_second": 135677.634 - }, - { - "epoch": 0.775, - "grad_norm": 0.8704593777656555, - "learning_rate": 3.664068103374307e-05, - "loss": 1.1794, - "num_input_tokens_seen": 5079040000, - "step": 77500, - "train_runtime": 37436.1356, - "train_tokens_per_second": 135672.123 - }, - { - "epoch": 0.776, - "grad_norm": 0.6627979874610901, - "learning_rate": 3.63295279245871e-05, - "loss": 1.175, - "num_input_tokens_seen": 5085593600, - "step": 77600, - "train_runtime": 37484.0969, - "train_tokens_per_second": 135673.366 - }, - { - "epoch": 0.777, - "grad_norm": 0.6232652068138123, - "learning_rate": 3.601951947582291e-05, - "loss": 1.1665, - "num_input_tokens_seen": 5092147200, - "step": 77700, - "train_runtime": 37536.8508, - "train_tokens_per_second": 135657.283 - }, - { - "epoch": 0.778, - "grad_norm": 0.5873488187789917, - "learning_rate": 3.571065880923216e-05, - "loss": 1.1734, - "num_input_tokens_seen": 5098700800, - "step": 77800, - "train_runtime": 37584.0839, - "train_tokens_per_second": 135661.17 - }, - { - "epoch": 0.779, - "grad_norm": 0.56858891248703, - "learning_rate": 3.540294903503841e-05, - "loss": 1.1696, - "num_input_tokens_seen": 5105254400, - "step": 77900, - "train_runtime": 37631.6286, - "train_tokens_per_second": 135663.924 - }, - { - "epoch": 0.78, - "grad_norm": 0.5939886569976807, - "learning_rate": 3.5096393251875566e-05, - "loss": 1.1784, - "num_input_tokens_seen": 5111808000, - "step": 78000, - "train_runtime": 37679.4424, - "train_tokens_per_second": 135665.702 - }, - { - "epoch": 0.781, - "grad_norm": 0.5839298367500305, - "learning_rate": 3.479099454675701e-05, - "loss": 1.1672, - "num_input_tokens_seen": 5118361600, - "step": 78100, - "train_runtime": 37733.7363, - "train_tokens_per_second": 135644.177 - }, - { - "epoch": 0.782, - "grad_norm": 0.6057742238044739, - "learning_rate": 3.448675599504434e-05, - "loss": 1.1767, - "num_input_tokens_seen": 5124915200, - "step": 78200, - "train_runtime": 37781.8162, - "train_tokens_per_second": 135645.019 - }, - { - "epoch": 0.783, - "grad_norm": 0.9875990748405457, - "learning_rate": 3.418368066041633e-05, - "loss": 1.1619, - "num_input_tokens_seen": 5131468800, - "step": 78300, - "train_runtime": 37829.8727, - "train_tokens_per_second": 135645.944 - }, - { - "epoch": 0.784, - "grad_norm": 0.5806832313537598, - "learning_rate": 3.388177159483826e-05, - "loss": 1.1747, - "num_input_tokens_seen": 5138022400, - "step": 78400, - "train_runtime": 37877.7351, - "train_tokens_per_second": 135647.561 - }, - { - "epoch": 0.785, - "grad_norm": 0.7016937136650085, - "learning_rate": 3.3581031838531116e-05, - "loss": 1.1664, - "num_input_tokens_seen": 5144576000, - "step": 78500, - "train_runtime": 37924.0105, - "train_tokens_per_second": 135654.851 - }, - { - "epoch": 0.786, - "grad_norm": 0.7171750664710999, - "learning_rate": 3.328146441994084e-05, - "loss": 1.1905, - "num_input_tokens_seen": 5151129600, - "step": 78600, - "train_runtime": 37971.9481, - "train_tokens_per_second": 135656.184 - }, - { - "epoch": 0.787, - "grad_norm": 0.5550017356872559, - "learning_rate": 3.2983072355708026e-05, - "loss": 1.1741, - "num_input_tokens_seen": 5157683200, - "step": 78700, - "train_runtime": 38021.3003, - "train_tokens_per_second": 135652.467 - }, - { - "epoch": 0.788, - "grad_norm": 0.5833317637443542, - "learning_rate": 3.2685858650637486e-05, - "loss": 1.176, - "num_input_tokens_seen": 5164236800, - "step": 78800, - "train_runtime": 38074.1209, - "train_tokens_per_second": 135636.403 - }, - { - "epoch": 0.789, - "grad_norm": 0.9918714165687561, - "learning_rate": 3.238982629766793e-05, - "loss": 1.1653, - "num_input_tokens_seen": 5170790400, - "step": 78900, - "train_runtime": 38121.5575, - "train_tokens_per_second": 135639.537 - }, - { - "epoch": 0.79, - "grad_norm": 1.2304959297180176, - "learning_rate": 3.209497827784177e-05, - "loss": 1.177, - "num_input_tokens_seen": 5177344000, - "step": 79000, - "train_runtime": 38168.8984, - "train_tokens_per_second": 135643.003 - }, - { - "epoch": 0.791, - "grad_norm": 0.5920888185501099, - "learning_rate": 3.1801317560275394e-05, - "loss": 1.1717, - "num_input_tokens_seen": 5183897600, - "step": 79100, - "train_runtime": 38223.2691, - "train_tokens_per_second": 135621.513 - }, - { - "epoch": 0.792, - "grad_norm": 0.5991621017456055, - "learning_rate": 3.150884710212895e-05, - "loss": 1.1933, - "num_input_tokens_seen": 5190451200, - "step": 79200, - "train_runtime": 38270.5225, - "train_tokens_per_second": 135625.303 - }, - { - "epoch": 0.793, - "grad_norm": 0.6007819175720215, - "learning_rate": 3.121756984857665e-05, - "loss": 1.1721, - "num_input_tokens_seen": 5197004800, - "step": 79300, - "train_runtime": 38316.5535, - "train_tokens_per_second": 135633.41 - }, - { - "epoch": 0.794, - "grad_norm": 0.6040635704994202, - "learning_rate": 3.092748873277725e-05, - "loss": 1.1784, - "num_input_tokens_seen": 5203558400, - "step": 79400, - "train_runtime": 38364.1371, - "train_tokens_per_second": 135636.008 - }, - { - "epoch": 0.795, - "grad_norm": 1.8925070762634277, - "learning_rate": 3.06386066758444e-05, - "loss": 1.179, - "num_input_tokens_seen": 5210112000, - "step": 79500, - "train_runtime": 38412.6561, - "train_tokens_per_second": 135635.297 - }, - { - "epoch": 0.796, - "grad_norm": 0.6026915311813354, - "learning_rate": 3.0350926586817127e-05, - "loss": 1.1706, - "num_input_tokens_seen": 5216665600, - "step": 79600, - "train_runtime": 38465.3514, - "train_tokens_per_second": 135619.861 - }, - { - "epoch": 0.797, - "grad_norm": 0.7981861233711243, - "learning_rate": 3.0064451362630765e-05, - "loss": 1.1796, - "num_input_tokens_seen": 5223219200, - "step": 79700, - "train_runtime": 38512.271, - "train_tokens_per_second": 135624.804 - }, - { - "epoch": 0.798, - "grad_norm": 1.3739973306655884, - "learning_rate": 2.9779183888087683e-05, - "loss": 1.1827, - "num_input_tokens_seen": 5229772800, - "step": 79800, - "train_runtime": 38560.5377, - "train_tokens_per_second": 135624.997 - }, - { - "epoch": 0.799, - "grad_norm": 0.7507041692733765, - "learning_rate": 2.9495127035828103e-05, - "loss": 1.164, - "num_input_tokens_seen": 5236326400, - "step": 79900, - "train_runtime": 38608.5419, - "train_tokens_per_second": 135626.111 - }, - { - "epoch": 0.8, - "grad_norm": 0.5848426818847656, - "learning_rate": 2.921228366630144e-05, - "loss": 1.1746, - "num_input_tokens_seen": 5242880000, - "step": 80000, - "train_runtime": 38660.3487, - "train_tokens_per_second": 135613.883 - }, - { - "epoch": 0.801, - "grad_norm": 0.5851396322250366, - "learning_rate": 2.8930656627737276e-05, - "loss": 1.1999, - "num_input_tokens_seen": 5249433600, - "step": 80100, - "train_runtime": 38707.849, - "train_tokens_per_second": 135616.774 - }, - { - "epoch": 0.802, - "grad_norm": 0.5581755638122559, - "learning_rate": 2.8650248756116727e-05, - "loss": 1.1657, - "num_input_tokens_seen": 5255987200, - "step": 80200, - "train_runtime": 38755.0614, - "train_tokens_per_second": 135620.665 - }, - { - "epoch": 0.803, - "grad_norm": 0.8737390637397766, - "learning_rate": 2.8371062875143968e-05, - "loss": 1.168, - "num_input_tokens_seen": 5262540800, - "step": 80300, - "train_runtime": 38809.0814, - "train_tokens_per_second": 135600.757 - }, - { - "epoch": 0.804, - "grad_norm": 0.6018446683883667, - "learning_rate": 2.809310179621776e-05, - "loss": 1.1603, - "num_input_tokens_seen": 5269094400, - "step": 80400, - "train_runtime": 38856.5205, - "train_tokens_per_second": 135603.866 - }, - { - "epoch": 0.805, - "grad_norm": 0.5673835873603821, - "learning_rate": 2.781636831840303e-05, - "loss": 1.1748, - "num_input_tokens_seen": 5275648000, - "step": 80500, - "train_runtime": 38904.9212, - "train_tokens_per_second": 135603.616 - }, - { - "epoch": 0.806, - "grad_norm": 0.5929433703422546, - "learning_rate": 2.754086522840282e-05, - "loss": 1.1663, - "num_input_tokens_seen": 5282201600, - "step": 80600, - "train_runtime": 38952.3955, - "train_tokens_per_second": 135606.592 - }, - { - "epoch": 0.807, - "grad_norm": 0.555366039276123, - "learning_rate": 2.7266595300530204e-05, - "loss": 1.1665, - "num_input_tokens_seen": 5288755200, - "step": 80700, - "train_runtime": 39001.4372, - "train_tokens_per_second": 135604.11 - }, - { - "epoch": 0.808, - "grad_norm": 0.5364073514938354, - "learning_rate": 2.6993561296680342e-05, - "loss": 1.1687, - "num_input_tokens_seen": 5295308800, - "step": 80800, - "train_runtime": 39048.23, - "train_tokens_per_second": 135609.445 - }, - { - "epoch": 0.809, - "grad_norm": 0.9588598608970642, - "learning_rate": 2.672176596630258e-05, - "loss": 1.1831, - "num_input_tokens_seen": 5301862400, - "step": 80900, - "train_runtime": 39096.7929, - "train_tokens_per_second": 135608.627 - }, - { - "epoch": 0.81, - "grad_norm": 0.6481744050979614, - "learning_rate": 2.6451212046372883e-05, - "loss": 1.1686, - "num_input_tokens_seen": 5308416000, - "step": 81000, - "train_runtime": 39152.1435, - "train_tokens_per_second": 135584.301 - }, - { - "epoch": 0.811, - "grad_norm": 0.5828465819358826, - "learning_rate": 2.6181902261366256e-05, - "loss": 1.1662, - "num_input_tokens_seen": 5314969600, - "step": 81100, - "train_runtime": 39199.715, - "train_tokens_per_second": 135586.945 - }, - { - "epoch": 0.812, - "grad_norm": 0.5715954899787903, - "learning_rate": 2.5913839323229195e-05, - "loss": 1.1623, - "num_input_tokens_seen": 5321523200, - "step": 81200, - "train_runtime": 39246.528, - "train_tokens_per_second": 135592.203 - }, - { - "epoch": 0.813, - "grad_norm": 0.8631576299667358, - "learning_rate": 2.564702593135253e-05, - "loss": 1.1896, - "num_input_tokens_seen": 5328076800, - "step": 81300, - "train_runtime": 39294.7731, - "train_tokens_per_second": 135592.507 - }, - { - "epoch": 0.814, - "grad_norm": 0.5882650017738342, - "learning_rate": 2.538146477254419e-05, - "loss": 1.1728, - "num_input_tokens_seen": 5334630400, - "step": 81400, - "train_runtime": 39341.8017, - "train_tokens_per_second": 135597.003 - }, - { - "epoch": 0.815, - "grad_norm": 0.5567020773887634, - "learning_rate": 2.5117158521002033e-05, - "loss": 1.1669, - "num_input_tokens_seen": 5341184000, - "step": 81500, - "train_runtime": 39389.3033, - "train_tokens_per_second": 135599.86 - }, - { - "epoch": 0.816, - "grad_norm": 0.7412062883377075, - "learning_rate": 2.4854109838287116e-05, - "loss": 1.1629, - "num_input_tokens_seen": 5347737600, - "step": 81600, - "train_runtime": 39443.4282, - "train_tokens_per_second": 135579.939 - }, - { - "epoch": 0.817, - "grad_norm": 0.6353700757026672, - "learning_rate": 2.459232137329679e-05, - "loss": 1.1676, - "num_input_tokens_seen": 5354291200, - "step": 81700, - "train_runtime": 39490.3956, - "train_tokens_per_second": 135584.643 - }, - { - "epoch": 0.818, - "grad_norm": 0.6541226506233215, - "learning_rate": 2.4331795762237894e-05, - "loss": 1.1669, - "num_input_tokens_seen": 5360844800, - "step": 81800, - "train_runtime": 39539.3049, - "train_tokens_per_second": 135582.677 - }, - { - "epoch": 0.819, - "grad_norm": 0.684333086013794, - "learning_rate": 2.4072535628600514e-05, - "loss": 1.1623, - "num_input_tokens_seen": 5367398400, - "step": 81900, - "train_runtime": 39587.3713, - "train_tokens_per_second": 135583.602 - }, - { - "epoch": 0.82, - "grad_norm": 0.5568915605545044, - "learning_rate": 2.3814543583131306e-05, - "loss": 1.1662, - "num_input_tokens_seen": 5373952000, - "step": 82000, - "train_runtime": 39636.1132, - "train_tokens_per_second": 135582.214 - }, - { - "epoch": 0.821, - "grad_norm": 0.6357592940330505, - "learning_rate": 2.3557822223807287e-05, - "loss": 1.1617, - "num_input_tokens_seen": 5380505600, - "step": 82100, - "train_runtime": 39683.9299, - "train_tokens_per_second": 135583.991 - }, - { - "epoch": 0.822, - "grad_norm": 0.6660736203193665, - "learning_rate": 2.3302374135809727e-05, - "loss": 1.1788, - "num_input_tokens_seen": 5387059200, - "step": 82200, - "train_runtime": 39731.7683, - "train_tokens_per_second": 135585.69 - }, - { - "epoch": 0.823, - "grad_norm": 0.6093869805335999, - "learning_rate": 2.304820189149798e-05, - "loss": 1.1823, - "num_input_tokens_seen": 5393612800, - "step": 82300, - "train_runtime": 39780.5498, - "train_tokens_per_second": 135584.169 - }, - { - "epoch": 0.824, - "grad_norm": 1.0343610048294067, - "learning_rate": 2.2795308050383787e-05, - "loss": 1.1942, - "num_input_tokens_seen": 5400166400, - "step": 82400, - "train_runtime": 39833.9775, - "train_tokens_per_second": 135566.839 - }, - { - "epoch": 0.825, - "grad_norm": 0.5363211035728455, - "learning_rate": 2.2543695159105248e-05, - "loss": 1.1659, - "num_input_tokens_seen": 5406720000, - "step": 82500, - "train_runtime": 39881.8503, - "train_tokens_per_second": 135568.434 - }, - { - "epoch": 0.826, - "grad_norm": 0.9732265472412109, - "learning_rate": 2.2293365751401443e-05, - "loss": 1.1757, - "num_input_tokens_seen": 5413273600, - "step": 82600, - "train_runtime": 39929.975, - "train_tokens_per_second": 135569.171 - }, - { - "epoch": 0.827, - "grad_norm": 0.5309200286865234, - "learning_rate": 2.2044322348086735e-05, - "loss": 1.1651, - "num_input_tokens_seen": 5419827200, - "step": 82700, - "train_runtime": 39978.229, - "train_tokens_per_second": 135569.467 - }, - { - "epoch": 0.828, - "grad_norm": 0.543769121170044, - "learning_rate": 2.1796567457025372e-05, - "loss": 1.1685, - "num_input_tokens_seen": 5426380800, - "step": 82800, - "train_runtime": 40026.0125, - "train_tokens_per_second": 135571.356 - }, - { - "epoch": 0.829, - "grad_norm": 0.5210631489753723, - "learning_rate": 2.15501035731064e-05, - "loss": 1.1778, - "num_input_tokens_seen": 5432934400, - "step": 82900, - "train_runtime": 40075.0654, - "train_tokens_per_second": 135568.947 - }, - { - "epoch": 0.83, - "grad_norm": 1.3538480997085571, - "learning_rate": 2.1304933178218426e-05, - "loss": 1.1655, - "num_input_tokens_seen": 5439488000, - "step": 83000, - "train_runtime": 40123.2015, - "train_tokens_per_second": 135569.64 - }, - { - "epoch": 0.831, - "grad_norm": 1.2901802062988281, - "learning_rate": 2.1061058741224518e-05, - "loss": 1.1668, - "num_input_tokens_seen": 5446041600, - "step": 83100, - "train_runtime": 40170.8312, - "train_tokens_per_second": 135572.042 - }, - { - "epoch": 0.832, - "grad_norm": 0.6960340142250061, - "learning_rate": 2.0818482717937596e-05, - "loss": 1.163, - "num_input_tokens_seen": 5452595200, - "step": 83200, - "train_runtime": 40225.6882, - "train_tokens_per_second": 135550.079 - }, - { - "epoch": 0.833, - "grad_norm": 0.537268340587616, - "learning_rate": 2.0577207551095552e-05, - "loss": 1.1689, - "num_input_tokens_seen": 5459148800, - "step": 83300, - "train_runtime": 40273.4785, - "train_tokens_per_second": 135551.956 - }, - { - "epoch": 0.834, - "grad_norm": 0.564239501953125, - "learning_rate": 2.0337235670336584e-05, - "loss": 1.1662, - "num_input_tokens_seen": 5465702400, - "step": 83400, - "train_runtime": 40320.9705, - "train_tokens_per_second": 135554.832 - }, - { - "epoch": 0.835, - "grad_norm": 0.520041823387146, - "learning_rate": 2.0098569492174887e-05, - "loss": 1.1642, - "num_input_tokens_seen": 5472256000, - "step": 83500, - "train_runtime": 40369.1468, - "train_tokens_per_second": 135555.404 - }, - { - "epoch": 0.836, - "grad_norm": 0.616112232208252, - "learning_rate": 1.9861211419976258e-05, - "loss": 1.1671, - "num_input_tokens_seen": 5478809600, - "step": 83600, - "train_runtime": 40416.0661, - "train_tokens_per_second": 135560.19 - }, - { - "epoch": 0.837, - "grad_norm": 1.3083754777908325, - "learning_rate": 1.962516384393377e-05, - "loss": 1.1778, - "num_input_tokens_seen": 5485363200, - "step": 83700, - "train_runtime": 40465.3405, - "train_tokens_per_second": 135557.075 - }, - { - "epoch": 0.838, - "grad_norm": 0.5721991062164307, - "learning_rate": 1.939042914104396e-05, - "loss": 1.179, - "num_input_tokens_seen": 5491916800, - "step": 83800, - "train_runtime": 40513.1256, - "train_tokens_per_second": 135558.951 - }, - { - "epoch": 0.839, - "grad_norm": 0.8014708161354065, - "learning_rate": 1.9157009675082702e-05, - "loss": 1.1698, - "num_input_tokens_seen": 5498470400, - "step": 83900, - "train_runtime": 40567.2253, - "train_tokens_per_second": 135539.721 - }, - { - "epoch": 0.84, - "grad_norm": 0.7554424405097961, - "learning_rate": 1.8924907796581363e-05, - "loss": 1.1689, - "num_input_tokens_seen": 5505024000, - "step": 84000, - "train_runtime": 40615.2949, - "train_tokens_per_second": 135540.663 - }, - { - "epoch": 0.841, - "grad_norm": 0.6026338934898376, - "learning_rate": 1.869412584280329e-05, - "loss": 1.1727, - "num_input_tokens_seen": 5511577600, - "step": 84100, - "train_runtime": 40664.1179, - "train_tokens_per_second": 135539.091 - }, - { - "epoch": 0.842, - "grad_norm": 0.6569694876670837, - "learning_rate": 1.8464666137720208e-05, - "loss": 1.1717, - "num_input_tokens_seen": 5518131200, - "step": 84200, - "train_runtime": 40713.1869, - "train_tokens_per_second": 135536.705 - }, - { - "epoch": 0.843, - "grad_norm": 0.5886375904083252, - "learning_rate": 1.823653099198884e-05, - "loss": 1.1764, - "num_input_tokens_seen": 5524684800, - "step": 84300, - "train_runtime": 40759.1071, - "train_tokens_per_second": 135544.795 - }, - { - "epoch": 0.844, - "grad_norm": 0.6782867908477783, - "learning_rate": 1.800972270292749e-05, - "loss": 1.1637, - "num_input_tokens_seen": 5531238400, - "step": 84400, - "train_runtime": 40811.77, - "train_tokens_per_second": 135530.471 - }, - { - "epoch": 0.845, - "grad_norm": 0.6513829231262207, - "learning_rate": 1.778424355449317e-05, - "loss": 1.165, - "num_input_tokens_seen": 5537792000, - "step": 84500, - "train_runtime": 40858.6857, - "train_tokens_per_second": 135535.245 - }, - { - "epoch": 0.846, - "grad_norm": 0.6192531585693359, - "learning_rate": 1.756009581725841e-05, - "loss": 1.1589, - "num_input_tokens_seen": 5544345600, - "step": 84600, - "train_runtime": 40906.0609, - "train_tokens_per_second": 135538.487 - }, - { - "epoch": 0.847, - "grad_norm": 0.5640349388122559, - "learning_rate": 1.7337281748388387e-05, - "loss": 1.1653, - "num_input_tokens_seen": 5550899200, - "step": 84700, - "train_runtime": 40952.689, - "train_tokens_per_second": 135544.194 - }, - { - "epoch": 0.848, - "grad_norm": 0.5606239438056946, - "learning_rate": 1.7115803591618312e-05, - "loss": 1.1734, - "num_input_tokens_seen": 5557452800, - "step": 84800, - "train_runtime": 41006.8402, - "train_tokens_per_second": 135525.019 - }, - { - "epoch": 0.849, - "grad_norm": 0.5700273513793945, - "learning_rate": 1.6895663577230816e-05, - "loss": 1.1755, - "num_input_tokens_seen": 5564006400, - "step": 84900, - "train_runtime": 41054.6511, - "train_tokens_per_second": 135526.822 - }, - { - "epoch": 0.85, - "grad_norm": 0.7111489176750183, - "learning_rate": 1.667686392203333e-05, - "loss": 1.1673, - "num_input_tokens_seen": 5570560000, - "step": 85000, - "train_runtime": 41102.7763, - "train_tokens_per_second": 135527.585 - }, - { - "epoch": 0.851, - "grad_norm": 0.5908454060554504, - "learning_rate": 1.6459406829335996e-05, - "loss": 1.1767, - "num_input_tokens_seen": 5577113600, - "step": 85100, - "train_runtime": 41150.3215, - "train_tokens_per_second": 135530.256 - }, - { - "epoch": 0.852, - "grad_norm": 0.6215232610702515, - "learning_rate": 1.624329448892932e-05, - "loss": 1.171, - "num_input_tokens_seen": 5583667200, - "step": 85200, - "train_runtime": 41205.5284, - "train_tokens_per_second": 135507.72 - }, - { - "epoch": 0.853, - "grad_norm": 0.6203814744949341, - "learning_rate": 1.6028529077062163e-05, - "loss": 1.1591, - "num_input_tokens_seen": 5590220800, - "step": 85300, - "train_runtime": 41253.0291, - "train_tokens_per_second": 135510.553 - }, - { - "epoch": 0.854, - "grad_norm": 0.5267207026481628, - "learning_rate": 1.5815112756419805e-05, - "loss": 1.185, - "num_input_tokens_seen": 5596774400, - "step": 85400, - "train_runtime": 41301.2794, - "train_tokens_per_second": 135510.921 - }, - { - "epoch": 0.855, - "grad_norm": 0.5815737843513489, - "learning_rate": 1.5603047676102313e-05, - "loss": 1.173, - "num_input_tokens_seen": 5603328000, - "step": 85500, - "train_runtime": 41349.0127, - "train_tokens_per_second": 135512.982 - }, - { - "epoch": 0.856, - "grad_norm": 0.6342357397079468, - "learning_rate": 1.5392335971602638e-05, - "loss": 1.1568, - "num_input_tokens_seen": 5609881600, - "step": 85600, - "train_runtime": 41397.1556, - "train_tokens_per_second": 135513.697 - }, - { - "epoch": 0.857, - "grad_norm": 0.6623713970184326, - "learning_rate": 1.5182979764785258e-05, - "loss": 1.1649, - "num_input_tokens_seen": 5616435200, - "step": 85700, - "train_runtime": 41450.3243, - "train_tokens_per_second": 135497.98 - }, - { - "epoch": 0.858, - "grad_norm": 0.6217081546783447, - "learning_rate": 1.4974981163864896e-05, - "loss": 1.1772, - "num_input_tokens_seen": 5622988800, - "step": 85800, - "train_runtime": 41497.5379, - "train_tokens_per_second": 135501.745 - }, - { - "epoch": 0.859, - "grad_norm": 0.6180946826934814, - "learning_rate": 1.4768342263385192e-05, - "loss": 1.1601, - "num_input_tokens_seen": 5629542400, - "step": 85900, - "train_runtime": 41546.6611, - "train_tokens_per_second": 135499.274 - }, - { - "epoch": 0.86, - "grad_norm": 0.5609486103057861, - "learning_rate": 1.4563065144197517e-05, - "loss": 1.1866, - "num_input_tokens_seen": 5636096000, - "step": 86000, - "train_runtime": 41594.2678, - "train_tokens_per_second": 135501.748 - }, - { - "epoch": 0.861, - "grad_norm": 0.5352550148963928, - "learning_rate": 1.4359151873440216e-05, - "loss": 1.1732, - "num_input_tokens_seen": 5642649600, - "step": 86100, - "train_runtime": 41640.8053, - "train_tokens_per_second": 135507.696 - }, - { - "epoch": 0.862, - "grad_norm": 0.5788577198982239, - "learning_rate": 1.415660450451767e-05, - "loss": 1.1785, - "num_input_tokens_seen": 5649203200, - "step": 86200, - "train_runtime": 41695.0254, - "train_tokens_per_second": 135488.662 - }, - { - "epoch": 0.863, - "grad_norm": 0.5672028064727783, - "learning_rate": 1.3955425077079595e-05, - "loss": 1.1692, - "num_input_tokens_seen": 5655756800, - "step": 86300, - "train_runtime": 41742.7936, - "train_tokens_per_second": 135490.615 - }, - { - "epoch": 0.864, - "grad_norm": 0.577563464641571, - "learning_rate": 1.375561561700061e-05, - "loss": 1.1662, - "num_input_tokens_seen": 5662310400, - "step": 86400, - "train_runtime": 41789.652, - "train_tokens_per_second": 135495.515 - }, - { - "epoch": 0.865, - "grad_norm": 0.544994592666626, - "learning_rate": 1.3557178136359798e-05, - "loss": 1.1665, - "num_input_tokens_seen": 5668864000, - "step": 86500, - "train_runtime": 41842.8709, - "train_tokens_per_second": 135479.805 - }, - { - "epoch": 0.866, - "grad_norm": 0.5978608727455139, - "learning_rate": 1.3360114633420333e-05, - "loss": 1.1644, - "num_input_tokens_seen": 5675417600, - "step": 86600, - "train_runtime": 41891.5143, - "train_tokens_per_second": 135478.932 - }, - { - "epoch": 0.867, - "grad_norm": 0.6005887985229492, - "learning_rate": 1.3164427092609503e-05, - "loss": 1.1742, - "num_input_tokens_seen": 5681971200, - "step": 86700, - "train_runtime": 41939.4895, - "train_tokens_per_second": 135480.218 - }, - { - "epoch": 0.868, - "grad_norm": 0.5312247276306152, - "learning_rate": 1.2970117484498732e-05, - "loss": 1.1575, - "num_input_tokens_seen": 5688524800, - "step": 86800, - "train_runtime": 41987.1811, - "train_tokens_per_second": 135482.418 - }, - { - "epoch": 0.869, - "grad_norm": 0.9317598342895508, - "learning_rate": 1.2777187765783558e-05, - "loss": 1.1668, - "num_input_tokens_seen": 5695078400, - "step": 86900, - "train_runtime": 42034.5611, - "train_tokens_per_second": 135485.616 - }, - { - "epoch": 0.87, - "grad_norm": 0.5501394271850586, - "learning_rate": 1.2585639879264103e-05, - "loss": 1.1741, - "num_input_tokens_seen": 5701632000, - "step": 87000, - "train_runtime": 42082.1201, - "train_tokens_per_second": 135488.231 - }, - { - "epoch": 0.871, - "grad_norm": 0.6144236326217651, - "learning_rate": 1.2395475753825518e-05, - "loss": 1.1665, - "num_input_tokens_seen": 5708185600, - "step": 87100, - "train_runtime": 42136.7086, - "train_tokens_per_second": 135468.236 - }, - { - "epoch": 0.872, - "grad_norm": 0.6324082612991333, - "learning_rate": 1.2206697304418367e-05, - "loss": 1.1523, - "num_input_tokens_seen": 5714739200, - "step": 87200, - "train_runtime": 42184.2095, - "train_tokens_per_second": 135471.051 - }, - { - "epoch": 0.873, - "grad_norm": 0.6486518979072571, - "learning_rate": 1.2019306432039594e-05, - "loss": 1.1872, - "num_input_tokens_seen": 5721292800, - "step": 87300, - "train_runtime": 42230.9222, - "train_tokens_per_second": 135476.388 - }, - { - "epoch": 0.874, - "grad_norm": 0.5755148530006409, - "learning_rate": 1.1833305023713153e-05, - "loss": 1.1963, - "num_input_tokens_seen": 5727846400, - "step": 87400, - "train_runtime": 42278.9901, - "train_tokens_per_second": 135477.37 - }, - { - "epoch": 0.875, - "grad_norm": 0.6408706307411194, - "learning_rate": 1.1648694952471205e-05, - "loss": 1.163, - "num_input_tokens_seen": 5734400000, - "step": 87500, - "train_runtime": 42326.8376, - "train_tokens_per_second": 135479.056 - }, - { - "epoch": 0.876, - "grad_norm": 0.6233325600624084, - "learning_rate": 1.1465478077335088e-05, - "loss": 1.1591, - "num_input_tokens_seen": 5740953600, - "step": 87600, - "train_runtime": 42379.4952, - "train_tokens_per_second": 135465.36 - }, - { - "epoch": 0.877, - "grad_norm": 0.8282228708267212, - "learning_rate": 1.1283656243296695e-05, - "loss": 1.1799, - "num_input_tokens_seen": 5747507200, - "step": 87700, - "train_runtime": 42427.8149, - "train_tokens_per_second": 135465.548 - }, - { - "epoch": 0.878, - "grad_norm": 0.7755045294761658, - "learning_rate": 1.1103231281299923e-05, - "loss": 1.1565, - "num_input_tokens_seen": 5754060800, - "step": 87800, - "train_runtime": 42474.6192, - "train_tokens_per_second": 135470.568 - }, - { - "epoch": 0.879, - "grad_norm": 0.6230588555335999, - "learning_rate": 1.0924205008222086e-05, - "loss": 1.1673, - "num_input_tokens_seen": 5760614400, - "step": 87900, - "train_runtime": 42522.6205, - "train_tokens_per_second": 135471.764 - }, - { - "epoch": 0.88, - "grad_norm": 0.5966441035270691, - "learning_rate": 1.0746579226855768e-05, - "loss": 1.1628, - "num_input_tokens_seen": 5767168000, - "step": 88000, - "train_runtime": 42576.1454, - "train_tokens_per_second": 135455.381 - }, - { - "epoch": 0.881, - "grad_norm": 0.6604552865028381, - "learning_rate": 1.0570355725890678e-05, - "loss": 1.1769, - "num_input_tokens_seen": 5773721600, - "step": 88100, - "train_runtime": 42624.8502, - "train_tokens_per_second": 135454.355 - }, - { - "epoch": 0.882, - "grad_norm": 0.5727500319480896, - "learning_rate": 1.0395536279895428e-05, - "loss": 1.1571, - "num_input_tokens_seen": 5780275200, - "step": 88200, - "train_runtime": 42673.6883, - "train_tokens_per_second": 135452.909 - }, - { - "epoch": 0.883, - "grad_norm": 0.5748215317726135, - "learning_rate": 1.0222122649299952e-05, - "loss": 1.1666, - "num_input_tokens_seen": 5786828800, - "step": 88300, - "train_runtime": 42720.0242, - "train_tokens_per_second": 135459.399 - }, - { - "epoch": 0.884, - "grad_norm": 0.6671021580696106, - "learning_rate": 1.0050116580377593e-05, - "loss": 1.1887, - "num_input_tokens_seen": 5793382400, - "step": 88400, - "train_runtime": 42766.9841, - "train_tokens_per_second": 135463.899 - }, - { - "epoch": 0.885, - "grad_norm": 0.7352688908576965, - "learning_rate": 9.879519805227515e-06, - "loss": 1.173, - "num_input_tokens_seen": 5799936000, - "step": 88500, - "train_runtime": 42820.4689, - "train_tokens_per_second": 135447.746 - }, - { - "epoch": 0.886, - "grad_norm": 0.5779001712799072, - "learning_rate": 9.710334041757351e-06, - "loss": 1.1612, - "num_input_tokens_seen": 5806489600, - "step": 88600, - "train_runtime": 42866.8877, - "train_tokens_per_second": 135453.958 - }, - { - "epoch": 0.887, - "grad_norm": 0.7246189713478088, - "learning_rate": 9.542560993665932e-06, - "loss": 1.1926, - "num_input_tokens_seen": 5813043200, - "step": 88700, - "train_runtime": 42915.9912, - "train_tokens_per_second": 135451.682 - }, - { - "epoch": 0.888, - "grad_norm": 0.5459685921669006, - "learning_rate": 9.376202350425888e-06, - "loss": 1.1698, - "num_input_tokens_seen": 5819596800, - "step": 88800, - "train_runtime": 42964.4051, - "train_tokens_per_second": 135451.586 - }, - { - "epoch": 0.889, - "grad_norm": 0.5574699640274048, - "learning_rate": 9.211259787266972e-06, - "loss": 1.1627, - "num_input_tokens_seen": 5826150400, - "step": 88900, - "train_runtime": 43011.9797, - "train_tokens_per_second": 135454.133 - }, - { - "epoch": 0.89, - "grad_norm": 0.5637386441230774, - "learning_rate": 9.047734965158966e-06, - "loss": 1.1659, - "num_input_tokens_seen": 5832704000, - "step": 89000, - "train_runtime": 43065.5789, - "train_tokens_per_second": 135437.724 - }, - { - "epoch": 0.891, - "grad_norm": 0.5420241951942444, - "learning_rate": 8.885629530794997e-06, - "loss": 1.1693, - "num_input_tokens_seen": 5839257600, - "step": 89100, - "train_runtime": 43113.8932, - "train_tokens_per_second": 135437.957 - }, - { - "epoch": 0.892, - "grad_norm": 0.5701260566711426, - "learning_rate": 8.724945116574983e-06, - "loss": 1.1592, - "num_input_tokens_seen": 5845811200, - "step": 89200, - "train_runtime": 43161.415, - "train_tokens_per_second": 135440.675 - }, - { - "epoch": 0.893, - "grad_norm": 0.5882892608642578, - "learning_rate": 8.565683340589185e-06, - "loss": 1.1601, - "num_input_tokens_seen": 5852364800, - "step": 89300, - "train_runtime": 43209.5307, - "train_tokens_per_second": 135441.527 - }, - { - "epoch": 0.894, - "grad_norm": 0.5708109736442566, - "learning_rate": 8.40784580660196e-06, - "loss": 1.1684, - "num_input_tokens_seen": 5858918400, - "step": 89400, - "train_runtime": 43257.3597, - "train_tokens_per_second": 135443.273 - }, - { - "epoch": 0.895, - "grad_norm": 0.5796698927879333, - "learning_rate": 8.251434104035465e-06, - "loss": 1.1753, - "num_input_tokens_seen": 5865472000, - "step": 89500, - "train_runtime": 43305.3116, - "train_tokens_per_second": 135444.632 - }, - { - "epoch": 0.896, - "grad_norm": 0.9602819681167603, - "learning_rate": 8.09644980795383e-06, - "loss": 1.1672, - "num_input_tokens_seen": 5872025600, - "step": 89600, - "train_runtime": 43360.788, - "train_tokens_per_second": 135422.484 - }, - { - "epoch": 0.897, - "grad_norm": 0.6962534189224243, - "learning_rate": 7.942894479047252e-06, - "loss": 1.1622, - "num_input_tokens_seen": 5878579200, - "step": 89700, - "train_runtime": 43407.8503, - "train_tokens_per_second": 135426.637 - }, - { - "epoch": 0.898, - "grad_norm": 0.6292552351951599, - "learning_rate": 7.790769663616098e-06, - "loss": 1.1632, - "num_input_tokens_seen": 5885132800, - "step": 89800, - "train_runtime": 43455.9389, - "train_tokens_per_second": 135427.584 - }, - { - "epoch": 0.899, - "grad_norm": 0.5883670449256897, - "learning_rate": 7.64007689355563e-06, - "loss": 1.1632, - "num_input_tokens_seen": 5891686400, - "step": 89900, - "train_runtime": 43504.2315, - "train_tokens_per_second": 135427.893 - }, - { - "epoch": 0.9, - "grad_norm": 0.8059070706367493, - "learning_rate": 7.490817686340361e-06, - "loss": 1.1728, - "num_input_tokens_seen": 5898240000, - "step": 90000, - "train_runtime": 43552.1457, - "train_tokens_per_second": 135429.378 - }, - { - "epoch": 0.901, - "grad_norm": 0.5949374437332153, - "learning_rate": 7.342993545008818e-06, - "loss": 1.1732, - "num_input_tokens_seen": 5904793600, - "step": 90100, - "train_runtime": 43599.6931, - "train_tokens_per_second": 135431.999 - }, - { - "epoch": 0.902, - "grad_norm": 0.6094557642936707, - "learning_rate": 7.196605958148505e-06, - "loss": 1.1713, - "num_input_tokens_seen": 5911347200, - "step": 90200, - "train_runtime": 43653.2541, - "train_tokens_per_second": 135415.957 - }, - { - "epoch": 0.903, - "grad_norm": 0.6275845170021057, - "learning_rate": 7.051656399880778e-06, - "loss": 1.1743, - "num_input_tokens_seen": 5917900800, - "step": 90300, - "train_runtime": 43702.1275, - "train_tokens_per_second": 135414.478 - }, - { - "epoch": 0.904, - "grad_norm": 0.7113337516784668, - "learning_rate": 6.9081463298460815e-06, - "loss": 1.162, - "num_input_tokens_seen": 5924454400, - "step": 90400, - "train_runtime": 43749.6704, - "train_tokens_per_second": 135417.121 - }, - { - "epoch": 0.905, - "grad_norm": 0.6237180233001709, - "learning_rate": 6.766077193189201e-06, - "loss": 1.159, - "num_input_tokens_seen": 5931008000, - "step": 90500, - "train_runtime": 43797.6522, - "train_tokens_per_second": 135418.4 - }, - { - "epoch": 0.906, - "grad_norm": 0.9803968667984009, - "learning_rate": 6.625450420544831e-06, - "loss": 1.1788, - "num_input_tokens_seen": 5937561600, - "step": 90600, - "train_runtime": 43846.1111, - "train_tokens_per_second": 135418.203 - }, - { - "epoch": 0.907, - "grad_norm": 0.5648267269134521, - "learning_rate": 6.486267428022967e-06, - "loss": 1.1581, - "num_input_tokens_seen": 5944115200, - "step": 90700, - "train_runtime": 43893.4216, - "train_tokens_per_second": 135421.55 - }, - { - "epoch": 0.908, - "grad_norm": 0.610898494720459, - "learning_rate": 6.34852961719477e-06, - "loss": 1.1557, - "num_input_tokens_seen": 5950668800, - "step": 90800, - "train_runtime": 43947.4481, - "train_tokens_per_second": 135404.194 - }, - { - "epoch": 0.909, - "grad_norm": 0.732876718044281, - "learning_rate": 6.212238375078521e-06, - "loss": 1.1683, - "num_input_tokens_seen": 5957222400, - "step": 90900, - "train_runtime": 43996.4271, - "train_tokens_per_second": 135402.413 - }, - { - "epoch": 0.91, - "grad_norm": 0.5793011784553528, - "learning_rate": 6.077395074125491e-06, - "loss": 1.1747, - "num_input_tokens_seen": 5963776000, - "step": 91000, - "train_runtime": 44044.5112, - "train_tokens_per_second": 135403.387 - }, - { - "epoch": 0.911, - "grad_norm": 0.6567527651786804, - "learning_rate": 5.944001072206212e-06, - "loss": 1.1594, - "num_input_tokens_seen": 5970329600, - "step": 91100, - "train_runtime": 44091.43, - "train_tokens_per_second": 135407.938 - }, - { - "epoch": 0.912, - "grad_norm": 0.6197203397750854, - "learning_rate": 5.812057712596807e-06, - "loss": 1.1504, - "num_input_tokens_seen": 5976883200, - "step": 91200, - "train_runtime": 44140.2623, - "train_tokens_per_second": 135406.608 - }, - { - "epoch": 0.913, - "grad_norm": 0.6190736889839172, - "learning_rate": 5.681566323965486e-06, - "loss": 1.1645, - "num_input_tokens_seen": 5983436800, - "step": 91300, - "train_runtime": 44194.3429, - "train_tokens_per_second": 135389.202 - }, - { - "epoch": 0.914, - "grad_norm": 0.5632036924362183, - "learning_rate": 5.552528220359004e-06, - "loss": 1.1691, - "num_input_tokens_seen": 5989990400, - "step": 91400, - "train_runtime": 44242.165, - "train_tokens_per_second": 135390.987 - }, - { - "epoch": 0.915, - "grad_norm": 0.6650084257125854, - "learning_rate": 5.424944701189704e-06, - "loss": 1.1587, - "num_input_tokens_seen": 5996544000, - "step": 91500, - "train_runtime": 44290.3253, - "train_tokens_per_second": 135391.735 - }, - { - "epoch": 0.916, - "grad_norm": 0.6665343642234802, - "learning_rate": 5.298817051222182e-06, - "loss": 1.16, - "num_input_tokens_seen": 6003097600, - "step": 91600, - "train_runtime": 44344.1461, - "train_tokens_per_second": 135375.199 - }, - { - "epoch": 0.917, - "grad_norm": 0.9934324026107788, - "learning_rate": 5.174146540560442e-06, - "loss": 1.186, - "num_input_tokens_seen": 6009651200, - "step": 91700, - "train_runtime": 44386.6411, - "train_tokens_per_second": 135393.241 - }, - { - "epoch": 0.918, - "grad_norm": 0.587840735912323, - "learning_rate": 5.050934424635195e-06, - "loss": 1.1685, - "num_input_tokens_seen": 6016204800, - "step": 91800, - "train_runtime": 44440.2445, - "train_tokens_per_second": 135377.401 - }, - { - "epoch": 0.919, - "grad_norm": 0.6308780312538147, - "learning_rate": 4.9291819441910465e-06, - "loss": 1.1593, - "num_input_tokens_seen": 6022758400, - "step": 91900, - "train_runtime": 44487.4748, - "train_tokens_per_second": 135380.99 - }, - { - "epoch": 0.92, - "grad_norm": 0.6875436305999756, - "learning_rate": 4.808890325274129e-06, - "loss": 1.1686, - "num_input_tokens_seen": 6029312000, - "step": 92000, - "train_runtime": 44535.4396, - "train_tokens_per_second": 135382.339 - }, - { - "epoch": 0.921, - "grad_norm": 0.6450539231300354, - "learning_rate": 4.690060779219723e-06, - "loss": 1.1669, - "num_input_tokens_seen": 6035865600, - "step": 92100, - "train_runtime": 44583.0204, - "train_tokens_per_second": 135384.852 - }, - { - "epoch": 0.922, - "grad_norm": 1.0118526220321655, - "learning_rate": 4.572694502640023e-06, - "loss": 1.1601, - "num_input_tokens_seen": 6042419200, - "step": 92200, - "train_runtime": 44632.4327, - "train_tokens_per_second": 135381.803 - }, - { - "epoch": 0.923, - "grad_norm": 0.5630050897598267, - "learning_rate": 4.456792677412141e-06, - "loss": 1.164, - "num_input_tokens_seen": 6048972800, - "step": 92300, - "train_runtime": 44685.5287, - "train_tokens_per_second": 135367.6 - }, - { - "epoch": 0.924, - "grad_norm": 0.5819036364555359, - "learning_rate": 4.342356470666153e-06, - "loss": 1.177, - "num_input_tokens_seen": 6055526400, - "step": 92400, - "train_runtime": 44733.1102, - "train_tokens_per_second": 135370.118 - }, - { - "epoch": 0.925, - "grad_norm": 0.5852016806602478, - "learning_rate": 4.22938703477344e-06, - "loss": 1.1846, - "num_input_tokens_seen": 6062080000, - "step": 92500, - "train_runtime": 44781.2518, - "train_tokens_per_second": 135370.937 - }, - { - "epoch": 0.926, - "grad_norm": 0.7466326355934143, - "learning_rate": 4.117885507334884e-06, - "loss": 1.1564, - "num_input_tokens_seen": 6068633600, - "step": 92600, - "train_runtime": 44829.0669, - "train_tokens_per_second": 135372.739 - }, - { - "epoch": 0.927, - "grad_norm": 0.7777779698371887, - "learning_rate": 4.007853011169687e-06, - "loss": 1.1654, - "num_input_tokens_seen": 6075187200, - "step": 92700, - "train_runtime": 44882.4041, - "train_tokens_per_second": 135357.883 - }, - { - "epoch": 0.928, - "grad_norm": 0.9159000515937805, - "learning_rate": 3.899290654303855e-06, - "loss": 1.1854, - "num_input_tokens_seen": 6081740800, - "step": 92800, - "train_runtime": 44929.6625, - "train_tokens_per_second": 135361.373 - }, - { - "epoch": 0.929, - "grad_norm": 0.5948230028152466, - "learning_rate": 3.7921995299591168e-06, - "loss": 1.1602, - "num_input_tokens_seen": 6088294400, - "step": 92900, - "train_runtime": 44977.4717, - "train_tokens_per_second": 135363.198 - }, - { - "epoch": 0.93, - "grad_norm": 0.5999124646186829, - "learning_rate": 3.686580716541887e-06, - "loss": 1.1484, - "num_input_tokens_seen": 6094848000, - "step": 93000, - "train_runtime": 45026.2424, - "train_tokens_per_second": 135362.128 - }, - { - "epoch": 0.931, - "grad_norm": 0.6015925407409668, - "learning_rate": 3.582435277632456e-06, - "loss": 1.1638, - "num_input_tokens_seen": 6101401600, - "step": 93100, - "train_runtime": 45073.6825, - "train_tokens_per_second": 135365.057 - }, - { - "epoch": 0.932, - "grad_norm": 0.5493288040161133, - "learning_rate": 3.479764261974266e-06, - "loss": 1.1644, - "num_input_tokens_seen": 6107955200, - "step": 93200, - "train_runtime": 45131.734, - "train_tokens_per_second": 135336.152 - }, - { - "epoch": 0.933, - "grad_norm": 0.5847836136817932, - "learning_rate": 3.3785687034632523e-06, - "loss": 1.1528, - "num_input_tokens_seen": 6114508800, - "step": 93300, - "train_runtime": 45180.4411, - "train_tokens_per_second": 135335.305 - }, - { - "epoch": 0.934, - "grad_norm": 0.6086737513542175, - "learning_rate": 3.2788496211376024e-06, - "loss": 1.1525, - "num_input_tokens_seen": 6121062400, - "step": 93400, - "train_runtime": 45228.3556, - "train_tokens_per_second": 135336.833 - }, - { - "epoch": 0.935, - "grad_norm": 0.6097891330718994, - "learning_rate": 3.180608019167363e-06, - "loss": 1.1681, - "num_input_tokens_seen": 6127616000, - "step": 93500, - "train_runtime": 45275.6501, - "train_tokens_per_second": 135340.21 - }, - { - "epoch": 0.936, - "grad_norm": 0.5980057716369629, - "learning_rate": 3.0838448868443665e-06, - "loss": 1.1603, - "num_input_tokens_seen": 6134169600, - "step": 93600, - "train_runtime": 45322.6488, - "train_tokens_per_second": 135344.464 - }, - { - "epoch": 0.937, - "grad_norm": 0.7306444048881531, - "learning_rate": 2.988561198572287e-06, - "loss": 1.1702, - "num_input_tokens_seen": 6140723200, - "step": 93700, - "train_runtime": 45376.9708, - "train_tokens_per_second": 135326.865 - }, - { - "epoch": 0.938, - "grad_norm": 0.9187434911727905, - "learning_rate": 2.8947579138567987e-06, - "loss": 1.1654, - "num_input_tokens_seen": 6147276800, - "step": 93800, - "train_runtime": 45427.1088, - "train_tokens_per_second": 135321.771 - }, - { - "epoch": 0.939, - "grad_norm": 0.6403319835662842, - "learning_rate": 2.8024359772959525e-06, - "loss": 1.1581, - "num_input_tokens_seen": 6153830400, - "step": 93900, - "train_runtime": 45475.34, - "train_tokens_per_second": 135322.362 - }, - { - "epoch": 0.94, - "grad_norm": 0.7088416218757629, - "learning_rate": 2.711596318570597e-06, - "loss": 1.1683, - "num_input_tokens_seen": 6160384000, - "step": 94000, - "train_runtime": 45523.8789, - "train_tokens_per_second": 135322.037 - }, - { - "epoch": 0.941, - "grad_norm": 0.6289553642272949, - "learning_rate": 2.6222398524351206e-06, - "loss": 1.1538, - "num_input_tokens_seen": 6166937600, - "step": 94100, - "train_runtime": 45571.6907, - "train_tokens_per_second": 135323.871 - }, - { - "epoch": 0.942, - "grad_norm": 0.8788822889328003, - "learning_rate": 2.5343674787081435e-06, - "loss": 1.1666, - "num_input_tokens_seen": 6173491200, - "step": 94200, - "train_runtime": 45621.3271, - "train_tokens_per_second": 135320.29 - }, - { - "epoch": 0.943, - "grad_norm": 0.575515866279602, - "learning_rate": 2.4479800822634565e-06, - "loss": 1.1685, - "num_input_tokens_seen": 6180044800, - "step": 94300, - "train_runtime": 45670.6842, - "train_tokens_per_second": 135317.543 - }, - { - "epoch": 0.944, - "grad_norm": 0.5740439891815186, - "learning_rate": 2.3630785330212286e-06, - "loss": 1.1588, - "num_input_tokens_seen": 6186598400, - "step": 94400, - "train_runtime": 45717.875, - "train_tokens_per_second": 135321.215 - }, - { - "epoch": 0.945, - "grad_norm": 0.6576538681983948, - "learning_rate": 2.2796636859390815e-06, - "loss": 1.1492, - "num_input_tokens_seen": 6193152000, - "step": 94500, - "train_runtime": 45766.0209, - "train_tokens_per_second": 135322.055 - }, - { - "epoch": 0.946, - "grad_norm": 0.5781713128089905, - "learning_rate": 2.197736381003612e-06, - "loss": 1.1725, - "num_input_tokens_seen": 6199705600, - "step": 94600, - "train_runtime": 45819.6687, - "train_tokens_per_second": 135306.644 - }, - { - "epoch": 0.947, - "grad_norm": 0.6812490820884705, - "learning_rate": 2.1172974432218826e-06, - "loss": 1.1509, - "num_input_tokens_seen": 6206259200, - "step": 94700, - "train_runtime": 45866.8187, - "train_tokens_per_second": 135310.435 - }, - { - "epoch": 0.948, - "grad_norm": 0.8884466886520386, - "learning_rate": 2.0383476826130786e-06, - "loss": 1.157, - "num_input_tokens_seen": 6212812800, - "step": 94800, - "train_runtime": 45915.7744, - "train_tokens_per_second": 135308.897 - }, - { - "epoch": 0.949, - "grad_norm": 0.6096293926239014, - "learning_rate": 1.96088789420043e-06, - "loss": 1.1609, - "num_input_tokens_seen": 6219366400, - "step": 94900, - "train_runtime": 45963.3824, - "train_tokens_per_second": 135311.33 - }, - { - "epoch": 0.95, - "grad_norm": 0.5762118697166443, - "learning_rate": 1.8849188580031539e-06, - "loss": 1.1621, - "num_input_tokens_seen": 6225920000, - "step": 95000, - "train_runtime": 46012.4538, - "train_tokens_per_second": 135309.454 - }, - { - "epoch": 0.951, - "grad_norm": 0.5296618938446045, - "learning_rate": 1.8104413390286066e-06, - "loss": 1.157, - "num_input_tokens_seen": 6232473600, - "step": 95100, - "train_runtime": 46059.2761, - "train_tokens_per_second": 135314.189 - }, - { - "epoch": 0.952, - "grad_norm": 0.6025533676147461, - "learning_rate": 1.7374560872645438e-06, - "loss": 1.1507, - "num_input_tokens_seen": 6239027200, - "step": 95200, - "train_runtime": 46113.68, - "train_tokens_per_second": 135296.667 - }, - { - "epoch": 0.953, - "grad_norm": 0.616148829460144, - "learning_rate": 1.6659638376716578e-06, - "loss": 1.1711, - "num_input_tokens_seen": 6245580800, - "step": 95300, - "train_runtime": 46162.0494, - "train_tokens_per_second": 135296.87 - }, - { - "epoch": 0.954, - "grad_norm": 0.6661262512207031, - "learning_rate": 1.5959653101761172e-06, - "loss": 1.1604, - "num_input_tokens_seen": 6252134400, - "step": 95400, - "train_runtime": 46208.848, - "train_tokens_per_second": 135301.672 - }, - { - "epoch": 0.955, - "grad_norm": 0.8173303604125977, - "learning_rate": 1.5274612096623063e-06, - "loss": 1.1498, - "num_input_tokens_seen": 6258688000, - "step": 95500, - "train_runtime": 46256.5159, - "train_tokens_per_second": 135303.922 - }, - { - "epoch": 0.956, - "grad_norm": 0.6189817786216736, - "learning_rate": 1.4604522259657635e-06, - "loss": 1.1602, - "num_input_tokens_seen": 6265241600, - "step": 95600, - "train_runtime": 46309.4141, - "train_tokens_per_second": 135290.885 - }, - { - "epoch": 0.957, - "grad_norm": 0.7523248195648193, - "learning_rate": 1.3949390338662047e-06, - "loss": 1.1655, - "num_input_tokens_seen": 6271795200, - "step": 95700, - "train_runtime": 46357.4405, - "train_tokens_per_second": 135292.094 - }, - { - "epoch": 0.958, - "grad_norm": 0.5935103297233582, - "learning_rate": 1.330922293080744e-06, - "loss": 1.1702, - "num_input_tokens_seen": 6278348800, - "step": 95800, - "train_runtime": 46406.0604, - "train_tokens_per_second": 135291.571 - }, - { - "epoch": 0.959, - "grad_norm": 0.8042653203010559, - "learning_rate": 1.2684026482572662e-06, - "loss": 1.1623, - "num_input_tokens_seen": 6284902400, - "step": 95900, - "train_runtime": 46454.8491, - "train_tokens_per_second": 135290.557 - }, - { - "epoch": 0.96, - "grad_norm": 0.5935735106468201, - "learning_rate": 1.2073807289678993e-06, - "loss": 1.1441, - "num_input_tokens_seen": 6291456000, - "step": 96000, - "train_runtime": 46502.688, - "train_tokens_per_second": 135292.308 - }, - { - "epoch": 0.961, - "grad_norm": 0.5718377828598022, - "learning_rate": 1.147857149702669e-06, - "loss": 1.1618, - "num_input_tokens_seen": 6298009600, - "step": 96100, - "train_runtime": 46555.2337, - "train_tokens_per_second": 135280.378 - }, - { - "epoch": 0.962, - "grad_norm": 0.6801995635032654, - "learning_rate": 1.0898325098633697e-06, - "loss": 1.1479, - "num_input_tokens_seen": 6304563200, - "step": 96200, - "train_runtime": 46603.2751, - "train_tokens_per_second": 135281.548 - }, - { - "epoch": 0.963, - "grad_norm": 0.5564619898796082, - "learning_rate": 1.0333073937575043e-06, - "loss": 1.1582, - "num_input_tokens_seen": 6311116800, - "step": 96300, - "train_runtime": 46652.5681, - "train_tokens_per_second": 135279.087 - }, - { - "epoch": 0.964, - "grad_norm": 0.6501321792602539, - "learning_rate": 9.782823705923204e-07, - "loss": 1.1617, - "num_input_tokens_seen": 6317670400, - "step": 96400, - "train_runtime": 46700.1727, - "train_tokens_per_second": 135281.521 - }, - { - "epoch": 0.965, - "grad_norm": 0.6728459596633911, - "learning_rate": 9.247579944692162e-07, - "loss": 1.1592, - "num_input_tokens_seen": 6324224000, - "step": 96500, - "train_runtime": 46748.7553, - "train_tokens_per_second": 135281.12 - }, - { - "epoch": 0.966, - "grad_norm": 0.5893784761428833, - "learning_rate": 8.72734804378078e-07, - "loss": 1.1691, - "num_input_tokens_seen": 6330777600, - "step": 96600, - "train_runtime": 46801.015, - "train_tokens_per_second": 135270.092 - }, - { - "epoch": 0.967, - "grad_norm": 0.8625339269638062, - "learning_rate": 8.222133241918172e-07, - "loss": 1.1518, - "num_input_tokens_seen": 6337331200, - "step": 96700, - "train_runtime": 46847.2237, - "train_tokens_per_second": 135276.559 - }, - { - "epoch": 0.968, - "grad_norm": 0.6501858830451965, - "learning_rate": 7.731940626612088e-07, - "loss": 1.1693, - "num_input_tokens_seen": 6343884800, - "step": 96800, - "train_runtime": 46895.3712, - "train_tokens_per_second": 135277.419 - }, - { - "epoch": 0.969, - "grad_norm": 0.6575475335121155, - "learning_rate": 7.256775134096615e-07, - "loss": 1.1552, - "num_input_tokens_seen": 6350438400, - "step": 96900, - "train_runtime": 46942.8491, - "train_tokens_per_second": 135280.208 - }, - { - "epoch": 0.97, - "grad_norm": 0.5287050604820251, - "learning_rate": 6.796641549283055e-07, - "loss": 1.1946, - "num_input_tokens_seen": 6356992000, - "step": 97000, - "train_runtime": 46991.8919, - "train_tokens_per_second": 135278.486 - }, - { - "epoch": 0.971, - "grad_norm": 0.568566083908081, - "learning_rate": 6.351544505711292e-07, - "loss": 1.1559, - "num_input_tokens_seen": 6363545600, - "step": 97100, - "train_runtime": 47040.0316, - "train_tokens_per_second": 135279.365 - }, - { - "epoch": 0.972, - "grad_norm": 0.9329395890235901, - "learning_rate": 5.921488485503833e-07, - "loss": 1.1603, - "num_input_tokens_seen": 6370099200, - "step": 97200, - "train_runtime": 47092.2725, - "train_tokens_per_second": 135268.46 - }, - { - "epoch": 0.973, - "grad_norm": 0.6256415843963623, - "learning_rate": 5.506477819319843e-07, - "loss": 1.1571, - "num_input_tokens_seen": 6376652800, - "step": 97300, - "train_runtime": 47139.4068, - "train_tokens_per_second": 135272.233 - }, - { - "epoch": 0.974, - "grad_norm": 0.7202081680297852, - "learning_rate": 5.106516686312345e-07, - "loss": 1.1638, - "num_input_tokens_seen": 6383206400, - "step": 97400, - "train_runtime": 47191.9059, - "train_tokens_per_second": 135260.619 - }, - { - "epoch": 0.975, - "grad_norm": 1.2700363397598267, - "learning_rate": 4.721609114085256e-07, - "loss": 1.1649, - "num_input_tokens_seen": 6389760000, - "step": 97500, - "train_runtime": 47240.0777, - "train_tokens_per_second": 135261.42 - }, - { - "epoch": 0.976, - "grad_norm": 0.5555500388145447, - "learning_rate": 4.3517589786539186e-07, - "loss": 1.1505, - "num_input_tokens_seen": 6396313600, - "step": 97600, - "train_runtime": 47287.972, - "train_tokens_per_second": 135263.013 - }, - { - "epoch": 0.977, - "grad_norm": 0.6499391198158264, - "learning_rate": 3.996970004404798e-07, - "loss": 1.153, - "num_input_tokens_seen": 6402867200, - "step": 97700, - "train_runtime": 47335.8726, - "train_tokens_per_second": 135264.586 - }, - { - "epoch": 0.978, - "grad_norm": 0.6353591084480286, - "learning_rate": 3.657245764058847e-07, - "loss": 1.1621, - "num_input_tokens_seen": 6409420800, - "step": 97800, - "train_runtime": 47382.5196, - "train_tokens_per_second": 135269.733 - }, - { - "epoch": 0.979, - "grad_norm": 0.62052321434021, - "learning_rate": 3.3325896786355334e-07, - "loss": 1.1539, - "num_input_tokens_seen": 6415974400, - "step": 97900, - "train_runtime": 47435.6023, - "train_tokens_per_second": 135256.518 - }, - { - "epoch": 0.98, - "grad_norm": 0.5979087352752686, - "learning_rate": 3.023005017418201e-07, - "loss": 1.1615, - "num_input_tokens_seen": 6422528000, - "step": 98000, - "train_runtime": 47484.0018, - "train_tokens_per_second": 135256.671 - }, - { - "epoch": 0.981, - "grad_norm": 1.0899096727371216, - "learning_rate": 2.7284948979205967e-07, - "loss": 1.166, - "num_input_tokens_seen": 6429081600, - "step": 98100, - "train_runtime": 47531.611, - "train_tokens_per_second": 135259.072 - }, - { - "epoch": 0.982, - "grad_norm": 0.6240010857582092, - "learning_rate": 2.449062285856729e-07, - "loss": 1.1565, - "num_input_tokens_seen": 6435635200, - "step": 98200, - "train_runtime": 47578.8884, - "train_tokens_per_second": 135262.412 - }, - { - "epoch": 0.983, - "grad_norm": 0.7941544651985168, - "learning_rate": 2.184709995109557e-07, - "loss": 1.1572, - "num_input_tokens_seen": 6442188800, - "step": 98300, - "train_runtime": 47627.3828, - "train_tokens_per_second": 135262.289 - }, - { - "epoch": 0.984, - "grad_norm": 0.5704551339149475, - "learning_rate": 1.9354406877038487e-07, - "loss": 1.1629, - "num_input_tokens_seen": 6448742400, - "step": 98400, - "train_runtime": 47679.6586, - "train_tokens_per_second": 135251.438 - }, - { - "epoch": 0.985, - "grad_norm": 0.5758212208747864, - "learning_rate": 1.7012568737788668e-07, - "loss": 1.1892, - "num_input_tokens_seen": 6455296000, - "step": 98500, - "train_runtime": 47728.7818, - "train_tokens_per_second": 135249.545 - }, - { - "epoch": 0.986, - "grad_norm": 0.5768951773643494, - "learning_rate": 1.4821609115630574e-07, - "loss": 1.1617, - "num_input_tokens_seen": 6461849600, - "step": 98600, - "train_runtime": 47775.3275, - "train_tokens_per_second": 135254.952 - }, - { - "epoch": 0.987, - "grad_norm": 0.5714033842086792, - "learning_rate": 1.278155007350068e-07, - "loss": 1.1712, - "num_input_tokens_seen": 6468403200, - "step": 98700, - "train_runtime": 47823.1467, - "train_tokens_per_second": 135256.746 - }, - { - "epoch": 0.988, - "grad_norm": 1.029975414276123, - "learning_rate": 1.089241215477099e-07, - "loss": 1.1621, - "num_input_tokens_seen": 6474956800, - "step": 98800, - "train_runtime": 47875.5087, - "train_tokens_per_second": 135245.702 - }, - { - "epoch": 0.989, - "grad_norm": 0.5554516315460205, - "learning_rate": 9.154214383042535e-08, - "loss": 1.1489, - "num_input_tokens_seen": 6481510400, - "step": 98900, - "train_runtime": 47923.8409, - "train_tokens_per_second": 135246.055 - }, - { - "epoch": 0.99, - "grad_norm": 0.6340943574905396, - "learning_rate": 7.566974261945524e-08, - "loss": 1.1721, - "num_input_tokens_seen": 6488064000, - "step": 99000, - "train_runtime": 47972.1937, - "train_tokens_per_second": 135246.348 - }, - { - "epoch": 0.991, - "grad_norm": 0.582399845123291, - "learning_rate": 6.13070777496949e-08, - "loss": 1.1497, - "num_input_tokens_seen": 6494617600, - "step": 99100, - "train_runtime": 48020.3976, - "train_tokens_per_second": 135247.06 - }, - { - "epoch": 0.992, - "grad_norm": 0.6133337020874023, - "learning_rate": 4.845429385303412e-08, - "loss": 1.1601, - "num_input_tokens_seen": 6501171200, - "step": 99200, - "train_runtime": 48068.6895, - "train_tokens_per_second": 135247.523 - }, - { - "epoch": 0.993, - "grad_norm": 0.5691381096839905, - "learning_rate": 3.711152035685838e-08, - "loss": 1.1571, - "num_input_tokens_seen": 6507724800, - "step": 99300, - "train_runtime": 48115.7967, - "train_tokens_per_second": 135251.315 - }, - { - "epoch": 0.994, - "grad_norm": 0.6613404750823975, - "learning_rate": 2.727887148278318e-08, - "loss": 1.1569, - "num_input_tokens_seen": 6514278400, - "step": 99400, - "train_runtime": 48169.6246, - "train_tokens_per_second": 135236.229 - }, - { - "epoch": 0.995, - "grad_norm": 0.5285235047340393, - "learning_rate": 1.8956446245455005e-08, - "loss": 1.1722, - "num_input_tokens_seen": 6520832000, - "step": 99500, - "train_runtime": 48217.4936, - "train_tokens_per_second": 135237.888 - }, - { - "epoch": 0.996, - "grad_norm": 0.8071156144142151, - "learning_rate": 1.2144328451618724e-08, - "loss": 1.1571, - "num_input_tokens_seen": 6527385600, - "step": 99600, - "train_runtime": 48264.7605, - "train_tokens_per_second": 135241.231 - }, - { - "epoch": 0.997, - "grad_norm": 0.5775815844535828, - "learning_rate": 6.84258669920168e-09, - "loss": 1.1634, - "num_input_tokens_seen": 6533939200, - "step": 99700, - "train_runtime": 48314.0709, - "train_tokens_per_second": 135238.846 - }, - { - "epoch": 0.998, - "grad_norm": 0.5299545526504517, - "learning_rate": 3.0512743767141524e-09, - "loss": 1.1563, - "num_input_tokens_seen": 6540492800, - "step": 99800, - "train_runtime": 48364.7142, - "train_tokens_per_second": 135232.74 - }, - { - "epoch": 0.999, - "grad_norm": 0.636650800704956, - "learning_rate": 7.70429662616534e-10, - "loss": 1.1653, - "num_input_tokens_seen": 6547046400, - "step": 99900, - "train_runtime": 48412.6126, - "train_tokens_per_second": 135234.313 - }, - { - "epoch": 1.0, - "grad_norm": 0.5705932974815369, - "learning_rate": 7.552498626495208e-14, - "loss": 1.1814, - "num_input_tokens_seen": 6553600000, - "step": 100000, - "train_runtime": 48460.0302, - "train_tokens_per_second": 135237.225 - } - ], - "logging_steps": 100, - "max_steps": 100000, - "num_input_tokens_seen": 6553600000, - "num_train_epochs": 9223372036854775807, - "save_steps": 1000, - "stateful_callbacks": { - "TrainerControl": { - "args": { - "should_epoch_stop": false, - "should_evaluate": false, - "should_log": false, - "should_save": true, - "should_training_stop": true - }, - "attributes": {} - } - }, - "total_flos": 1.23866185728e+17, - "train_batch_size": 256, - "trial_name": null, - "trial_params": null -}