diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,10044 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 100000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.001, + "grad_norm": 3.9445953369140625, + "learning_rate": 2.97e-05, + "loss": 6.7008, + "num_input_tokens_seen": 6553600, + "step": 100, + "train_runtime": 61.1942, + "train_tokens_per_second": 107095.166 + }, + { + "epoch": 0.002, + "grad_norm": 0.6828203797340393, + "learning_rate": 5.97e-05, + "loss": 3.3177, + "num_input_tokens_seen": 13107200, + "step": 200, + "train_runtime": 107.6856, + "train_tokens_per_second": 121717.274 + }, + { + "epoch": 0.003, + "grad_norm": 16.05720329284668, + "learning_rate": 8.969999999999998e-05, + "loss": 3.0024, + "num_input_tokens_seen": 19660800, + "step": 300, + "train_runtime": 154.3564, + "train_tokens_per_second": 127372.748 + }, + { + "epoch": 0.004, + "grad_norm": 13.74783706665039, + "learning_rate": 0.0001197, + "loss": 2.6797, + "num_input_tokens_seen": 26214400, + "step": 400, + "train_runtime": 200.698, + "train_tokens_per_second": 130616.167 + }, + { + "epoch": 0.005, + "grad_norm": 12.893468856811523, + "learning_rate": 0.00014969999999999998, + "loss": 2.4588, + "num_input_tokens_seen": 32768000, + "step": 500, + "train_runtime": 252.1632, + "train_tokens_per_second": 129947.566 + }, + { + "epoch": 0.006, + "grad_norm": 9.021939277648926, + "learning_rate": 0.00017969999999999998, + "loss": 2.276, + "num_input_tokens_seen": 39321600, + "step": 600, + "train_runtime": 299.2712, + "train_tokens_per_second": 131391.184 + }, + { + "epoch": 0.007, + "grad_norm": 8.669090270996094, + "learning_rate": 0.00020969999999999997, + "loss": 2.1203, + "num_input_tokens_seen": 45875200, + "step": 700, + "train_runtime": 346.3366, + "train_tokens_per_second": 132458.429 + }, + { + "epoch": 0.008, + "grad_norm": 7.335177898406982, + "learning_rate": 0.0002397, + "loss": 1.9886, + "num_input_tokens_seen": 52428800, + "step": 800, + "train_runtime": 393.5299, + "train_tokens_per_second": 133226.965 + }, + { + "epoch": 0.009, + "grad_norm": 6.051175117492676, + "learning_rate": 0.0002697, + "loss": 1.9128, + "num_input_tokens_seen": 58982400, + "step": 900, + "train_runtime": 440.0136, + "train_tokens_per_second": 134046.765 + }, + { + "epoch": 0.01, + "grad_norm": 5.503482818603516, + "learning_rate": 0.00029969999999999997, + "loss": 1.8296, + "num_input_tokens_seen": 65536000, + "step": 1000, + "train_runtime": 492.2662, + "train_tokens_per_second": 133131.222 + }, + { + "epoch": 0.011, + "grad_norm": 2.8459227085113525, + "learning_rate": 0.00029999925978027874, + "loss": 1.779, + "num_input_tokens_seen": 72089600, + "step": 1100, + "train_runtime": 538.0301, + "train_tokens_per_second": 133988.032 + }, + { + "epoch": 0.012, + "grad_norm": 2.292707920074463, + "learning_rate": 0.0002999970091452017, + "loss": 1.7037, + "num_input_tokens_seen": 78643200, + "step": 1200, + "train_runtime": 585.618, + "train_tokens_per_second": 134290.951 + }, + { + "epoch": 0.013, + "grad_norm": 3.362025737762451, + "learning_rate": 0.00029999324804190795, + "loss": 1.6688, + "num_input_tokens_seen": 85196800, + "step": 1300, + "train_runtime": 632.1008, + "train_tokens_per_second": 134783.565 + }, + { + "epoch": 0.014, + "grad_norm": 2.2756998538970947, + "learning_rate": 0.0002999879765082716, + "loss": 1.6397, + "num_input_tokens_seen": 91750400, + "step": 1400, + "train_runtime": 684.3545, + "train_tokens_per_second": 134068.525 + }, + { + "epoch": 0.015, + "grad_norm": 2.5730831623077393, + "learning_rate": 0.000299981194597377, + "loss": 1.605, + "num_input_tokens_seen": 98304000, + "step": 1500, + "train_runtime": 730.5087, + "train_tokens_per_second": 134569.247 + }, + { + "epoch": 0.016, + "grad_norm": 1.7514433860778809, + "learning_rate": 0.0002999729023775179, + "loss": 1.5838, + "num_input_tokens_seen": 104857600, + "step": 1600, + "train_runtime": 781.9407, + "train_tokens_per_second": 134099.179 + }, + { + "epoch": 0.017, + "grad_norm": 1.8343929052352905, + "learning_rate": 0.0002999630999321969, + "loss": 1.6037, + "num_input_tokens_seen": 111411200, + "step": 1700, + "train_runtime": 824.7241, + "train_tokens_per_second": 135089.057 + }, + { + "epoch": 0.018, + "grad_norm": 1.5672227144241333, + "learning_rate": 0.00029995178736012443, + "loss": 1.5627, + "num_input_tokens_seen": 117964800, + "step": 1800, + "train_runtime": 871.9564, + "train_tokens_per_second": 135287.497 + }, + { + "epoch": 0.019, + "grad_norm": 1.6202061176300049, + "learning_rate": 0.0002999389647752181, + "loss": 1.5398, + "num_input_tokens_seen": 124518400, + "step": 1900, + "train_runtime": 923.402, + "train_tokens_per_second": 134847.439 + }, + { + "epoch": 0.02, + "grad_norm": 1.5145666599273682, + "learning_rate": 0.00029992463230660104, + "loss": 1.5389, + "num_input_tokens_seen": 131072000, + "step": 2000, + "train_runtime": 968.9283, + "train_tokens_per_second": 135275.229 + }, + { + "epoch": 0.021, + "grad_norm": 1.0306257009506226, + "learning_rate": 0.00029990879009860117, + "loss": 1.5098, + "num_input_tokens_seen": 137625600, + "step": 2100, + "train_runtime": 1020.8371, + "train_tokens_per_second": 134816.412 + }, + { + "epoch": 0.022, + "grad_norm": 2.0710599422454834, + "learning_rate": 0.0002998914383107493, + "loss": 1.5081, + "num_input_tokens_seen": 144179200, + "step": 2200, + "train_runtime": 1067.2796, + "train_tokens_per_second": 135090.368 + }, + { + "epoch": 0.023, + "grad_norm": 1.4022581577301025, + "learning_rate": 0.0002998725771177778, + "loss": 1.521, + "num_input_tokens_seen": 150732800, + "step": 2300, + "train_runtime": 1114.7094, + "train_tokens_per_second": 135221.616 + }, + { + "epoch": 0.024, + "grad_norm": 1.4328904151916504, + "learning_rate": 0.00029985220670961847, + "loss": 1.4855, + "num_input_tokens_seen": 157286400, + "step": 2400, + "train_runtime": 1160.6217, + "train_tokens_per_second": 135519.092 + }, + { + "epoch": 0.025, + "grad_norm": 1.3760366439819336, + "learning_rate": 0.0002998303272914014, + "loss": 1.4966, + "num_input_tokens_seen": 163840000, + "step": 2500, + "train_runtime": 1212.6489, + "train_tokens_per_second": 135109.18 + }, + { + "epoch": 0.026, + "grad_norm": 0.9530190825462341, + "learning_rate": 0.00029980693908345185, + "loss": 1.4795, + "num_input_tokens_seen": 170393600, + "step": 2600, + "train_runtime": 1258.3106, + "train_tokens_per_second": 135414.576 + }, + { + "epoch": 0.027, + "grad_norm": 0.8715839385986328, + "learning_rate": 0.00029978204232128895, + "loss": 1.4601, + "num_input_tokens_seen": 176947200, + "step": 2700, + "train_runtime": 1304.6837, + "train_tokens_per_second": 135624.597 + }, + { + "epoch": 0.028, + "grad_norm": 1.1879854202270508, + "learning_rate": 0.0002997556372556227, + "loss": 1.487, + "num_input_tokens_seen": 183500800, + "step": 2800, + "train_runtime": 1358.2195, + "train_tokens_per_second": 135103.938 + }, + { + "epoch": 0.029, + "grad_norm": 1.0949848890304565, + "learning_rate": 0.0002997277241523519, + "loss": 1.4658, + "num_input_tokens_seen": 190054400, + "step": 2900, + "train_runtime": 1404.4203, + "train_tokens_per_second": 135325.869 + }, + { + "epoch": 0.03, + "grad_norm": 1.465809941291809, + "learning_rate": 0.00029969830329256125, + "loss": 1.4463, + "num_input_tokens_seen": 196608000, + "step": 3000, + "train_runtime": 1451.3838, + "train_tokens_per_second": 135462.45 + }, + { + "epoch": 0.031, + "grad_norm": 0.9500088095664978, + "learning_rate": 0.00029966737497251836, + "loss": 1.4533, + "num_input_tokens_seen": 203161600, + "step": 3100, + "train_runtime": 1496.7114, + "train_tokens_per_second": 135738.657 + }, + { + "epoch": 0.032, + "grad_norm": 1.3393683433532715, + "learning_rate": 0.0002996349395036711, + "loss": 1.4402, + "num_input_tokens_seen": 209715200, + "step": 3200, + "train_runtime": 1549.2536, + "train_tokens_per_second": 135365.316 + }, + { + "epoch": 0.033, + "grad_norm": 0.7998270988464355, + "learning_rate": 0.00029960099721264435, + "loss": 1.4467, + "num_input_tokens_seen": 216268800, + "step": 3300, + "train_runtime": 1596.5035, + "train_tokens_per_second": 135464.03 + }, + { + "epoch": 0.034, + "grad_norm": 0.8441318273544312, + "learning_rate": 0.0002995655484412365, + "loss": 1.4353, + "num_input_tokens_seen": 222822400, + "step": 3400, + "train_runtime": 1642.6114, + "train_tokens_per_second": 135651.317 + }, + { + "epoch": 0.035, + "grad_norm": 0.7577129006385803, + "learning_rate": 0.00029952859354641636, + "loss": 1.4253, + "num_input_tokens_seen": 229376000, + "step": 3500, + "train_runtime": 1690.0779, + "train_tokens_per_second": 135719.187 + }, + { + "epoch": 0.036, + "grad_norm": 0.8359817862510681, + "learning_rate": 0.00029949013290031924, + "loss": 1.4348, + "num_input_tokens_seen": 235929600, + "step": 3600, + "train_runtime": 1736.0232, + "train_tokens_per_second": 135902.33 + }, + { + "epoch": 0.037, + "grad_norm": 0.7565376162528992, + "learning_rate": 0.00029945016689024353, + "loss": 1.4114, + "num_input_tokens_seen": 242483200, + "step": 3700, + "train_runtime": 1788.0113, + "train_tokens_per_second": 135616.148 + }, + { + "epoch": 0.038, + "grad_norm": 0.9537010788917542, + "learning_rate": 0.0002994086959186464, + "loss": 1.4134, + "num_input_tokens_seen": 249036800, + "step": 3800, + "train_runtime": 1835.9254, + "train_tokens_per_second": 135646.47 + }, + { + "epoch": 0.039, + "grad_norm": 0.8911266922950745, + "learning_rate": 0.00029936572040314014, + "loss": 1.4224, + "num_input_tokens_seen": 255590400, + "step": 3900, + "train_runtime": 1882.537, + "train_tokens_per_second": 135769.123 + }, + { + "epoch": 0.04, + "grad_norm": 0.7832906246185303, + "learning_rate": 0.0002993212407764877, + "loss": 1.4177, + "num_input_tokens_seen": 262144000, + "step": 4000, + "train_runtime": 1928.8118, + "train_tokens_per_second": 135909.579 + }, + { + "epoch": 0.041, + "grad_norm": 0.8426671624183655, + "learning_rate": 0.00029927525748659834, + "loss": 1.4194, + "num_input_tokens_seen": 268697600, + "step": 4100, + "train_runtime": 1981.7143, + "train_tokens_per_second": 135588.467 + }, + { + "epoch": 0.042, + "grad_norm": 0.9675344824790955, + "learning_rate": 0.0002992277709965234, + "loss": 1.4059, + "num_input_tokens_seen": 275251200, + "step": 4200, + "train_runtime": 2027.927, + "train_tokens_per_second": 135730.33 + }, + { + "epoch": 0.043, + "grad_norm": 1.1866440773010254, + "learning_rate": 0.0002991787817844513, + "loss": 1.4065, + "num_input_tokens_seen": 281804800, + "step": 4300, + "train_runtime": 2074.708, + "train_tokens_per_second": 135828.659 + }, + { + "epoch": 0.044, + "grad_norm": 0.8417257070541382, + "learning_rate": 0.0002991282903437028, + "loss": 1.397, + "num_input_tokens_seen": 288358400, + "step": 4400, + "train_runtime": 2126.0513, + "train_tokens_per_second": 135630.972 + }, + { + "epoch": 0.045, + "grad_norm": 0.8226633071899414, + "learning_rate": 0.0002990762971827262, + "loss": 1.3996, + "num_input_tokens_seen": 294912000, + "step": 4500, + "train_runtime": 2172.3837, + "train_tokens_per_second": 135755.024 + }, + { + "epoch": 0.046, + "grad_norm": 0.8411224484443665, + "learning_rate": 0.00029902280282509197, + "loss": 1.4002, + "num_input_tokens_seen": 301465600, + "step": 4600, + "train_runtime": 2220.1775, + "train_tokens_per_second": 135784.456 + }, + { + "epoch": 0.047, + "grad_norm": 0.7082719802856445, + "learning_rate": 0.0002989678078094878, + "loss": 1.3804, + "num_input_tokens_seen": 308019200, + "step": 4700, + "train_runtime": 2266.6848, + "train_tokens_per_second": 135889.739 + }, + { + "epoch": 0.048, + "grad_norm": 0.7628137469291687, + "learning_rate": 0.00029891131268971284, + "loss": 1.3795, + "num_input_tokens_seen": 314572800, + "step": 4800, + "train_runtime": 2318.5885, + "train_tokens_per_second": 135674.269 + }, + { + "epoch": 0.049, + "grad_norm": 0.7231079936027527, + "learning_rate": 0.0002988533180346723, + "loss": 1.3789, + "num_input_tokens_seen": 321126400, + "step": 4900, + "train_runtime": 2364.3453, + "train_tokens_per_second": 135820.432 + }, + { + "epoch": 0.05, + "grad_norm": 0.7210503816604614, + "learning_rate": 0.0002987938244283717, + "loss": 1.3641, + "num_input_tokens_seen": 327680000, + "step": 5000, + "train_runtime": 2410.3286, + "train_tokens_per_second": 135948.267 + }, + { + "epoch": 0.051, + "grad_norm": 0.729364275932312, + "learning_rate": 0.00029873283246991105, + "loss": 1.3756, + "num_input_tokens_seen": 334233600, + "step": 5100, + "train_runtime": 2458.4762, + "train_tokens_per_second": 135951.532 + }, + { + "epoch": 0.052, + "grad_norm": 0.7513293027877808, + "learning_rate": 0.0002986703427734787, + "loss": 1.3778, + "num_input_tokens_seen": 340787200, + "step": 5200, + "train_runtime": 2506.9032, + "train_tokens_per_second": 135939.511 + }, + { + "epoch": 0.053, + "grad_norm": 0.7382386326789856, + "learning_rate": 0.00029860635596834517, + "loss": 1.3807, + "num_input_tokens_seen": 347340800, + "step": 5300, + "train_runtime": 2559.5035, + "train_tokens_per_second": 135706.321 + }, + { + "epoch": 0.054, + "grad_norm": 0.5869194269180298, + "learning_rate": 0.0002985408726988569, + "loss": 1.3695, + "num_input_tokens_seen": 353894400, + "step": 5400, + "train_runtime": 2605.4484, + "train_tokens_per_second": 135828.598 + }, + { + "epoch": 0.055, + "grad_norm": 0.7805973291397095, + "learning_rate": 0.0002984738936244296, + "loss": 1.3746, + "num_input_tokens_seen": 360448000, + "step": 5500, + "train_runtime": 2655.8515, + "train_tokens_per_second": 135718.431 + }, + { + "epoch": 0.056, + "grad_norm": 0.6918448209762573, + "learning_rate": 0.0002984054194195419, + "loss": 1.3855, + "num_input_tokens_seen": 367001600, + "step": 5600, + "train_runtime": 2703.0299, + "train_tokens_per_second": 135774.155 + }, + { + "epoch": 0.057, + "grad_norm": 0.6129201054573059, + "learning_rate": 0.0002983354507737283, + "loss": 1.3816, + "num_input_tokens_seen": 373555200, + "step": 5700, + "train_runtime": 2750.071, + "train_tokens_per_second": 135834.747 + }, + { + "epoch": 0.058, + "grad_norm": 0.7457948923110962, + "learning_rate": 0.00029826398839157215, + "loss": 1.3748, + "num_input_tokens_seen": 380108800, + "step": 5800, + "train_runtime": 2795.4164, + "train_tokens_per_second": 135975.735 + }, + { + "epoch": 0.059, + "grad_norm": 0.6171481013298035, + "learning_rate": 0.000298191032992699, + "loss": 1.3725, + "num_input_tokens_seen": 386662400, + "step": 5900, + "train_runtime": 2842.5021, + "train_tokens_per_second": 136028.889 + }, + { + "epoch": 0.06, + "grad_norm": 0.6233596205711365, + "learning_rate": 0.0002981165853117688, + "loss": 1.3624, + "num_input_tokens_seen": 393216000, + "step": 6000, + "train_runtime": 2892.8273, + "train_tokens_per_second": 135927.922 + }, + { + "epoch": 0.061, + "grad_norm": 0.5645745396614075, + "learning_rate": 0.000298040646098469, + "loss": 1.356, + "num_input_tokens_seen": 399769600, + "step": 6100, + "train_runtime": 2940.1153, + "train_tokens_per_second": 135970.721 + }, + { + "epoch": 0.062, + "grad_norm": 0.6580554246902466, + "learning_rate": 0.0002979632161175064, + "loss": 1.3627, + "num_input_tokens_seen": 406323200, + "step": 6200, + "train_runtime": 2986.9073, + "train_tokens_per_second": 136034.754 + }, + { + "epoch": 0.063, + "grad_norm": 0.6815545558929443, + "learning_rate": 0.0002978842961486003, + "loss": 1.3562, + "num_input_tokens_seen": 412876800, + "step": 6300, + "train_runtime": 3038.4238, + "train_tokens_per_second": 135885.191 + }, + { + "epoch": 0.064, + "grad_norm": 0.9602898955345154, + "learning_rate": 0.0002978038869864738, + "loss": 1.3562, + "num_input_tokens_seen": 419430400, + "step": 6400, + "train_runtime": 3085.1228, + "train_tokens_per_second": 135952.578 + }, + { + "epoch": 0.065, + "grad_norm": 0.7086384892463684, + "learning_rate": 0.0002977219894408463, + "loss": 1.3579, + "num_input_tokens_seen": 425984000, + "step": 6500, + "train_runtime": 3130.8346, + "train_tokens_per_second": 136060.844 + }, + { + "epoch": 0.066, + "grad_norm": 0.5864439010620117, + "learning_rate": 0.0002976386043364251, + "loss": 1.3563, + "num_input_tokens_seen": 432537600, + "step": 6600, + "train_runtime": 3182.4893, + "train_tokens_per_second": 135911.72 + }, + { + "epoch": 0.067, + "grad_norm": 0.6041991114616394, + "learning_rate": 0.00029755373251289733, + "loss": 1.3753, + "num_input_tokens_seen": 439091200, + "step": 6700, + "train_runtime": 3229.4118, + "train_tokens_per_second": 135966.308 + }, + { + "epoch": 0.068, + "grad_norm": 0.7153160572052002, + "learning_rate": 0.0002974673748249213, + "loss": 1.3475, + "num_input_tokens_seen": 445644800, + "step": 6800, + "train_runtime": 3276.7034, + "train_tokens_per_second": 136004.008 + }, + { + "epoch": 0.069, + "grad_norm": 0.5409119725227356, + "learning_rate": 0.00029737953214211804, + "loss": 1.3464, + "num_input_tokens_seen": 452198400, + "step": 6900, + "train_runtime": 3324.3119, + "train_tokens_per_second": 136027.67 + }, + { + "epoch": 0.07, + "grad_norm": 0.6369441151618958, + "learning_rate": 0.0002972902053490623, + "loss": 1.3546, + "num_input_tokens_seen": 458752000, + "step": 7000, + "train_runtime": 3370.6322, + "train_tokens_per_second": 136102.657 + }, + { + "epoch": 0.071, + "grad_norm": 0.8589248061180115, + "learning_rate": 0.00029719939534527393, + "loss": 1.3479, + "num_input_tokens_seen": 465305600, + "step": 7100, + "train_runtime": 3424.7139, + "train_tokens_per_second": 135867.0 + }, + { + "epoch": 0.072, + "grad_norm": 0.8014613389968872, + "learning_rate": 0.00029710710304520866, + "loss": 1.3667, + "num_input_tokens_seen": 471859200, + "step": 7200, + "train_runtime": 3472.985, + "train_tokens_per_second": 135865.601 + }, + { + "epoch": 0.073, + "grad_norm": 0.5970280766487122, + "learning_rate": 0.00029701332937824885, + "loss": 1.3423, + "num_input_tokens_seen": 478412800, + "step": 7300, + "train_runtime": 3519.3052, + "train_tokens_per_second": 135939.558 + }, + { + "epoch": 0.074, + "grad_norm": 0.6963617205619812, + "learning_rate": 0.0002969180752886944, + "loss": 1.3443, + "num_input_tokens_seen": 484966400, + "step": 7400, + "train_runtime": 3565.8739, + "train_tokens_per_second": 136002.118 + }, + { + "epoch": 0.075, + "grad_norm": 0.5769393444061279, + "learning_rate": 0.0002968213417357529, + "loss": 1.3576, + "num_input_tokens_seen": 491520000, + "step": 7500, + "train_runtime": 3611.5043, + "train_tokens_per_second": 136098.411 + }, + { + "epoch": 0.076, + "grad_norm": 0.5492929816246033, + "learning_rate": 0.00029672312969353015, + "loss": 1.3422, + "num_input_tokens_seen": 498073600, + "step": 7600, + "train_runtime": 3664.3633, + "train_tokens_per_second": 135923.642 + }, + { + "epoch": 0.077, + "grad_norm": 0.8065637946128845, + "learning_rate": 0.00029662344015102027, + "loss": 1.3395, + "num_input_tokens_seen": 504627200, + "step": 7700, + "train_runtime": 3711.2689, + "train_tokens_per_second": 135971.608 + }, + { + "epoch": 0.078, + "grad_norm": 0.552871584892273, + "learning_rate": 0.00029652227411209594, + "loss": 1.3427, + "num_input_tokens_seen": 511180800, + "step": 7800, + "train_runtime": 3758.1209, + "train_tokens_per_second": 136020.319 + }, + { + "epoch": 0.079, + "grad_norm": 0.6378001570701599, + "learning_rate": 0.0002964196325954979, + "loss": 1.3339, + "num_input_tokens_seen": 517734400, + "step": 7900, + "train_runtime": 3804.2295, + "train_tokens_per_second": 136094.417 + }, + { + "epoch": 0.08, + "grad_norm": 0.6196131706237793, + "learning_rate": 0.0002963155166348253, + "loss": 1.341, + "num_input_tokens_seen": 524288000, + "step": 8000, + "train_runtime": 3855.6562, + "train_tokens_per_second": 135978.93 + }, + { + "epoch": 0.081, + "grad_norm": 0.5841253399848938, + "learning_rate": 0.0002962099272785246, + "loss": 1.3366, + "num_input_tokens_seen": 530841600, + "step": 8100, + "train_runtime": 3903.5348, + "train_tokens_per_second": 135989.977 + }, + { + "epoch": 0.082, + "grad_norm": 0.5912770628929138, + "learning_rate": 0.0002961028655898794, + "loss": 1.3417, + "num_input_tokens_seen": 537395200, + "step": 8200, + "train_runtime": 3951.3698, + "train_tokens_per_second": 136002.255 + }, + { + "epoch": 0.083, + "grad_norm": 0.5480249524116516, + "learning_rate": 0.0002959943326469998, + "loss": 1.3419, + "num_input_tokens_seen": 543948800, + "step": 8300, + "train_runtime": 3997.3554, + "train_tokens_per_second": 136077.166 + }, + { + "epoch": 0.084, + "grad_norm": 0.49880343675613403, + "learning_rate": 0.0002958843295428112, + "loss": 1.3165, + "num_input_tokens_seen": 550502400, + "step": 8400, + "train_runtime": 4044.3967, + "train_tokens_per_second": 136114.838 + }, + { + "epoch": 0.085, + "grad_norm": 0.5670176148414612, + "learning_rate": 0.0002957728573850438, + "loss": 1.3314, + "num_input_tokens_seen": 557056000, + "step": 8500, + "train_runtime": 4095.7201, + "train_tokens_per_second": 136009.294 + }, + { + "epoch": 0.086, + "grad_norm": 2.3274426460266113, + "learning_rate": 0.0002956599172962209, + "loss": 1.3323, + "num_input_tokens_seen": 563609600, + "step": 8600, + "train_runtime": 4143.1443, + "train_tokens_per_second": 136034.268 + }, + { + "epoch": 0.087, + "grad_norm": 0.7660558819770813, + "learning_rate": 0.0002955455104136479, + "loss": 1.3382, + "num_input_tokens_seen": 570163200, + "step": 8700, + "train_runtime": 4190.7065, + "train_tokens_per_second": 136054.194 + }, + { + "epoch": 0.088, + "grad_norm": 0.5114762783050537, + "learning_rate": 0.00029542963788940096, + "loss": 1.3252, + "num_input_tokens_seen": 576716800, + "step": 8800, + "train_runtime": 4237.8545, + "train_tokens_per_second": 136086.974 + }, + { + "epoch": 0.089, + "grad_norm": 0.6698548197746277, + "learning_rate": 0.00029531230089031505, + "loss": 1.3449, + "num_input_tokens_seen": 583270400, + "step": 8900, + "train_runtime": 4285.2299, + "train_tokens_per_second": 136111.81 + }, + { + "epoch": 0.09, + "grad_norm": 0.5562598705291748, + "learning_rate": 0.0002951935005979724, + "loss": 1.3204, + "num_input_tokens_seen": 589824000, + "step": 9000, + "train_runtime": 4336.4907, + "train_tokens_per_second": 136014.126 + }, + { + "epoch": 0.091, + "grad_norm": 0.6327181458473206, + "learning_rate": 0.0002950732382086907, + "loss": 1.3178, + "num_input_tokens_seen": 596377600, + "step": 9100, + "train_runtime": 4383.0811, + "train_tokens_per_second": 136063.555 + }, + { + "epoch": 0.092, + "grad_norm": 0.6857426166534424, + "learning_rate": 0.0002949515149335108, + "loss": 1.3332, + "num_input_tokens_seen": 602931200, + "step": 9200, + "train_runtime": 4431.4231, + "train_tokens_per_second": 136058.142 + }, + { + "epoch": 0.093, + "grad_norm": 0.6040679812431335, + "learning_rate": 0.0002948283319981848, + "loss": 1.307, + "num_input_tokens_seen": 609484800, + "step": 9300, + "train_runtime": 4478.1663, + "train_tokens_per_second": 136101.423 + }, + { + "epoch": 0.094, + "grad_norm": 1.0060901641845703, + "learning_rate": 0.00029470369064316354, + "loss": 1.3108, + "num_input_tokens_seen": 616038400, + "step": 9400, + "train_runtime": 4524.7167, + "train_tokens_per_second": 136149.607 + }, + { + "epoch": 0.095, + "grad_norm": 0.504460871219635, + "learning_rate": 0.00029457759212358397, + "loss": 1.3169, + "num_input_tokens_seen": 622592000, + "step": 9500, + "train_runtime": 4575.869, + "train_tokens_per_second": 136059.84 + }, + { + "epoch": 0.096, + "grad_norm": 0.5062097907066345, + "learning_rate": 0.00029445003770925686, + "loss": 1.3137, + "num_input_tokens_seen": 629145600, + "step": 9600, + "train_runtime": 4621.4422, + "train_tokens_per_second": 136136.203 + }, + { + "epoch": 0.097, + "grad_norm": 0.5388786792755127, + "learning_rate": 0.00029432102868465367, + "loss": 1.3128, + "num_input_tokens_seen": 635699200, + "step": 9700, + "train_runtime": 4668.6149, + "train_tokens_per_second": 136164.411 + }, + { + "epoch": 0.098, + "grad_norm": 0.5705980062484741, + "learning_rate": 0.0002941905663488939, + "loss": 1.3065, + "num_input_tokens_seen": 642252800, + "step": 9800, + "train_runtime": 4715.2389, + "train_tokens_per_second": 136207.903 + }, + { + "epoch": 0.099, + "grad_norm": 0.5500839352607727, + "learning_rate": 0.0002940586520157318, + "loss": 1.3222, + "num_input_tokens_seen": 648806400, + "step": 9900, + "train_runtime": 4767.1995, + "train_tokens_per_second": 136098.019 + }, + { + "epoch": 0.1, + "grad_norm": 0.5740068554878235, + "learning_rate": 0.00029392528701354325, + "loss": 1.3173, + "num_input_tokens_seen": 655360000, + "step": 10000, + "train_runtime": 4814.2762, + "train_tokens_per_second": 136128.458 + }, + { + "epoch": 0.101, + "grad_norm": 0.47691279649734497, + "learning_rate": 0.00029379047268531243, + "loss": 1.3084, + "num_input_tokens_seen": 661913600, + "step": 10100, + "train_runtime": 4861.0919, + "train_tokens_per_second": 136165.622 + }, + { + "epoch": 0.102, + "grad_norm": 0.5993319153785706, + "learning_rate": 0.00029365421038861795, + "loss": 1.3299, + "num_input_tokens_seen": 668467200, + "step": 10200, + "train_runtime": 4908.6949, + "train_tokens_per_second": 136180.229 + }, + { + "epoch": 0.103, + "grad_norm": 0.556516170501709, + "learning_rate": 0.0002935165014956198, + "loss": 1.316, + "num_input_tokens_seen": 675020800, + "step": 10300, + "train_runtime": 4956.5309, + "train_tokens_per_second": 136188.156 + }, + { + "epoch": 0.104, + "grad_norm": 0.6757346391677856, + "learning_rate": 0.0002933773473930448, + "loss": 1.3048, + "num_input_tokens_seen": 681574400, + "step": 10400, + "train_runtime": 5003.7965, + "train_tokens_per_second": 136211.454 + }, + { + "epoch": 0.105, + "grad_norm": 0.9610360860824585, + "learning_rate": 0.0002932367494821734, + "loss": 1.3043, + "num_input_tokens_seen": 688128000, + "step": 10500, + "train_runtime": 5050.8058, + "train_tokens_per_second": 136241.232 + }, + { + "epoch": 0.106, + "grad_norm": 0.5780071020126343, + "learning_rate": 0.00029309470917882497, + "loss": 1.3015, + "num_input_tokens_seen": 694681600, + "step": 10600, + "train_runtime": 5104.0171, + "train_tokens_per_second": 136104.873 + }, + { + "epoch": 0.107, + "grad_norm": 0.6387894749641418, + "learning_rate": 0.0002929512279133437, + "loss": 1.3342, + "num_input_tokens_seen": 701235200, + "step": 10700, + "train_runtime": 5151.2508, + "train_tokens_per_second": 136129.112 + }, + { + "epoch": 0.108, + "grad_norm": 0.48744165897369385, + "learning_rate": 0.0002928063071305844, + "loss": 1.2999, + "num_input_tokens_seen": 707788800, + "step": 10800, + "train_runtime": 5198.4813, + "train_tokens_per_second": 136152.995 + }, + { + "epoch": 0.109, + "grad_norm": 0.5223510265350342, + "learning_rate": 0.0002926599482898978, + "loss": 1.2996, + "num_input_tokens_seen": 714342400, + "step": 10900, + "train_runtime": 5244.0735, + "train_tokens_per_second": 136218.99 + }, + { + "epoch": 0.11, + "grad_norm": 0.6020687222480774, + "learning_rate": 0.00029251215286511573, + "loss": 1.3029, + "num_input_tokens_seen": 720896000, + "step": 11000, + "train_runtime": 5291.0983, + "train_tokens_per_second": 136246.948 + }, + { + "epoch": 0.111, + "grad_norm": 0.5317751169204712, + "learning_rate": 0.00029236292234453647, + "loss": 1.316, + "num_input_tokens_seen": 727449600, + "step": 11100, + "train_runtime": 5342.4851, + "train_tokens_per_second": 136163.15 + }, + { + "epoch": 0.112, + "grad_norm": 1.2369730472564697, + "learning_rate": 0.0002922122582309097, + "loss": 1.298, + "num_input_tokens_seen": 734003200, + "step": 11200, + "train_runtime": 5391.0041, + "train_tokens_per_second": 136153.338 + }, + { + "epoch": 0.113, + "grad_norm": 0.5294257998466492, + "learning_rate": 0.0002920601620414215, + "loss": 1.316, + "num_input_tokens_seen": 740556800, + "step": 11300, + "train_runtime": 5437.8422, + "train_tokens_per_second": 136185.784 + }, + { + "epoch": 0.114, + "grad_norm": 0.5318885445594788, + "learning_rate": 0.0002919066353076786, + "loss": 1.2993, + "num_input_tokens_seen": 747110400, + "step": 11400, + "train_runtime": 5484.1183, + "train_tokens_per_second": 136231.635 + }, + { + "epoch": 0.115, + "grad_norm": 0.5208443403244019, + "learning_rate": 0.00029175167957569366, + "loss": 1.3066, + "num_input_tokens_seen": 753664000, + "step": 11500, + "train_runtime": 5531.5155, + "train_tokens_per_second": 136249.099 + }, + { + "epoch": 0.116, + "grad_norm": 0.5068408250808716, + "learning_rate": 0.0002915952964058691, + "loss": 1.3041, + "num_input_tokens_seen": 760217600, + "step": 11600, + "train_runtime": 5578.6188, + "train_tokens_per_second": 136273.445 + }, + { + "epoch": 0.117, + "grad_norm": 0.6206523776054382, + "learning_rate": 0.00029143748737298173, + "loss": 1.3061, + "num_input_tokens_seen": 766771200, + "step": 11700, + "train_runtime": 5631.31, + "train_tokens_per_second": 136162.136 + }, + { + "epoch": 0.118, + "grad_norm": 0.5741725564002991, + "learning_rate": 0.00029127825406616677, + "loss": 1.3097, + "num_input_tokens_seen": 773324800, + "step": 11800, + "train_runtime": 5678.817, + "train_tokens_per_second": 136177.096 + }, + { + "epoch": 0.119, + "grad_norm": 0.5251154899597168, + "learning_rate": 0.0002911175980889019, + "loss": 1.3054, + "num_input_tokens_seen": 779878400, + "step": 11900, + "train_runtime": 5725.8659, + "train_tokens_per_second": 136202.701 + }, + { + "epoch": 0.12, + "grad_norm": 0.4509083032608032, + "learning_rate": 0.00029095552105899095, + "loss": 1.301, + "num_input_tokens_seen": 786432000, + "step": 12000, + "train_runtime": 5772.0962, + "train_tokens_per_second": 136247.211 + }, + { + "epoch": 0.121, + "grad_norm": 0.4560108184814453, + "learning_rate": 0.0002907920246085478, + "loss": 1.2981, + "num_input_tokens_seen": 792985600, + "step": 12100, + "train_runtime": 5817.8977, + "train_tokens_per_second": 136301.056 + }, + { + "epoch": 0.122, + "grad_norm": 1.227121114730835, + "learning_rate": 0.00029062711038397996, + "loss": 1.302, + "num_input_tokens_seen": 799539200, + "step": 12200, + "train_runtime": 5870.3451, + "train_tokens_per_second": 136199.693 + }, + { + "epoch": 0.123, + "grad_norm": 0.4861258864402771, + "learning_rate": 0.00029046078004597175, + "loss": 1.318, + "num_input_tokens_seen": 806092800, + "step": 12300, + "train_runtime": 5916.8489, + "train_tokens_per_second": 136236.84 + }, + { + "epoch": 0.124, + "grad_norm": 0.9702387452125549, + "learning_rate": 0.00029029303526946796, + "loss": 1.2869, + "num_input_tokens_seen": 812646400, + "step": 12400, + "train_runtime": 5964.0243, + "train_tokens_per_second": 136258.063 + }, + { + "epoch": 0.125, + "grad_norm": 0.4712119400501251, + "learning_rate": 0.0002901238777436565, + "loss": 1.2924, + "num_input_tokens_seen": 819200000, + "step": 12500, + "train_runtime": 6009.6089, + "train_tokens_per_second": 136315.026 + }, + { + "epoch": 0.126, + "grad_norm": 0.4670332372188568, + "learning_rate": 0.00028995330917195184, + "loss": 1.2942, + "num_input_tokens_seen": 825753600, + "step": 12600, + "train_runtime": 6061.3166, + "train_tokens_per_second": 136233.371 + }, + { + "epoch": 0.127, + "grad_norm": 0.4821685552597046, + "learning_rate": 0.00028978133127197765, + "loss": 1.2856, + "num_input_tokens_seen": 832307200, + "step": 12700, + "train_runtime": 6108.5206, + "train_tokens_per_second": 136253.481 + }, + { + "epoch": 0.128, + "grad_norm": 0.5634518265724182, + "learning_rate": 0.0002896079457755493, + "loss": 1.2982, + "num_input_tokens_seen": 838860800, + "step": 12800, + "train_runtime": 6155.2503, + "train_tokens_per_second": 136283.785 + }, + { + "epoch": 0.129, + "grad_norm": 0.45673057436943054, + "learning_rate": 0.000289433154428657, + "loss": 1.2997, + "num_input_tokens_seen": 845414400, + "step": 12900, + "train_runtime": 6202.1106, + "train_tokens_per_second": 136310.758 + }, + { + "epoch": 0.13, + "grad_norm": 0.4386661648750305, + "learning_rate": 0.0002892569589914476, + "loss": 1.2985, + "num_input_tokens_seen": 851968000, + "step": 13000, + "train_runtime": 6249.4681, + "train_tokens_per_second": 136326.482 + }, + { + "epoch": 0.131, + "grad_norm": 0.4749270975589752, + "learning_rate": 0.0002890793612382072, + "loss": 1.2946, + "num_input_tokens_seen": 858521600, + "step": 13100, + "train_runtime": 6301.6638, + "train_tokens_per_second": 136237.291 + }, + { + "epoch": 0.132, + "grad_norm": 0.5405780673027039, + "learning_rate": 0.0002889003629573432, + "loss": 1.2857, + "num_input_tokens_seen": 865075200, + "step": 13200, + "train_runtime": 6349.664, + "train_tokens_per_second": 136239.523 + }, + { + "epoch": 0.133, + "grad_norm": 0.4045722782611847, + "learning_rate": 0.00028871996595136626, + "loss": 1.3009, + "num_input_tokens_seen": 871628800, + "step": 13300, + "train_runtime": 6396.2349, + "train_tokens_per_second": 136272.169 + }, + { + "epoch": 0.134, + "grad_norm": 0.5851114392280579, + "learning_rate": 0.0002885381720368723, + "loss": 1.3026, + "num_input_tokens_seen": 878182400, + "step": 13400, + "train_runtime": 6442.8884, + "train_tokens_per_second": 136302.594 + }, + { + "epoch": 0.135, + "grad_norm": 0.5135608315467834, + "learning_rate": 0.000288354983044524, + "loss": 1.2778, + "num_input_tokens_seen": 884736000, + "step": 13500, + "train_runtime": 6489.2417, + "train_tokens_per_second": 136338.889 + }, + { + "epoch": 0.136, + "grad_norm": 0.4828953742980957, + "learning_rate": 0.00028817040081903245, + "loss": 1.2864, + "num_input_tokens_seen": 891289600, + "step": 13600, + "train_runtime": 6540.9813, + "train_tokens_per_second": 136262.368 + }, + { + "epoch": 0.137, + "grad_norm": 0.5756350755691528, + "learning_rate": 0.00028798442721913867, + "loss": 1.2858, + "num_input_tokens_seen": 897843200, + "step": 13700, + "train_runtime": 6588.3179, + "train_tokens_per_second": 136278.063 + }, + { + "epoch": 0.138, + "grad_norm": 0.5231483578681946, + "learning_rate": 0.00028779706411759465, + "loss": 1.282, + "num_input_tokens_seen": 904396800, + "step": 13800, + "train_runtime": 6635.0521, + "train_tokens_per_second": 136305.909 + }, + { + "epoch": 0.139, + "grad_norm": 0.5475858449935913, + "learning_rate": 0.00028760831340114484, + "loss": 1.2797, + "num_input_tokens_seen": 910950400, + "step": 13900, + "train_runtime": 6681.4731, + "train_tokens_per_second": 136339.754 + }, + { + "epoch": 0.14, + "grad_norm": 0.7064163684844971, + "learning_rate": 0.00028741817697050683, + "loss": 1.2927, + "num_input_tokens_seen": 917504000, + "step": 14000, + "train_runtime": 6730.4553, + "train_tokens_per_second": 136321.238 + }, + { + "epoch": 0.141, + "grad_norm": 0.5267386436462402, + "learning_rate": 0.00028722665674035233, + "loss": 1.2815, + "num_input_tokens_seen": 924057600, + "step": 14100, + "train_runtime": 6782.7717, + "train_tokens_per_second": 136235.987 + }, + { + "epoch": 0.142, + "grad_norm": 0.5816136598587036, + "learning_rate": 0.0002870337546392879, + "loss": 1.2983, + "num_input_tokens_seen": 930611200, + "step": 14200, + "train_runtime": 6829.7567, + "train_tokens_per_second": 136258.323 + }, + { + "epoch": 0.143, + "grad_norm": 0.4982451796531677, + "learning_rate": 0.00028683947260983576, + "loss": 1.3026, + "num_input_tokens_seen": 937164800, + "step": 14300, + "train_runtime": 6877.8163, + "train_tokens_per_second": 136259.063 + }, + { + "epoch": 0.144, + "grad_norm": 0.49408379197120667, + "learning_rate": 0.00028664381260841356, + "loss": 1.2869, + "num_input_tokens_seen": 943718400, + "step": 14400, + "train_runtime": 6923.5994, + "train_tokens_per_second": 136304.593 + }, + { + "epoch": 0.145, + "grad_norm": 0.4885796904563904, + "learning_rate": 0.0002864467766053154, + "loss": 1.2768, + "num_input_tokens_seen": 950272000, + "step": 14500, + "train_runtime": 6969.9199, + "train_tokens_per_second": 136339.014 + }, + { + "epoch": 0.146, + "grad_norm": 0.5424348711967468, + "learning_rate": 0.00028624836658469165, + "loss": 1.2806, + "num_input_tokens_seen": 956825600, + "step": 14600, + "train_runtime": 7020.7829, + "train_tokens_per_second": 136284.743 + }, + { + "epoch": 0.147, + "grad_norm": 0.4333992898464203, + "learning_rate": 0.00028604858454452906, + "loss": 1.2776, + "num_input_tokens_seen": 963379200, + "step": 14700, + "train_runtime": 7066.7012, + "train_tokens_per_second": 136326.58 + }, + { + "epoch": 0.148, + "grad_norm": 1.3118066787719727, + "learning_rate": 0.00028584743249663057, + "loss": 1.3039, + "num_input_tokens_seen": 969932800, + "step": 14800, + "train_runtime": 7115.8691, + "train_tokens_per_second": 136305.6 + }, + { + "epoch": 0.149, + "grad_norm": 0.5320950150489807, + "learning_rate": 0.000285644912466595, + "loss": 1.2801, + "num_input_tokens_seen": 976486400, + "step": 14900, + "train_runtime": 7162.6662, + "train_tokens_per_second": 136330.016 + }, + { + "epoch": 0.15, + "grad_norm": 0.6902542114257812, + "learning_rate": 0.00028544102649379684, + "loss": 1.2832, + "num_input_tokens_seen": 983040000, + "step": 15000, + "train_runtime": 7209.6657, + "train_tokens_per_second": 136350.29 + }, + { + "epoch": 0.151, + "grad_norm": 0.544683039188385, + "learning_rate": 0.00028523577663136556, + "loss": 1.2948, + "num_input_tokens_seen": 989593600, + "step": 15100, + "train_runtime": 7261.0326, + "train_tokens_per_second": 136288.275 + }, + { + "epoch": 0.152, + "grad_norm": 0.500091552734375, + "learning_rate": 0.000285029164946165, + "loss": 1.2746, + "num_input_tokens_seen": 996147200, + "step": 15200, + "train_runtime": 7306.6445, + "train_tokens_per_second": 136334.427 + }, + { + "epoch": 0.153, + "grad_norm": 0.4995329678058624, + "learning_rate": 0.0002848211935187725, + "loss": 1.2893, + "num_input_tokens_seen": 1002700800, + "step": 15300, + "train_runtime": 7353.2711, + "train_tokens_per_second": 136361.19 + }, + { + "epoch": 0.154, + "grad_norm": 0.42985284328460693, + "learning_rate": 0.0002846118644434581, + "loss": 1.3077, + "num_input_tokens_seen": 1009254400, + "step": 15400, + "train_runtime": 7400.7889, + "train_tokens_per_second": 136371.192 + }, + { + "epoch": 0.155, + "grad_norm": 0.4847468137741089, + "learning_rate": 0.00028440117982816326, + "loss": 1.2723, + "num_input_tokens_seen": 1015808000, + "step": 15500, + "train_runtime": 7452.7433, + "train_tokens_per_second": 136299.877 + }, + { + "epoch": 0.156, + "grad_norm": 0.47867411375045776, + "learning_rate": 0.0002841891417944796, + "loss": 1.2754, + "num_input_tokens_seen": 1022361600, + "step": 15600, + "train_runtime": 7498.8195, + "train_tokens_per_second": 136336.339 + }, + { + "epoch": 0.157, + "grad_norm": 0.43365904688835144, + "learning_rate": 0.0002839757524776279, + "loss": 1.2737, + "num_input_tokens_seen": 1028915200, + "step": 15700, + "train_runtime": 7545.0284, + "train_tokens_per_second": 136369.957 + }, + { + "epoch": 0.158, + "grad_norm": 0.5739541053771973, + "learning_rate": 0.0002837610140264361, + "loss": 1.286, + "num_input_tokens_seen": 1035468800, + "step": 15800, + "train_runtime": 7597.8039, + "train_tokens_per_second": 136285.275 + }, + { + "epoch": 0.159, + "grad_norm": 0.4836307168006897, + "learning_rate": 0.0002835449286033182, + "loss": 1.2779, + "num_input_tokens_seen": 1042022400, + "step": 15900, + "train_runtime": 7643.6023, + "train_tokens_per_second": 136326.088 + }, + { + "epoch": 0.16, + "grad_norm": 0.5712729692459106, + "learning_rate": 0.0002833274983842518, + "loss": 1.2702, + "num_input_tokens_seen": 1048576000, + "step": 16000, + "train_runtime": 7691.0096, + "train_tokens_per_second": 136337.887 + }, + { + "epoch": 0.161, + "grad_norm": 0.48568034172058105, + "learning_rate": 0.0002831087255587569, + "loss": 1.2696, + "num_input_tokens_seen": 1055129600, + "step": 16100, + "train_runtime": 7737.6132, + "train_tokens_per_second": 136363.705 + }, + { + "epoch": 0.162, + "grad_norm": 0.5240116715431213, + "learning_rate": 0.0002828886123298734, + "loss": 1.2636, + "num_input_tokens_seen": 1061683200, + "step": 16200, + "train_runtime": 7790.0975, + "train_tokens_per_second": 136286.253 + }, + { + "epoch": 0.163, + "grad_norm": 0.4505080580711365, + "learning_rate": 0.00028266716091413906, + "loss": 1.2679, + "num_input_tokens_seen": 1068236800, + "step": 16300, + "train_runtime": 7837.0156, + "train_tokens_per_second": 136306.581 + }, + { + "epoch": 0.164, + "grad_norm": 0.38184958696365356, + "learning_rate": 0.0002824443735415673, + "loss": 1.2801, + "num_input_tokens_seen": 1074790400, + "step": 16400, + "train_runtime": 7884.0198, + "train_tokens_per_second": 136325.178 + }, + { + "epoch": 0.165, + "grad_norm": 0.860382616519928, + "learning_rate": 0.0002822202524556243, + "loss": 1.2737, + "num_input_tokens_seen": 1081344000, + "step": 16500, + "train_runtime": 7930.486, + "train_tokens_per_second": 136352.803 + }, + { + "epoch": 0.166, + "grad_norm": 0.771594226360321, + "learning_rate": 0.00028199479991320695, + "loss": 1.2876, + "num_input_tokens_seen": 1087897600, + "step": 16600, + "train_runtime": 7977.0943, + "train_tokens_per_second": 136377.678 + }, + { + "epoch": 0.167, + "grad_norm": 0.4533759653568268, + "learning_rate": 0.00028176801818461994, + "loss": 1.2769, + "num_input_tokens_seen": 1094451200, + "step": 16700, + "train_runtime": 8024.6165, + "train_tokens_per_second": 136386.73 + }, + { + "epoch": 0.168, + "grad_norm": 0.548772394657135, + "learning_rate": 0.00028153990955355273, + "loss": 1.2647, + "num_input_tokens_seen": 1101004800, + "step": 16800, + "train_runtime": 8077.0632, + "train_tokens_per_second": 136312.515 + }, + { + "epoch": 0.169, + "grad_norm": 0.5390068888664246, + "learning_rate": 0.00028131047631705665, + "loss": 1.2799, + "num_input_tokens_seen": 1107558400, + "step": 16900, + "train_runtime": 8123.3347, + "train_tokens_per_second": 136342.824 + }, + { + "epoch": 0.17, + "grad_norm": 0.4429817795753479, + "learning_rate": 0.00028107972078552187, + "loss": 1.2727, + "num_input_tokens_seen": 1114112000, + "step": 17000, + "train_runtime": 8169.0719, + "train_tokens_per_second": 136381.71 + }, + { + "epoch": 0.171, + "grad_norm": 0.6212127208709717, + "learning_rate": 0.0002808476452826541, + "loss": 1.2743, + "num_input_tokens_seen": 1120665600, + "step": 17100, + "train_runtime": 8217.1136, + "train_tokens_per_second": 136381.904 + }, + { + "epoch": 0.172, + "grad_norm": 0.44569867849349976, + "learning_rate": 0.00028061425214545094, + "loss": 1.2628, + "num_input_tokens_seen": 1127219200, + "step": 17200, + "train_runtime": 8268.2495, + "train_tokens_per_second": 136331.057 + }, + { + "epoch": 0.173, + "grad_norm": 0.5025371313095093, + "learning_rate": 0.00028037954372417883, + "loss": 1.2651, + "num_input_tokens_seen": 1133772800, + "step": 17300, + "train_runtime": 8315.4333, + "train_tokens_per_second": 136345.607 + }, + { + "epoch": 0.174, + "grad_norm": 0.5257975459098816, + "learning_rate": 0.0002801435223823488, + "loss": 1.2701, + "num_input_tokens_seen": 1140326400, + "step": 17400, + "train_runtime": 8361.8666, + "train_tokens_per_second": 136372.23 + }, + { + "epoch": 0.175, + "grad_norm": 0.6858969926834106, + "learning_rate": 0.00027990619049669336, + "loss": 1.2759, + "num_input_tokens_seen": 1146880000, + "step": 17500, + "train_runtime": 8408.7431, + "train_tokens_per_second": 136391.371 + }, + { + "epoch": 0.176, + "grad_norm": 0.5586578845977783, + "learning_rate": 0.00027966755045714177, + "loss": 1.2782, + "num_input_tokens_seen": 1153433600, + "step": 17600, + "train_runtime": 8455.5155, + "train_tokens_per_second": 136411.978 + }, + { + "epoch": 0.177, + "grad_norm": 0.583242654800415, + "learning_rate": 0.00027942760466679673, + "loss": 1.287, + "num_input_tokens_seen": 1159987200, + "step": 17700, + "train_runtime": 8508.2754, + "train_tokens_per_second": 136336.349 + }, + { + "epoch": 0.178, + "grad_norm": 0.5521747469902039, + "learning_rate": 0.00027918635554190956, + "loss": 1.2704, + "num_input_tokens_seen": 1166540800, + "step": 17800, + "train_runtime": 8555.5497, + "train_tokens_per_second": 136349.018 + }, + { + "epoch": 0.179, + "grad_norm": 0.6325215697288513, + "learning_rate": 0.00027894380551185636, + "loss": 1.2912, + "num_input_tokens_seen": 1173094400, + "step": 17900, + "train_runtime": 8602.3857, + "train_tokens_per_second": 136368.495 + }, + { + "epoch": 0.18, + "grad_norm": 0.44643789529800415, + "learning_rate": 0.00027869995701911314, + "loss": 1.2762, + "num_input_tokens_seen": 1179648000, + "step": 18000, + "train_runtime": 8649.7648, + "train_tokens_per_second": 136379.2 + }, + { + "epoch": 0.181, + "grad_norm": 0.49556615948677063, + "learning_rate": 0.0002784548125192316, + "loss": 1.2577, + "num_input_tokens_seen": 1186201600, + "step": 18100, + "train_runtime": 8701.0558, + "train_tokens_per_second": 136328.467 + }, + { + "epoch": 0.182, + "grad_norm": 0.5336231589317322, + "learning_rate": 0.0002782083744808141, + "loss": 1.2629, + "num_input_tokens_seen": 1192755200, + "step": 18200, + "train_runtime": 8748.3794, + "train_tokens_per_second": 136340.131 + }, + { + "epoch": 0.183, + "grad_norm": 0.3993295431137085, + "learning_rate": 0.000277960645385489, + "loss": 1.2621, + "num_input_tokens_seen": 1199308800, + "step": 18300, + "train_runtime": 8795.9903, + "train_tokens_per_second": 136347.217 + }, + { + "epoch": 0.184, + "grad_norm": 0.5608197450637817, + "learning_rate": 0.00027771162772788544, + "loss": 1.2746, + "num_input_tokens_seen": 1205862400, + "step": 18400, + "train_runtime": 8844.0918, + "train_tokens_per_second": 136346.663 + }, + { + "epoch": 0.185, + "grad_norm": 0.5299677848815918, + "learning_rate": 0.00027746132401560857, + "loss": 1.2608, + "num_input_tokens_seen": 1212416000, + "step": 18500, + "train_runtime": 8890.974, + "train_tokens_per_second": 136364.812 + }, + { + "epoch": 0.186, + "grad_norm": 0.5247559547424316, + "learning_rate": 0.0002772097367692139, + "loss": 1.2628, + "num_input_tokens_seen": 1218969600, + "step": 18600, + "train_runtime": 8937.3092, + "train_tokens_per_second": 136391.119 + }, + { + "epoch": 0.187, + "grad_norm": 0.4991471469402313, + "learning_rate": 0.00027695686852218226, + "loss": 1.2617, + "num_input_tokens_seen": 1225523200, + "step": 18700, + "train_runtime": 8984.1463, + "train_tokens_per_second": 136409.532 + }, + { + "epoch": 0.188, + "grad_norm": 0.4922790229320526, + "learning_rate": 0.00027670272182089416, + "loss": 1.277, + "num_input_tokens_seen": 1232076800, + "step": 18800, + "train_runtime": 9036.4876, + "train_tokens_per_second": 136344.656 + }, + { + "epoch": 0.189, + "grad_norm": 0.49377188086509705, + "learning_rate": 0.0002764472992246039, + "loss": 1.2767, + "num_input_tokens_seen": 1238630400, + "step": 18900, + "train_runtime": 9084.3866, + "train_tokens_per_second": 136347.169 + }, + { + "epoch": 0.19, + "grad_norm": 0.6417357921600342, + "learning_rate": 0.0002761906033054143, + "loss": 1.2616, + "num_input_tokens_seen": 1245184000, + "step": 19000, + "train_runtime": 9130.7221, + "train_tokens_per_second": 136373.004 + }, + { + "epoch": 0.191, + "grad_norm": 0.44580140709877014, + "learning_rate": 0.00027593263664825045, + "loss": 1.2686, + "num_input_tokens_seen": 1251737600, + "step": 19100, + "train_runtime": 9176.6051, + "train_tokens_per_second": 136405.303 + }, + { + "epoch": 0.192, + "grad_norm": 0.5867856740951538, + "learning_rate": 0.00027567340185083363, + "loss": 1.2638, + "num_input_tokens_seen": 1258291200, + "step": 19200, + "train_runtime": 9229.719, + "train_tokens_per_second": 136330.391 + }, + { + "epoch": 0.193, + "grad_norm": 0.4900195896625519, + "learning_rate": 0.00027541290152365537, + "loss": 1.263, + "num_input_tokens_seen": 1264844800, + "step": 19300, + "train_runtime": 9276.2421, + "train_tokens_per_second": 136353.147 + }, + { + "epoch": 0.194, + "grad_norm": 0.49572521448135376, + "learning_rate": 0.00027515113828995117, + "loss": 1.273, + "num_input_tokens_seen": 1271398400, + "step": 19400, + "train_runtime": 9323.5363, + "train_tokens_per_second": 136364.396 + }, + { + "epoch": 0.195, + "grad_norm": 0.440213680267334, + "learning_rate": 0.00027488811478567374, + "loss": 1.2657, + "num_input_tokens_seen": 1277952000, + "step": 19500, + "train_runtime": 9371.4717, + "train_tokens_per_second": 136366.201 + }, + { + "epoch": 0.196, + "grad_norm": 0.5604475736618042, + "learning_rate": 0.0002746238336594671, + "loss": 1.2619, + "num_input_tokens_seen": 1284505600, + "step": 19600, + "train_runtime": 9417.129, + "train_tokens_per_second": 136400.978 + }, + { + "epoch": 0.197, + "grad_norm": 0.45344123244285583, + "learning_rate": 0.00027435829757263894, + "loss": 1.2573, + "num_input_tokens_seen": 1291059200, + "step": 19700, + "train_runtime": 9468.5748, + "train_tokens_per_second": 136352.009 + }, + { + "epoch": 0.198, + "grad_norm": 0.7260287404060364, + "learning_rate": 0.0002740915091991349, + "loss": 1.2668, + "num_input_tokens_seen": 1297612800, + "step": 19800, + "train_runtime": 9515.3702, + "train_tokens_per_second": 136370.186 + }, + { + "epoch": 0.199, + "grad_norm": 0.47865310311317444, + "learning_rate": 0.0002738234712255109, + "loss": 1.2674, + "num_input_tokens_seen": 1304166400, + "step": 19900, + "train_runtime": 9562.0606, + "train_tokens_per_second": 136389.682 + }, + { + "epoch": 0.2, + "grad_norm": 0.8422930240631104, + "learning_rate": 0.00027355418635090635, + "loss": 1.2671, + "num_input_tokens_seen": 1310720000, + "step": 20000, + "train_runtime": 9614.8867, + "train_tokens_per_second": 136321.939 + }, + { + "epoch": 0.201, + "grad_norm": 0.8500565886497498, + "learning_rate": 0.000273283657287017, + "loss": 1.2722, + "num_input_tokens_seen": 1317273600, + "step": 20100, + "train_runtime": 9662.5316, + "train_tokens_per_second": 136327.999 + }, + { + "epoch": 0.202, + "grad_norm": 0.4511219263076782, + "learning_rate": 0.00027301188675806745, + "loss": 1.257, + "num_input_tokens_seen": 1323827200, + "step": 20200, + "train_runtime": 9710.3614, + "train_tokens_per_second": 136331.404 + }, + { + "epoch": 0.203, + "grad_norm": 0.6040441393852234, + "learning_rate": 0.0002727388775007839, + "loss": 1.2787, + "num_input_tokens_seen": 1330380800, + "step": 20300, + "train_runtime": 9757.2415, + "train_tokens_per_second": 136348.045 + }, + { + "epoch": 0.204, + "grad_norm": 0.531548798084259, + "learning_rate": 0.0002724646322643666, + "loss": 1.2567, + "num_input_tokens_seen": 1336934400, + "step": 20400, + "train_runtime": 9803.907, + "train_tokens_per_second": 136367.512 + }, + { + "epoch": 0.205, + "grad_norm": 0.5128377079963684, + "learning_rate": 0.000272189153810462, + "loss": 1.2634, + "num_input_tokens_seen": 1343488000, + "step": 20500, + "train_runtime": 9849.6975, + "train_tokens_per_second": 136398.909 + }, + { + "epoch": 0.206, + "grad_norm": 0.5763120651245117, + "learning_rate": 0.0002719124449131351, + "loss": 1.2708, + "num_input_tokens_seen": 1350041600, + "step": 20600, + "train_runtime": 9902.5747, + "train_tokens_per_second": 136332.382 + }, + { + "epoch": 0.207, + "grad_norm": 0.5266316533088684, + "learning_rate": 0.00027163450835884144, + "loss": 1.2579, + "num_input_tokens_seen": 1356595200, + "step": 20700, + "train_runtime": 9950.4471, + "train_tokens_per_second": 136335.1 + }, + { + "epoch": 0.208, + "grad_norm": 0.6279749274253845, + "learning_rate": 0.00027135534694639894, + "loss": 1.2566, + "num_input_tokens_seen": 1363148800, + "step": 20800, + "train_runtime": 9997.0613, + "train_tokens_per_second": 136354.951 + }, + { + "epoch": 0.209, + "grad_norm": 0.5421542525291443, + "learning_rate": 0.00027107496348696003, + "loss": 1.2687, + "num_input_tokens_seen": 1369702400, + "step": 20900, + "train_runtime": 10044.3146, + "train_tokens_per_second": 136365.939 + }, + { + "epoch": 0.21, + "grad_norm": 0.5376498699188232, + "learning_rate": 0.00027079336080398296, + "loss": 1.2772, + "num_input_tokens_seen": 1376256000, + "step": 21000, + "train_runtime": 10090.6051, + "train_tokens_per_second": 136389.839 + }, + { + "epoch": 0.211, + "grad_norm": 0.41719597578048706, + "learning_rate": 0.00027051054173320366, + "loss": 1.2502, + "num_input_tokens_seen": 1382809600, + "step": 21100, + "train_runtime": 10143.3243, + "train_tokens_per_second": 136327.063 + }, + { + "epoch": 0.212, + "grad_norm": 0.4714694321155548, + "learning_rate": 0.000270226509122607, + "loss": 1.2537, + "num_input_tokens_seen": 1389363200, + "step": 21200, + "train_runtime": 10188.8874, + "train_tokens_per_second": 136360.639 + }, + { + "epoch": 0.213, + "grad_norm": 0.4616274833679199, + "learning_rate": 0.0002699412658323983, + "loss": 1.2571, + "num_input_tokens_seen": 1395916800, + "step": 21300, + "train_runtime": 10236.5378, + "train_tokens_per_second": 136366.107 + }, + { + "epoch": 0.214, + "grad_norm": 0.4215717911720276, + "learning_rate": 0.00026965481473497423, + "loss": 1.2687, + "num_input_tokens_seen": 1402470400, + "step": 21400, + "train_runtime": 10282.9404, + "train_tokens_per_second": 136388.071 + }, + { + "epoch": 0.215, + "grad_norm": 0.5976271033287048, + "learning_rate": 0.0002693671587148942, + "loss": 1.2573, + "num_input_tokens_seen": 1409024000, + "step": 21500, + "train_runtime": 10329.955, + "train_tokens_per_second": 136401.756 + }, + { + "epoch": 0.216, + "grad_norm": 0.5200098752975464, + "learning_rate": 0.0002690783006688511, + "loss": 1.247, + "num_input_tokens_seen": 1415577600, + "step": 21600, + "train_runtime": 10382.0767, + "train_tokens_per_second": 136348.213 + }, + { + "epoch": 0.217, + "grad_norm": 0.8170623779296875, + "learning_rate": 0.0002687882435056423, + "loss": 1.2562, + "num_input_tokens_seen": 1422131200, + "step": 21700, + "train_runtime": 10429.827, + "train_tokens_per_second": 136352.329 + }, + { + "epoch": 0.218, + "grad_norm": 0.52497398853302, + "learning_rate": 0.0002684969901461402, + "loss": 1.2533, + "num_input_tokens_seen": 1428684800, + "step": 21800, + "train_runtime": 10476.8104, + "train_tokens_per_second": 136366.388 + }, + { + "epoch": 0.219, + "grad_norm": 0.4417087137699127, + "learning_rate": 0.000268204543523263, + "loss": 1.2721, + "num_input_tokens_seen": 1435238400, + "step": 21900, + "train_runtime": 10524.1028, + "train_tokens_per_second": 136376.319 + }, + { + "epoch": 0.22, + "grad_norm": 0.5729189515113831, + "learning_rate": 0.0002679109065819447, + "loss": 1.2654, + "num_input_tokens_seen": 1441792000, + "step": 22000, + "train_runtime": 10572.3447, + "train_tokens_per_second": 136373.911 + }, + { + "epoch": 0.221, + "grad_norm": 0.5111753940582275, + "learning_rate": 0.0002676160822791062, + "loss": 1.2581, + "num_input_tokens_seen": 1448345600, + "step": 22100, + "train_runtime": 10619.3771, + "train_tokens_per_second": 136387.057 + }, + { + "epoch": 0.222, + "grad_norm": 0.4302677512168884, + "learning_rate": 0.00026732007358362496, + "loss": 1.2581, + "num_input_tokens_seen": 1454899200, + "step": 22200, + "train_runtime": 10666.0714, + "train_tokens_per_second": 136404.413 + }, + { + "epoch": 0.223, + "grad_norm": 3.9242477416992188, + "learning_rate": 0.0002670228834763052, + "loss": 1.2872, + "num_input_tokens_seen": 1461452800, + "step": 22300, + "train_runtime": 10719.3985, + "train_tokens_per_second": 136337.203 + }, + { + "epoch": 0.224, + "grad_norm": 0.7662601470947266, + "learning_rate": 0.00026672451494984804, + "loss": 1.2602, + "num_input_tokens_seen": 1468006400, + "step": 22400, + "train_runtime": 10767.2807, + "train_tokens_per_second": 136339.568 + }, + { + "epoch": 0.225, + "grad_norm": 0.48544740676879883, + "learning_rate": 0.0002664249710088213, + "loss": 1.257, + "num_input_tokens_seen": 1474560000, + "step": 22500, + "train_runtime": 10813.982, + "train_tokens_per_second": 136356.802 + }, + { + "epoch": 0.226, + "grad_norm": 0.4495686888694763, + "learning_rate": 0.00026612425466962893, + "loss": 1.2552, + "num_input_tokens_seen": 1481113600, + "step": 22600, + "train_runtime": 10860.2948, + "train_tokens_per_second": 136378.766 + }, + { + "epoch": 0.227, + "grad_norm": 0.5733143091201782, + "learning_rate": 0.00026582236896048134, + "loss": 1.2403, + "num_input_tokens_seen": 1487667200, + "step": 22700, + "train_runtime": 10907.2107, + "train_tokens_per_second": 136393.001 + }, + { + "epoch": 0.228, + "grad_norm": 0.7318263649940491, + "learning_rate": 0.00026551931692136413, + "loss": 1.2468, + "num_input_tokens_seen": 1494220800, + "step": 22800, + "train_runtime": 10953.9499, + "train_tokens_per_second": 136409.315 + }, + { + "epoch": 0.229, + "grad_norm": 0.5192084312438965, + "learning_rate": 0.00026521510160400804, + "loss": 1.2458, + "num_input_tokens_seen": 1500774400, + "step": 22900, + "train_runtime": 11006.6198, + "train_tokens_per_second": 136351.98 + }, + { + "epoch": 0.23, + "grad_norm": 0.4651305079460144, + "learning_rate": 0.00026490972607185793, + "loss": 1.2601, + "num_input_tokens_seen": 1507328000, + "step": 23000, + "train_runtime": 11053.8305, + "train_tokens_per_second": 136362.504 + }, + { + "epoch": 0.231, + "grad_norm": 0.5470275282859802, + "learning_rate": 0.0002646031934000421, + "loss": 1.2405, + "num_input_tokens_seen": 1513881600, + "step": 23100, + "train_runtime": 11099.6418, + "train_tokens_per_second": 136390.132 + }, + { + "epoch": 0.232, + "grad_norm": 0.519235372543335, + "learning_rate": 0.00026429550667534095, + "loss": 1.2586, + "num_input_tokens_seen": 1520435200, + "step": 23200, + "train_runtime": 11152.1986, + "train_tokens_per_second": 136335.018 + }, + { + "epoch": 0.233, + "grad_norm": 0.4892626404762268, + "learning_rate": 0.0002639866689961565, + "loss": 1.2595, + "num_input_tokens_seen": 1526988800, + "step": 23300, + "train_runtime": 11199.2653, + "train_tokens_per_second": 136347.23 + }, + { + "epoch": 0.234, + "grad_norm": 0.4089221656322479, + "learning_rate": 0.00026367668347248083, + "loss": 1.2393, + "num_input_tokens_seen": 1533542400, + "step": 23400, + "train_runtime": 11247.6635, + "train_tokens_per_second": 136343.196 + }, + { + "epoch": 0.235, + "grad_norm": 0.467582106590271, + "learning_rate": 0.0002633655532258646, + "loss": 1.2534, + "num_input_tokens_seen": 1540096000, + "step": 23500, + "train_runtime": 11294.1646, + "train_tokens_per_second": 136362.099 + }, + { + "epoch": 0.236, + "grad_norm": 0.48117080330848694, + "learning_rate": 0.000263053281389386, + "loss": 1.2644, + "num_input_tokens_seen": 1546649600, + "step": 23600, + "train_runtime": 11340.9021, + "train_tokens_per_second": 136378.004 + }, + { + "epoch": 0.237, + "grad_norm": 0.4495629072189331, + "learning_rate": 0.0002627398711076189, + "loss": 1.2442, + "num_input_tokens_seen": 1553203200, + "step": 23700, + "train_runtime": 11387.7566, + "train_tokens_per_second": 136392.377 + }, + { + "epoch": 0.238, + "grad_norm": 0.4376384913921356, + "learning_rate": 0.0002624253255366014, + "loss": 1.2489, + "num_input_tokens_seen": 1559756800, + "step": 23800, + "train_runtime": 11439.8893, + "train_tokens_per_second": 136343.696 + }, + { + "epoch": 0.239, + "grad_norm": 0.4419648349285126, + "learning_rate": 0.0002621096478438039, + "loss": 1.2353, + "num_input_tokens_seen": 1566310400, + "step": 23900, + "train_runtime": 11486.001, + "train_tokens_per_second": 136366.904 + }, + { + "epoch": 0.24, + "grad_norm": 0.669739305973053, + "learning_rate": 0.00026179284120809727, + "loss": 1.2528, + "num_input_tokens_seen": 1572864000, + "step": 24000, + "train_runtime": 11533.9608, + "train_tokens_per_second": 136368.072 + }, + { + "epoch": 0.241, + "grad_norm": 0.4047415554523468, + "learning_rate": 0.0002614749088197208, + "loss": 1.2679, + "num_input_tokens_seen": 1579417600, + "step": 24100, + "train_runtime": 11582.9583, + "train_tokens_per_second": 136357.013 + }, + { + "epoch": 0.242, + "grad_norm": 0.5224933624267578, + "learning_rate": 0.00026115585388025015, + "loss": 1.2425, + "num_input_tokens_seen": 1585971200, + "step": 24200, + "train_runtime": 11630.022, + "train_tokens_per_second": 136368.719 + }, + { + "epoch": 0.243, + "grad_norm": 0.5125856399536133, + "learning_rate": 0.00026083567960256493, + "loss": 1.2423, + "num_input_tokens_seen": 1592524800, + "step": 24300, + "train_runtime": 11677.13, + "train_tokens_per_second": 136379.813 + }, + { + "epoch": 0.244, + "grad_norm": 0.5344144701957703, + "learning_rate": 0.00026051438921081667, + "loss": 1.2431, + "num_input_tokens_seen": 1599078400, + "step": 24400, + "train_runtime": 11723.5349, + "train_tokens_per_second": 136398.997 + }, + { + "epoch": 0.245, + "grad_norm": 0.4386890232563019, + "learning_rate": 0.00026019198594039595, + "loss": 1.2426, + "num_input_tokens_seen": 1605632000, + "step": 24500, + "train_runtime": 11773.1296, + "train_tokens_per_second": 136381.069 + }, + { + "epoch": 0.246, + "grad_norm": 0.4986630082130432, + "learning_rate": 0.00025986847303790026, + "loss": 1.2531, + "num_input_tokens_seen": 1612185600, + "step": 24600, + "train_runtime": 11820.6579, + "train_tokens_per_second": 136387.13 + }, + { + "epoch": 0.247, + "grad_norm": 0.5271715521812439, + "learning_rate": 0.00025954385376110076, + "loss": 1.249, + "num_input_tokens_seen": 1618739200, + "step": 24700, + "train_runtime": 11867.4874, + "train_tokens_per_second": 136401.172 + }, + { + "epoch": 0.248, + "grad_norm": 0.45263609290122986, + "learning_rate": 0.00025921813137891005, + "loss": 1.2507, + "num_input_tokens_seen": 1625292800, + "step": 24800, + "train_runtime": 11919.9131, + "train_tokens_per_second": 136351.061 + }, + { + "epoch": 0.249, + "grad_norm": 0.5932081937789917, + "learning_rate": 0.000258891309171349, + "loss": 1.2438, + "num_input_tokens_seen": 1631846400, + "step": 24900, + "train_runtime": 11962.6395, + "train_tokens_per_second": 136411.902 + }, + { + "epoch": 0.25, + "grad_norm": 0.5539859533309937, + "learning_rate": 0.00025856339042951344, + "loss": 1.2548, + "num_input_tokens_seen": 1638400000, + "step": 25000, + "train_runtime": 12014.9411, + "train_tokens_per_second": 136363.548 + }, + { + "epoch": 0.251, + "grad_norm": 0.5236772298812866, + "learning_rate": 0.0002582343784555415, + "loss": 1.2386, + "num_input_tokens_seen": 1644953600, + "step": 25100, + "train_runtime": 12062.3997, + "train_tokens_per_second": 136370.344 + }, + { + "epoch": 0.252, + "grad_norm": 0.5913048982620239, + "learning_rate": 0.00025790427656258017, + "loss": 1.2354, + "num_input_tokens_seen": 1651507200, + "step": 25200, + "train_runtime": 12108.5333, + "train_tokens_per_second": 136392.01 + }, + { + "epoch": 0.253, + "grad_norm": 0.5929732322692871, + "learning_rate": 0.00025757308807475185, + "loss": 1.2582, + "num_input_tokens_seen": 1658060800, + "step": 25300, + "train_runtime": 12154.8252, + "train_tokens_per_second": 136411.736 + }, + { + "epoch": 0.254, + "grad_norm": 0.4542764723300934, + "learning_rate": 0.00025724081632712086, + "loss": 1.2488, + "num_input_tokens_seen": 1664614400, + "step": 25400, + "train_runtime": 12207.8935, + "train_tokens_per_second": 136355.58 + }, + { + "epoch": 0.255, + "grad_norm": 1.0848513841629028, + "learning_rate": 0.0002569074646656601, + "loss": 1.2375, + "num_input_tokens_seen": 1671168000, + "step": 25500, + "train_runtime": 12254.3162, + "train_tokens_per_second": 136373.827 + }, + { + "epoch": 0.256, + "grad_norm": 0.5190780162811279, + "learning_rate": 0.00025657303644721695, + "loss": 1.236, + "num_input_tokens_seen": 1677721600, + "step": 25600, + "train_runtime": 12301.2378, + "train_tokens_per_second": 136386.405 + }, + { + "epoch": 0.257, + "grad_norm": 0.43418362736701965, + "learning_rate": 0.00025623753503948004, + "loss": 1.2484, + "num_input_tokens_seen": 1684275200, + "step": 25700, + "train_runtime": 12347.684, + "train_tokens_per_second": 136404.138 + }, + { + "epoch": 0.258, + "grad_norm": 0.4586409032344818, + "learning_rate": 0.00025590096382094475, + "loss": 1.2674, + "num_input_tokens_seen": 1690828800, + "step": 25800, + "train_runtime": 12394.5809, + "train_tokens_per_second": 136416.778 + }, + { + "epoch": 0.259, + "grad_norm": 0.5069702863693237, + "learning_rate": 0.00025556332618087945, + "loss": 1.2428, + "num_input_tokens_seen": 1697382400, + "step": 25900, + "train_runtime": 12447.2116, + "train_tokens_per_second": 136366.478 + }, + { + "epoch": 0.26, + "grad_norm": 0.591788649559021, + "learning_rate": 0.00025522462551929155, + "loss": 1.2417, + "num_input_tokens_seen": 1703936000, + "step": 26000, + "train_runtime": 12492.8891, + "train_tokens_per_second": 136392.47 + }, + { + "epoch": 0.261, + "grad_norm": 0.6001791954040527, + "learning_rate": 0.00025488486524689283, + "loss": 1.2407, + "num_input_tokens_seen": 1710489600, + "step": 26100, + "train_runtime": 12539.4548, + "train_tokens_per_second": 136408.61 + }, + { + "epoch": 0.262, + "grad_norm": 0.47005897760391235, + "learning_rate": 0.00025454404878506555, + "loss": 1.2558, + "num_input_tokens_seen": 1717043200, + "step": 26200, + "train_runtime": 12587.1655, + "train_tokens_per_second": 136412.221 + }, + { + "epoch": 0.263, + "grad_norm": 0.42708972096443176, + "learning_rate": 0.0002542021795658276, + "loss": 1.2445, + "num_input_tokens_seen": 1723596800, + "step": 26300, + "train_runtime": 12634.1294, + "train_tokens_per_second": 136423.868 + }, + { + "epoch": 0.264, + "grad_norm": 0.48100486397743225, + "learning_rate": 0.0002538592610317984, + "loss": 1.2416, + "num_input_tokens_seen": 1730150400, + "step": 26400, + "train_runtime": 12686.5075, + "train_tokens_per_second": 136377.202 + }, + { + "epoch": 0.265, + "grad_norm": 0.5689502954483032, + "learning_rate": 0.00025351529663616355, + "loss": 1.2476, + "num_input_tokens_seen": 1736704000, + "step": 26500, + "train_runtime": 12733.1403, + "train_tokens_per_second": 136392.435 + }, + { + "epoch": 0.266, + "grad_norm": 0.3999510705471039, + "learning_rate": 0.00025317028984264087, + "loss": 1.2507, + "num_input_tokens_seen": 1743257600, + "step": 26600, + "train_runtime": 12780.4326, + "train_tokens_per_second": 136400.515 + }, + { + "epoch": 0.267, + "grad_norm": 0.4349440336227417, + "learning_rate": 0.0002528242441254448, + "loss": 1.2359, + "num_input_tokens_seen": 1749811200, + "step": 26700, + "train_runtime": 12826.6298, + "train_tokens_per_second": 136420.184 + }, + { + "epoch": 0.268, + "grad_norm": 0.40468648076057434, + "learning_rate": 0.000252477162969252, + "loss": 1.2463, + "num_input_tokens_seen": 1756364800, + "step": 26800, + "train_runtime": 12873.4848, + "train_tokens_per_second": 136432.739 + }, + { + "epoch": 0.269, + "grad_norm": 0.5858653783798218, + "learning_rate": 0.00025212904986916584, + "loss": 1.2385, + "num_input_tokens_seen": 1762918400, + "step": 26900, + "train_runtime": 12926.2009, + "train_tokens_per_second": 136383.336 + }, + { + "epoch": 0.27, + "grad_norm": 0.4621046483516693, + "learning_rate": 0.00025177990833068133, + "loss": 1.2366, + "num_input_tokens_seen": 1769472000, + "step": 27000, + "train_runtime": 12973.4952, + "train_tokens_per_second": 136391.31 + }, + { + "epoch": 0.271, + "grad_norm": 0.4884892404079437, + "learning_rate": 0.0002514297418696499, + "loss": 1.2436, + "num_input_tokens_seen": 1776025600, + "step": 27100, + "train_runtime": 13021.2871, + "train_tokens_per_second": 136394.013 + }, + { + "epoch": 0.272, + "grad_norm": 0.5108981132507324, + "learning_rate": 0.0002510785540122439, + "loss": 1.2423, + "num_input_tokens_seen": 1782579200, + "step": 27200, + "train_runtime": 13068.0423, + "train_tokens_per_second": 136407.517 + }, + { + "epoch": 0.273, + "grad_norm": 0.3898067772388458, + "learning_rate": 0.0002507263482949212, + "loss": 1.2415, + "num_input_tokens_seen": 1789132800, + "step": 27300, + "train_runtime": 13113.8421, + "train_tokens_per_second": 136430.864 + }, + { + "epoch": 0.274, + "grad_norm": 0.5622383952140808, + "learning_rate": 0.0002503731282643894, + "loss": 1.2378, + "num_input_tokens_seen": 1795686400, + "step": 27400, + "train_runtime": 13161.1635, + "train_tokens_per_second": 136438.272 + }, + { + "epoch": 0.275, + "grad_norm": 0.7748796343803406, + "learning_rate": 0.0002500188974775704, + "loss": 1.248, + "num_input_tokens_seen": 1802240000, + "step": 27500, + "train_runtime": 13209.4471, + "train_tokens_per_second": 136435.688 + }, + { + "epoch": 0.276, + "grad_norm": 0.8867826461791992, + "learning_rate": 0.00024966365950156416, + "loss": 1.2409, + "num_input_tokens_seen": 1808793600, + "step": 27600, + "train_runtime": 13256.4066, + "train_tokens_per_second": 136446.751 + }, + { + "epoch": 0.277, + "grad_norm": 0.49997836351394653, + "learning_rate": 0.00024930741791361326, + "loss": 1.2382, + "num_input_tokens_seen": 1815347200, + "step": 27700, + "train_runtime": 13309.6196, + "train_tokens_per_second": 136393.62 + }, + { + "epoch": 0.278, + "grad_norm": 0.5048521161079407, + "learning_rate": 0.0002489501763010664, + "loss": 1.2351, + "num_input_tokens_seen": 1821900800, + "step": 27800, + "train_runtime": 13356.706, + "train_tokens_per_second": 136403.451 + }, + { + "epoch": 0.279, + "grad_norm": 0.5528578162193298, + "learning_rate": 0.00024859193826134285, + "loss": 1.2404, + "num_input_tokens_seen": 1828454400, + "step": 27900, + "train_runtime": 13405.5813, + "train_tokens_per_second": 136395.01 + }, + { + "epoch": 0.28, + "grad_norm": 0.44376805424690247, + "learning_rate": 0.00024823270740189556, + "loss": 1.2461, + "num_input_tokens_seen": 1835008000, + "step": 28000, + "train_runtime": 13452.7686, + "train_tokens_per_second": 136403.743 + }, + { + "epoch": 0.281, + "grad_norm": 0.5072674751281738, + "learning_rate": 0.00024787248734017527, + "loss": 1.2301, + "num_input_tokens_seen": 1841561600, + "step": 28100, + "train_runtime": 13501.0413, + "train_tokens_per_second": 136401.449 + }, + { + "epoch": 0.282, + "grad_norm": 0.46835577487945557, + "learning_rate": 0.0002475112817035941, + "loss": 1.237, + "num_input_tokens_seen": 1848115200, + "step": 28200, + "train_runtime": 13547.4814, + "train_tokens_per_second": 136417.622 + }, + { + "epoch": 0.283, + "grad_norm": 0.4893036186695099, + "learning_rate": 0.0002471490941294887, + "loss": 1.2612, + "num_input_tokens_seen": 1854668800, + "step": 28300, + "train_runtime": 13593.9904, + "train_tokens_per_second": 136432.993 + }, + { + "epoch": 0.284, + "grad_norm": 0.66542649269104, + "learning_rate": 0.000246785928265084, + "loss": 1.2405, + "num_input_tokens_seen": 1861222400, + "step": 28400, + "train_runtime": 13646.3147, + "train_tokens_per_second": 136390.113 + }, + { + "epoch": 0.285, + "grad_norm": 0.669306755065918, + "learning_rate": 0.0002464217877674562, + "loss": 1.2409, + "num_input_tokens_seen": 1867776000, + "step": 28500, + "train_runtime": 13692.502, + "train_tokens_per_second": 136408.671 + }, + { + "epoch": 0.286, + "grad_norm": 0.43464845418930054, + "learning_rate": 0.0002460566763034961, + "loss": 1.2435, + "num_input_tokens_seen": 1874329600, + "step": 28600, + "train_runtime": 13738.7564, + "train_tokens_per_second": 136426.438 + }, + { + "epoch": 0.287, + "grad_norm": 0.5084187388420105, + "learning_rate": 0.00024569059754987196, + "loss": 1.2572, + "num_input_tokens_seen": 1880883200, + "step": 28700, + "train_runtime": 13785.6191, + "train_tokens_per_second": 136438.065 + }, + { + "epoch": 0.288, + "grad_norm": 0.473603755235672, + "learning_rate": 0.00024532355519299296, + "loss": 1.2459, + "num_input_tokens_seen": 1887436800, + "step": 28800, + "train_runtime": 13838.5181, + "train_tokens_per_second": 136390.095 + }, + { + "epoch": 0.289, + "grad_norm": 0.493012011051178, + "learning_rate": 0.0002449555529289714, + "loss": 1.243, + "num_input_tokens_seen": 1893990400, + "step": 28900, + "train_runtime": 13886.1283, + "train_tokens_per_second": 136394.419 + }, + { + "epoch": 0.29, + "grad_norm": 0.7421333193778992, + "learning_rate": 0.0002445865944635861, + "loss": 1.2455, + "num_input_tokens_seen": 1900544000, + "step": 29000, + "train_runtime": 13931.9406, + "train_tokens_per_second": 136416.315 + }, + { + "epoch": 0.291, + "grad_norm": 0.5027185678482056, + "learning_rate": 0.0002442166835122446, + "loss": 1.2686, + "num_input_tokens_seen": 1907097600, + "step": 29100, + "train_runtime": 13980.446, + "train_tokens_per_second": 136411.785 + }, + { + "epoch": 0.292, + "grad_norm": 0.48427557945251465, + "learning_rate": 0.00024384582379994614, + "loss": 1.2369, + "num_input_tokens_seen": 1913651200, + "step": 29200, + "train_runtime": 14028.0456, + "train_tokens_per_second": 136416.095 + }, + { + "epoch": 0.293, + "grad_norm": 0.6620755195617676, + "learning_rate": 0.00024347401906124388, + "loss": 1.2317, + "num_input_tokens_seen": 1920204800, + "step": 29300, + "train_runtime": 14074.3372, + "train_tokens_per_second": 136433.054 + }, + { + "epoch": 0.294, + "grad_norm": 0.5745883584022522, + "learning_rate": 0.0002431012730402075, + "loss": 1.2443, + "num_input_tokens_seen": 1926758400, + "step": 29400, + "train_runtime": 14125.645, + "train_tokens_per_second": 136401.446 + }, + { + "epoch": 0.295, + "grad_norm": 0.441680908203125, + "learning_rate": 0.00024272758949038517, + "loss": 1.2393, + "num_input_tokens_seen": 1933312000, + "step": 29500, + "train_runtime": 14172.5336, + "train_tokens_per_second": 136412.588 + }, + { + "epoch": 0.296, + "grad_norm": 0.4417046904563904, + "learning_rate": 0.00024235297217476616, + "loss": 1.2371, + "num_input_tokens_seen": 1939865600, + "step": 29600, + "train_runtime": 14220.1572, + "train_tokens_per_second": 136416.608 + }, + { + "epoch": 0.297, + "grad_norm": 0.5888639688491821, + "learning_rate": 0.00024197742486574268, + "loss": 1.2344, + "num_input_tokens_seen": 1946419200, + "step": 29700, + "train_runtime": 14267.366, + "train_tokens_per_second": 136424.565 + }, + { + "epoch": 0.298, + "grad_norm": 0.4625283479690552, + "learning_rate": 0.0002416009513450719, + "loss": 1.2373, + "num_input_tokens_seen": 1952972800, + "step": 29800, + "train_runtime": 14318.8989, + "train_tokens_per_second": 136391.27 + }, + { + "epoch": 0.299, + "grad_norm": 0.47661375999450684, + "learning_rate": 0.00024122355540383806, + "loss": 1.2454, + "num_input_tokens_seen": 1959526400, + "step": 29900, + "train_runtime": 14365.8797, + "train_tokens_per_second": 136401.42 + }, + { + "epoch": 0.3, + "grad_norm": 0.727032482624054, + "learning_rate": 0.00024084524084241405, + "loss": 1.2379, + "num_input_tokens_seen": 1966080000, + "step": 30000, + "train_runtime": 14415.1273, + "train_tokens_per_second": 136390.055 + }, + { + "epoch": 0.301, + "grad_norm": 0.45500555634498596, + "learning_rate": 0.00024046601147042332, + "loss": 1.2358, + "num_input_tokens_seen": 1972633600, + "step": 30100, + "train_runtime": 14461.5845, + "train_tokens_per_second": 136405.08 + }, + { + "epoch": 0.302, + "grad_norm": 0.44596830010414124, + "learning_rate": 0.0002400858711067015, + "loss": 1.2301, + "num_input_tokens_seen": 1979187200, + "step": 30200, + "train_runtime": 14508.0707, + "train_tokens_per_second": 136419.737 + }, + { + "epoch": 0.303, + "grad_norm": 0.4207491874694824, + "learning_rate": 0.00023970482357925772, + "loss": 1.2441, + "num_input_tokens_seen": 1985740800, + "step": 30300, + "train_runtime": 14555.5751, + "train_tokens_per_second": 136424.757 + }, + { + "epoch": 0.304, + "grad_norm": 0.4833202064037323, + "learning_rate": 0.00023932287272523646, + "loss": 1.2351, + "num_input_tokens_seen": 1992294400, + "step": 30400, + "train_runtime": 14601.9546, + "train_tokens_per_second": 136440.255 + }, + { + "epoch": 0.305, + "grad_norm": 0.5268282294273376, + "learning_rate": 0.00023894002239087847, + "loss": 1.2384, + "num_input_tokens_seen": 1998848000, + "step": 30500, + "train_runtime": 14654.2539, + "train_tokens_per_second": 136400.53 + }, + { + "epoch": 0.306, + "grad_norm": 0.4639832377433777, + "learning_rate": 0.0002385562764314825, + "loss": 1.3007, + "num_input_tokens_seen": 2005401600, + "step": 30600, + "train_runtime": 14702.026, + "train_tokens_per_second": 136403.078 + }, + { + "epoch": 0.307, + "grad_norm": 0.526703953742981, + "learning_rate": 0.00023817163871136596, + "loss": 1.2481, + "num_input_tokens_seen": 2011955200, + "step": 30700, + "train_runtime": 14749.4458, + "train_tokens_per_second": 136408.868 + }, + { + "epoch": 0.308, + "grad_norm": 0.43404075503349304, + "learning_rate": 0.00023778611310382652, + "loss": 1.2273, + "num_input_tokens_seen": 2018508800, + "step": 30800, + "train_runtime": 14796.5936, + "train_tokens_per_second": 136417.128 + }, + { + "epoch": 0.309, + "grad_norm": 0.39956456422805786, + "learning_rate": 0.0002373997034911027, + "loss": 1.2275, + "num_input_tokens_seen": 2025062400, + "step": 30900, + "train_runtime": 14843.3887, + "train_tokens_per_second": 136428.578 + }, + { + "epoch": 0.31, + "grad_norm": 0.46024298667907715, + "learning_rate": 0.00023701241376433506, + "loss": 1.2353, + "num_input_tokens_seen": 2031616000, + "step": 31000, + "train_runtime": 14890.8282, + "train_tokens_per_second": 136434.05 + }, + { + "epoch": 0.311, + "grad_norm": 0.38429203629493713, + "learning_rate": 0.0002366242478235268, + "loss": 1.2403, + "num_input_tokens_seen": 2038169600, + "step": 31100, + "train_runtime": 14937.8781, + "train_tokens_per_second": 136443.047 + }, + { + "epoch": 0.312, + "grad_norm": 0.5401485562324524, + "learning_rate": 0.00023623520957750471, + "loss": 1.2273, + "num_input_tokens_seen": 2044723200, + "step": 31200, + "train_runtime": 14990.0842, + "train_tokens_per_second": 136405.051 + }, + { + "epoch": 0.313, + "grad_norm": 0.5360187888145447, + "learning_rate": 0.00023584530294387953, + "loss": 1.2312, + "num_input_tokens_seen": 2051276800, + "step": 31300, + "train_runtime": 15037.4257, + "train_tokens_per_second": 136411.434 + }, + { + "epoch": 0.314, + "grad_norm": 0.4468795359134674, + "learning_rate": 0.00023545453184900682, + "loss": 1.2383, + "num_input_tokens_seen": 2057830400, + "step": 31400, + "train_runtime": 15083.4771, + "train_tokens_per_second": 136429.444 + }, + { + "epoch": 0.315, + "grad_norm": 0.4575517177581787, + "learning_rate": 0.00023506290022794706, + "loss": 1.2354, + "num_input_tokens_seen": 2064384000, + "step": 31500, + "train_runtime": 15131.2692, + "train_tokens_per_second": 136431.648 + }, + { + "epoch": 0.316, + "grad_norm": 0.7983475923538208, + "learning_rate": 0.00023467041202442643, + "loss": 1.2309, + "num_input_tokens_seen": 2070937600, + "step": 31600, + "train_runtime": 15178.6218, + "train_tokens_per_second": 136437.789 + }, + { + "epoch": 0.317, + "grad_norm": 0.4316498339176178, + "learning_rate": 0.00023427707119079669, + "loss": 1.2462, + "num_input_tokens_seen": 2077491200, + "step": 31700, + "train_runtime": 15225.1881, + "train_tokens_per_second": 136450.938 + }, + { + "epoch": 0.318, + "grad_norm": 0.5765666365623474, + "learning_rate": 0.0002338828816879957, + "loss": 1.2367, + "num_input_tokens_seen": 2084044800, + "step": 31800, + "train_runtime": 15277.5735, + "train_tokens_per_second": 136412.029 + }, + { + "epoch": 0.319, + "grad_norm": 0.44825831055641174, + "learning_rate": 0.00023348784748550744, + "loss": 1.2354, + "num_input_tokens_seen": 2090598400, + "step": 31900, + "train_runtime": 15324.8285, + "train_tokens_per_second": 136419.04 + }, + { + "epoch": 0.32, + "grad_norm": 0.5602436661720276, + "learning_rate": 0.00023309197256132184, + "loss": 1.2324, + "num_input_tokens_seen": 2097152000, + "step": 32000, + "train_runtime": 15371.4775, + "train_tokens_per_second": 136431.387 + }, + { + "epoch": 0.321, + "grad_norm": 0.4002476930618286, + "learning_rate": 0.00023269526090189505, + "loss": 1.2396, + "num_input_tokens_seen": 2103705600, + "step": 32100, + "train_runtime": 15419.2672, + "train_tokens_per_second": 136433.565 + }, + { + "epoch": 0.322, + "grad_norm": 0.4306688606739044, + "learning_rate": 0.00023229771650210907, + "loss": 1.2468, + "num_input_tokens_seen": 2110259200, + "step": 32200, + "train_runtime": 15466.1068, + "train_tokens_per_second": 136444.111 + }, + { + "epoch": 0.323, + "grad_norm": 0.584658145904541, + "learning_rate": 0.00023189934336523163, + "loss": 1.2459, + "num_input_tokens_seen": 2116812800, + "step": 32300, + "train_runtime": 15513.277, + "train_tokens_per_second": 136451.686 + }, + { + "epoch": 0.324, + "grad_norm": 0.4049496352672577, + "learning_rate": 0.00023150014550287574, + "loss": 1.2455, + "num_input_tokens_seen": 2123366400, + "step": 32400, + "train_runtime": 15565.7808, + "train_tokens_per_second": 136412.456 + }, + { + "epoch": 0.325, + "grad_norm": 0.45713433623313904, + "learning_rate": 0.00023110012693495943, + "loss": 1.2308, + "num_input_tokens_seen": 2129920000, + "step": 32500, + "train_runtime": 15610.6324, + "train_tokens_per_second": 136440.341 + }, + { + "epoch": 0.326, + "grad_norm": 0.5710960030555725, + "learning_rate": 0.00023069929168966527, + "loss": 1.2434, + "num_input_tokens_seen": 2136473600, + "step": 32600, + "train_runtime": 15657.7335, + "train_tokens_per_second": 136448.458 + }, + { + "epoch": 0.327, + "grad_norm": 0.5807371735572815, + "learning_rate": 0.0002302976438033997, + "loss": 1.2292, + "num_input_tokens_seen": 2143027200, + "step": 32700, + "train_runtime": 15710.1819, + "train_tokens_per_second": 136410.082 + }, + { + "epoch": 0.328, + "grad_norm": 0.4462313652038574, + "learning_rate": 0.0002298951873207525, + "loss": 1.2427, + "num_input_tokens_seen": 2149580800, + "step": 32800, + "train_runtime": 15757.3708, + "train_tokens_per_second": 136417.479 + }, + { + "epoch": 0.329, + "grad_norm": 0.6099971532821655, + "learning_rate": 0.00022949192629445606, + "loss": 1.2313, + "num_input_tokens_seen": 2156134400, + "step": 32900, + "train_runtime": 15804.1823, + "train_tokens_per_second": 136428.089 + }, + { + "epoch": 0.33, + "grad_norm": 0.8630947470664978, + "learning_rate": 0.0002290878647853443, + "loss": 1.247, + "num_input_tokens_seen": 2162688000, + "step": 33000, + "train_runtime": 15852.2039, + "train_tokens_per_second": 136428.223 + }, + { + "epoch": 0.331, + "grad_norm": 0.5154317021369934, + "learning_rate": 0.00022868300686231224, + "loss": 1.2246, + "num_input_tokens_seen": 2169241600, + "step": 33100, + "train_runtime": 15899.5617, + "train_tokens_per_second": 136434.05 + }, + { + "epoch": 0.332, + "grad_norm": 0.5033185482025146, + "learning_rate": 0.00022827735660227457, + "loss": 1.2271, + "num_input_tokens_seen": 2175795200, + "step": 33200, + "train_runtime": 15947.1716, + "train_tokens_per_second": 136437.686 + }, + { + "epoch": 0.333, + "grad_norm": 0.7760284543037415, + "learning_rate": 0.000227870918090125, + "loss": 1.2445, + "num_input_tokens_seen": 2182348800, + "step": 33300, + "train_runtime": 16000.1889, + "train_tokens_per_second": 136395.189 + }, + { + "epoch": 0.334, + "grad_norm": 0.5042400360107422, + "learning_rate": 0.00022746369541869476, + "loss": 1.223, + "num_input_tokens_seen": 2188902400, + "step": 33400, + "train_runtime": 16047.8873, + "train_tokens_per_second": 136398.166 + }, + { + "epoch": 0.335, + "grad_norm": 0.421273410320282, + "learning_rate": 0.00022705569268871163, + "loss": 1.2222, + "num_input_tokens_seen": 2195456000, + "step": 33500, + "train_runtime": 16094.6711, + "train_tokens_per_second": 136408.876 + }, + { + "epoch": 0.336, + "grad_norm": 0.48292359709739685, + "learning_rate": 0.00022664691400875865, + "loss": 1.222, + "num_input_tokens_seen": 2202009600, + "step": 33600, + "train_runtime": 16143.6943, + "train_tokens_per_second": 136400.601 + }, + { + "epoch": 0.337, + "grad_norm": 0.4301004409790039, + "learning_rate": 0.00022623736349523254, + "loss": 1.2308, + "num_input_tokens_seen": 2208563200, + "step": 33700, + "train_runtime": 16189.7469, + "train_tokens_per_second": 136417.401 + }, + { + "epoch": 0.338, + "grad_norm": 0.6592893600463867, + "learning_rate": 0.00022582704527230238, + "loss": 1.2401, + "num_input_tokens_seen": 2215116800, + "step": 33800, + "train_runtime": 16235.6512, + "train_tokens_per_second": 136435.353 + }, + { + "epoch": 0.339, + "grad_norm": 0.6183221340179443, + "learning_rate": 0.0002254159634718682, + "loss": 1.2364, + "num_input_tokens_seen": 2221670400, + "step": 33900, + "train_runtime": 16283.1306, + "train_tokens_per_second": 136440.003 + }, + { + "epoch": 0.34, + "grad_norm": 0.529971182346344, + "learning_rate": 0.00022500412223351915, + "loss": 1.2222, + "num_input_tokens_seen": 2228224000, + "step": 34000, + "train_runtime": 16330.1955, + "train_tokens_per_second": 136448.091 + }, + { + "epoch": 0.341, + "grad_norm": 0.41906896233558655, + "learning_rate": 0.0002245915257044919, + "loss": 1.2261, + "num_input_tokens_seen": 2234777600, + "step": 34100, + "train_runtime": 16381.7912, + "train_tokens_per_second": 136418.391 + }, + { + "epoch": 0.342, + "grad_norm": 0.4326164722442627, + "learning_rate": 0.00022417817803962892, + "loss": 1.2452, + "num_input_tokens_seen": 2241331200, + "step": 34200, + "train_runtime": 16429.3997, + "train_tokens_per_second": 136421.978 + }, + { + "epoch": 0.343, + "grad_norm": 0.8329346179962158, + "learning_rate": 0.0002237640834013366, + "loss": 1.2197, + "num_input_tokens_seen": 2247884800, + "step": 34300, + "train_runtime": 16476.2139, + "train_tokens_per_second": 136432.121 + }, + { + "epoch": 0.344, + "grad_norm": 0.4649752378463745, + "learning_rate": 0.0002233492459595434, + "loss": 1.2255, + "num_input_tokens_seen": 2254438400, + "step": 34400, + "train_runtime": 16523.092, + "train_tokens_per_second": 136441.678 + }, + { + "epoch": 0.345, + "grad_norm": 0.5218563675880432, + "learning_rate": 0.00022293366989165772, + "loss": 1.2365, + "num_input_tokens_seen": 2260992000, + "step": 34500, + "train_runtime": 16575.1624, + "train_tokens_per_second": 136408.437 + }, + { + "epoch": 0.346, + "grad_norm": 0.8002403974533081, + "learning_rate": 0.00022251735938252587, + "loss": 1.2179, + "num_input_tokens_seen": 2267545600, + "step": 34600, + "train_runtime": 16622.274, + "train_tokens_per_second": 136416.088 + }, + { + "epoch": 0.347, + "grad_norm": 0.5648475289344788, + "learning_rate": 0.0002221003186243902, + "loss": 1.2301, + "num_input_tokens_seen": 2274099200, + "step": 34700, + "train_runtime": 16668.9107, + "train_tokens_per_second": 136427.583 + }, + { + "epoch": 0.348, + "grad_norm": 0.4631340801715851, + "learning_rate": 0.00022168255181684643, + "loss": 1.2292, + "num_input_tokens_seen": 2280652800, + "step": 34800, + "train_runtime": 16715.4649, + "train_tokens_per_second": 136439.687 + }, + { + "epoch": 0.349, + "grad_norm": 0.4492770731449127, + "learning_rate": 0.00022126406316680172, + "loss": 1.226, + "num_input_tokens_seen": 2287206400, + "step": 34900, + "train_runtime": 16761.744, + "train_tokens_per_second": 136453.963 + }, + { + "epoch": 0.35, + "grad_norm": 0.5984812378883362, + "learning_rate": 0.00022084485688843208, + "loss": 1.2332, + "num_input_tokens_seen": 2293760000, + "step": 35000, + "train_runtime": 16816.4332, + "train_tokens_per_second": 136399.912 + }, + { + "epoch": 0.351, + "grad_norm": 0.6245887875556946, + "learning_rate": 0.00022042493720314003, + "loss": 1.2324, + "num_input_tokens_seen": 2300313600, + "step": 35100, + "train_runtime": 16864.2018, + "train_tokens_per_second": 136402.163 + }, + { + "epoch": 0.352, + "grad_norm": 0.6719664335250854, + "learning_rate": 0.00022000430833951228, + "loss": 1.2272, + "num_input_tokens_seen": 2306867200, + "step": 35200, + "train_runtime": 16910.313, + "train_tokens_per_second": 136417.77 + }, + { + "epoch": 0.353, + "grad_norm": 0.43880173563957214, + "learning_rate": 0.00021958297453327673, + "loss": 1.2572, + "num_input_tokens_seen": 2313420800, + "step": 35300, + "train_runtime": 16958.9376, + "train_tokens_per_second": 136413.073 + }, + { + "epoch": 0.354, + "grad_norm": 0.6195557713508606, + "learning_rate": 0.00021916094002726012, + "loss": 1.2299, + "num_input_tokens_seen": 2319974400, + "step": 35400, + "train_runtime": 17005.9814, + "train_tokens_per_second": 136421.083 + }, + { + "epoch": 0.355, + "grad_norm": 0.5288188457489014, + "learning_rate": 0.00021873820907134534, + "loss": 1.2157, + "num_input_tokens_seen": 2326528000, + "step": 35500, + "train_runtime": 17053.3579, + "train_tokens_per_second": 136426.387 + }, + { + "epoch": 0.356, + "grad_norm": 0.4962466061115265, + "learning_rate": 0.0002183147859224283, + "loss": 1.2282, + "num_input_tokens_seen": 2333081600, + "step": 35600, + "train_runtime": 17099.0541, + "train_tokens_per_second": 136445.068 + }, + { + "epoch": 0.357, + "grad_norm": 0.4940129518508911, + "learning_rate": 0.00021789067484437544, + "loss": 1.2349, + "num_input_tokens_seen": 2339635200, + "step": 35700, + "train_runtime": 17146.892, + "train_tokens_per_second": 136446.605 + }, + { + "epoch": 0.358, + "grad_norm": 0.5929033160209656, + "learning_rate": 0.00021746588010798068, + "loss": 1.2368, + "num_input_tokens_seen": 2346188800, + "step": 35800, + "train_runtime": 17199.6266, + "train_tokens_per_second": 136409.287 + }, + { + "epoch": 0.359, + "grad_norm": 0.4825666546821594, + "learning_rate": 0.00021704040599092216, + "loss": 1.2215, + "num_input_tokens_seen": 2352742400, + "step": 35900, + "train_runtime": 17246.2748, + "train_tokens_per_second": 136420.324 + }, + { + "epoch": 0.36, + "grad_norm": 0.4572449028491974, + "learning_rate": 0.00021661425677771965, + "loss": 1.2291, + "num_input_tokens_seen": 2359296000, + "step": 36000, + "train_runtime": 17292.1332, + "train_tokens_per_second": 136437.533 + }, + { + "epoch": 0.361, + "grad_norm": 0.467132568359375, + "learning_rate": 0.00021618743675969095, + "loss": 1.2295, + "num_input_tokens_seen": 2365849600, + "step": 36100, + "train_runtime": 17339.1599, + "train_tokens_per_second": 136445.457 + }, + { + "epoch": 0.362, + "grad_norm": 0.4863705635070801, + "learning_rate": 0.0002157599502349089, + "loss": 1.2154, + "num_input_tokens_seen": 2372403200, + "step": 36200, + "train_runtime": 17386.7454, + "train_tokens_per_second": 136448.952 + }, + { + "epoch": 0.363, + "grad_norm": 0.43923652172088623, + "learning_rate": 0.00021533180150815802, + "loss": 1.2268, + "num_input_tokens_seen": 2378956800, + "step": 36300, + "train_runtime": 17439.0785, + "train_tokens_per_second": 136415.282 + }, + { + "epoch": 0.364, + "grad_norm": 0.5028465390205383, + "learning_rate": 0.00021490299489089132, + "loss": 1.2293, + "num_input_tokens_seen": 2385510400, + "step": 36400, + "train_runtime": 17485.9662, + "train_tokens_per_second": 136424.283 + }, + { + "epoch": 0.365, + "grad_norm": 0.4366530478000641, + "learning_rate": 0.00021447353470118656, + "loss": 1.2276, + "num_input_tokens_seen": 2392064000, + "step": 36500, + "train_runtime": 17533.3809, + "train_tokens_per_second": 136429.136 + }, + { + "epoch": 0.366, + "grad_norm": 0.46415793895721436, + "learning_rate": 0.00021404342526370326, + "loss": 1.2227, + "num_input_tokens_seen": 2398617600, + "step": 36600, + "train_runtime": 17580.8443, + "train_tokens_per_second": 136433.584 + }, + { + "epoch": 0.367, + "grad_norm": 0.6382859349250793, + "learning_rate": 0.00021361267090963846, + "loss": 1.2212, + "num_input_tokens_seen": 2405171200, + "step": 36700, + "train_runtime": 17626.7905, + "train_tokens_per_second": 136449.753 + }, + { + "epoch": 0.368, + "grad_norm": 0.6642177700996399, + "learning_rate": 0.0002131812759766839, + "loss": 1.2317, + "num_input_tokens_seen": 2411724800, + "step": 36800, + "train_runtime": 17679.381, + "train_tokens_per_second": 136414.55 + }, + { + "epoch": 0.369, + "grad_norm": 0.4071521461009979, + "learning_rate": 0.00021274924480898169, + "loss": 1.2262, + "num_input_tokens_seen": 2418278400, + "step": 36900, + "train_runtime": 17726.5473, + "train_tokens_per_second": 136421.288 + }, + { + "epoch": 0.37, + "grad_norm": 0.5301467776298523, + "learning_rate": 0.00021231658175708087, + "loss": 1.2192, + "num_input_tokens_seen": 2424832000, + "step": 37000, + "train_runtime": 17772.7667, + "train_tokens_per_second": 136435.258 + }, + { + "epoch": 0.371, + "grad_norm": 0.5216257572174072, + "learning_rate": 0.00021188329117789357, + "loss": 1.213, + "num_input_tokens_seen": 2431385600, + "step": 37100, + "train_runtime": 17824.6083, + "train_tokens_per_second": 136406.116 + }, + { + "epoch": 0.372, + "grad_norm": 0.5098195672035217, + "learning_rate": 0.0002114493774346512, + "loss": 1.2311, + "num_input_tokens_seen": 2437939200, + "step": 37200, + "train_runtime": 17870.9901, + "train_tokens_per_second": 136418.81 + }, + { + "epoch": 0.373, + "grad_norm": 0.47295039892196655, + "learning_rate": 0.00021101484489686025, + "loss": 1.2211, + "num_input_tokens_seen": 2444492800, + "step": 37300, + "train_runtime": 17918.4906, + "train_tokens_per_second": 136422.919 + }, + { + "epoch": 0.374, + "grad_norm": 0.49752944707870483, + "learning_rate": 0.00021057969794025866, + "loss": 1.2292, + "num_input_tokens_seen": 2451046400, + "step": 37400, + "train_runtime": 17965.5373, + "train_tokens_per_second": 136430.453 + }, + { + "epoch": 0.375, + "grad_norm": 0.9500930905342102, + "learning_rate": 0.00021014394094677128, + "loss": 1.2187, + "num_input_tokens_seen": 2457600000, + "step": 37500, + "train_runtime": 18012.267, + "train_tokens_per_second": 136440.349 + }, + { + "epoch": 0.376, + "grad_norm": 0.4800110459327698, + "learning_rate": 0.00020970757830446633, + "loss": 1.2336, + "num_input_tokens_seen": 2464153600, + "step": 37600, + "train_runtime": 18059.6653, + "train_tokens_per_second": 136445.143 + }, + { + "epoch": 0.377, + "grad_norm": 0.48905813694000244, + "learning_rate": 0.00020927061440751072, + "loss": 1.2189, + "num_input_tokens_seen": 2470707200, + "step": 37700, + "train_runtime": 18111.7548, + "train_tokens_per_second": 136414.567 + }, + { + "epoch": 0.378, + "grad_norm": 0.593604564666748, + "learning_rate": 0.00020883305365612602, + "loss": 1.2178, + "num_input_tokens_seen": 2477260800, + "step": 37800, + "train_runtime": 18157.6424, + "train_tokens_per_second": 136430.751 + }, + { + "epoch": 0.379, + "grad_norm": 0.46399399638175964, + "learning_rate": 0.00020839490045654425, + "loss": 1.2141, + "num_input_tokens_seen": 2483814400, + "step": 37900, + "train_runtime": 18204.4326, + "train_tokens_per_second": 136440.089 + }, + { + "epoch": 0.38, + "grad_norm": 0.5679593086242676, + "learning_rate": 0.00020795615922096313, + "loss": 1.2332, + "num_input_tokens_seen": 2490368000, + "step": 38000, + "train_runtime": 18252.6627, + "train_tokens_per_second": 136438.614 + }, + { + "epoch": 0.381, + "grad_norm": 0.48073315620422363, + "learning_rate": 0.00020751683436750207, + "loss": 1.2369, + "num_input_tokens_seen": 2496921600, + "step": 38100, + "train_runtime": 18300.6025, + "train_tokens_per_second": 136439.311 + }, + { + "epoch": 0.382, + "grad_norm": 0.4134567677974701, + "learning_rate": 0.00020707693032015752, + "loss": 1.2168, + "num_input_tokens_seen": 2503475200, + "step": 38200, + "train_runtime": 18351.6848, + "train_tokens_per_second": 136416.641 + }, + { + "epoch": 0.383, + "grad_norm": 0.4675845503807068, + "learning_rate": 0.00020663645150875834, + "loss": 1.2272, + "num_input_tokens_seen": 2510028800, + "step": 38300, + "train_runtime": 18398.2852, + "train_tokens_per_second": 136427.323 + }, + { + "epoch": 0.384, + "grad_norm": 0.4632211923599243, + "learning_rate": 0.00020619540236892125, + "loss": 1.2444, + "num_input_tokens_seen": 2516582400, + "step": 38400, + "train_runtime": 18445.2271, + "train_tokens_per_second": 136435.425 + }, + { + "epoch": 0.385, + "grad_norm": 0.5543389916419983, + "learning_rate": 0.00020575378734200616, + "loss": 1.22, + "num_input_tokens_seen": 2523136000, + "step": 38500, + "train_runtime": 18492.3307, + "train_tokens_per_second": 136442.292 + }, + { + "epoch": 0.386, + "grad_norm": 0.5775281190872192, + "learning_rate": 0.0002053116108750715, + "loss": 1.2277, + "num_input_tokens_seen": 2529689600, + "step": 38600, + "train_runtime": 18544.2017, + "train_tokens_per_second": 136414.047 + }, + { + "epoch": 0.387, + "grad_norm": 0.5202789306640625, + "learning_rate": 0.0002048688774208294, + "loss": 1.2203, + "num_input_tokens_seen": 2536243200, + "step": 38700, + "train_runtime": 18591.8641, + "train_tokens_per_second": 136416.832 + }, + { + "epoch": 0.388, + "grad_norm": 0.44833704829216003, + "learning_rate": 0.0002044255914376009, + "loss": 1.2209, + "num_input_tokens_seen": 2542796800, + "step": 38800, + "train_runtime": 18637.8905, + "train_tokens_per_second": 136431.577 + }, + { + "epoch": 0.389, + "grad_norm": 0.5180789828300476, + "learning_rate": 0.00020398175738927082, + "loss": 1.2105, + "num_input_tokens_seen": 2549350400, + "step": 38900, + "train_runtime": 18684.0663, + "train_tokens_per_second": 136445.159 + }, + { + "epoch": 0.39, + "grad_norm": 0.6083468794822693, + "learning_rate": 0.00020353737974524312, + "loss": 1.2136, + "num_input_tokens_seen": 2555904000, + "step": 39000, + "train_runtime": 18730.572, + "train_tokens_per_second": 136456.27 + }, + { + "epoch": 0.391, + "grad_norm": 0.39693883061408997, + "learning_rate": 0.00020309246298039584, + "loss": 1.2285, + "num_input_tokens_seen": 2562457600, + "step": 39100, + "train_runtime": 18784.1544, + "train_tokens_per_second": 136415.914 + }, + { + "epoch": 0.392, + "grad_norm": 0.5166248679161072, + "learning_rate": 0.0002026470115750357, + "loss": 1.223, + "num_input_tokens_seen": 2569011200, + "step": 39200, + "train_runtime": 18830.687, + "train_tokens_per_second": 136426.844 + }, + { + "epoch": 0.393, + "grad_norm": 0.4967111051082611, + "learning_rate": 0.0002022010300148535, + "loss": 1.2163, + "num_input_tokens_seen": 2575564800, + "step": 39300, + "train_runtime": 18876.8963, + "train_tokens_per_second": 136440.057 + }, + { + "epoch": 0.394, + "grad_norm": 0.627816915512085, + "learning_rate": 0.0002017545227908786, + "loss": 1.2328, + "num_input_tokens_seen": 2582118400, + "step": 39400, + "train_runtime": 18923.6736, + "train_tokens_per_second": 136449.109 + }, + { + "epoch": 0.395, + "grad_norm": 0.489969938993454, + "learning_rate": 0.00020130749439943376, + "loss": 1.224, + "num_input_tokens_seen": 2588672000, + "step": 39500, + "train_runtime": 18970.0964, + "train_tokens_per_second": 136460.666 + }, + { + "epoch": 0.396, + "grad_norm": 0.6713995933532715, + "learning_rate": 0.00020085994934208998, + "loss": 1.2156, + "num_input_tokens_seen": 2595225600, + "step": 39600, + "train_runtime": 19023.1241, + "train_tokens_per_second": 136424.784 + }, + { + "epoch": 0.397, + "grad_norm": 0.4549367427825928, + "learning_rate": 0.00020041189212562094, + "loss": 1.2094, + "num_input_tokens_seen": 2601779200, + "step": 39700, + "train_runtime": 19070.6234, + "train_tokens_per_second": 136428.639 + }, + { + "epoch": 0.398, + "grad_norm": 0.47548773884773254, + "learning_rate": 0.0001999633272619579, + "loss": 1.2244, + "num_input_tokens_seen": 2608332800, + "step": 39800, + "train_runtime": 19117.4992, + "train_tokens_per_second": 136436.925 + }, + { + "epoch": 0.399, + "grad_norm": 0.46569159626960754, + "learning_rate": 0.00019951425926814404, + "loss": 1.2189, + "num_input_tokens_seen": 2614886400, + "step": 39900, + "train_runtime": 19164.3173, + "train_tokens_per_second": 136445.581 + }, + { + "epoch": 0.4, + "grad_norm": 0.5518438220024109, + "learning_rate": 0.00019906469266628904, + "loss": 1.2097, + "num_input_tokens_seen": 2621440000, + "step": 40000, + "train_runtime": 19211.1586, + "train_tokens_per_second": 136454.029 + }, + { + "epoch": 0.401, + "grad_norm": 0.4615115821361542, + "learning_rate": 0.0001986146319835236, + "loss": 1.2177, + "num_input_tokens_seen": 2627993600, + "step": 40100, + "train_runtime": 19263.5816, + "train_tokens_per_second": 136422.897 + }, + { + "epoch": 0.402, + "grad_norm": 0.4154411554336548, + "learning_rate": 0.00019816408175195383, + "loss": 1.2262, + "num_input_tokens_seen": 2634547200, + "step": 40200, + "train_runtime": 19310.6242, + "train_tokens_per_second": 136429.935 + }, + { + "epoch": 0.403, + "grad_norm": 0.48504838347435, + "learning_rate": 0.0001977130465086155, + "loss": 1.2205, + "num_input_tokens_seen": 2641100800, + "step": 40300, + "train_runtime": 19356.9428, + "train_tokens_per_second": 136442.042 + }, + { + "epoch": 0.404, + "grad_norm": 0.477006196975708, + "learning_rate": 0.0001972615307954286, + "loss": 1.2099, + "num_input_tokens_seen": 2647654400, + "step": 40400, + "train_runtime": 19403.4467, + "train_tokens_per_second": 136452.788 + }, + { + "epoch": 0.405, + "grad_norm": 0.46401214599609375, + "learning_rate": 0.00019680953915915124, + "loss": 1.2142, + "num_input_tokens_seen": 2654208000, + "step": 40500, + "train_runtime": 19456.0604, + "train_tokens_per_second": 136420.629 + }, + { + "epoch": 0.406, + "grad_norm": 0.4205267131328583, + "learning_rate": 0.00019635707615133427, + "loss": 1.2233, + "num_input_tokens_seen": 2660761600, + "step": 40600, + "train_runtime": 19503.129, + "train_tokens_per_second": 136427.422 + }, + { + "epoch": 0.407, + "grad_norm": 0.7298253178596497, + "learning_rate": 0.00019590414632827513, + "loss": 1.2143, + "num_input_tokens_seen": 2667315200, + "step": 40700, + "train_runtime": 19550.1113, + "train_tokens_per_second": 136434.783 + }, + { + "epoch": 0.408, + "grad_norm": 0.47734642028808594, + "learning_rate": 0.00019545075425097204, + "loss": 1.222, + "num_input_tokens_seen": 2673868800, + "step": 40800, + "train_runtime": 19596.9887, + "train_tokens_per_second": 136442.84 + }, + { + "epoch": 0.409, + "grad_norm": 0.4535351097583771, + "learning_rate": 0.00019499690448507827, + "loss": 1.2373, + "num_input_tokens_seen": 2680422400, + "step": 40900, + "train_runtime": 19649.1805, + "train_tokens_per_second": 136413.954 + }, + { + "epoch": 0.41, + "grad_norm": 0.572079062461853, + "learning_rate": 0.00019454260160085588, + "loss": 1.2125, + "num_input_tokens_seen": 2686976000, + "step": 41000, + "train_runtime": 19697.7854, + "train_tokens_per_second": 136410.056 + }, + { + "epoch": 0.411, + "grad_norm": 0.4487378001213074, + "learning_rate": 0.0001940878501731299, + "loss": 1.2124, + "num_input_tokens_seen": 2693529600, + "step": 41100, + "train_runtime": 19744.9135, + "train_tokens_per_second": 136416.379 + }, + { + "epoch": 0.412, + "grad_norm": 0.47419917583465576, + "learning_rate": 0.00019363265478124214, + "loss": 1.2037, + "num_input_tokens_seen": 2700083200, + "step": 41200, + "train_runtime": 19791.8314, + "train_tokens_per_second": 136424.121 + }, + { + "epoch": 0.413, + "grad_norm": 0.6295040845870972, + "learning_rate": 0.00019317702000900516, + "loss": 1.2246, + "num_input_tokens_seen": 2706636800, + "step": 41300, + "train_runtime": 19838.5236, + "train_tokens_per_second": 136433.379 + }, + { + "epoch": 0.414, + "grad_norm": 0.53326016664505, + "learning_rate": 0.000192720950444656, + "loss": 1.2192, + "num_input_tokens_seen": 2713190400, + "step": 41400, + "train_runtime": 19885.4264, + "train_tokens_per_second": 136441.147 + }, + { + "epoch": 0.415, + "grad_norm": 0.49727046489715576, + "learning_rate": 0.00019226445068081018, + "loss": 1.2279, + "num_input_tokens_seen": 2719744000, + "step": 41500, + "train_runtime": 19937.4737, + "train_tokens_per_second": 136413.672 + }, + { + "epoch": 0.416, + "grad_norm": 0.47963398694992065, + "learning_rate": 0.00019180752531441523, + "loss": 1.2226, + "num_input_tokens_seen": 2726297600, + "step": 41600, + "train_runtime": 19984.6667, + "train_tokens_per_second": 136419.468 + }, + { + "epoch": 0.417, + "grad_norm": 0.4789304733276367, + "learning_rate": 0.00019135017894670456, + "loss": 1.2222, + "num_input_tokens_seen": 2732851200, + "step": 41700, + "train_runtime": 20032.7071, + "train_tokens_per_second": 136419.465 + }, + { + "epoch": 0.418, + "grad_norm": 0.6693325638771057, + "learning_rate": 0.0001908924161831509, + "loss": 1.2366, + "num_input_tokens_seen": 2739404800, + "step": 41800, + "train_runtime": 20078.7138, + "train_tokens_per_second": 136433.281 + }, + { + "epoch": 0.419, + "grad_norm": 0.41989439725875854, + "learning_rate": 0.0001904342416334203, + "loss": 1.2212, + "num_input_tokens_seen": 2745958400, + "step": 41900, + "train_runtime": 20125.0521, + "train_tokens_per_second": 136444.785 + }, + { + "epoch": 0.42, + "grad_norm": 0.5444014072418213, + "learning_rate": 0.00018997565991132532, + "loss": 1.2164, + "num_input_tokens_seen": 2752512000, + "step": 42000, + "train_runtime": 20177.4596, + "train_tokens_per_second": 136415.191 + }, + { + "epoch": 0.421, + "grad_norm": 0.5790873169898987, + "learning_rate": 0.0001895166756347789, + "loss": 1.215, + "num_input_tokens_seen": 2759065600, + "step": 42100, + "train_runtime": 20224.878, + "train_tokens_per_second": 136419.394 + }, + { + "epoch": 0.422, + "grad_norm": 0.4666343927383423, + "learning_rate": 0.0001890572934257475, + "loss": 1.2229, + "num_input_tokens_seen": 2765619200, + "step": 42200, + "train_runtime": 20270.922, + "train_tokens_per_second": 136432.827 + }, + { + "epoch": 0.423, + "grad_norm": 0.4322357177734375, + "learning_rate": 0.00018859751791020497, + "loss": 1.2258, + "num_input_tokens_seen": 2772172800, + "step": 42300, + "train_runtime": 20317.4494, + "train_tokens_per_second": 136442.954 + }, + { + "epoch": 0.424, + "grad_norm": 0.6240208148956299, + "learning_rate": 0.0001881373537180856, + "loss": 1.221, + "num_input_tokens_seen": 2778726400, + "step": 42400, + "train_runtime": 20364.5753, + "train_tokens_per_second": 136449.023 + }, + { + "epoch": 0.425, + "grad_norm": 0.5865579843521118, + "learning_rate": 0.00018767680548323766, + "loss": 1.2244, + "num_input_tokens_seen": 2785280000, + "step": 42500, + "train_runtime": 20417.9029, + "train_tokens_per_second": 136413.617 + }, + { + "epoch": 0.426, + "grad_norm": 0.5201649069786072, + "learning_rate": 0.0001872158778433768, + "loss": 1.2076, + "num_input_tokens_seen": 2791833600, + "step": 42600, + "train_runtime": 20464.7135, + "train_tokens_per_second": 136421.827 + }, + { + "epoch": 0.427, + "grad_norm": 0.5092735290527344, + "learning_rate": 0.0001867545754400392, + "loss": 1.2057, + "num_input_tokens_seen": 2798387200, + "step": 42700, + "train_runtime": 20511.0273, + "train_tokens_per_second": 136433.303 + }, + { + "epoch": 0.428, + "grad_norm": 0.4439486265182495, + "learning_rate": 0.000186292902918535, + "loss": 1.209, + "num_input_tokens_seen": 2804940800, + "step": 42800, + "train_runtime": 20558.3684, + "train_tokens_per_second": 136437.909 + }, + { + "epoch": 0.429, + "grad_norm": 0.4466177225112915, + "learning_rate": 0.00018583086492790136, + "loss": 1.218, + "num_input_tokens_seen": 2811494400, + "step": 42900, + "train_runtime": 20605.5543, + "train_tokens_per_second": 136443.522 + }, + { + "epoch": 0.43, + "grad_norm": 0.5813594460487366, + "learning_rate": 0.00018536846612085566, + "loss": 1.2161, + "num_input_tokens_seen": 2818048000, + "step": 43000, + "train_runtime": 20658.6134, + "train_tokens_per_second": 136410.317 + }, + { + "epoch": 0.431, + "grad_norm": 0.49140629172325134, + "learning_rate": 0.00018490571115374878, + "loss": 1.227, + "num_input_tokens_seen": 2824601600, + "step": 43100, + "train_runtime": 20705.6255, + "train_tokens_per_second": 136417.11 + }, + { + "epoch": 0.432, + "grad_norm": 0.4938826858997345, + "learning_rate": 0.00018444260468651816, + "loss": 1.2252, + "num_input_tokens_seen": 2831155200, + "step": 43200, + "train_runtime": 20752.3571, + "train_tokens_per_second": 136425.717 + }, + { + "epoch": 0.433, + "grad_norm": 0.5228791832923889, + "learning_rate": 0.00018397915138264068, + "loss": 1.2274, + "num_input_tokens_seen": 2837708800, + "step": 43300, + "train_runtime": 20799.4436, + "train_tokens_per_second": 136431.957 + }, + { + "epoch": 0.434, + "grad_norm": 0.46896296739578247, + "learning_rate": 0.00018351535590908606, + "loss": 1.2043, + "num_input_tokens_seen": 2844262400, + "step": 43400, + "train_runtime": 20845.6184, + "train_tokens_per_second": 136444.137 + }, + { + "epoch": 0.435, + "grad_norm": 0.4269004464149475, + "learning_rate": 0.00018305122293626948, + "loss": 1.2213, + "num_input_tokens_seen": 2850816000, + "step": 43500, + "train_runtime": 20897.7485, + "train_tokens_per_second": 136417.375 + }, + { + "epoch": 0.436, + "grad_norm": 0.6213890314102173, + "learning_rate": 0.00018258675713800492, + "loss": 1.2096, + "num_input_tokens_seen": 2857369600, + "step": 43600, + "train_runtime": 20944.9642, + "train_tokens_per_second": 136422.749 + }, + { + "epoch": 0.437, + "grad_norm": 0.4281384348869324, + "learning_rate": 0.00018212196319145773, + "loss": 1.2111, + "num_input_tokens_seen": 2863923200, + "step": 43700, + "train_runtime": 20992.0443, + "train_tokens_per_second": 136428.98 + }, + { + "epoch": 0.438, + "grad_norm": 1.044310212135315, + "learning_rate": 0.00018165684577709778, + "loss": 1.2142, + "num_input_tokens_seen": 2870476800, + "step": 43800, + "train_runtime": 21039.718, + "train_tokens_per_second": 136431.334 + }, + { + "epoch": 0.439, + "grad_norm": 0.445425808429718, + "learning_rate": 0.0001811914095786524, + "loss": 1.218, + "num_input_tokens_seen": 2877030400, + "step": 43900, + "train_runtime": 21088.215, + "train_tokens_per_second": 136428.351 + }, + { + "epoch": 0.44, + "grad_norm": 0.43947216868400574, + "learning_rate": 0.0001807256592830588, + "loss": 1.2124, + "num_input_tokens_seen": 2883584000, + "step": 44000, + "train_runtime": 21136.0286, + "train_tokens_per_second": 136429.793 + }, + { + "epoch": 0.441, + "grad_norm": 0.5147203803062439, + "learning_rate": 0.00018025959958041732, + "loss": 1.2227, + "num_input_tokens_seen": 2890137600, + "step": 44100, + "train_runtime": 21182.9913, + "train_tokens_per_second": 136436.708 + }, + { + "epoch": 0.442, + "grad_norm": 0.473652184009552, + "learning_rate": 0.00017979323516394407, + "loss": 1.2277, + "num_input_tokens_seen": 2896691200, + "step": 44200, + "train_runtime": 21236.5796, + "train_tokens_per_second": 136401.024 + }, + { + "epoch": 0.443, + "grad_norm": 0.4356568157672882, + "learning_rate": 0.00017932657072992344, + "loss": 1.2018, + "num_input_tokens_seen": 2903244800, + "step": 44300, + "train_runtime": 21282.9387, + "train_tokens_per_second": 136411.838 + }, + { + "epoch": 0.444, + "grad_norm": 0.4458017647266388, + "learning_rate": 0.00017885961097766117, + "loss": 1.2124, + "num_input_tokens_seen": 2909798400, + "step": 44400, + "train_runtime": 21331.1223, + "train_tokens_per_second": 136410.938 + }, + { + "epoch": 0.445, + "grad_norm": 0.5065773725509644, + "learning_rate": 0.00017839236060943674, + "loss": 1.2262, + "num_input_tokens_seen": 2916352000, + "step": 44500, + "train_runtime": 21377.5493, + "train_tokens_per_second": 136421.25 + }, + { + "epoch": 0.446, + "grad_norm": 0.5424425601959229, + "learning_rate": 0.0001779248243304562, + "loss": 1.2171, + "num_input_tokens_seen": 2922905600, + "step": 44600, + "train_runtime": 21424.9021, + "train_tokens_per_second": 136425.622 + }, + { + "epoch": 0.447, + "grad_norm": 0.4595748484134674, + "learning_rate": 0.00017745700684880465, + "loss": 1.2039, + "num_input_tokens_seen": 2929459200, + "step": 44700, + "train_runtime": 21472.2167, + "train_tokens_per_second": 136430.218 + }, + { + "epoch": 0.448, + "grad_norm": 0.5353960990905762, + "learning_rate": 0.000176988912875399, + "loss": 1.2075, + "num_input_tokens_seen": 2936012800, + "step": 44800, + "train_runtime": 21524.5148, + "train_tokens_per_second": 136403.205 + }, + { + "epoch": 0.449, + "grad_norm": 0.4949302673339844, + "learning_rate": 0.00017652054712394028, + "loss": 1.2174, + "num_input_tokens_seen": 2942566400, + "step": 44900, + "train_runtime": 21571.6626, + "train_tokens_per_second": 136408.883 + }, + { + "epoch": 0.45, + "grad_norm": 0.5596060752868652, + "learning_rate": 0.0001760519143108665, + "loss": 1.2178, + "num_input_tokens_seen": 2949120000, + "step": 45000, + "train_runtime": 21618.3195, + "train_tokens_per_second": 136417.634 + }, + { + "epoch": 0.451, + "grad_norm": 0.5348083972930908, + "learning_rate": 0.00017558301915530483, + "loss": 1.215, + "num_input_tokens_seen": 2955673600, + "step": 45100, + "train_runtime": 21666.1069, + "train_tokens_per_second": 136419.229 + }, + { + "epoch": 0.452, + "grad_norm": 0.46748441457748413, + "learning_rate": 0.00017511386637902428, + "loss": 1.2104, + "num_input_tokens_seen": 2962227200, + "step": 45200, + "train_runtime": 21713.1957, + "train_tokens_per_second": 136425.206 + }, + { + "epoch": 0.453, + "grad_norm": 0.47188806533813477, + "learning_rate": 0.00017464446070638814, + "loss": 1.213, + "num_input_tokens_seen": 2968780800, + "step": 45300, + "train_runtime": 21760.1393, + "train_tokens_per_second": 136432.068 + }, + { + "epoch": 0.454, + "grad_norm": 0.5225762128829956, + "learning_rate": 0.00017417480686430622, + "loss": 1.2152, + "num_input_tokens_seen": 2975334400, + "step": 45400, + "train_runtime": 21812.7666, + "train_tokens_per_second": 136403.348 + }, + { + "epoch": 0.455, + "grad_norm": 0.5889186263084412, + "learning_rate": 0.00017370490958218765, + "loss": 1.2214, + "num_input_tokens_seen": 2981888000, + "step": 45500, + "train_runtime": 21859.0263, + "train_tokens_per_second": 136414.493 + }, + { + "epoch": 0.456, + "grad_norm": 0.6613258719444275, + "learning_rate": 0.00017323477359189272, + "loss": 1.2334, + "num_input_tokens_seen": 2988441600, + "step": 45600, + "train_runtime": 21905.9003, + "train_tokens_per_second": 136421.766 + }, + { + "epoch": 0.457, + "grad_norm": 0.4657646715641022, + "learning_rate": 0.00017276440362768564, + "loss": 1.2132, + "num_input_tokens_seen": 2994995200, + "step": 45700, + "train_runtime": 21952.9851, + "train_tokens_per_second": 136427.697 + }, + { + "epoch": 0.458, + "grad_norm": 0.8410550355911255, + "learning_rate": 0.0001722938044261868, + "loss": 1.2073, + "num_input_tokens_seen": 3001548800, + "step": 45800, + "train_runtime": 22005.352, + "train_tokens_per_second": 136400.854 + }, + { + "epoch": 0.459, + "grad_norm": 0.7687750458717346, + "learning_rate": 0.0001718229807263249, + "loss": 1.2116, + "num_input_tokens_seen": 3008102400, + "step": 45900, + "train_runtime": 22051.2762, + "train_tokens_per_second": 136413.982 + }, + { + "epoch": 0.46, + "grad_norm": 0.40700653195381165, + "learning_rate": 0.0001713519372692894, + "loss": 1.2082, + "num_input_tokens_seen": 3014656000, + "step": 46000, + "train_runtime": 22102.8898, + "train_tokens_per_second": 136391.939 + }, + { + "epoch": 0.461, + "grad_norm": 0.44239944219589233, + "learning_rate": 0.0001708806787984826, + "loss": 1.2177, + "num_input_tokens_seen": 3021209600, + "step": 46100, + "train_runtime": 22149.1222, + "train_tokens_per_second": 136403.13 + }, + { + "epoch": 0.462, + "grad_norm": 0.4981868267059326, + "learning_rate": 0.00017040921005947212, + "loss": 1.2073, + "num_input_tokens_seen": 3027763200, + "step": 46200, + "train_runtime": 22195.5009, + "train_tokens_per_second": 136413.376 + }, + { + "epoch": 0.463, + "grad_norm": 0.5651112198829651, + "learning_rate": 0.0001699375357999429, + "loss": 1.2098, + "num_input_tokens_seen": 3034316800, + "step": 46300, + "train_runtime": 22241.367, + "train_tokens_per_second": 136426.722 + }, + { + "epoch": 0.464, + "grad_norm": 1.1314237117767334, + "learning_rate": 0.0001694656607696496, + "loss": 1.2335, + "num_input_tokens_seen": 3040870400, + "step": 46400, + "train_runtime": 22294.8896, + "train_tokens_per_second": 136393.158 + }, + { + "epoch": 0.465, + "grad_norm": 0.568980872631073, + "learning_rate": 0.0001689935897203684, + "loss": 1.2096, + "num_input_tokens_seen": 3047424000, + "step": 46500, + "train_runtime": 22342.7849, + "train_tokens_per_second": 136394.098 + }, + { + "epoch": 0.466, + "grad_norm": 0.7110226154327393, + "learning_rate": 0.0001685213274058496, + "loss": 1.2136, + "num_input_tokens_seen": 3053977600, + "step": 46600, + "train_runtime": 22393.3193, + "train_tokens_per_second": 136378.96 + }, + { + "epoch": 0.467, + "grad_norm": 0.5052018761634827, + "learning_rate": 0.00016804887858176944, + "loss": 1.2237, + "num_input_tokens_seen": 3060531200, + "step": 46700, + "train_runtime": 22441.2606, + "train_tokens_per_second": 136379.647 + }, + { + "epoch": 0.468, + "grad_norm": 0.4663156270980835, + "learning_rate": 0.00016757624800568238, + "loss": 1.2071, + "num_input_tokens_seen": 3067084800, + "step": 46800, + "train_runtime": 22487.9084, + "train_tokens_per_second": 136388.175 + }, + { + "epoch": 0.469, + "grad_norm": 0.5441033840179443, + "learning_rate": 0.00016710344043697301, + "loss": 1.2078, + "num_input_tokens_seen": 3073638400, + "step": 46900, + "train_runtime": 22534.6023, + "train_tokens_per_second": 136396.39 + }, + { + "epoch": 0.47, + "grad_norm": 0.4578142464160919, + "learning_rate": 0.0001666304606368083, + "loss": 1.1956, + "num_input_tokens_seen": 3080192000, + "step": 47000, + "train_runtime": 22587.0441, + "train_tokens_per_second": 136369.858 + }, + { + "epoch": 0.471, + "grad_norm": 0.6252749562263489, + "learning_rate": 0.00016615731336808962, + "loss": 1.1911, + "num_input_tokens_seen": 3086745600, + "step": 47100, + "train_runtime": 22634.7186, + "train_tokens_per_second": 136372.166 + }, + { + "epoch": 0.472, + "grad_norm": 0.45418813824653625, + "learning_rate": 0.0001656840033954047, + "loss": 1.22, + "num_input_tokens_seen": 3093299200, + "step": 47200, + "train_runtime": 22681.221, + "train_tokens_per_second": 136381.511 + }, + { + "epoch": 0.473, + "grad_norm": 0.55946284532547, + "learning_rate": 0.00016521053548497973, + "loss": 1.2073, + "num_input_tokens_seen": 3099852800, + "step": 47300, + "train_runtime": 22728.7635, + "train_tokens_per_second": 136384.577 + }, + { + "epoch": 0.474, + "grad_norm": 0.508859395980835, + "learning_rate": 0.0001647369144046313, + "loss": 1.1957, + "num_input_tokens_seen": 3106406400, + "step": 47400, + "train_runtime": 22775.8652, + "train_tokens_per_second": 136390.27 + }, + { + "epoch": 0.475, + "grad_norm": 0.5557622313499451, + "learning_rate": 0.00016426314492371842, + "loss": 1.1996, + "num_input_tokens_seen": 3112960000, + "step": 47500, + "train_runtime": 22823.5391, + "train_tokens_per_second": 136392.519 + }, + { + "epoch": 0.476, + "grad_norm": 0.5686858296394348, + "learning_rate": 0.0001637892318130945, + "loss": 1.201, + "num_input_tokens_seen": 3119513600, + "step": 47600, + "train_runtime": 22875.0526, + "train_tokens_per_second": 136371.866 + }, + { + "epoch": 0.477, + "grad_norm": 0.47568413615226746, + "learning_rate": 0.00016331517984505934, + "loss": 1.2132, + "num_input_tokens_seen": 3126067200, + "step": 47700, + "train_runtime": 22923.2754, + "train_tokens_per_second": 136370.878 + }, + { + "epoch": 0.478, + "grad_norm": 0.40612325072288513, + "learning_rate": 0.00016284099379331092, + "loss": 1.2085, + "num_input_tokens_seen": 3132620800, + "step": 47800, + "train_runtime": 22970.2831, + "train_tokens_per_second": 136377.109 + }, + { + "epoch": 0.479, + "grad_norm": 0.491755872964859, + "learning_rate": 0.00016236667843289759, + "loss": 1.206, + "num_input_tokens_seen": 3139174400, + "step": 47900, + "train_runtime": 23016.8676, + "train_tokens_per_second": 136385.821 + }, + { + "epoch": 0.48, + "grad_norm": 1.2421867847442627, + "learning_rate": 0.00016189223854016973, + "loss": 1.1991, + "num_input_tokens_seen": 3145728000, + "step": 48000, + "train_runtime": 23070.1067, + "train_tokens_per_second": 136355.156 + }, + { + "epoch": 0.481, + "grad_norm": 0.44709935784339905, + "learning_rate": 0.00016141767889273182, + "loss": 1.1987, + "num_input_tokens_seen": 3152281600, + "step": 48100, + "train_runtime": 23117.6704, + "train_tokens_per_second": 136358.1 + }, + { + "epoch": 0.482, + "grad_norm": 0.6956078410148621, + "learning_rate": 0.00016094300426939417, + "loss": 1.206, + "num_input_tokens_seen": 3158835200, + "step": 48200, + "train_runtime": 23164.6084, + "train_tokens_per_second": 136364.714 + }, + { + "epoch": 0.483, + "grad_norm": 0.4756148159503937, + "learning_rate": 0.00016046821945012505, + "loss": 1.213, + "num_input_tokens_seen": 3165388800, + "step": 48300, + "train_runtime": 23212.4256, + "train_tokens_per_second": 136366.137 + }, + { + "epoch": 0.484, + "grad_norm": 0.4668136239051819, + "learning_rate": 0.00015999332921600226, + "loss": 1.2027, + "num_input_tokens_seen": 3171942400, + "step": 48400, + "train_runtime": 23260.1957, + "train_tokens_per_second": 136367.829 + }, + { + "epoch": 0.485, + "grad_norm": 0.48166415095329285, + "learning_rate": 0.00015951833834916532, + "loss": 1.1885, + "num_input_tokens_seen": 3178496000, + "step": 48500, + "train_runtime": 23308.4042, + "train_tokens_per_second": 136366.951 + }, + { + "epoch": 0.486, + "grad_norm": 1.4835230112075806, + "learning_rate": 0.00015904325163276672, + "loss": 1.2144, + "num_input_tokens_seen": 3185049600, + "step": 48600, + "train_runtime": 23355.0119, + "train_tokens_per_second": 136375.422 + }, + { + "epoch": 0.487, + "grad_norm": 0.47993043065071106, + "learning_rate": 0.00015856807385092466, + "loss": 1.2092, + "num_input_tokens_seen": 3191603200, + "step": 48700, + "train_runtime": 23408.2289, + "train_tokens_per_second": 136345.352 + }, + { + "epoch": 0.488, + "grad_norm": 0.4617721736431122, + "learning_rate": 0.00015809280978867405, + "loss": 1.2079, + "num_input_tokens_seen": 3198156800, + "step": 48800, + "train_runtime": 23456.3091, + "train_tokens_per_second": 136345.27 + }, + { + "epoch": 0.489, + "grad_norm": 0.4698822796344757, + "learning_rate": 0.0001576174642319187, + "loss": 1.2221, + "num_input_tokens_seen": 3204710400, + "step": 48900, + "train_runtime": 23502.92, + "train_tokens_per_second": 136353.713 + }, + { + "epoch": 0.49, + "grad_norm": 0.5454009771347046, + "learning_rate": 0.0001571420419673831, + "loss": 1.201, + "num_input_tokens_seen": 3211264000, + "step": 49000, + "train_runtime": 23550.5868, + "train_tokens_per_second": 136356.008 + }, + { + "epoch": 0.491, + "grad_norm": 0.9021556973457336, + "learning_rate": 0.0001566665477825642, + "loss": 1.2047, + "num_input_tokens_seen": 3217817600, + "step": 49100, + "train_runtime": 23597.4655, + "train_tokens_per_second": 136362.848 + }, + { + "epoch": 0.492, + "grad_norm": 0.4959240257740021, + "learning_rate": 0.0001561909864656831, + "loss": 1.2042, + "num_input_tokens_seen": 3224371200, + "step": 49200, + "train_runtime": 23650.6048, + "train_tokens_per_second": 136333.562 + }, + { + "epoch": 0.493, + "grad_norm": 0.554251492023468, + "learning_rate": 0.00015571536280563705, + "loss": 1.2163, + "num_input_tokens_seen": 3230924800, + "step": 49300, + "train_runtime": 23697.3685, + "train_tokens_per_second": 136341.079 + }, + { + "epoch": 0.494, + "grad_norm": 0.5000952482223511, + "learning_rate": 0.000155239681591951, + "loss": 1.2086, + "num_input_tokens_seen": 3237478400, + "step": 49400, + "train_runtime": 23745.4988, + "train_tokens_per_second": 136340.72 + }, + { + "epoch": 0.495, + "grad_norm": 0.7438832521438599, + "learning_rate": 0.00015476394761472953, + "loss": 1.1999, + "num_input_tokens_seen": 3244032000, + "step": 49500, + "train_runtime": 23793.3349, + "train_tokens_per_second": 136342.048 + }, + { + "epoch": 0.496, + "grad_norm": 0.4872761368751526, + "learning_rate": 0.00015428816566460843, + "loss": 1.194, + "num_input_tokens_seen": 3250585600, + "step": 49600, + "train_runtime": 23839.649, + "train_tokens_per_second": 136352.074 + }, + { + "epoch": 0.497, + "grad_norm": 0.48635321855545044, + "learning_rate": 0.00015381234053270669, + "loss": 1.1957, + "num_input_tokens_seen": 3257139200, + "step": 49700, + "train_runtime": 23886.4418, + "train_tokens_per_second": 136359.33 + }, + { + "epoch": 0.498, + "grad_norm": 0.899361252784729, + "learning_rate": 0.0001533364770105781, + "loss": 1.201, + "num_input_tokens_seen": 3263692800, + "step": 49800, + "train_runtime": 23933.6337, + "train_tokens_per_second": 136364.283 + }, + { + "epoch": 0.499, + "grad_norm": 0.5460925698280334, + "learning_rate": 0.0001528605798901631, + "loss": 1.2086, + "num_input_tokens_seen": 3270246400, + "step": 49900, + "train_runtime": 23985.6033, + "train_tokens_per_second": 136342.053 + }, + { + "epoch": 0.5, + "grad_norm": 0.4763907194137573, + "learning_rate": 0.00015238465396374027, + "loss": 1.1987, + "num_input_tokens_seen": 3276800000, + "step": 50000, + "train_runtime": 24033.0829, + "train_tokens_per_second": 136345.388 + }, + { + "epoch": 0.501, + "grad_norm": 0.4716530442237854, + "learning_rate": 0.00015190870402387858, + "loss": 1.2083, + "num_input_tokens_seen": 3283353600, + "step": 50100, + "train_runtime": 24080.0017, + "train_tokens_per_second": 136351.884 + }, + { + "epoch": 0.502, + "grad_norm": 0.65655517578125, + "learning_rate": 0.00015143273486338857, + "loss": 1.2026, + "num_input_tokens_seen": 3289907200, + "step": 50200, + "train_runtime": 24132.759, + "train_tokens_per_second": 136325.366 + }, + { + "epoch": 0.503, + "grad_norm": 0.494205117225647, + "learning_rate": 0.00015095675127527438, + "loss": 1.208, + "num_input_tokens_seen": 3296460800, + "step": 50300, + "train_runtime": 24179.9126, + "train_tokens_per_second": 136330.551 + }, + { + "epoch": 0.504, + "grad_norm": 0.485307902097702, + "learning_rate": 0.00015048075805268547, + "loss": 1.1965, + "num_input_tokens_seen": 3303014400, + "step": 50400, + "train_runtime": 24227.2685, + "train_tokens_per_second": 136334.577 + }, + { + "epoch": 0.505, + "grad_norm": 0.4843132793903351, + "learning_rate": 0.00015000475998886825, + "loss": 1.2028, + "num_input_tokens_seen": 3309568000, + "step": 50500, + "train_runtime": 24274.7092, + "train_tokens_per_second": 136338.111 + }, + { + "epoch": 0.506, + "grad_norm": 0.4654887020587921, + "learning_rate": 0.00014952876187711804, + "loss": 1.2151, + "num_input_tokens_seen": 3316121600, + "step": 50600, + "train_runtime": 24321.273, + "train_tokens_per_second": 136346.547 + }, + { + "epoch": 0.507, + "grad_norm": 0.4625457525253296, + "learning_rate": 0.00014905276851073053, + "loss": 1.209, + "num_input_tokens_seen": 3322675200, + "step": 50700, + "train_runtime": 24374.7609, + "train_tokens_per_second": 136316.217 + }, + { + "epoch": 0.508, + "grad_norm": 0.527594268321991, + "learning_rate": 0.00014857678468295352, + "loss": 1.2043, + "num_input_tokens_seen": 3329228800, + "step": 50800, + "train_runtime": 24422.59, + "train_tokens_per_second": 136317.598 + }, + { + "epoch": 0.509, + "grad_norm": 0.4604775011539459, + "learning_rate": 0.00014810081518693902, + "loss": 1.1895, + "num_input_tokens_seen": 3335782400, + "step": 50900, + "train_runtime": 24468.7673, + "train_tokens_per_second": 136328.176 + }, + { + "epoch": 0.51, + "grad_norm": 0.4973219335079193, + "learning_rate": 0.0001476248648156945, + "loss": 1.1977, + "num_input_tokens_seen": 3342336000, + "step": 51000, + "train_runtime": 24516.9703, + "train_tokens_per_second": 136327.448 + }, + { + "epoch": 0.511, + "grad_norm": 0.42552006244659424, + "learning_rate": 0.00014714893836203485, + "loss": 1.2109, + "num_input_tokens_seen": 3348889600, + "step": 51100, + "train_runtime": 24564.5614, + "train_tokens_per_second": 136330.12 + }, + { + "epoch": 0.512, + "grad_norm": 0.5027197003364563, + "learning_rate": 0.0001466730406185343, + "loss": 1.1949, + "num_input_tokens_seen": 3355443200, + "step": 51200, + "train_runtime": 24611.9784, + "train_tokens_per_second": 136333.745 + }, + { + "epoch": 0.513, + "grad_norm": 0.6097121238708496, + "learning_rate": 0.0001461971763774778, + "loss": 1.2, + "num_input_tokens_seen": 3361996800, + "step": 51300, + "train_runtime": 24665.0046, + "train_tokens_per_second": 136306.352 + }, + { + "epoch": 0.514, + "grad_norm": 0.9953346848487854, + "learning_rate": 0.0001457213504308129, + "loss": 1.1919, + "num_input_tokens_seen": 3368550400, + "step": 51400, + "train_runtime": 24711.3817, + "train_tokens_per_second": 136315.745 + }, + { + "epoch": 0.515, + "grad_norm": 0.5582478642463684, + "learning_rate": 0.00014524556757010177, + "loss": 1.1924, + "num_input_tokens_seen": 3375104000, + "step": 51500, + "train_runtime": 24758.0554, + "train_tokens_per_second": 136323.469 + }, + { + "epoch": 0.516, + "grad_norm": 0.5084798336029053, + "learning_rate": 0.00014476983258647234, + "loss": 1.2068, + "num_input_tokens_seen": 3381657600, + "step": 51600, + "train_runtime": 24807.6959, + "train_tokens_per_second": 136314.86 + }, + { + "epoch": 0.517, + "grad_norm": 0.6907379627227783, + "learning_rate": 0.0001442941502705707, + "loss": 1.1945, + "num_input_tokens_seen": 3388211200, + "step": 51700, + "train_runtime": 24855.3849, + "train_tokens_per_second": 136316.988 + }, + { + "epoch": 0.518, + "grad_norm": 0.6037150025367737, + "learning_rate": 0.0001438185254125125, + "loss": 1.2053, + "num_input_tokens_seen": 3394764800, + "step": 51800, + "train_runtime": 24901.8712, + "train_tokens_per_second": 136325.691 + }, + { + "epoch": 0.519, + "grad_norm": 0.6816796064376831, + "learning_rate": 0.00014334296280183473, + "loss": 1.2019, + "num_input_tokens_seen": 3401318400, + "step": 51900, + "train_runtime": 24955.4949, + "train_tokens_per_second": 136295.37 + }, + { + "epoch": 0.52, + "grad_norm": 0.5201036930084229, + "learning_rate": 0.00014286746722744768, + "loss": 1.206, + "num_input_tokens_seen": 3407872000, + "step": 52000, + "train_runtime": 25002.9753, + "train_tokens_per_second": 136298.659 + }, + { + "epoch": 0.521, + "grad_norm": 0.5104642510414124, + "learning_rate": 0.00014239204347758647, + "loss": 1.2029, + "num_input_tokens_seen": 3414425600, + "step": 52100, + "train_runtime": 25051.9745, + "train_tokens_per_second": 136293.672 + }, + { + "epoch": 0.522, + "grad_norm": 0.4965505003929138, + "learning_rate": 0.00014191669633976294, + "loss": 1.1961, + "num_input_tokens_seen": 3420979200, + "step": 52200, + "train_runtime": 25099.3949, + "train_tokens_per_second": 136297.278 + }, + { + "epoch": 0.523, + "grad_norm": 0.5390327572822571, + "learning_rate": 0.00014144143060071756, + "loss": 1.194, + "num_input_tokens_seen": 3427532800, + "step": 52300, + "train_runtime": 25146.6291, + "train_tokens_per_second": 136301.879 + }, + { + "epoch": 0.524, + "grad_norm": 2.647089719772339, + "learning_rate": 0.000140966251046371, + "loss": 1.2006, + "num_input_tokens_seen": 3434086400, + "step": 52400, + "train_runtime": 25194.2742, + "train_tokens_per_second": 136304.24 + }, + { + "epoch": 0.525, + "grad_norm": 0.46030643582344055, + "learning_rate": 0.0001404911624617761, + "loss": 1.2071, + "num_input_tokens_seen": 3440640000, + "step": 52500, + "train_runtime": 25247.7567, + "train_tokens_per_second": 136275.077 + }, + { + "epoch": 0.526, + "grad_norm": 0.487699031829834, + "learning_rate": 0.00014001616963106966, + "loss": 1.2046, + "num_input_tokens_seen": 3447193600, + "step": 52600, + "train_runtime": 25295.5054, + "train_tokens_per_second": 136276.921 + }, + { + "epoch": 0.527, + "grad_norm": 0.4782906472682953, + "learning_rate": 0.00013954127733742416, + "loss": 1.1891, + "num_input_tokens_seen": 3453747200, + "step": 52700, + "train_runtime": 25344.1317, + "train_tokens_per_second": 136274.039 + }, + { + "epoch": 0.528, + "grad_norm": 0.595632016658783, + "learning_rate": 0.0001390664903629998, + "loss": 1.1867, + "num_input_tokens_seen": 3460300800, + "step": 52800, + "train_runtime": 25391.6777, + "train_tokens_per_second": 136276.966 + }, + { + "epoch": 0.529, + "grad_norm": 0.5201537609100342, + "learning_rate": 0.0001385918134888961, + "loss": 1.1955, + "num_input_tokens_seen": 3466854400, + "step": 52900, + "train_runtime": 25439.3874, + "train_tokens_per_second": 136279.005 + }, + { + "epoch": 0.53, + "grad_norm": 0.4726644456386566, + "learning_rate": 0.00013811725149510387, + "loss": 1.206, + "num_input_tokens_seen": 3473408000, + "step": 53000, + "train_runtime": 25492.0415, + "train_tokens_per_second": 136254.603 + }, + { + "epoch": 0.531, + "grad_norm": 0.5846008062362671, + "learning_rate": 0.0001376428091604572, + "loss": 1.2117, + "num_input_tokens_seen": 3479961600, + "step": 53100, + "train_runtime": 25540.3083, + "train_tokens_per_second": 136253.704 + }, + { + "epoch": 0.532, + "grad_norm": 0.4758647382259369, + "learning_rate": 0.00013716849126258512, + "loss": 1.2042, + "num_input_tokens_seen": 3486515200, + "step": 53200, + "train_runtime": 25589.0853, + "train_tokens_per_second": 136250.091 + }, + { + "epoch": 0.533, + "grad_norm": 0.4607105255126953, + "learning_rate": 0.00013669430257786354, + "loss": 1.1992, + "num_input_tokens_seen": 3493068800, + "step": 53300, + "train_runtime": 25636.4376, + "train_tokens_per_second": 136254.063 + }, + { + "epoch": 0.534, + "grad_norm": 0.6885077357292175, + "learning_rate": 0.00013622024788136728, + "loss": 1.2006, + "num_input_tokens_seen": 3499622400, + "step": 53400, + "train_runtime": 25684.4816, + "train_tokens_per_second": 136254.352 + }, + { + "epoch": 0.535, + "grad_norm": 0.6578366160392761, + "learning_rate": 0.00013574633194682185, + "loss": 1.1948, + "num_input_tokens_seen": 3506176000, + "step": 53500, + "train_runtime": 25730.7322, + "train_tokens_per_second": 136264.136 + }, + { + "epoch": 0.536, + "grad_norm": 0.4718693196773529, + "learning_rate": 0.0001352725595465555, + "loss": 1.2, + "num_input_tokens_seen": 3512729600, + "step": 53600, + "train_runtime": 25783.9922, + "train_tokens_per_second": 136236.839 + }, + { + "epoch": 0.537, + "grad_norm": 0.5561531186103821, + "learning_rate": 0.000134798935451451, + "loss": 1.2052, + "num_input_tokens_seen": 3519283200, + "step": 53700, + "train_runtime": 25832.8858, + "train_tokens_per_second": 136232.677 + }, + { + "epoch": 0.538, + "grad_norm": 0.5250628590583801, + "learning_rate": 0.00013432546443089768, + "loss": 1.2, + "num_input_tokens_seen": 3525836800, + "step": 53800, + "train_runtime": 25880.0084, + "train_tokens_per_second": 136237.854 + }, + { + "epoch": 0.539, + "grad_norm": 0.5457636117935181, + "learning_rate": 0.0001338521512527436, + "loss": 1.1944, + "num_input_tokens_seen": 3532390400, + "step": 53900, + "train_runtime": 25927.8228, + "train_tokens_per_second": 136239.376 + }, + { + "epoch": 0.54, + "grad_norm": 0.4437522292137146, + "learning_rate": 0.00013337900068324712, + "loss": 1.1912, + "num_input_tokens_seen": 3538944000, + "step": 54000, + "train_runtime": 25975.6777, + "train_tokens_per_second": 136240.68 + }, + { + "epoch": 0.541, + "grad_norm": 0.5343025326728821, + "learning_rate": 0.00013290601748702918, + "loss": 1.188, + "num_input_tokens_seen": 3545497600, + "step": 54100, + "train_runtime": 26027.6243, + "train_tokens_per_second": 136220.562 + }, + { + "epoch": 0.542, + "grad_norm": 0.4907335042953491, + "learning_rate": 0.00013243320642702543, + "loss": 1.1909, + "num_input_tokens_seen": 3552051200, + "step": 54200, + "train_runtime": 26075.5648, + "train_tokens_per_second": 136221.448 + }, + { + "epoch": 0.543, + "grad_norm": 0.7268043160438538, + "learning_rate": 0.0001319605722644379, + "loss": 1.1911, + "num_input_tokens_seen": 3558604800, + "step": 54300, + "train_runtime": 26122.2114, + "train_tokens_per_second": 136229.079 + }, + { + "epoch": 0.544, + "grad_norm": 1.3769776821136475, + "learning_rate": 0.0001314881197586874, + "loss": 1.224, + "num_input_tokens_seen": 3565158400, + "step": 54400, + "train_runtime": 26170.2324, + "train_tokens_per_second": 136229.528 + }, + { + "epoch": 0.545, + "grad_norm": 0.7141419649124146, + "learning_rate": 0.0001310158536673654, + "loss": 1.2025, + "num_input_tokens_seen": 3571712000, + "step": 54500, + "train_runtime": 26217.6992, + "train_tokens_per_second": 136232.854 + }, + { + "epoch": 0.546, + "grad_norm": 0.5124280452728271, + "learning_rate": 0.0001305437787461862, + "loss": 1.1972, + "num_input_tokens_seen": 3578265600, + "step": 54600, + "train_runtime": 26264.9719, + "train_tokens_per_second": 136237.176 + }, + { + "epoch": 0.547, + "grad_norm": 0.5609524250030518, + "learning_rate": 0.00013007189974893903, + "loss": 1.1924, + "num_input_tokens_seen": 3584819200, + "step": 54700, + "train_runtime": 26319.2824, + "train_tokens_per_second": 136205.051 + }, + { + "epoch": 0.548, + "grad_norm": 0.5220986604690552, + "learning_rate": 0.00012960022142744016, + "loss": 1.188, + "num_input_tokens_seen": 3591372800, + "step": 54800, + "train_runtime": 26367.119, + "train_tokens_per_second": 136206.493 + }, + { + "epoch": 0.549, + "grad_norm": 0.5159165263175964, + "learning_rate": 0.00012912874853148506, + "loss": 1.1891, + "num_input_tokens_seen": 3597926400, + "step": 54900, + "train_runtime": 26415.2651, + "train_tokens_per_second": 136206.333 + }, + { + "epoch": 0.55, + "grad_norm": 0.5019519925117493, + "learning_rate": 0.00012865748580880053, + "loss": 1.1827, + "num_input_tokens_seen": 3604480000, + "step": 55000, + "train_runtime": 26462.5595, + "train_tokens_per_second": 136210.558 + }, + { + "epoch": 0.551, + "grad_norm": 0.5309172868728638, + "learning_rate": 0.0001281864380049969, + "loss": 1.1876, + "num_input_tokens_seen": 3611033600, + "step": 55100, + "train_runtime": 26514.9513, + "train_tokens_per_second": 136188.581 + }, + { + "epoch": 0.552, + "grad_norm": 0.5431755781173706, + "learning_rate": 0.00012771560986352042, + "loss": 1.2038, + "num_input_tokens_seen": 3617587200, + "step": 55200, + "train_runtime": 26562.7975, + "train_tokens_per_second": 136189.993 + }, + { + "epoch": 0.553, + "grad_norm": 0.5063371658325195, + "learning_rate": 0.0001272450061256052, + "loss": 1.1837, + "num_input_tokens_seen": 3624140800, + "step": 55300, + "train_runtime": 26609.2594, + "train_tokens_per_second": 136198.484 + }, + { + "epoch": 0.554, + "grad_norm": 0.502314567565918, + "learning_rate": 0.00012677463153022565, + "loss": 1.1988, + "num_input_tokens_seen": 3630694400, + "step": 55400, + "train_runtime": 26655.8656, + "train_tokens_per_second": 136206.209 + }, + { + "epoch": 0.555, + "grad_norm": 0.5824739336967468, + "learning_rate": 0.0001263044908140488, + "loss": 1.1917, + "num_input_tokens_seen": 3637248000, + "step": 55500, + "train_runtime": 26707.6694, + "train_tokens_per_second": 136187.398 + }, + { + "epoch": 0.556, + "grad_norm": 0.5498598217964172, + "learning_rate": 0.00012583458871138632, + "loss": 1.1908, + "num_input_tokens_seen": 3643801600, + "step": 55600, + "train_runtime": 26755.8413, + "train_tokens_per_second": 136187.144 + }, + { + "epoch": 0.557, + "grad_norm": 0.5867239832878113, + "learning_rate": 0.00012536492995414723, + "loss": 1.193, + "num_input_tokens_seen": 3650355200, + "step": 55700, + "train_runtime": 26804.5182, + "train_tokens_per_second": 136184.324 + }, + { + "epoch": 0.558, + "grad_norm": 0.5584626197814941, + "learning_rate": 0.00012489551927179007, + "loss": 1.1833, + "num_input_tokens_seen": 3656908800, + "step": 55800, + "train_runtime": 26850.8981, + "train_tokens_per_second": 136193.165 + }, + { + "epoch": 0.559, + "grad_norm": 0.48578086495399475, + "learning_rate": 0.00012442636139127508, + "loss": 1.1919, + "num_input_tokens_seen": 3663462400, + "step": 55900, + "train_runtime": 26898.2376, + "train_tokens_per_second": 136197.116 + }, + { + "epoch": 0.56, + "grad_norm": 0.5344805121421814, + "learning_rate": 0.00012395746103701695, + "loss": 1.1978, + "num_input_tokens_seen": 3670016000, + "step": 56000, + "train_runtime": 26951.1383, + "train_tokens_per_second": 136172.95 + }, + { + "epoch": 0.561, + "grad_norm": 0.5378079414367676, + "learning_rate": 0.00012348882293083708, + "loss": 1.192, + "num_input_tokens_seen": 3676569600, + "step": 56100, + "train_runtime": 26999.7429, + "train_tokens_per_second": 136170.541 + }, + { + "epoch": 0.562, + "grad_norm": 0.6195780038833618, + "learning_rate": 0.00012302045179191594, + "loss": 1.1919, + "num_input_tokens_seen": 3683123200, + "step": 56200, + "train_runtime": 27047.827, + "train_tokens_per_second": 136170.761 + }, + { + "epoch": 0.563, + "grad_norm": 0.5348559617996216, + "learning_rate": 0.00012255235233674572, + "loss": 1.1875, + "num_input_tokens_seen": 3689676800, + "step": 56300, + "train_runtime": 27094.1422, + "train_tokens_per_second": 136179.871 + }, + { + "epoch": 0.564, + "grad_norm": 0.48098888993263245, + "learning_rate": 0.00012208452927908278, + "loss": 1.1818, + "num_input_tokens_seen": 3696230400, + "step": 56400, + "train_runtime": 27141.6856, + "train_tokens_per_second": 136182.787 + }, + { + "epoch": 0.565, + "grad_norm": 0.585021436214447, + "learning_rate": 0.00012161698732990003, + "loss": 1.1887, + "num_input_tokens_seen": 3702784000, + "step": 56500, + "train_runtime": 27194.4825, + "train_tokens_per_second": 136159.385 + }, + { + "epoch": 0.566, + "grad_norm": 0.5269266963005066, + "learning_rate": 0.00012114973119733987, + "loss": 1.187, + "num_input_tokens_seen": 3709337600, + "step": 56600, + "train_runtime": 27242.6521, + "train_tokens_per_second": 136159.196 + }, + { + "epoch": 0.567, + "grad_norm": 0.5563040971755981, + "learning_rate": 0.00012068276558666616, + "loss": 1.1996, + "num_input_tokens_seen": 3715891200, + "step": 56700, + "train_runtime": 27290.3101, + "train_tokens_per_second": 136161.56 + }, + { + "epoch": 0.568, + "grad_norm": 0.6131460666656494, + "learning_rate": 0.00012021609520021752, + "loss": 1.195, + "num_input_tokens_seen": 3722444800, + "step": 56800, + "train_runtime": 27337.7804, + "train_tokens_per_second": 136164.851 + }, + { + "epoch": 0.569, + "grad_norm": 0.5921023488044739, + "learning_rate": 0.00011974972473735957, + "loss": 1.2018, + "num_input_tokens_seen": 3728998400, + "step": 56900, + "train_runtime": 27384.9126, + "train_tokens_per_second": 136169.812 + }, + { + "epoch": 0.57, + "grad_norm": 0.4582422375679016, + "learning_rate": 0.00011928365889443764, + "loss": 1.1914, + "num_input_tokens_seen": 3735552000, + "step": 57000, + "train_runtime": 27436.2125, + "train_tokens_per_second": 136154.07 + }, + { + "epoch": 0.571, + "grad_norm": 0.6521887183189392, + "learning_rate": 0.00011881790236472966, + "loss": 1.2041, + "num_input_tokens_seen": 3742105600, + "step": 57100, + "train_runtime": 27484.9505, + "train_tokens_per_second": 136151.076 + }, + { + "epoch": 0.572, + "grad_norm": 0.5971055030822754, + "learning_rate": 0.00011835245983839869, + "loss": 1.1992, + "num_input_tokens_seen": 3748659200, + "step": 57200, + "train_runtime": 27531.7756, + "train_tokens_per_second": 136157.553 + }, + { + "epoch": 0.573, + "grad_norm": 0.5187013745307922, + "learning_rate": 0.00011788733600244575, + "loss": 1.193, + "num_input_tokens_seen": 3755212800, + "step": 57300, + "train_runtime": 27579.3239, + "train_tokens_per_second": 136160.437 + }, + { + "epoch": 0.574, + "grad_norm": 0.5805628299713135, + "learning_rate": 0.00011742253554066278, + "loss": 1.1925, + "num_input_tokens_seen": 3761766400, + "step": 57400, + "train_runtime": 27633.4529, + "train_tokens_per_second": 136130.885 + }, + { + "epoch": 0.575, + "grad_norm": 0.5242844223976135, + "learning_rate": 0.00011695806313358523, + "loss": 1.1991, + "num_input_tokens_seen": 3768320000, + "step": 57500, + "train_runtime": 27681.3237, + "train_tokens_per_second": 136132.218 + }, + { + "epoch": 0.576, + "grad_norm": 0.7652018666267395, + "learning_rate": 0.00011649392345844506, + "loss": 1.192, + "num_input_tokens_seen": 3774873600, + "step": 57600, + "train_runtime": 27728.8266, + "train_tokens_per_second": 136135.353 + }, + { + "epoch": 0.577, + "grad_norm": 0.5232011675834656, + "learning_rate": 0.00011603012118912372, + "loss": 1.2019, + "num_input_tokens_seen": 3781427200, + "step": 57700, + "train_runtime": 27778.1555, + "train_tokens_per_second": 136129.528 + }, + { + "epoch": 0.578, + "grad_norm": 0.5537053942680359, + "learning_rate": 0.00011556666099610485, + "loss": 1.1948, + "num_input_tokens_seen": 3787980800, + "step": 57800, + "train_runtime": 27824.9287, + "train_tokens_per_second": 136136.227 + }, + { + "epoch": 0.579, + "grad_norm": 0.6031852960586548, + "learning_rate": 0.00011510354754642745, + "loss": 1.1888, + "num_input_tokens_seen": 3794534400, + "step": 57900, + "train_runtime": 27872.2044, + "train_tokens_per_second": 136140.448 + }, + { + "epoch": 0.58, + "grad_norm": 0.5748854875564575, + "learning_rate": 0.00011464078550363887, + "loss": 1.1921, + "num_input_tokens_seen": 3801088000, + "step": 58000, + "train_runtime": 27925.2055, + "train_tokens_per_second": 136116.742 + }, + { + "epoch": 0.581, + "grad_norm": 0.5586141347885132, + "learning_rate": 0.0001141783795277477, + "loss": 1.2024, + "num_input_tokens_seen": 3807641600, + "step": 58100, + "train_runtime": 27972.7534, + "train_tokens_per_second": 136119.657 + }, + { + "epoch": 0.582, + "grad_norm": 0.4893476366996765, + "learning_rate": 0.00011371633427517696, + "loss": 1.2034, + "num_input_tokens_seen": 3814195200, + "step": 58200, + "train_runtime": 28020.2529, + "train_tokens_per_second": 136122.797 + }, + { + "epoch": 0.583, + "grad_norm": 0.5007518529891968, + "learning_rate": 0.00011325465439871731, + "loss": 1.1885, + "num_input_tokens_seen": 3820748800, + "step": 58300, + "train_runtime": 28067.154, + "train_tokens_per_second": 136128.829 + }, + { + "epoch": 0.584, + "grad_norm": 0.5260310769081116, + "learning_rate": 0.00011279334454747989, + "loss": 1.1931, + "num_input_tokens_seen": 3827302400, + "step": 58400, + "train_runtime": 28120.6157, + "train_tokens_per_second": 136103.08 + }, + { + "epoch": 0.585, + "grad_norm": 0.5364392399787903, + "learning_rate": 0.00011233240936684981, + "loss": 1.1928, + "num_input_tokens_seen": 3833856000, + "step": 58500, + "train_runtime": 28168.5149, + "train_tokens_per_second": 136104.3 + }, + { + "epoch": 0.586, + "grad_norm": 0.49333399534225464, + "learning_rate": 0.00011187185349843916, + "loss": 1.1935, + "num_input_tokens_seen": 3840409600, + "step": 58600, + "train_runtime": 28215.0596, + "train_tokens_per_second": 136112.05 + }, + { + "epoch": 0.587, + "grad_norm": 0.5711957216262817, + "learning_rate": 0.00011141168158004053, + "loss": 1.1812, + "num_input_tokens_seen": 3846963200, + "step": 58700, + "train_runtime": 28264.2863, + "train_tokens_per_second": 136106.858 + }, + { + "epoch": 0.588, + "grad_norm": 1.0157184600830078, + "learning_rate": 0.00011095189824557998, + "loss": 1.1929, + "num_input_tokens_seen": 3853516800, + "step": 58800, + "train_runtime": 28311.6057, + "train_tokens_per_second": 136110.853 + }, + { + "epoch": 0.589, + "grad_norm": 0.552700936794281, + "learning_rate": 0.00011049250812507054, + "loss": 1.1909, + "num_input_tokens_seen": 3860070400, + "step": 58900, + "train_runtime": 28359.0956, + "train_tokens_per_second": 136114.016 + }, + { + "epoch": 0.59, + "grad_norm": 0.46860748529434204, + "learning_rate": 0.00011003351584456571, + "loss": 1.1972, + "num_input_tokens_seen": 3866624000, + "step": 59000, + "train_runtime": 28412.8978, + "train_tokens_per_second": 136086.929 + }, + { + "epoch": 0.591, + "grad_norm": 0.5399055480957031, + "learning_rate": 0.0001095749260261126, + "loss": 1.1895, + "num_input_tokens_seen": 3873177600, + "step": 59100, + "train_runtime": 28462.0603, + "train_tokens_per_second": 136082.123 + }, + { + "epoch": 0.592, + "grad_norm": 0.49921005964279175, + "learning_rate": 0.00010911674328770559, + "loss": 1.1968, + "num_input_tokens_seen": 3879731200, + "step": 59200, + "train_runtime": 28510.9551, + "train_tokens_per_second": 136078.612 + }, + { + "epoch": 0.593, + "grad_norm": 0.5357686877250671, + "learning_rate": 0.00010865897224323979, + "loss": 1.1889, + "num_input_tokens_seen": 3886284800, + "step": 59300, + "train_runtime": 28558.3344, + "train_tokens_per_second": 136082.334 + }, + { + "epoch": 0.594, + "grad_norm": 0.5710283517837524, + "learning_rate": 0.00010820161750246453, + "loss": 1.1864, + "num_input_tokens_seen": 3892838400, + "step": 59400, + "train_runtime": 28606.454, + "train_tokens_per_second": 136082.522 + }, + { + "epoch": 0.595, + "grad_norm": 0.6333475112915039, + "learning_rate": 0.00010774468367093696, + "loss": 1.2009, + "num_input_tokens_seen": 3899392000, + "step": 59500, + "train_runtime": 28653.986, + "train_tokens_per_second": 136085.5 + }, + { + "epoch": 0.596, + "grad_norm": 0.5585243701934814, + "learning_rate": 0.00010728817534997573, + "loss": 1.1877, + "num_input_tokens_seen": 3905945600, + "step": 59600, + "train_runtime": 28701.832, + "train_tokens_per_second": 136086.979 + }, + { + "epoch": 0.597, + "grad_norm": 0.5805736184120178, + "learning_rate": 0.00010683209713661453, + "loss": 1.211, + "num_input_tokens_seen": 3912499200, + "step": 59700, + "train_runtime": 28751.7229, + "train_tokens_per_second": 136078.774 + }, + { + "epoch": 0.598, + "grad_norm": 0.5607670545578003, + "learning_rate": 0.00010637645362355589, + "loss": 1.196, + "num_input_tokens_seen": 3919052800, + "step": 59800, + "train_runtime": 28798.1873, + "train_tokens_per_second": 136086.788 + }, + { + "epoch": 0.599, + "grad_norm": 0.4962175488471985, + "learning_rate": 0.00010592124939912497, + "loss": 1.1889, + "num_input_tokens_seen": 3925606400, + "step": 59900, + "train_runtime": 28852.3337, + "train_tokens_per_second": 136058.54 + }, + { + "epoch": 0.6, + "grad_norm": 0.6488810777664185, + "learning_rate": 0.00010546648904722326, + "loss": 1.1968, + "num_input_tokens_seen": 3932160000, + "step": 60000, + "train_runtime": 28898.713, + "train_tokens_per_second": 136066.959 + }, + { + "epoch": 0.601, + "grad_norm": 0.9370976686477661, + "learning_rate": 0.0001050121771472824, + "loss": 1.183, + "num_input_tokens_seen": 3938713600, + "step": 60100, + "train_runtime": 28946.5523, + "train_tokens_per_second": 136068.488 + }, + { + "epoch": 0.602, + "grad_norm": 0.5040610432624817, + "learning_rate": 0.0001045583182742182, + "loss": 1.2023, + "num_input_tokens_seen": 3945267200, + "step": 60200, + "train_runtime": 28994.2594, + "train_tokens_per_second": 136070.632 + }, + { + "epoch": 0.603, + "grad_norm": 0.5120612382888794, + "learning_rate": 0.00010410491699838448, + "loss": 1.1865, + "num_input_tokens_seen": 3951820800, + "step": 60300, + "train_runtime": 29042.095, + "train_tokens_per_second": 136072.167 + }, + { + "epoch": 0.604, + "grad_norm": 0.8983064889907837, + "learning_rate": 0.00010365197788552707, + "loss": 1.1734, + "num_input_tokens_seen": 3958374400, + "step": 60400, + "train_runtime": 29090.1772, + "train_tokens_per_second": 136072.543 + }, + { + "epoch": 0.605, + "grad_norm": 0.5155735015869141, + "learning_rate": 0.00010319950549673778, + "loss": 1.1923, + "num_input_tokens_seen": 3964928000, + "step": 60500, + "train_runtime": 29143.642, + "train_tokens_per_second": 136047.787 + }, + { + "epoch": 0.606, + "grad_norm": 1.5562913417816162, + "learning_rate": 0.00010274750438840855, + "loss": 1.1877, + "num_input_tokens_seen": 3971481600, + "step": 60600, + "train_runtime": 29191.8256, + "train_tokens_per_second": 136047.73 + }, + { + "epoch": 0.607, + "grad_norm": 0.5603190064430237, + "learning_rate": 0.00010229597911218554, + "loss": 1.1862, + "num_input_tokens_seen": 3978035200, + "step": 60700, + "train_runtime": 29240.4534, + "train_tokens_per_second": 136045.606 + }, + { + "epoch": 0.608, + "grad_norm": 0.550956130027771, + "learning_rate": 0.00010184493421492324, + "loss": 1.1869, + "num_input_tokens_seen": 3984588800, + "step": 60800, + "train_runtime": 29287.1822, + "train_tokens_per_second": 136052.31 + }, + { + "epoch": 0.609, + "grad_norm": 0.5152813196182251, + "learning_rate": 0.0001013943742386388, + "loss": 1.1902, + "num_input_tokens_seen": 3991142400, + "step": 60900, + "train_runtime": 29335.0152, + "train_tokens_per_second": 136053.872 + }, + { + "epoch": 0.61, + "grad_norm": 0.5258508324623108, + "learning_rate": 0.00010094430372046616, + "loss": 1.1843, + "num_input_tokens_seen": 3997696000, + "step": 61000, + "train_runtime": 29387.778, + "train_tokens_per_second": 136032.605 + }, + { + "epoch": 0.611, + "grad_norm": 0.5804030895233154, + "learning_rate": 0.0001004947271926104, + "loss": 1.1872, + "num_input_tokens_seen": 4004249600, + "step": 61100, + "train_runtime": 29435.5024, + "train_tokens_per_second": 136034.695 + }, + { + "epoch": 0.612, + "grad_norm": 0.5679774284362793, + "learning_rate": 0.00010004564918230222, + "loss": 1.1933, + "num_input_tokens_seen": 4010803200, + "step": 61200, + "train_runtime": 29483.504, + "train_tokens_per_second": 136035.5 + }, + { + "epoch": 0.613, + "grad_norm": 0.611191987991333, + "learning_rate": 9.959707421175217e-05, + "loss": 1.1926, + "num_input_tokens_seen": 4017356800, + "step": 61300, + "train_runtime": 29529.7223, + "train_tokens_per_second": 136044.517 + }, + { + "epoch": 0.614, + "grad_norm": 0.5725626945495605, + "learning_rate": 9.914900679810522e-05, + "loss": 1.1812, + "num_input_tokens_seen": 4023910400, + "step": 61400, + "train_runtime": 29577.4052, + "train_tokens_per_second": 136046.769 + }, + { + "epoch": 0.615, + "grad_norm": 0.6058773398399353, + "learning_rate": 9.870145145339529e-05, + "loss": 1.1904, + "num_input_tokens_seen": 4030464000, + "step": 61500, + "train_runtime": 29630.1636, + "train_tokens_per_second": 136025.708 + }, + { + "epoch": 0.616, + "grad_norm": 0.5151665806770325, + "learning_rate": 9.825441268449969e-05, + "loss": 1.1783, + "num_input_tokens_seen": 4037017600, + "step": 61600, + "train_runtime": 29677.4813, + "train_tokens_per_second": 136029.657 + }, + { + "epoch": 0.617, + "grad_norm": 0.5461622476577759, + "learning_rate": 9.780789499309391e-05, + "loss": 1.1825, + "num_input_tokens_seen": 4043571200, + "step": 61700, + "train_runtime": 29725.432, + "train_tokens_per_second": 136030.696 + }, + { + "epoch": 0.618, + "grad_norm": 0.8243169784545898, + "learning_rate": 9.736190287560608e-05, + "loss": 1.1933, + "num_input_tokens_seen": 4050124800, + "step": 61800, + "train_runtime": 29772.1739, + "train_tokens_per_second": 136037.255 + }, + { + "epoch": 0.619, + "grad_norm": 0.4877258539199829, + "learning_rate": 9.691644082317186e-05, + "loss": 1.1881, + "num_input_tokens_seen": 4056678400, + "step": 61900, + "train_runtime": 29825.721, + "train_tokens_per_second": 136012.752 + }, + { + "epoch": 0.62, + "grad_norm": 0.5376379489898682, + "learning_rate": 9.647151332158926e-05, + "loss": 1.1812, + "num_input_tokens_seen": 4063232000, + "step": 62000, + "train_runtime": 29872.1612, + "train_tokens_per_second": 136020.691 + }, + { + "epoch": 0.621, + "grad_norm": 0.5128985643386841, + "learning_rate": 9.60271248512732e-05, + "loss": 1.1719, + "num_input_tokens_seen": 4069785600, + "step": 62100, + "train_runtime": 29919.8698, + "train_tokens_per_second": 136022.838 + }, + { + "epoch": 0.622, + "grad_norm": 0.6911051273345947, + "learning_rate": 9.558327988721068e-05, + "loss": 1.199, + "num_input_tokens_seen": 4076339200, + "step": 62200, + "train_runtime": 29967.7263, + "train_tokens_per_second": 136024.307 + }, + { + "epoch": 0.623, + "grad_norm": 0.5334423184394836, + "learning_rate": 9.513998289891559e-05, + "loss": 1.1922, + "num_input_tokens_seen": 4082892800, + "step": 62300, + "train_runtime": 30014.7483, + "train_tokens_per_second": 136029.553 + }, + { + "epoch": 0.624, + "grad_norm": 0.47934290766716003, + "learning_rate": 9.469723835038361e-05, + "loss": 1.1864, + "num_input_tokens_seen": 4089446400, + "step": 62400, + "train_runtime": 30062.3944, + "train_tokens_per_second": 136031.959 + }, + { + "epoch": 0.625, + "grad_norm": 0.6690011620521545, + "learning_rate": 9.42550507000475e-05, + "loss": 1.1887, + "num_input_tokens_seen": 4096000000, + "step": 62500, + "train_runtime": 30115.1503, + "train_tokens_per_second": 136011.275 + }, + { + "epoch": 0.626, + "grad_norm": 0.5379562973976135, + "learning_rate": 9.381342440073194e-05, + "loss": 1.1873, + "num_input_tokens_seen": 4102553600, + "step": 62600, + "train_runtime": 30162.8214, + "train_tokens_per_second": 136013.589 + }, + { + "epoch": 0.627, + "grad_norm": 0.5619449615478516, + "learning_rate": 9.337236389960886e-05, + "loss": 1.184, + "num_input_tokens_seen": 4109107200, + "step": 62700, + "train_runtime": 30211.3171, + "train_tokens_per_second": 136012.183 + }, + { + "epoch": 0.628, + "grad_norm": 0.9017994999885559, + "learning_rate": 9.293187363815265e-05, + "loss": 1.1869, + "num_input_tokens_seen": 4115660800, + "step": 62800, + "train_runtime": 30263.5761, + "train_tokens_per_second": 135993.869 + }, + { + "epoch": 0.629, + "grad_norm": 0.6502019762992859, + "learning_rate": 9.249195805209533e-05, + "loss": 1.1944, + "num_input_tokens_seen": 4122214400, + "step": 62900, + "train_runtime": 30310.6247, + "train_tokens_per_second": 135998.992 + }, + { + "epoch": 0.63, + "grad_norm": 0.5749123096466064, + "learning_rate": 9.205262157138192e-05, + "loss": 1.1896, + "num_input_tokens_seen": 4128768000, + "step": 63000, + "train_runtime": 30359.0787, + "train_tokens_per_second": 135997.803 + }, + { + "epoch": 0.631, + "grad_norm": 0.4843611419200897, + "learning_rate": 9.161386862012601e-05, + "loss": 1.1932, + "num_input_tokens_seen": 4135321600, + "step": 63100, + "train_runtime": 30406.8492, + "train_tokens_per_second": 135999.675 + }, + { + "epoch": 0.632, + "grad_norm": 0.634504497051239, + "learning_rate": 9.11757036165649e-05, + "loss": 1.181, + "num_input_tokens_seen": 4141875200, + "step": 63200, + "train_runtime": 30453.794, + "train_tokens_per_second": 136005.228 + }, + { + "epoch": 0.633, + "grad_norm": 0.605948269367218, + "learning_rate": 9.073813097301521e-05, + "loss": 1.1742, + "num_input_tokens_seen": 4148428800, + "step": 63300, + "train_runtime": 30506.719, + "train_tokens_per_second": 135984.102 + }, + { + "epoch": 0.634, + "grad_norm": 0.5731847882270813, + "learning_rate": 9.030115509582883e-05, + "loss": 1.1809, + "num_input_tokens_seen": 4154982400, + "step": 63400, + "train_runtime": 30554.7018, + "train_tokens_per_second": 135985.042 + }, + { + "epoch": 0.635, + "grad_norm": 0.9707246422767639, + "learning_rate": 8.986478038534775e-05, + "loss": 1.1981, + "num_input_tokens_seen": 4161536000, + "step": 63500, + "train_runtime": 30602.1945, + "train_tokens_per_second": 135988.156 + }, + { + "epoch": 0.636, + "grad_norm": 0.7120965719223022, + "learning_rate": 8.942901123586059e-05, + "loss": 1.1816, + "num_input_tokens_seen": 4168089600, + "step": 63600, + "train_runtime": 30649.6499, + "train_tokens_per_second": 135991.426 + }, + { + "epoch": 0.637, + "grad_norm": 0.5136720538139343, + "learning_rate": 8.899385203555781e-05, + "loss": 1.177, + "num_input_tokens_seen": 4174643200, + "step": 63700, + "train_runtime": 30696.6221, + "train_tokens_per_second": 135996.827 + }, + { + "epoch": 0.638, + "grad_norm": 0.5284336805343628, + "learning_rate": 8.855930716648774e-05, + "loss": 1.184, + "num_input_tokens_seen": 4181196800, + "step": 63800, + "train_runtime": 30745.5123, + "train_tokens_per_second": 135993.727 + }, + { + "epoch": 0.639, + "grad_norm": 0.5269259810447693, + "learning_rate": 8.812538100451239e-05, + "loss": 1.2174, + "num_input_tokens_seen": 4187750400, + "step": 63900, + "train_runtime": 30792.1632, + "train_tokens_per_second": 136000.526 + }, + { + "epoch": 0.64, + "grad_norm": 0.5354572534561157, + "learning_rate": 8.769207791926338e-05, + "loss": 1.1771, + "num_input_tokens_seen": 4194304000, + "step": 64000, + "train_runtime": 30846.5823, + "train_tokens_per_second": 135973.054 + }, + { + "epoch": 0.641, + "grad_norm": 0.7058772444725037, + "learning_rate": 8.725940227409797e-05, + "loss": 1.179, + "num_input_tokens_seen": 4200857600, + "step": 64100, + "train_runtime": 30893.4429, + "train_tokens_per_second": 135978.94 + }, + { + "epoch": 0.642, + "grad_norm": 0.5777366161346436, + "learning_rate": 8.682735842605509e-05, + "loss": 1.182, + "num_input_tokens_seen": 4207411200, + "step": 64200, + "train_runtime": 30940.3826, + "train_tokens_per_second": 135984.459 + }, + { + "epoch": 0.643, + "grad_norm": 0.5608710646629333, + "learning_rate": 8.639595072581158e-05, + "loss": 1.1904, + "num_input_tokens_seen": 4213964800, + "step": 64300, + "train_runtime": 30988.4894, + "train_tokens_per_second": 135984.841 + }, + { + "epoch": 0.644, + "grad_norm": 0.6048064231872559, + "learning_rate": 8.596518351763806e-05, + "loss": 1.1851, + "num_input_tokens_seen": 4220518400, + "step": 64400, + "train_runtime": 31041.3711, + "train_tokens_per_second": 135964.304 + }, + { + "epoch": 0.645, + "grad_norm": 0.47835734486579895, + "learning_rate": 8.553506113935561e-05, + "loss": 1.1803, + "num_input_tokens_seen": 4227072000, + "step": 64500, + "train_runtime": 31089.9624, + "train_tokens_per_second": 135962.596 + }, + { + "epoch": 0.646, + "grad_norm": 1.1150704622268677, + "learning_rate": 8.510558792229183e-05, + "loss": 1.1878, + "num_input_tokens_seen": 4233625600, + "step": 64600, + "train_runtime": 31137.4325, + "train_tokens_per_second": 135965.79 + }, + { + "epoch": 0.647, + "grad_norm": 0.6650880575180054, + "learning_rate": 8.467676819123716e-05, + "loss": 1.1951, + "num_input_tokens_seen": 4240179200, + "step": 64700, + "train_runtime": 31185.0957, + "train_tokens_per_second": 135968.132 + }, + { + "epoch": 0.648, + "grad_norm": 0.7750310897827148, + "learning_rate": 8.424860626440158e-05, + "loss": 1.1829, + "num_input_tokens_seen": 4246732800, + "step": 64800, + "train_runtime": 31237.5852, + "train_tokens_per_second": 135949.459 + }, + { + "epoch": 0.649, + "grad_norm": 0.595783531665802, + "learning_rate": 8.382110645337102e-05, + "loss": 1.1856, + "num_input_tokens_seen": 4253286400, + "step": 64900, + "train_runtime": 31285.0064, + "train_tokens_per_second": 135952.87 + }, + { + "epoch": 0.65, + "grad_norm": 0.6093938946723938, + "learning_rate": 8.339427306306365e-05, + "loss": 1.1842, + "num_input_tokens_seen": 4259840000, + "step": 65000, + "train_runtime": 31332.1176, + "train_tokens_per_second": 135957.615 + }, + { + "epoch": 0.651, + "grad_norm": 0.6823499798774719, + "learning_rate": 8.296811039168716e-05, + "loss": 1.1818, + "num_input_tokens_seen": 4266393600, + "step": 65100, + "train_runtime": 31381.0925, + "train_tokens_per_second": 135954.273 + }, + { + "epoch": 0.652, + "grad_norm": 0.5052744746208191, + "learning_rate": 8.254262273069477e-05, + "loss": 1.2034, + "num_input_tokens_seen": 4272947200, + "step": 65200, + "train_runtime": 31428.8012, + "train_tokens_per_second": 135956.417 + }, + { + "epoch": 0.653, + "grad_norm": 0.5003641247749329, + "learning_rate": 8.211781436474263e-05, + "loss": 1.177, + "num_input_tokens_seen": 4279500800, + "step": 65300, + "train_runtime": 31476.0702, + "train_tokens_per_second": 135960.454 + }, + { + "epoch": 0.654, + "grad_norm": 0.5675527453422546, + "learning_rate": 8.169368957164613e-05, + "loss": 1.1707, + "num_input_tokens_seen": 4286054400, + "step": 65400, + "train_runtime": 31524.8831, + "train_tokens_per_second": 135957.82 + }, + { + "epoch": 0.655, + "grad_norm": 0.5109818577766418, + "learning_rate": 8.127025262233731e-05, + "loss": 1.187, + "num_input_tokens_seen": 4292608000, + "step": 65500, + "train_runtime": 31578.0721, + "train_tokens_per_second": 135936.354 + }, + { + "epoch": 0.656, + "grad_norm": 0.6228885054588318, + "learning_rate": 8.084750778082159e-05, + "loss": 1.1944, + "num_input_tokens_seen": 4299161600, + "step": 65600, + "train_runtime": 31626.6624, + "train_tokens_per_second": 135934.723 + }, + { + "epoch": 0.657, + "grad_norm": 0.6139951348304749, + "learning_rate": 8.042545930413473e-05, + "loss": 1.1788, + "num_input_tokens_seen": 4305715200, + "step": 65700, + "train_runtime": 31673.2442, + "train_tokens_per_second": 135941.717 + }, + { + "epoch": 0.658, + "grad_norm": 0.6792371273040771, + "learning_rate": 8.000411144230025e-05, + "loss": 1.2019, + "num_input_tokens_seen": 4312268800, + "step": 65800, + "train_runtime": 31721.455, + "train_tokens_per_second": 135941.709 + }, + { + "epoch": 0.659, + "grad_norm": 0.546470582485199, + "learning_rate": 7.95834684382865e-05, + "loss": 1.1905, + "num_input_tokens_seen": 4318822400, + "step": 65900, + "train_runtime": 31770.1998, + "train_tokens_per_second": 135939.416 + }, + { + "epoch": 0.66, + "grad_norm": 0.5273057818412781, + "learning_rate": 7.916353452796378e-05, + "loss": 1.1769, + "num_input_tokens_seen": 4325376000, + "step": 66000, + "train_runtime": 31818.123, + "train_tokens_per_second": 135940.64 + }, + { + "epoch": 0.661, + "grad_norm": 0.5213398933410645, + "learning_rate": 7.874431394006188e-05, + "loss": 1.1834, + "num_input_tokens_seen": 4331929600, + "step": 66100, + "train_runtime": 31870.8187, + "train_tokens_per_second": 135921.504 + }, + { + "epoch": 0.662, + "grad_norm": 0.5762707591056824, + "learning_rate": 7.832581089612762e-05, + "loss": 1.1875, + "num_input_tokens_seen": 4338483200, + "step": 66200, + "train_runtime": 31918.6258, + "train_tokens_per_second": 135923.245 + }, + { + "epoch": 0.663, + "grad_norm": 0.6153529286384583, + "learning_rate": 7.790802961048183e-05, + "loss": 1.1895, + "num_input_tokens_seen": 4345036800, + "step": 66300, + "train_runtime": 31967.5441, + "train_tokens_per_second": 135920.257 + }, + { + "epoch": 0.664, + "grad_norm": 0.6668293476104736, + "learning_rate": 7.749097429017749e-05, + "loss": 1.1835, + "num_input_tokens_seen": 4351590400, + "step": 66400, + "train_runtime": 32014.502, + "train_tokens_per_second": 135925.6 + }, + { + "epoch": 0.665, + "grad_norm": 0.49117180705070496, + "learning_rate": 7.70746491349571e-05, + "loss": 1.1762, + "num_input_tokens_seen": 4358144000, + "step": 66500, + "train_runtime": 32062.234, + "train_tokens_per_second": 135927.646 + }, + { + "epoch": 0.666, + "grad_norm": 0.5580335259437561, + "learning_rate": 7.665905833721025e-05, + "loss": 1.1751, + "num_input_tokens_seen": 4364697600, + "step": 66600, + "train_runtime": 32116.4057, + "train_tokens_per_second": 135902.431 + }, + { + "epoch": 0.667, + "grad_norm": 0.4941908121109009, + "learning_rate": 7.624420608193171e-05, + "loss": 1.1991, + "num_input_tokens_seen": 4371251200, + "step": 66700, + "train_runtime": 32164.7962, + "train_tokens_per_second": 135901.722 + }, + { + "epoch": 0.668, + "grad_norm": 0.5203377604484558, + "learning_rate": 7.583009654667912e-05, + "loss": 1.1892, + "num_input_tokens_seen": 4377804800, + "step": 66800, + "train_runtime": 32211.7614, + "train_tokens_per_second": 135907.029 + }, + { + "epoch": 0.669, + "grad_norm": 0.5924380421638489, + "learning_rate": 7.541673390153087e-05, + "loss": 1.1749, + "num_input_tokens_seen": 4384358400, + "step": 66900, + "train_runtime": 32259.5523, + "train_tokens_per_second": 135908.842 + }, + { + "epoch": 0.67, + "grad_norm": 0.5180861353874207, + "learning_rate": 7.500412230904416e-05, + "loss": 1.1833, + "num_input_tokens_seen": 4390912000, + "step": 67000, + "train_runtime": 32305.7062, + "train_tokens_per_second": 135917.536 + }, + { + "epoch": 0.671, + "grad_norm": 0.5575404167175293, + "learning_rate": 7.459226592421318e-05, + "loss": 1.1908, + "num_input_tokens_seen": 4397465600, + "step": 67100, + "train_runtime": 32353.5616, + "train_tokens_per_second": 135919.058 + }, + { + "epoch": 0.672, + "grad_norm": 0.519868016242981, + "learning_rate": 7.418116889442721e-05, + "loss": 1.191, + "num_input_tokens_seen": 4404019200, + "step": 67200, + "train_runtime": 32407.2129, + "train_tokens_per_second": 135896.265 + }, + { + "epoch": 0.673, + "grad_norm": 0.5036019086837769, + "learning_rate": 7.377083535942868e-05, + "loss": 1.1771, + "num_input_tokens_seen": 4410572800, + "step": 67300, + "train_runtime": 32454.4825, + "train_tokens_per_second": 135900.266 + }, + { + "epoch": 0.674, + "grad_norm": 0.5349675416946411, + "learning_rate": 7.336126945127178e-05, + "loss": 1.1834, + "num_input_tokens_seen": 4417126400, + "step": 67400, + "train_runtime": 32501.8427, + "train_tokens_per_second": 135903.876 + }, + { + "epoch": 0.675, + "grad_norm": 0.675538957118988, + "learning_rate": 7.29524752942807e-05, + "loss": 1.1852, + "num_input_tokens_seen": 4423680000, + "step": 67500, + "train_runtime": 32550.3797, + "train_tokens_per_second": 135902.562 + }, + { + "epoch": 0.676, + "grad_norm": 0.5116747617721558, + "learning_rate": 7.254445700500798e-05, + "loss": 1.1816, + "num_input_tokens_seen": 4430233600, + "step": 67600, + "train_runtime": 32598.0387, + "train_tokens_per_second": 135904.913 + }, + { + "epoch": 0.677, + "grad_norm": 0.5892815589904785, + "learning_rate": 7.213721869219329e-05, + "loss": 1.1827, + "num_input_tokens_seen": 4436787200, + "step": 67700, + "train_runtime": 32650.3715, + "train_tokens_per_second": 135887.802 + }, + { + "epoch": 0.678, + "grad_norm": 0.6862092614173889, + "learning_rate": 7.173076445672198e-05, + "loss": 1.1801, + "num_input_tokens_seen": 4443340800, + "step": 67800, + "train_runtime": 32698.6817, + "train_tokens_per_second": 135887.46 + }, + { + "epoch": 0.679, + "grad_norm": 0.8308249115943909, + "learning_rate": 7.132509839158359e-05, + "loss": 1.1887, + "num_input_tokens_seen": 4449894400, + "step": 67900, + "train_runtime": 32745.9782, + "train_tokens_per_second": 135891.326 + }, + { + "epoch": 0.68, + "grad_norm": 0.5063105225563049, + "learning_rate": 7.092022458183096e-05, + "loss": 1.1949, + "num_input_tokens_seen": 4456448000, + "step": 68000, + "train_runtime": 32794.3077, + "train_tokens_per_second": 135890.9 + }, + { + "epoch": 0.681, + "grad_norm": 0.6090216040611267, + "learning_rate": 7.051614710453888e-05, + "loss": 1.1827, + "num_input_tokens_seen": 4463001600, + "step": 68100, + "train_runtime": 32841.7871, + "train_tokens_per_second": 135893.993 + }, + { + "epoch": 0.682, + "grad_norm": 0.5802315473556519, + "learning_rate": 7.011287002876296e-05, + "loss": 1.1808, + "num_input_tokens_seen": 4469555200, + "step": 68200, + "train_runtime": 32889.3297, + "train_tokens_per_second": 135896.816 + }, + { + "epoch": 0.683, + "grad_norm": 0.5431249141693115, + "learning_rate": 6.971039741549894e-05, + "loss": 1.1872, + "num_input_tokens_seen": 4476108800, + "step": 68300, + "train_runtime": 32943.0615, + "train_tokens_per_second": 135874.099 + }, + { + "epoch": 0.684, + "grad_norm": 0.8621413111686707, + "learning_rate": 6.930873331764162e-05, + "loss": 1.1776, + "num_input_tokens_seen": 4482662400, + "step": 68400, + "train_runtime": 32991.0019, + "train_tokens_per_second": 135875.304 + }, + { + "epoch": 0.685, + "grad_norm": 0.6102387309074402, + "learning_rate": 6.890788177994391e-05, + "loss": 1.18, + "num_input_tokens_seen": 4489216000, + "step": 68500, + "train_runtime": 33039.2288, + "train_tokens_per_second": 135875.326 + }, + { + "epoch": 0.686, + "grad_norm": 0.5266649723052979, + "learning_rate": 6.850784683897641e-05, + "loss": 1.1743, + "num_input_tokens_seen": 4495769600, + "step": 68600, + "train_runtime": 33086.8363, + "train_tokens_per_second": 135877.893 + }, + { + "epoch": 0.687, + "grad_norm": 0.5879511833190918, + "learning_rate": 6.810863252308653e-05, + "loss": 1.1803, + "num_input_tokens_seen": 4502323200, + "step": 68700, + "train_runtime": 33133.6328, + "train_tokens_per_second": 135883.778 + }, + { + "epoch": 0.688, + "grad_norm": 0.5183672308921814, + "learning_rate": 6.771024285235792e-05, + "loss": 1.1834, + "num_input_tokens_seen": 4508876800, + "step": 68800, + "train_runtime": 33182.6281, + "train_tokens_per_second": 135880.642 + }, + { + "epoch": 0.689, + "grad_norm": 0.5091114640235901, + "learning_rate": 6.73126818385702e-05, + "loss": 1.1913, + "num_input_tokens_seen": 4515430400, + "step": 68900, + "train_runtime": 33236.4019, + "train_tokens_per_second": 135857.979 + }, + { + "epoch": 0.69, + "grad_norm": 0.7696628570556641, + "learning_rate": 6.691595348515837e-05, + "loss": 1.1786, + "num_input_tokens_seen": 4521984000, + "step": 69000, + "train_runtime": 33285.7582, + "train_tokens_per_second": 135853.417 + }, + { + "epoch": 0.691, + "grad_norm": 0.5338857769966125, + "learning_rate": 6.65200617871726e-05, + "loss": 1.1832, + "num_input_tokens_seen": 4528537600, + "step": 69100, + "train_runtime": 33332.8826, + "train_tokens_per_second": 135857.965 + }, + { + "epoch": 0.692, + "grad_norm": 0.7705228328704834, + "learning_rate": 6.612501073123775e-05, + "loss": 1.1762, + "num_input_tokens_seen": 4535091200, + "step": 69200, + "train_runtime": 33380.8611, + "train_tokens_per_second": 135859.024 + }, + { + "epoch": 0.693, + "grad_norm": 0.5423911213874817, + "learning_rate": 6.573080429551368e-05, + "loss": 1.19, + "num_input_tokens_seen": 4541644800, + "step": 69300, + "train_runtime": 33429.7481, + "train_tokens_per_second": 135856.387 + }, + { + "epoch": 0.694, + "grad_norm": 0.5332856774330139, + "learning_rate": 6.533744644965482e-05, + "loss": 1.1753, + "num_input_tokens_seen": 4548198400, + "step": 69400, + "train_runtime": 33476.6955, + "train_tokens_per_second": 135861.629 + }, + { + "epoch": 0.695, + "grad_norm": 0.5862846970558167, + "learning_rate": 6.494494115477023e-05, + "loss": 1.1799, + "num_input_tokens_seen": 4554752000, + "step": 69500, + "train_runtime": 33523.7618, + "train_tokens_per_second": 135866.375 + }, + { + "epoch": 0.696, + "grad_norm": 0.658592164516449, + "learning_rate": 6.455329236338394e-05, + "loss": 1.1846, + "num_input_tokens_seen": 4561305600, + "step": 69600, + "train_runtime": 33571.8888, + "train_tokens_per_second": 135866.815 + }, + { + "epoch": 0.697, + "grad_norm": 0.5558256506919861, + "learning_rate": 6.416250401939496e-05, + "loss": 1.1873, + "num_input_tokens_seen": 4567859200, + "step": 69700, + "train_runtime": 33620.7189, + "train_tokens_per_second": 135864.412 + }, + { + "epoch": 0.698, + "grad_norm": 0.5283026099205017, + "learning_rate": 6.377258005803746e-05, + "loss": 1.1743, + "num_input_tokens_seen": 4574412800, + "step": 69800, + "train_runtime": 33674.4741, + "train_tokens_per_second": 135842.145 + }, + { + "epoch": 0.699, + "grad_norm": 0.802412211894989, + "learning_rate": 6.338352440584149e-05, + "loss": 1.1782, + "num_input_tokens_seen": 4580966400, + "step": 69900, + "train_runtime": 33722.7187, + "train_tokens_per_second": 135842.144 + }, + { + "epoch": 0.7, + "grad_norm": 0.5585867762565613, + "learning_rate": 6.299534098059318e-05, + "loss": 1.1809, + "num_input_tokens_seen": 4587520000, + "step": 70000, + "train_runtime": 33770.2671, + "train_tokens_per_second": 135844.943 + }, + { + "epoch": 0.701, + "grad_norm": 0.6285941004753113, + "learning_rate": 6.260803369129522e-05, + "loss": 1.1807, + "num_input_tokens_seen": 4594073600, + "step": 70100, + "train_runtime": 33818.7011, + "train_tokens_per_second": 135844.176 + }, + { + "epoch": 0.702, + "grad_norm": 0.9580085277557373, + "learning_rate": 6.222160643812774e-05, + "loss": 1.1802, + "num_input_tokens_seen": 4600627200, + "step": 70200, + "train_runtime": 33866.618, + "train_tokens_per_second": 135845.487 + }, + { + "epoch": 0.703, + "grad_norm": 0.6520081162452698, + "learning_rate": 6.183606311240901e-05, + "loss": 1.1879, + "num_input_tokens_seen": 4607180800, + "step": 70300, + "train_runtime": 33915.2388, + "train_tokens_per_second": 135843.973 + }, + { + "epoch": 0.704, + "grad_norm": 0.520710289478302, + "learning_rate": 6.145140759655585e-05, + "loss": 1.179, + "num_input_tokens_seen": 4613734400, + "step": 70400, + "train_runtime": 33968.6026, + "train_tokens_per_second": 135823.497 + }, + { + "epoch": 0.705, + "grad_norm": 0.5945906639099121, + "learning_rate": 6.10676437640451e-05, + "loss": 1.192, + "num_input_tokens_seen": 4620288000, + "step": 70500, + "train_runtime": 34016.7254, + "train_tokens_per_second": 135824.008 + }, + { + "epoch": 0.706, + "grad_norm": 0.5285692811012268, + "learning_rate": 6.068477547937436e-05, + "loss": 1.1855, + "num_input_tokens_seen": 4626841600, + "step": 70600, + "train_runtime": 34064.6033, + "train_tokens_per_second": 135825.495 + }, + { + "epoch": 0.707, + "grad_norm": 0.6492000222206116, + "learning_rate": 6.030280659802294e-05, + "loss": 1.192, + "num_input_tokens_seen": 4633395200, + "step": 70700, + "train_runtime": 34111.1694, + "train_tokens_per_second": 135832.2 + }, + { + "epoch": 0.708, + "grad_norm": 0.5521112084388733, + "learning_rate": 5.9921740966413204e-05, + "loss": 1.1781, + "num_input_tokens_seen": 4639948800, + "step": 70800, + "train_runtime": 34162.8893, + "train_tokens_per_second": 135818.395 + }, + { + "epoch": 0.709, + "grad_norm": 0.9012600183486938, + "learning_rate": 5.954158242187197e-05, + "loss": 1.1748, + "num_input_tokens_seen": 4646502400, + "step": 70900, + "train_runtime": 34211.5739, + "train_tokens_per_second": 135816.68 + }, + { + "epoch": 0.71, + "grad_norm": 0.4976861774921417, + "learning_rate": 5.91623347925914e-05, + "loss": 1.1902, + "num_input_tokens_seen": 4653056000, + "step": 71000, + "train_runtime": 34258.4131, + "train_tokens_per_second": 135822.287 + }, + { + "epoch": 0.711, + "grad_norm": 0.5690837502479553, + "learning_rate": 5.8784001897590996e-05, + "loss": 1.1767, + "num_input_tokens_seen": 4659609600, + "step": 71100, + "train_runtime": 34307.7023, + "train_tokens_per_second": 135818.177 + }, + { + "epoch": 0.712, + "grad_norm": 0.5648302435874939, + "learning_rate": 5.840658754667877e-05, + "loss": 1.182, + "num_input_tokens_seen": 4666163200, + "step": 71200, + "train_runtime": 34355.8058, + "train_tokens_per_second": 135818.768 + }, + { + "epoch": 0.713, + "grad_norm": 0.5309351086616516, + "learning_rate": 5.8030095540413144e-05, + "loss": 1.1755, + "num_input_tokens_seen": 4672716800, + "step": 71300, + "train_runtime": 34402.7961, + "train_tokens_per_second": 135823.751 + }, + { + "epoch": 0.714, + "grad_norm": 1.0066486597061157, + "learning_rate": 5.7654529670064326e-05, + "loss": 1.2073, + "num_input_tokens_seen": 4679270400, + "step": 71400, + "train_runtime": 34458.8447, + "train_tokens_per_second": 135793.015 + }, + { + "epoch": 0.715, + "grad_norm": 0.625823974609375, + "learning_rate": 5.7279893717576485e-05, + "loss": 1.2012, + "num_input_tokens_seen": 4685824000, + "step": 71500, + "train_runtime": 34506.5957, + "train_tokens_per_second": 135795.024 + }, + { + "epoch": 0.716, + "grad_norm": 0.512055516242981, + "learning_rate": 5.690619145552958e-05, + "loss": 1.1702, + "num_input_tokens_seen": 4692377600, + "step": 71600, + "train_runtime": 34554.5393, + "train_tokens_per_second": 135796.271 + }, + { + "epoch": 0.717, + "grad_norm": 0.749454915523529, + "learning_rate": 5.6533426647101135e-05, + "loss": 1.1812, + "num_input_tokens_seen": 4698931200, + "step": 71700, + "train_runtime": 34601.4153, + "train_tokens_per_second": 135801.705 + }, + { + "epoch": 0.718, + "grad_norm": 0.5417782068252563, + "learning_rate": 5.6161603046028674e-05, + "loss": 1.1681, + "num_input_tokens_seen": 4705484800, + "step": 71800, + "train_runtime": 34650.0822, + "train_tokens_per_second": 135800.105 + }, + { + "epoch": 0.719, + "grad_norm": 0.7127480506896973, + "learning_rate": 5.579072439657179e-05, + "loss": 1.1946, + "num_input_tokens_seen": 4712038400, + "step": 71900, + "train_runtime": 34698.539, + "train_tokens_per_second": 135799.331 + }, + { + "epoch": 0.72, + "grad_norm": 0.5434790253639221, + "learning_rate": 5.542079443347431e-05, + "loss": 1.1761, + "num_input_tokens_seen": 4718592000, + "step": 72000, + "train_runtime": 34745.7766, + "train_tokens_per_second": 135803.325 + }, + { + "epoch": 0.721, + "grad_norm": 0.5872786045074463, + "learning_rate": 5.505181688192682e-05, + "loss": 1.1758, + "num_input_tokens_seen": 4725145600, + "step": 72100, + "train_runtime": 34797.942, + "train_tokens_per_second": 135788.076 + }, + { + "epoch": 0.722, + "grad_norm": 0.5440493822097778, + "learning_rate": 5.468379545752925e-05, + "loss": 1.2086, + "num_input_tokens_seen": 4731699200, + "step": 72200, + "train_runtime": 34846.6082, + "train_tokens_per_second": 135786.507 + }, + { + "epoch": 0.723, + "grad_norm": 0.5699992775917053, + "learning_rate": 5.4316733866253166e-05, + "loss": 1.1705, + "num_input_tokens_seen": 4738252800, + "step": 72300, + "train_runtime": 34894.2941, + "train_tokens_per_second": 135788.756 + }, + { + "epoch": 0.724, + "grad_norm": 0.7067492604255676, + "learning_rate": 5.3950635804404754e-05, + "loss": 1.1788, + "num_input_tokens_seen": 4744806400, + "step": 72400, + "train_runtime": 34943.1279, + "train_tokens_per_second": 135786.539 + }, + { + "epoch": 0.725, + "grad_norm": 0.4926595389842987, + "learning_rate": 5.358550495858751e-05, + "loss": 1.1712, + "num_input_tokens_seen": 4751360000, + "step": 72500, + "train_runtime": 34988.8033, + "train_tokens_per_second": 135796.585 + }, + { + "epoch": 0.726, + "grad_norm": 0.6217764616012573, + "learning_rate": 5.322134500566487e-05, + "loss": 1.199, + "num_input_tokens_seen": 4757913600, + "step": 72600, + "train_runtime": 35043.098, + "train_tokens_per_second": 135773.201 + }, + { + "epoch": 0.727, + "grad_norm": 0.5704054236412048, + "learning_rate": 5.285815961272359e-05, + "loss": 1.1782, + "num_input_tokens_seen": 4764467200, + "step": 72700, + "train_runtime": 35090.0359, + "train_tokens_per_second": 135778.351 + }, + { + "epoch": 0.728, + "grad_norm": 0.6081520915031433, + "learning_rate": 5.249595243703658e-05, + "loss": 1.1679, + "num_input_tokens_seen": 4771020800, + "step": 72800, + "train_runtime": 35136.6254, + "train_tokens_per_second": 135784.833 + }, + { + "epoch": 0.729, + "grad_norm": 0.6235555410385132, + "learning_rate": 5.213472712602598e-05, + "loss": 1.1707, + "num_input_tokens_seen": 4777574400, + "step": 72900, + "train_runtime": 35185.4188, + "train_tokens_per_second": 135782.792 + }, + { + "epoch": 0.73, + "grad_norm": 0.5777461528778076, + "learning_rate": 5.17744873172267e-05, + "loss": 1.1816, + "num_input_tokens_seen": 4784128000, + "step": 73000, + "train_runtime": 35238.2318, + "train_tokens_per_second": 135765.268 + }, + { + "epoch": 0.731, + "grad_norm": 0.569218635559082, + "learning_rate": 5.1415236638249694e-05, + "loss": 1.1757, + "num_input_tokens_seen": 4790681600, + "step": 73100, + "train_runtime": 35286.0257, + "train_tokens_per_second": 135767.106 + }, + { + "epoch": 0.732, + "grad_norm": 1.2679173946380615, + "learning_rate": 5.105697870674519e-05, + "loss": 1.1686, + "num_input_tokens_seen": 4797235200, + "step": 73200, + "train_runtime": 35333.5517, + "train_tokens_per_second": 135769.969 + }, + { + "epoch": 0.733, + "grad_norm": 0.5663115382194519, + "learning_rate": 5.069971713036664e-05, + "loss": 1.1699, + "num_input_tokens_seen": 4803788800, + "step": 73300, + "train_runtime": 35380.3642, + "train_tokens_per_second": 135775.561 + }, + { + "epoch": 0.734, + "grad_norm": 0.5404617190361023, + "learning_rate": 5.034345550673415e-05, + "loss": 1.1916, + "num_input_tokens_seen": 4810342400, + "step": 73400, + "train_runtime": 35434.8234, + "train_tokens_per_second": 135751.838 + }, + { + "epoch": 0.735, + "grad_norm": 0.7994534373283386, + "learning_rate": 4.998819742339835e-05, + "loss": 1.1842, + "num_input_tokens_seen": 4816896000, + "step": 73500, + "train_runtime": 35482.3263, + "train_tokens_per_second": 135754.797 + }, + { + "epoch": 0.736, + "grad_norm": 0.6482565402984619, + "learning_rate": 4.963394645780411e-05, + "loss": 1.1789, + "num_input_tokens_seen": 4823449600, + "step": 73600, + "train_runtime": 35530.782, + "train_tokens_per_second": 135754.107 + }, + { + "epoch": 0.737, + "grad_norm": 0.5401994585990906, + "learning_rate": 4.928070617725482e-05, + "loss": 1.1832, + "num_input_tokens_seen": 4830003200, + "step": 73700, + "train_runtime": 35578.1016, + "train_tokens_per_second": 135757.755 + }, + { + "epoch": 0.738, + "grad_norm": 0.5170857906341553, + "learning_rate": 4.892848013887613e-05, + "loss": 1.1804, + "num_input_tokens_seen": 4836556800, + "step": 73800, + "train_runtime": 35625.1017, + "train_tokens_per_second": 135762.61 + }, + { + "epoch": 0.739, + "grad_norm": 0.5744811296463013, + "learning_rate": 4.857727188958031e-05, + "loss": 1.181, + "num_input_tokens_seen": 4843110400, + "step": 73900, + "train_runtime": 35672.7413, + "train_tokens_per_second": 135765.019 + }, + { + "epoch": 0.74, + "grad_norm": 0.6613340377807617, + "learning_rate": 4.822708496603052e-05, + "loss": 1.1879, + "num_input_tokens_seen": 4849664000, + "step": 74000, + "train_runtime": 35721.0554, + "train_tokens_per_second": 135764.858 + }, + { + "epoch": 0.741, + "grad_norm": 0.5571849346160889, + "learning_rate": 4.7877922894605304e-05, + "loss": 1.1781, + "num_input_tokens_seen": 4856217600, + "step": 74100, + "train_runtime": 35771.1997, + "train_tokens_per_second": 135757.75 + }, + { + "epoch": 0.742, + "grad_norm": 0.6960323452949524, + "learning_rate": 4.752978919136273e-05, + "loss": 1.1702, + "num_input_tokens_seen": 4862771200, + "step": 74200, + "train_runtime": 35823.4168, + "train_tokens_per_second": 135742.808 + }, + { + "epoch": 0.743, + "grad_norm": 0.5823075175285339, + "learning_rate": 4.7182687362005337e-05, + "loss": 1.1762, + "num_input_tokens_seen": 4869324800, + "step": 74300, + "train_runtime": 35872.0393, + "train_tokens_per_second": 135741.511 + }, + { + "epoch": 0.744, + "grad_norm": 0.5310567021369934, + "learning_rate": 4.6836620901844794e-05, + "loss": 1.1737, + "num_input_tokens_seen": 4875878400, + "step": 74400, + "train_runtime": 35918.2124, + "train_tokens_per_second": 135749.473 + }, + { + "epoch": 0.745, + "grad_norm": 0.560118556022644, + "learning_rate": 4.64915932957664e-05, + "loss": 1.1746, + "num_input_tokens_seen": 4882432000, + "step": 74500, + "train_runtime": 35972.3831, + "train_tokens_per_second": 135727.232 + }, + { + "epoch": 0.746, + "grad_norm": 0.5729120969772339, + "learning_rate": 4.614760801819433e-05, + "loss": 1.1729, + "num_input_tokens_seen": 4888985600, + "step": 74600, + "train_runtime": 36018.4093, + "train_tokens_per_second": 135735.744 + }, + { + "epoch": 0.747, + "grad_norm": 0.5329717397689819, + "learning_rate": 4.58046685330566e-05, + "loss": 1.1969, + "num_input_tokens_seen": 4895539200, + "step": 74700, + "train_runtime": 36066.8487, + "train_tokens_per_second": 135735.152 + }, + { + "epoch": 0.748, + "grad_norm": 0.5714908838272095, + "learning_rate": 4.546277829374993e-05, + "loss": 1.172, + "num_input_tokens_seen": 4902092800, + "step": 74800, + "train_runtime": 36115.3648, + "train_tokens_per_second": 135734.273 + }, + { + "epoch": 0.749, + "grad_norm": 0.5672817826271057, + "learning_rate": 4.5121940743105246e-05, + "loss": 1.1813, + "num_input_tokens_seen": 4908646400, + "step": 74900, + "train_runtime": 36164.0493, + "train_tokens_per_second": 135732.765 + }, + { + "epoch": 0.75, + "grad_norm": 0.5890370607376099, + "learning_rate": 4.478215931335295e-05, + "loss": 1.1667, + "num_input_tokens_seen": 4915200000, + "step": 75000, + "train_runtime": 36215.8524, + "train_tokens_per_second": 135719.572 + }, + { + "epoch": 0.751, + "grad_norm": 0.6215245127677917, + "learning_rate": 4.4443437426088205e-05, + "loss": 1.179, + "num_input_tokens_seen": 4921753600, + "step": 75100, + "train_runtime": 36264.1849, + "train_tokens_per_second": 135719.405 + }, + { + "epoch": 0.752, + "grad_norm": 1.4719446897506714, + "learning_rate": 4.410577849223666e-05, + "loss": 1.1847, + "num_input_tokens_seen": 4928307200, + "step": 75200, + "train_runtime": 36312.9781, + "train_tokens_per_second": 135717.516 + }, + { + "epoch": 0.753, + "grad_norm": 1.3475043773651123, + "learning_rate": 4.376918591202006e-05, + "loss": 1.1745, + "num_input_tokens_seen": 4934860800, + "step": 75300, + "train_runtime": 36359.7761, + "train_tokens_per_second": 135723.08 + }, + { + "epoch": 0.754, + "grad_norm": 0.9558594822883606, + "learning_rate": 4.3433663074922046e-05, + "loss": 1.181, + "num_input_tokens_seen": 4941414400, + "step": 75400, + "train_runtime": 36406.8385, + "train_tokens_per_second": 135727.644 + }, + { + "epoch": 0.755, + "grad_norm": 0.5916360020637512, + "learning_rate": 4.309921335965367e-05, + "loss": 1.1706, + "num_input_tokens_seen": 4947968000, + "step": 75500, + "train_runtime": 36460.2599, + "train_tokens_per_second": 135708.522 + }, + { + "epoch": 0.756, + "grad_norm": 0.5985275506973267, + "learning_rate": 4.276584013411992e-05, + "loss": 1.1758, + "num_input_tokens_seen": 4954521600, + "step": 75600, + "train_runtime": 36507.6786, + "train_tokens_per_second": 135711.768 + }, + { + "epoch": 0.757, + "grad_norm": 0.5550095438957214, + "learning_rate": 4.243354675538555e-05, + "loss": 1.1705, + "num_input_tokens_seen": 4961075200, + "step": 75700, + "train_runtime": 36554.9962, + "train_tokens_per_second": 135715.38 + }, + { + "epoch": 0.758, + "grad_norm": 0.5496001243591309, + "learning_rate": 4.210233656964111e-05, + "loss": 1.1746, + "num_input_tokens_seen": 4967628800, + "step": 75800, + "train_runtime": 36602.3493, + "train_tokens_per_second": 135718.851 + }, + { + "epoch": 0.759, + "grad_norm": 0.570070743560791, + "learning_rate": 4.1772212912169516e-05, + "loss": 1.1771, + "num_input_tokens_seen": 4974182400, + "step": 75900, + "train_runtime": 36656.3482, + "train_tokens_per_second": 135697.707 + }, + { + "epoch": 0.76, + "grad_norm": 0.7570028305053711, + "learning_rate": 4.14431791073124e-05, + "loss": 1.1756, + "num_input_tokens_seen": 4980736000, + "step": 76000, + "train_runtime": 36704.1036, + "train_tokens_per_second": 135699.704 + }, + { + "epoch": 0.761, + "grad_norm": 0.6243161559104919, + "learning_rate": 4.111523846843639e-05, + "loss": 1.1667, + "num_input_tokens_seen": 4987289600, + "step": 76100, + "train_runtime": 36753.037, + "train_tokens_per_second": 135697.347 + }, + { + "epoch": 0.762, + "grad_norm": 0.5531216263771057, + "learning_rate": 4.078839429790019e-05, + "loss": 1.1755, + "num_input_tokens_seen": 4993843200, + "step": 76200, + "train_runtime": 36800.3039, + "train_tokens_per_second": 135701.14 + }, + { + "epoch": 0.763, + "grad_norm": 0.5894837379455566, + "learning_rate": 4.046264988702097e-05, + "loss": 1.1778, + "num_input_tokens_seen": 5000396800, + "step": 76300, + "train_runtime": 36847.8696, + "train_tokens_per_second": 135703.824 + }, + { + "epoch": 0.764, + "grad_norm": 0.6210083365440369, + "learning_rate": 4.013800851604123e-05, + "loss": 1.1729, + "num_input_tokens_seen": 5006950400, + "step": 76400, + "train_runtime": 36901.2456, + "train_tokens_per_second": 135685.133 + }, + { + "epoch": 0.765, + "grad_norm": 0.5929700136184692, + "learning_rate": 3.981447345409606e-05, + "loss": 1.171, + "num_input_tokens_seen": 5013504000, + "step": 76500, + "train_runtime": 36949.2788, + "train_tokens_per_second": 135686.113 + }, + { + "epoch": 0.766, + "grad_norm": 0.5809143781661987, + "learning_rate": 3.949204795917995e-05, + "loss": 1.1775, + "num_input_tokens_seen": 5020057600, + "step": 76600, + "train_runtime": 36996.6957, + "train_tokens_per_second": 135689.35 + }, + { + "epoch": 0.767, + "grad_norm": 0.5398791432380676, + "learning_rate": 3.917073527811399e-05, + "loss": 1.1765, + "num_input_tokens_seen": 5026611200, + "step": 76700, + "train_runtime": 37044.9859, + "train_tokens_per_second": 135689.381 + }, + { + "epoch": 0.768, + "grad_norm": 0.8559983372688293, + "learning_rate": 3.885053864651334e-05, + "loss": 1.1661, + "num_input_tokens_seen": 5033164800, + "step": 76800, + "train_runtime": 37092.5707, + "train_tokens_per_second": 135691.992 + }, + { + "epoch": 0.769, + "grad_norm": 1.0961577892303467, + "learning_rate": 3.8531461288754564e-05, + "loss": 1.1734, + "num_input_tokens_seen": 5039718400, + "step": 76900, + "train_runtime": 37145.642, + "train_tokens_per_second": 135674.554 + }, + { + "epoch": 0.77, + "grad_norm": 0.5564078688621521, + "learning_rate": 3.821350641794305e-05, + "loss": 1.1783, + "num_input_tokens_seen": 5046272000, + "step": 77000, + "train_runtime": 37194.2194, + "train_tokens_per_second": 135673.556 + }, + { + "epoch": 0.771, + "grad_norm": 0.6036384701728821, + "learning_rate": 3.789667723588087e-05, + "loss": 1.1651, + "num_input_tokens_seen": 5052825600, + "step": 77100, + "train_runtime": 37242.6728, + "train_tokens_per_second": 135673.012 + }, + { + "epoch": 0.772, + "grad_norm": 1.4465519189834595, + "learning_rate": 3.758097693303431e-05, + "loss": 1.1783, + "num_input_tokens_seen": 5059379200, + "step": 77200, + "train_runtime": 37290.7014, + "train_tokens_per_second": 135674.015 + }, + { + "epoch": 0.773, + "grad_norm": 0.5566693544387817, + "learning_rate": 3.7266408688502005e-05, + "loss": 1.1751, + "num_input_tokens_seen": 5065932800, + "step": 77300, + "train_runtime": 37338.6452, + "train_tokens_per_second": 135675.324 + }, + { + "epoch": 0.774, + "grad_norm": 0.653806209564209, + "learning_rate": 3.695297566998256e-05, + "loss": 1.1709, + "num_input_tokens_seen": 5072486400, + "step": 77400, + "train_runtime": 37386.3122, + "train_tokens_per_second": 135677.634 + }, + { + "epoch": 0.775, + "grad_norm": 0.8704593777656555, + "learning_rate": 3.664068103374307e-05, + "loss": 1.1794, + "num_input_tokens_seen": 5079040000, + "step": 77500, + "train_runtime": 37436.1356, + "train_tokens_per_second": 135672.123 + }, + { + "epoch": 0.776, + "grad_norm": 0.6627979874610901, + "learning_rate": 3.63295279245871e-05, + "loss": 1.175, + "num_input_tokens_seen": 5085593600, + "step": 77600, + "train_runtime": 37484.0969, + "train_tokens_per_second": 135673.366 + }, + { + "epoch": 0.777, + "grad_norm": 0.6232652068138123, + "learning_rate": 3.601951947582291e-05, + "loss": 1.1665, + "num_input_tokens_seen": 5092147200, + "step": 77700, + "train_runtime": 37536.8508, + "train_tokens_per_second": 135657.283 + }, + { + "epoch": 0.778, + "grad_norm": 0.5873488187789917, + "learning_rate": 3.571065880923216e-05, + "loss": 1.1734, + "num_input_tokens_seen": 5098700800, + "step": 77800, + "train_runtime": 37584.0839, + "train_tokens_per_second": 135661.17 + }, + { + "epoch": 0.779, + "grad_norm": 0.56858891248703, + "learning_rate": 3.540294903503841e-05, + "loss": 1.1696, + "num_input_tokens_seen": 5105254400, + "step": 77900, + "train_runtime": 37631.6286, + "train_tokens_per_second": 135663.924 + }, + { + "epoch": 0.78, + "grad_norm": 0.5939886569976807, + "learning_rate": 3.5096393251875566e-05, + "loss": 1.1784, + "num_input_tokens_seen": 5111808000, + "step": 78000, + "train_runtime": 37679.4424, + "train_tokens_per_second": 135665.702 + }, + { + "epoch": 0.781, + "grad_norm": 0.5839298367500305, + "learning_rate": 3.479099454675701e-05, + "loss": 1.1672, + "num_input_tokens_seen": 5118361600, + "step": 78100, + "train_runtime": 37733.7363, + "train_tokens_per_second": 135644.177 + }, + { + "epoch": 0.782, + "grad_norm": 0.6057742238044739, + "learning_rate": 3.448675599504434e-05, + "loss": 1.1767, + "num_input_tokens_seen": 5124915200, + "step": 78200, + "train_runtime": 37781.8162, + "train_tokens_per_second": 135645.019 + }, + { + "epoch": 0.783, + "grad_norm": 0.9875990748405457, + "learning_rate": 3.418368066041633e-05, + "loss": 1.1619, + "num_input_tokens_seen": 5131468800, + "step": 78300, + "train_runtime": 37829.8727, + "train_tokens_per_second": 135645.944 + }, + { + "epoch": 0.784, + "grad_norm": 0.5806832313537598, + "learning_rate": 3.388177159483826e-05, + "loss": 1.1747, + "num_input_tokens_seen": 5138022400, + "step": 78400, + "train_runtime": 37877.7351, + "train_tokens_per_second": 135647.561 + }, + { + "epoch": 0.785, + "grad_norm": 0.7016937136650085, + "learning_rate": 3.3581031838531116e-05, + "loss": 1.1664, + "num_input_tokens_seen": 5144576000, + "step": 78500, + "train_runtime": 37924.0105, + "train_tokens_per_second": 135654.851 + }, + { + "epoch": 0.786, + "grad_norm": 0.7171750664710999, + "learning_rate": 3.328146441994084e-05, + "loss": 1.1905, + "num_input_tokens_seen": 5151129600, + "step": 78600, + "train_runtime": 37971.9481, + "train_tokens_per_second": 135656.184 + }, + { + "epoch": 0.787, + "grad_norm": 0.5550017356872559, + "learning_rate": 3.2983072355708026e-05, + "loss": 1.1741, + "num_input_tokens_seen": 5157683200, + "step": 78700, + "train_runtime": 38021.3003, + "train_tokens_per_second": 135652.467 + }, + { + "epoch": 0.788, + "grad_norm": 0.5833317637443542, + "learning_rate": 3.2685858650637486e-05, + "loss": 1.176, + "num_input_tokens_seen": 5164236800, + "step": 78800, + "train_runtime": 38074.1209, + "train_tokens_per_second": 135636.403 + }, + { + "epoch": 0.789, + "grad_norm": 0.9918714165687561, + "learning_rate": 3.238982629766793e-05, + "loss": 1.1653, + "num_input_tokens_seen": 5170790400, + "step": 78900, + "train_runtime": 38121.5575, + "train_tokens_per_second": 135639.537 + }, + { + "epoch": 0.79, + "grad_norm": 1.2304959297180176, + "learning_rate": 3.209497827784177e-05, + "loss": 1.177, + "num_input_tokens_seen": 5177344000, + "step": 79000, + "train_runtime": 38168.8984, + "train_tokens_per_second": 135643.003 + }, + { + "epoch": 0.791, + "grad_norm": 0.5920888185501099, + "learning_rate": 3.1801317560275394e-05, + "loss": 1.1717, + "num_input_tokens_seen": 5183897600, + "step": 79100, + "train_runtime": 38223.2691, + "train_tokens_per_second": 135621.513 + }, + { + "epoch": 0.792, + "grad_norm": 0.5991621017456055, + "learning_rate": 3.150884710212895e-05, + "loss": 1.1933, + "num_input_tokens_seen": 5190451200, + "step": 79200, + "train_runtime": 38270.5225, + "train_tokens_per_second": 135625.303 + }, + { + "epoch": 0.793, + "grad_norm": 0.6007819175720215, + "learning_rate": 3.121756984857665e-05, + "loss": 1.1721, + "num_input_tokens_seen": 5197004800, + "step": 79300, + "train_runtime": 38316.5535, + "train_tokens_per_second": 135633.41 + }, + { + "epoch": 0.794, + "grad_norm": 0.6040635704994202, + "learning_rate": 3.092748873277725e-05, + "loss": 1.1784, + "num_input_tokens_seen": 5203558400, + "step": 79400, + "train_runtime": 38364.1371, + "train_tokens_per_second": 135636.008 + }, + { + "epoch": 0.795, + "grad_norm": 1.8925070762634277, + "learning_rate": 3.06386066758444e-05, + "loss": 1.179, + "num_input_tokens_seen": 5210112000, + "step": 79500, + "train_runtime": 38412.6561, + "train_tokens_per_second": 135635.297 + }, + { + "epoch": 0.796, + "grad_norm": 0.6026915311813354, + "learning_rate": 3.0350926586817127e-05, + "loss": 1.1706, + "num_input_tokens_seen": 5216665600, + "step": 79600, + "train_runtime": 38465.3514, + "train_tokens_per_second": 135619.861 + }, + { + "epoch": 0.797, + "grad_norm": 0.7981861233711243, + "learning_rate": 3.0064451362630765e-05, + "loss": 1.1796, + "num_input_tokens_seen": 5223219200, + "step": 79700, + "train_runtime": 38512.271, + "train_tokens_per_second": 135624.804 + }, + { + "epoch": 0.798, + "grad_norm": 1.3739973306655884, + "learning_rate": 2.9779183888087683e-05, + "loss": 1.1827, + "num_input_tokens_seen": 5229772800, + "step": 79800, + "train_runtime": 38560.5377, + "train_tokens_per_second": 135624.997 + }, + { + "epoch": 0.799, + "grad_norm": 0.7507041692733765, + "learning_rate": 2.9495127035828103e-05, + "loss": 1.164, + "num_input_tokens_seen": 5236326400, + "step": 79900, + "train_runtime": 38608.5419, + "train_tokens_per_second": 135626.111 + }, + { + "epoch": 0.8, + "grad_norm": 0.5848426818847656, + "learning_rate": 2.921228366630144e-05, + "loss": 1.1746, + "num_input_tokens_seen": 5242880000, + "step": 80000, + "train_runtime": 38660.3487, + "train_tokens_per_second": 135613.883 + }, + { + "epoch": 0.801, + "grad_norm": 0.5851396322250366, + "learning_rate": 2.8930656627737276e-05, + "loss": 1.1999, + "num_input_tokens_seen": 5249433600, + "step": 80100, + "train_runtime": 38707.849, + "train_tokens_per_second": 135616.774 + }, + { + "epoch": 0.802, + "grad_norm": 0.5581755638122559, + "learning_rate": 2.8650248756116727e-05, + "loss": 1.1657, + "num_input_tokens_seen": 5255987200, + "step": 80200, + "train_runtime": 38755.0614, + "train_tokens_per_second": 135620.665 + }, + { + "epoch": 0.803, + "grad_norm": 0.8737390637397766, + "learning_rate": 2.8371062875143968e-05, + "loss": 1.168, + "num_input_tokens_seen": 5262540800, + "step": 80300, + "train_runtime": 38809.0814, + "train_tokens_per_second": 135600.757 + }, + { + "epoch": 0.804, + "grad_norm": 0.6018446683883667, + "learning_rate": 2.809310179621776e-05, + "loss": 1.1603, + "num_input_tokens_seen": 5269094400, + "step": 80400, + "train_runtime": 38856.5205, + "train_tokens_per_second": 135603.866 + }, + { + "epoch": 0.805, + "grad_norm": 0.5673835873603821, + "learning_rate": 2.781636831840303e-05, + "loss": 1.1748, + "num_input_tokens_seen": 5275648000, + "step": 80500, + "train_runtime": 38904.9212, + "train_tokens_per_second": 135603.616 + }, + { + "epoch": 0.806, + "grad_norm": 0.5929433703422546, + "learning_rate": 2.754086522840282e-05, + "loss": 1.1663, + "num_input_tokens_seen": 5282201600, + "step": 80600, + "train_runtime": 38952.3955, + "train_tokens_per_second": 135606.592 + }, + { + "epoch": 0.807, + "grad_norm": 0.555366039276123, + "learning_rate": 2.7266595300530204e-05, + "loss": 1.1665, + "num_input_tokens_seen": 5288755200, + "step": 80700, + "train_runtime": 39001.4372, + "train_tokens_per_second": 135604.11 + }, + { + "epoch": 0.808, + "grad_norm": 0.5364073514938354, + "learning_rate": 2.6993561296680342e-05, + "loss": 1.1687, + "num_input_tokens_seen": 5295308800, + "step": 80800, + "train_runtime": 39048.23, + "train_tokens_per_second": 135609.445 + }, + { + "epoch": 0.809, + "grad_norm": 0.9588598608970642, + "learning_rate": 2.672176596630258e-05, + "loss": 1.1831, + "num_input_tokens_seen": 5301862400, + "step": 80900, + "train_runtime": 39096.7929, + "train_tokens_per_second": 135608.627 + }, + { + "epoch": 0.81, + "grad_norm": 0.6481744050979614, + "learning_rate": 2.6451212046372883e-05, + "loss": 1.1686, + "num_input_tokens_seen": 5308416000, + "step": 81000, + "train_runtime": 39152.1435, + "train_tokens_per_second": 135584.301 + }, + { + "epoch": 0.811, + "grad_norm": 0.5828465819358826, + "learning_rate": 2.6181902261366256e-05, + "loss": 1.1662, + "num_input_tokens_seen": 5314969600, + "step": 81100, + "train_runtime": 39199.715, + "train_tokens_per_second": 135586.945 + }, + { + "epoch": 0.812, + "grad_norm": 0.5715954899787903, + "learning_rate": 2.5913839323229195e-05, + "loss": 1.1623, + "num_input_tokens_seen": 5321523200, + "step": 81200, + "train_runtime": 39246.528, + "train_tokens_per_second": 135592.203 + }, + { + "epoch": 0.813, + "grad_norm": 0.8631576299667358, + "learning_rate": 2.564702593135253e-05, + "loss": 1.1896, + "num_input_tokens_seen": 5328076800, + "step": 81300, + "train_runtime": 39294.7731, + "train_tokens_per_second": 135592.507 + }, + { + "epoch": 0.814, + "grad_norm": 0.5882650017738342, + "learning_rate": 2.538146477254419e-05, + "loss": 1.1728, + "num_input_tokens_seen": 5334630400, + "step": 81400, + "train_runtime": 39341.8017, + "train_tokens_per_second": 135597.003 + }, + { + "epoch": 0.815, + "grad_norm": 0.5567020773887634, + "learning_rate": 2.5117158521002033e-05, + "loss": 1.1669, + "num_input_tokens_seen": 5341184000, + "step": 81500, + "train_runtime": 39389.3033, + "train_tokens_per_second": 135599.86 + }, + { + "epoch": 0.816, + "grad_norm": 0.7412062883377075, + "learning_rate": 2.4854109838287116e-05, + "loss": 1.1629, + "num_input_tokens_seen": 5347737600, + "step": 81600, + "train_runtime": 39443.4282, + "train_tokens_per_second": 135579.939 + }, + { + "epoch": 0.817, + "grad_norm": 0.6353700757026672, + "learning_rate": 2.459232137329679e-05, + "loss": 1.1676, + "num_input_tokens_seen": 5354291200, + "step": 81700, + "train_runtime": 39490.3956, + "train_tokens_per_second": 135584.643 + }, + { + "epoch": 0.818, + "grad_norm": 0.6541226506233215, + "learning_rate": 2.4331795762237894e-05, + "loss": 1.1669, + "num_input_tokens_seen": 5360844800, + "step": 81800, + "train_runtime": 39539.3049, + "train_tokens_per_second": 135582.677 + }, + { + "epoch": 0.819, + "grad_norm": 0.684333086013794, + "learning_rate": 2.4072535628600514e-05, + "loss": 1.1623, + "num_input_tokens_seen": 5367398400, + "step": 81900, + "train_runtime": 39587.3713, + "train_tokens_per_second": 135583.602 + }, + { + "epoch": 0.82, + "grad_norm": 0.5568915605545044, + "learning_rate": 2.3814543583131306e-05, + "loss": 1.1662, + "num_input_tokens_seen": 5373952000, + "step": 82000, + "train_runtime": 39636.1132, + "train_tokens_per_second": 135582.214 + }, + { + "epoch": 0.821, + "grad_norm": 0.6357592940330505, + "learning_rate": 2.3557822223807287e-05, + "loss": 1.1617, + "num_input_tokens_seen": 5380505600, + "step": 82100, + "train_runtime": 39683.9299, + "train_tokens_per_second": 135583.991 + }, + { + "epoch": 0.822, + "grad_norm": 0.6660736203193665, + "learning_rate": 2.3302374135809727e-05, + "loss": 1.1788, + "num_input_tokens_seen": 5387059200, + "step": 82200, + "train_runtime": 39731.7683, + "train_tokens_per_second": 135585.69 + }, + { + "epoch": 0.823, + "grad_norm": 0.6093869805335999, + "learning_rate": 2.304820189149798e-05, + "loss": 1.1823, + "num_input_tokens_seen": 5393612800, + "step": 82300, + "train_runtime": 39780.5498, + "train_tokens_per_second": 135584.169 + }, + { + "epoch": 0.824, + "grad_norm": 1.0343610048294067, + "learning_rate": 2.2795308050383787e-05, + "loss": 1.1942, + "num_input_tokens_seen": 5400166400, + "step": 82400, + "train_runtime": 39833.9775, + "train_tokens_per_second": 135566.839 + }, + { + "epoch": 0.825, + "grad_norm": 0.5363211035728455, + "learning_rate": 2.2543695159105248e-05, + "loss": 1.1659, + "num_input_tokens_seen": 5406720000, + "step": 82500, + "train_runtime": 39881.8503, + "train_tokens_per_second": 135568.434 + }, + { + "epoch": 0.826, + "grad_norm": 0.9732265472412109, + "learning_rate": 2.2293365751401443e-05, + "loss": 1.1757, + "num_input_tokens_seen": 5413273600, + "step": 82600, + "train_runtime": 39929.975, + "train_tokens_per_second": 135569.171 + }, + { + "epoch": 0.827, + "grad_norm": 0.5309200286865234, + "learning_rate": 2.2044322348086735e-05, + "loss": 1.1651, + "num_input_tokens_seen": 5419827200, + "step": 82700, + "train_runtime": 39978.229, + "train_tokens_per_second": 135569.467 + }, + { + "epoch": 0.828, + "grad_norm": 0.543769121170044, + "learning_rate": 2.1796567457025372e-05, + "loss": 1.1685, + "num_input_tokens_seen": 5426380800, + "step": 82800, + "train_runtime": 40026.0125, + "train_tokens_per_second": 135571.356 + }, + { + "epoch": 0.829, + "grad_norm": 0.5210631489753723, + "learning_rate": 2.15501035731064e-05, + "loss": 1.1778, + "num_input_tokens_seen": 5432934400, + "step": 82900, + "train_runtime": 40075.0654, + "train_tokens_per_second": 135568.947 + }, + { + "epoch": 0.83, + "grad_norm": 1.3538480997085571, + "learning_rate": 2.1304933178218426e-05, + "loss": 1.1655, + "num_input_tokens_seen": 5439488000, + "step": 83000, + "train_runtime": 40123.2015, + "train_tokens_per_second": 135569.64 + }, + { + "epoch": 0.831, + "grad_norm": 1.2901802062988281, + "learning_rate": 2.1061058741224518e-05, + "loss": 1.1668, + "num_input_tokens_seen": 5446041600, + "step": 83100, + "train_runtime": 40170.8312, + "train_tokens_per_second": 135572.042 + }, + { + "epoch": 0.832, + "grad_norm": 0.6960340142250061, + "learning_rate": 2.0818482717937596e-05, + "loss": 1.163, + "num_input_tokens_seen": 5452595200, + "step": 83200, + "train_runtime": 40225.6882, + "train_tokens_per_second": 135550.079 + }, + { + "epoch": 0.833, + "grad_norm": 0.537268340587616, + "learning_rate": 2.0577207551095552e-05, + "loss": 1.1689, + "num_input_tokens_seen": 5459148800, + "step": 83300, + "train_runtime": 40273.4785, + "train_tokens_per_second": 135551.956 + }, + { + "epoch": 0.834, + "grad_norm": 0.564239501953125, + "learning_rate": 2.0337235670336584e-05, + "loss": 1.1662, + "num_input_tokens_seen": 5465702400, + "step": 83400, + "train_runtime": 40320.9705, + "train_tokens_per_second": 135554.832 + }, + { + "epoch": 0.835, + "grad_norm": 0.520041823387146, + "learning_rate": 2.0098569492174887e-05, + "loss": 1.1642, + "num_input_tokens_seen": 5472256000, + "step": 83500, + "train_runtime": 40369.1468, + "train_tokens_per_second": 135555.404 + }, + { + "epoch": 0.836, + "grad_norm": 0.616112232208252, + "learning_rate": 1.9861211419976258e-05, + "loss": 1.1671, + "num_input_tokens_seen": 5478809600, + "step": 83600, + "train_runtime": 40416.0661, + "train_tokens_per_second": 135560.19 + }, + { + "epoch": 0.837, + "grad_norm": 1.3083754777908325, + "learning_rate": 1.962516384393377e-05, + "loss": 1.1778, + "num_input_tokens_seen": 5485363200, + "step": 83700, + "train_runtime": 40465.3405, + "train_tokens_per_second": 135557.075 + }, + { + "epoch": 0.838, + "grad_norm": 0.5721991062164307, + "learning_rate": 1.939042914104396e-05, + "loss": 1.179, + "num_input_tokens_seen": 5491916800, + "step": 83800, + "train_runtime": 40513.1256, + "train_tokens_per_second": 135558.951 + }, + { + "epoch": 0.839, + "grad_norm": 0.8014708161354065, + "learning_rate": 1.9157009675082702e-05, + "loss": 1.1698, + "num_input_tokens_seen": 5498470400, + "step": 83900, + "train_runtime": 40567.2253, + "train_tokens_per_second": 135539.721 + }, + { + "epoch": 0.84, + "grad_norm": 0.7554424405097961, + "learning_rate": 1.8924907796581363e-05, + "loss": 1.1689, + "num_input_tokens_seen": 5505024000, + "step": 84000, + "train_runtime": 40615.2949, + "train_tokens_per_second": 135540.663 + }, + { + "epoch": 0.841, + "grad_norm": 0.6026338934898376, + "learning_rate": 1.869412584280329e-05, + "loss": 1.1727, + "num_input_tokens_seen": 5511577600, + "step": 84100, + "train_runtime": 40664.1179, + "train_tokens_per_second": 135539.091 + }, + { + "epoch": 0.842, + "grad_norm": 0.6569694876670837, + "learning_rate": 1.8464666137720208e-05, + "loss": 1.1717, + "num_input_tokens_seen": 5518131200, + "step": 84200, + "train_runtime": 40713.1869, + "train_tokens_per_second": 135536.705 + }, + { + "epoch": 0.843, + "grad_norm": 0.5886375904083252, + "learning_rate": 1.823653099198884e-05, + "loss": 1.1764, + "num_input_tokens_seen": 5524684800, + "step": 84300, + "train_runtime": 40759.1071, + "train_tokens_per_second": 135544.795 + }, + { + "epoch": 0.844, + "grad_norm": 0.6782867908477783, + "learning_rate": 1.800972270292749e-05, + "loss": 1.1637, + "num_input_tokens_seen": 5531238400, + "step": 84400, + "train_runtime": 40811.77, + "train_tokens_per_second": 135530.471 + }, + { + "epoch": 0.845, + "grad_norm": 0.6513829231262207, + "learning_rate": 1.778424355449317e-05, + "loss": 1.165, + "num_input_tokens_seen": 5537792000, + "step": 84500, + "train_runtime": 40858.6857, + "train_tokens_per_second": 135535.245 + }, + { + "epoch": 0.846, + "grad_norm": 0.6192531585693359, + "learning_rate": 1.756009581725841e-05, + "loss": 1.1589, + "num_input_tokens_seen": 5544345600, + "step": 84600, + "train_runtime": 40906.0609, + "train_tokens_per_second": 135538.487 + }, + { + "epoch": 0.847, + "grad_norm": 0.5640349388122559, + "learning_rate": 1.7337281748388387e-05, + "loss": 1.1653, + "num_input_tokens_seen": 5550899200, + "step": 84700, + "train_runtime": 40952.689, + "train_tokens_per_second": 135544.194 + }, + { + "epoch": 0.848, + "grad_norm": 0.5606239438056946, + "learning_rate": 1.7115803591618312e-05, + "loss": 1.1734, + "num_input_tokens_seen": 5557452800, + "step": 84800, + "train_runtime": 41006.8402, + "train_tokens_per_second": 135525.019 + }, + { + "epoch": 0.849, + "grad_norm": 0.5700273513793945, + "learning_rate": 1.6895663577230816e-05, + "loss": 1.1755, + "num_input_tokens_seen": 5564006400, + "step": 84900, + "train_runtime": 41054.6511, + "train_tokens_per_second": 135526.822 + }, + { + "epoch": 0.85, + "grad_norm": 0.7111489176750183, + "learning_rate": 1.667686392203333e-05, + "loss": 1.1673, + "num_input_tokens_seen": 5570560000, + "step": 85000, + "train_runtime": 41102.7763, + "train_tokens_per_second": 135527.585 + }, + { + "epoch": 0.851, + "grad_norm": 0.5908454060554504, + "learning_rate": 1.6459406829335996e-05, + "loss": 1.1767, + "num_input_tokens_seen": 5577113600, + "step": 85100, + "train_runtime": 41150.3215, + "train_tokens_per_second": 135530.256 + }, + { + "epoch": 0.852, + "grad_norm": 0.6215232610702515, + "learning_rate": 1.624329448892932e-05, + "loss": 1.171, + "num_input_tokens_seen": 5583667200, + "step": 85200, + "train_runtime": 41205.5284, + "train_tokens_per_second": 135507.72 + }, + { + "epoch": 0.853, + "grad_norm": 0.6203814744949341, + "learning_rate": 1.6028529077062163e-05, + "loss": 1.1591, + "num_input_tokens_seen": 5590220800, + "step": 85300, + "train_runtime": 41253.0291, + "train_tokens_per_second": 135510.553 + }, + { + "epoch": 0.854, + "grad_norm": 0.5267207026481628, + "learning_rate": 1.5815112756419805e-05, + "loss": 1.185, + "num_input_tokens_seen": 5596774400, + "step": 85400, + "train_runtime": 41301.2794, + "train_tokens_per_second": 135510.921 + }, + { + "epoch": 0.855, + "grad_norm": 0.5815737843513489, + "learning_rate": 1.5603047676102313e-05, + "loss": 1.173, + "num_input_tokens_seen": 5603328000, + "step": 85500, + "train_runtime": 41349.0127, + "train_tokens_per_second": 135512.982 + }, + { + "epoch": 0.856, + "grad_norm": 0.6342357397079468, + "learning_rate": 1.5392335971602638e-05, + "loss": 1.1568, + "num_input_tokens_seen": 5609881600, + "step": 85600, + "train_runtime": 41397.1556, + "train_tokens_per_second": 135513.697 + }, + { + "epoch": 0.857, + "grad_norm": 0.6623713970184326, + "learning_rate": 1.5182979764785258e-05, + "loss": 1.1649, + "num_input_tokens_seen": 5616435200, + "step": 85700, + "train_runtime": 41450.3243, + "train_tokens_per_second": 135497.98 + }, + { + "epoch": 0.858, + "grad_norm": 0.6217081546783447, + "learning_rate": 1.4974981163864896e-05, + "loss": 1.1772, + "num_input_tokens_seen": 5622988800, + "step": 85800, + "train_runtime": 41497.5379, + "train_tokens_per_second": 135501.745 + }, + { + "epoch": 0.859, + "grad_norm": 0.6180946826934814, + "learning_rate": 1.4768342263385192e-05, + "loss": 1.1601, + "num_input_tokens_seen": 5629542400, + "step": 85900, + "train_runtime": 41546.6611, + "train_tokens_per_second": 135499.274 + }, + { + "epoch": 0.86, + "grad_norm": 0.5609486103057861, + "learning_rate": 1.4563065144197517e-05, + "loss": 1.1866, + "num_input_tokens_seen": 5636096000, + "step": 86000, + "train_runtime": 41594.2678, + "train_tokens_per_second": 135501.748 + }, + { + "epoch": 0.861, + "grad_norm": 0.5352550148963928, + "learning_rate": 1.4359151873440216e-05, + "loss": 1.1732, + "num_input_tokens_seen": 5642649600, + "step": 86100, + "train_runtime": 41640.8053, + "train_tokens_per_second": 135507.696 + }, + { + "epoch": 0.862, + "grad_norm": 0.5788577198982239, + "learning_rate": 1.415660450451767e-05, + "loss": 1.1785, + "num_input_tokens_seen": 5649203200, + "step": 86200, + "train_runtime": 41695.0254, + "train_tokens_per_second": 135488.662 + }, + { + "epoch": 0.863, + "grad_norm": 0.5672028064727783, + "learning_rate": 1.3955425077079595e-05, + "loss": 1.1692, + "num_input_tokens_seen": 5655756800, + "step": 86300, + "train_runtime": 41742.7936, + "train_tokens_per_second": 135490.615 + }, + { + "epoch": 0.864, + "grad_norm": 0.577563464641571, + "learning_rate": 1.375561561700061e-05, + "loss": 1.1662, + "num_input_tokens_seen": 5662310400, + "step": 86400, + "train_runtime": 41789.652, + "train_tokens_per_second": 135495.515 + }, + { + "epoch": 0.865, + "grad_norm": 0.544994592666626, + "learning_rate": 1.3557178136359798e-05, + "loss": 1.1665, + "num_input_tokens_seen": 5668864000, + "step": 86500, + "train_runtime": 41842.8709, + "train_tokens_per_second": 135479.805 + }, + { + "epoch": 0.866, + "grad_norm": 0.5978608727455139, + "learning_rate": 1.3360114633420333e-05, + "loss": 1.1644, + "num_input_tokens_seen": 5675417600, + "step": 86600, + "train_runtime": 41891.5143, + "train_tokens_per_second": 135478.932 + }, + { + "epoch": 0.867, + "grad_norm": 0.6005887985229492, + "learning_rate": 1.3164427092609503e-05, + "loss": 1.1742, + "num_input_tokens_seen": 5681971200, + "step": 86700, + "train_runtime": 41939.4895, + "train_tokens_per_second": 135480.218 + }, + { + "epoch": 0.868, + "grad_norm": 0.5312247276306152, + "learning_rate": 1.2970117484498732e-05, + "loss": 1.1575, + "num_input_tokens_seen": 5688524800, + "step": 86800, + "train_runtime": 41987.1811, + "train_tokens_per_second": 135482.418 + }, + { + "epoch": 0.869, + "grad_norm": 0.9317598342895508, + "learning_rate": 1.2777187765783558e-05, + "loss": 1.1668, + "num_input_tokens_seen": 5695078400, + "step": 86900, + "train_runtime": 42034.5611, + "train_tokens_per_second": 135485.616 + }, + { + "epoch": 0.87, + "grad_norm": 0.5501394271850586, + "learning_rate": 1.2585639879264103e-05, + "loss": 1.1741, + "num_input_tokens_seen": 5701632000, + "step": 87000, + "train_runtime": 42082.1201, + "train_tokens_per_second": 135488.231 + }, + { + "epoch": 0.871, + "grad_norm": 0.6144236326217651, + "learning_rate": 1.2395475753825518e-05, + "loss": 1.1665, + "num_input_tokens_seen": 5708185600, + "step": 87100, + "train_runtime": 42136.7086, + "train_tokens_per_second": 135468.236 + }, + { + "epoch": 0.872, + "grad_norm": 0.6324082612991333, + "learning_rate": 1.2206697304418367e-05, + "loss": 1.1523, + "num_input_tokens_seen": 5714739200, + "step": 87200, + "train_runtime": 42184.2095, + "train_tokens_per_second": 135471.051 + }, + { + "epoch": 0.873, + "grad_norm": 0.6486518979072571, + "learning_rate": 1.2019306432039594e-05, + "loss": 1.1872, + "num_input_tokens_seen": 5721292800, + "step": 87300, + "train_runtime": 42230.9222, + "train_tokens_per_second": 135476.388 + }, + { + "epoch": 0.874, + "grad_norm": 0.5755148530006409, + "learning_rate": 1.1833305023713153e-05, + "loss": 1.1963, + "num_input_tokens_seen": 5727846400, + "step": 87400, + "train_runtime": 42278.9901, + "train_tokens_per_second": 135477.37 + }, + { + "epoch": 0.875, + "grad_norm": 0.6408706307411194, + "learning_rate": 1.1648694952471205e-05, + "loss": 1.163, + "num_input_tokens_seen": 5734400000, + "step": 87500, + "train_runtime": 42326.8376, + "train_tokens_per_second": 135479.056 + }, + { + "epoch": 0.876, + "grad_norm": 0.6233325600624084, + "learning_rate": 1.1465478077335088e-05, + "loss": 1.1591, + "num_input_tokens_seen": 5740953600, + "step": 87600, + "train_runtime": 42379.4952, + "train_tokens_per_second": 135465.36 + }, + { + "epoch": 0.877, + "grad_norm": 0.8282228708267212, + "learning_rate": 1.1283656243296695e-05, + "loss": 1.1799, + "num_input_tokens_seen": 5747507200, + "step": 87700, + "train_runtime": 42427.8149, + "train_tokens_per_second": 135465.548 + }, + { + "epoch": 0.878, + "grad_norm": 0.7755045294761658, + "learning_rate": 1.1103231281299923e-05, + "loss": 1.1565, + "num_input_tokens_seen": 5754060800, + "step": 87800, + "train_runtime": 42474.6192, + "train_tokens_per_second": 135470.568 + }, + { + "epoch": 0.879, + "grad_norm": 0.6230588555335999, + "learning_rate": 1.0924205008222086e-05, + "loss": 1.1673, + "num_input_tokens_seen": 5760614400, + "step": 87900, + "train_runtime": 42522.6205, + "train_tokens_per_second": 135471.764 + }, + { + "epoch": 0.88, + "grad_norm": 0.5966441035270691, + "learning_rate": 1.0746579226855768e-05, + "loss": 1.1628, + "num_input_tokens_seen": 5767168000, + "step": 88000, + "train_runtime": 42576.1454, + "train_tokens_per_second": 135455.381 + }, + { + "epoch": 0.881, + "grad_norm": 0.6604552865028381, + "learning_rate": 1.0570355725890678e-05, + "loss": 1.1769, + "num_input_tokens_seen": 5773721600, + "step": 88100, + "train_runtime": 42624.8502, + "train_tokens_per_second": 135454.355 + }, + { + "epoch": 0.882, + "grad_norm": 0.5727500319480896, + "learning_rate": 1.0395536279895428e-05, + "loss": 1.1571, + "num_input_tokens_seen": 5780275200, + "step": 88200, + "train_runtime": 42673.6883, + "train_tokens_per_second": 135452.909 + }, + { + "epoch": 0.883, + "grad_norm": 0.5748215317726135, + "learning_rate": 1.0222122649299952e-05, + "loss": 1.1666, + "num_input_tokens_seen": 5786828800, + "step": 88300, + "train_runtime": 42720.0242, + "train_tokens_per_second": 135459.399 + }, + { + "epoch": 0.884, + "grad_norm": 0.6671021580696106, + "learning_rate": 1.0050116580377593e-05, + "loss": 1.1887, + "num_input_tokens_seen": 5793382400, + "step": 88400, + "train_runtime": 42766.9841, + "train_tokens_per_second": 135463.899 + }, + { + "epoch": 0.885, + "grad_norm": 0.7352688908576965, + "learning_rate": 9.879519805227515e-06, + "loss": 1.173, + "num_input_tokens_seen": 5799936000, + "step": 88500, + "train_runtime": 42820.4689, + "train_tokens_per_second": 135447.746 + }, + { + "epoch": 0.886, + "grad_norm": 0.5779001712799072, + "learning_rate": 9.710334041757351e-06, + "loss": 1.1612, + "num_input_tokens_seen": 5806489600, + "step": 88600, + "train_runtime": 42866.8877, + "train_tokens_per_second": 135453.958 + }, + { + "epoch": 0.887, + "grad_norm": 0.7246189713478088, + "learning_rate": 9.542560993665932e-06, + "loss": 1.1926, + "num_input_tokens_seen": 5813043200, + "step": 88700, + "train_runtime": 42915.9912, + "train_tokens_per_second": 135451.682 + }, + { + "epoch": 0.888, + "grad_norm": 0.5459685921669006, + "learning_rate": 9.376202350425888e-06, + "loss": 1.1698, + "num_input_tokens_seen": 5819596800, + "step": 88800, + "train_runtime": 42964.4051, + "train_tokens_per_second": 135451.586 + }, + { + "epoch": 0.889, + "grad_norm": 0.5574699640274048, + "learning_rate": 9.211259787266972e-06, + "loss": 1.1627, + "num_input_tokens_seen": 5826150400, + "step": 88900, + "train_runtime": 43011.9797, + "train_tokens_per_second": 135454.133 + }, + { + "epoch": 0.89, + "grad_norm": 0.5637386441230774, + "learning_rate": 9.047734965158966e-06, + "loss": 1.1659, + "num_input_tokens_seen": 5832704000, + "step": 89000, + "train_runtime": 43065.5789, + "train_tokens_per_second": 135437.724 + }, + { + "epoch": 0.891, + "grad_norm": 0.5420241951942444, + "learning_rate": 8.885629530794997e-06, + "loss": 1.1693, + "num_input_tokens_seen": 5839257600, + "step": 89100, + "train_runtime": 43113.8932, + "train_tokens_per_second": 135437.957 + }, + { + "epoch": 0.892, + "grad_norm": 0.5701260566711426, + "learning_rate": 8.724945116574983e-06, + "loss": 1.1592, + "num_input_tokens_seen": 5845811200, + "step": 89200, + "train_runtime": 43161.415, + "train_tokens_per_second": 135440.675 + }, + { + "epoch": 0.893, + "grad_norm": 0.5882892608642578, + "learning_rate": 8.565683340589185e-06, + "loss": 1.1601, + "num_input_tokens_seen": 5852364800, + "step": 89300, + "train_runtime": 43209.5307, + "train_tokens_per_second": 135441.527 + }, + { + "epoch": 0.894, + "grad_norm": 0.5708109736442566, + "learning_rate": 8.40784580660196e-06, + "loss": 1.1684, + "num_input_tokens_seen": 5858918400, + "step": 89400, + "train_runtime": 43257.3597, + "train_tokens_per_second": 135443.273 + }, + { + "epoch": 0.895, + "grad_norm": 0.5796698927879333, + "learning_rate": 8.251434104035465e-06, + "loss": 1.1753, + "num_input_tokens_seen": 5865472000, + "step": 89500, + "train_runtime": 43305.3116, + "train_tokens_per_second": 135444.632 + }, + { + "epoch": 0.896, + "grad_norm": 0.9602819681167603, + "learning_rate": 8.09644980795383e-06, + "loss": 1.1672, + "num_input_tokens_seen": 5872025600, + "step": 89600, + "train_runtime": 43360.788, + "train_tokens_per_second": 135422.484 + }, + { + "epoch": 0.897, + "grad_norm": 0.6962534189224243, + "learning_rate": 7.942894479047252e-06, + "loss": 1.1622, + "num_input_tokens_seen": 5878579200, + "step": 89700, + "train_runtime": 43407.8503, + "train_tokens_per_second": 135426.637 + }, + { + "epoch": 0.898, + "grad_norm": 0.6292552351951599, + "learning_rate": 7.790769663616098e-06, + "loss": 1.1632, + "num_input_tokens_seen": 5885132800, + "step": 89800, + "train_runtime": 43455.9389, + "train_tokens_per_second": 135427.584 + }, + { + "epoch": 0.899, + "grad_norm": 0.5883670449256897, + "learning_rate": 7.64007689355563e-06, + "loss": 1.1632, + "num_input_tokens_seen": 5891686400, + "step": 89900, + "train_runtime": 43504.2315, + "train_tokens_per_second": 135427.893 + }, + { + "epoch": 0.9, + "grad_norm": 0.8059070706367493, + "learning_rate": 7.490817686340361e-06, + "loss": 1.1728, + "num_input_tokens_seen": 5898240000, + "step": 90000, + "train_runtime": 43552.1457, + "train_tokens_per_second": 135429.378 + }, + { + "epoch": 0.901, + "grad_norm": 0.5949374437332153, + "learning_rate": 7.342993545008818e-06, + "loss": 1.1732, + "num_input_tokens_seen": 5904793600, + "step": 90100, + "train_runtime": 43599.6931, + "train_tokens_per_second": 135431.999 + }, + { + "epoch": 0.902, + "grad_norm": 0.6094557642936707, + "learning_rate": 7.196605958148505e-06, + "loss": 1.1713, + "num_input_tokens_seen": 5911347200, + "step": 90200, + "train_runtime": 43653.2541, + "train_tokens_per_second": 135415.957 + }, + { + "epoch": 0.903, + "grad_norm": 0.6275845170021057, + "learning_rate": 7.051656399880778e-06, + "loss": 1.1743, + "num_input_tokens_seen": 5917900800, + "step": 90300, + "train_runtime": 43702.1275, + "train_tokens_per_second": 135414.478 + }, + { + "epoch": 0.904, + "grad_norm": 0.7113337516784668, + "learning_rate": 6.9081463298460815e-06, + "loss": 1.162, + "num_input_tokens_seen": 5924454400, + "step": 90400, + "train_runtime": 43749.6704, + "train_tokens_per_second": 135417.121 + }, + { + "epoch": 0.905, + "grad_norm": 0.6237180233001709, + "learning_rate": 6.766077193189201e-06, + "loss": 1.159, + "num_input_tokens_seen": 5931008000, + "step": 90500, + "train_runtime": 43797.6522, + "train_tokens_per_second": 135418.4 + }, + { + "epoch": 0.906, + "grad_norm": 0.9803968667984009, + "learning_rate": 6.625450420544831e-06, + "loss": 1.1788, + "num_input_tokens_seen": 5937561600, + "step": 90600, + "train_runtime": 43846.1111, + "train_tokens_per_second": 135418.203 + }, + { + "epoch": 0.907, + "grad_norm": 0.5648267269134521, + "learning_rate": 6.486267428022967e-06, + "loss": 1.1581, + "num_input_tokens_seen": 5944115200, + "step": 90700, + "train_runtime": 43893.4216, + "train_tokens_per_second": 135421.55 + }, + { + "epoch": 0.908, + "grad_norm": 0.610898494720459, + "learning_rate": 6.34852961719477e-06, + "loss": 1.1557, + "num_input_tokens_seen": 5950668800, + "step": 90800, + "train_runtime": 43947.4481, + "train_tokens_per_second": 135404.194 + }, + { + "epoch": 0.909, + "grad_norm": 0.732876718044281, + "learning_rate": 6.212238375078521e-06, + "loss": 1.1683, + "num_input_tokens_seen": 5957222400, + "step": 90900, + "train_runtime": 43996.4271, + "train_tokens_per_second": 135402.413 + }, + { + "epoch": 0.91, + "grad_norm": 0.5793011784553528, + "learning_rate": 6.077395074125491e-06, + "loss": 1.1747, + "num_input_tokens_seen": 5963776000, + "step": 91000, + "train_runtime": 44044.5112, + "train_tokens_per_second": 135403.387 + }, + { + "epoch": 0.911, + "grad_norm": 0.6567527651786804, + "learning_rate": 5.944001072206212e-06, + "loss": 1.1594, + "num_input_tokens_seen": 5970329600, + "step": 91100, + "train_runtime": 44091.43, + "train_tokens_per_second": 135407.938 + }, + { + "epoch": 0.912, + "grad_norm": 0.6197203397750854, + "learning_rate": 5.812057712596807e-06, + "loss": 1.1504, + "num_input_tokens_seen": 5976883200, + "step": 91200, + "train_runtime": 44140.2623, + "train_tokens_per_second": 135406.608 + }, + { + "epoch": 0.913, + "grad_norm": 0.6190736889839172, + "learning_rate": 5.681566323965486e-06, + "loss": 1.1645, + "num_input_tokens_seen": 5983436800, + "step": 91300, + "train_runtime": 44194.3429, + "train_tokens_per_second": 135389.202 + }, + { + "epoch": 0.914, + "grad_norm": 0.5632036924362183, + "learning_rate": 5.552528220359004e-06, + "loss": 1.1691, + "num_input_tokens_seen": 5989990400, + "step": 91400, + "train_runtime": 44242.165, + "train_tokens_per_second": 135390.987 + }, + { + "epoch": 0.915, + "grad_norm": 0.6650084257125854, + "learning_rate": 5.424944701189704e-06, + "loss": 1.1587, + "num_input_tokens_seen": 5996544000, + "step": 91500, + "train_runtime": 44290.3253, + "train_tokens_per_second": 135391.735 + }, + { + "epoch": 0.916, + "grad_norm": 0.6665343642234802, + "learning_rate": 5.298817051222182e-06, + "loss": 1.16, + "num_input_tokens_seen": 6003097600, + "step": 91600, + "train_runtime": 44344.1461, + "train_tokens_per_second": 135375.199 + }, + { + "epoch": 0.917, + "grad_norm": 0.9934324026107788, + "learning_rate": 5.174146540560442e-06, + "loss": 1.186, + "num_input_tokens_seen": 6009651200, + "step": 91700, + "train_runtime": 44386.6411, + "train_tokens_per_second": 135393.241 + }, + { + "epoch": 0.918, + "grad_norm": 0.587840735912323, + "learning_rate": 5.050934424635195e-06, + "loss": 1.1685, + "num_input_tokens_seen": 6016204800, + "step": 91800, + "train_runtime": 44440.2445, + "train_tokens_per_second": 135377.401 + }, + { + "epoch": 0.919, + "grad_norm": 0.6308780312538147, + "learning_rate": 4.9291819441910465e-06, + "loss": 1.1593, + "num_input_tokens_seen": 6022758400, + "step": 91900, + "train_runtime": 44487.4748, + "train_tokens_per_second": 135380.99 + }, + { + "epoch": 0.92, + "grad_norm": 0.6875436305999756, + "learning_rate": 4.808890325274129e-06, + "loss": 1.1686, + "num_input_tokens_seen": 6029312000, + "step": 92000, + "train_runtime": 44535.4396, + "train_tokens_per_second": 135382.339 + }, + { + "epoch": 0.921, + "grad_norm": 0.6450539231300354, + "learning_rate": 4.690060779219723e-06, + "loss": 1.1669, + "num_input_tokens_seen": 6035865600, + "step": 92100, + "train_runtime": 44583.0204, + "train_tokens_per_second": 135384.852 + }, + { + "epoch": 0.922, + "grad_norm": 1.0118526220321655, + "learning_rate": 4.572694502640023e-06, + "loss": 1.1601, + "num_input_tokens_seen": 6042419200, + "step": 92200, + "train_runtime": 44632.4327, + "train_tokens_per_second": 135381.803 + }, + { + "epoch": 0.923, + "grad_norm": 0.5630050897598267, + "learning_rate": 4.456792677412141e-06, + "loss": 1.164, + "num_input_tokens_seen": 6048972800, + "step": 92300, + "train_runtime": 44685.5287, + "train_tokens_per_second": 135367.6 + }, + { + "epoch": 0.924, + "grad_norm": 0.5819036364555359, + "learning_rate": 4.342356470666153e-06, + "loss": 1.177, + "num_input_tokens_seen": 6055526400, + "step": 92400, + "train_runtime": 44733.1102, + "train_tokens_per_second": 135370.118 + }, + { + "epoch": 0.925, + "grad_norm": 0.5852016806602478, + "learning_rate": 4.22938703477344e-06, + "loss": 1.1846, + "num_input_tokens_seen": 6062080000, + "step": 92500, + "train_runtime": 44781.2518, + "train_tokens_per_second": 135370.937 + }, + { + "epoch": 0.926, + "grad_norm": 0.7466326355934143, + "learning_rate": 4.117885507334884e-06, + "loss": 1.1564, + "num_input_tokens_seen": 6068633600, + "step": 92600, + "train_runtime": 44829.0669, + "train_tokens_per_second": 135372.739 + }, + { + "epoch": 0.927, + "grad_norm": 0.7777779698371887, + "learning_rate": 4.007853011169687e-06, + "loss": 1.1654, + "num_input_tokens_seen": 6075187200, + "step": 92700, + "train_runtime": 44882.4041, + "train_tokens_per_second": 135357.883 + }, + { + "epoch": 0.928, + "grad_norm": 0.9159000515937805, + "learning_rate": 3.899290654303855e-06, + "loss": 1.1854, + "num_input_tokens_seen": 6081740800, + "step": 92800, + "train_runtime": 44929.6625, + "train_tokens_per_second": 135361.373 + }, + { + "epoch": 0.929, + "grad_norm": 0.5948230028152466, + "learning_rate": 3.7921995299591168e-06, + "loss": 1.1602, + "num_input_tokens_seen": 6088294400, + "step": 92900, + "train_runtime": 44977.4717, + "train_tokens_per_second": 135363.198 + }, + { + "epoch": 0.93, + "grad_norm": 0.5999124646186829, + "learning_rate": 3.686580716541887e-06, + "loss": 1.1484, + "num_input_tokens_seen": 6094848000, + "step": 93000, + "train_runtime": 45026.2424, + "train_tokens_per_second": 135362.128 + }, + { + "epoch": 0.931, + "grad_norm": 0.6015925407409668, + "learning_rate": 3.582435277632456e-06, + "loss": 1.1638, + "num_input_tokens_seen": 6101401600, + "step": 93100, + "train_runtime": 45073.6825, + "train_tokens_per_second": 135365.057 + }, + { + "epoch": 0.932, + "grad_norm": 0.5493288040161133, + "learning_rate": 3.479764261974266e-06, + "loss": 1.1644, + "num_input_tokens_seen": 6107955200, + "step": 93200, + "train_runtime": 45131.734, + "train_tokens_per_second": 135336.152 + }, + { + "epoch": 0.933, + "grad_norm": 0.5847836136817932, + "learning_rate": 3.3785687034632523e-06, + "loss": 1.1528, + "num_input_tokens_seen": 6114508800, + "step": 93300, + "train_runtime": 45180.4411, + "train_tokens_per_second": 135335.305 + }, + { + "epoch": 0.934, + "grad_norm": 0.6086737513542175, + "learning_rate": 3.2788496211376024e-06, + "loss": 1.1525, + "num_input_tokens_seen": 6121062400, + "step": 93400, + "train_runtime": 45228.3556, + "train_tokens_per_second": 135336.833 + }, + { + "epoch": 0.935, + "grad_norm": 0.6097891330718994, + "learning_rate": 3.180608019167363e-06, + "loss": 1.1681, + "num_input_tokens_seen": 6127616000, + "step": 93500, + "train_runtime": 45275.6501, + "train_tokens_per_second": 135340.21 + }, + { + "epoch": 0.936, + "grad_norm": 0.5980057716369629, + "learning_rate": 3.0838448868443665e-06, + "loss": 1.1603, + "num_input_tokens_seen": 6134169600, + "step": 93600, + "train_runtime": 45322.6488, + "train_tokens_per_second": 135344.464 + }, + { + "epoch": 0.937, + "grad_norm": 0.7306444048881531, + "learning_rate": 2.988561198572287e-06, + "loss": 1.1702, + "num_input_tokens_seen": 6140723200, + "step": 93700, + "train_runtime": 45376.9708, + "train_tokens_per_second": 135326.865 + }, + { + "epoch": 0.938, + "grad_norm": 0.9187434911727905, + "learning_rate": 2.8947579138567987e-06, + "loss": 1.1654, + "num_input_tokens_seen": 6147276800, + "step": 93800, + "train_runtime": 45427.1088, + "train_tokens_per_second": 135321.771 + }, + { + "epoch": 0.939, + "grad_norm": 0.6403319835662842, + "learning_rate": 2.8024359772959525e-06, + "loss": 1.1581, + "num_input_tokens_seen": 6153830400, + "step": 93900, + "train_runtime": 45475.34, + "train_tokens_per_second": 135322.362 + }, + { + "epoch": 0.94, + "grad_norm": 0.7088416218757629, + "learning_rate": 2.711596318570597e-06, + "loss": 1.1683, + "num_input_tokens_seen": 6160384000, + "step": 94000, + "train_runtime": 45523.8789, + "train_tokens_per_second": 135322.037 + }, + { + "epoch": 0.941, + "grad_norm": 0.6289553642272949, + "learning_rate": 2.6222398524351206e-06, + "loss": 1.1538, + "num_input_tokens_seen": 6166937600, + "step": 94100, + "train_runtime": 45571.6907, + "train_tokens_per_second": 135323.871 + }, + { + "epoch": 0.942, + "grad_norm": 0.8788822889328003, + "learning_rate": 2.5343674787081435e-06, + "loss": 1.1666, + "num_input_tokens_seen": 6173491200, + "step": 94200, + "train_runtime": 45621.3271, + "train_tokens_per_second": 135320.29 + }, + { + "epoch": 0.943, + "grad_norm": 0.575515866279602, + "learning_rate": 2.4479800822634565e-06, + "loss": 1.1685, + "num_input_tokens_seen": 6180044800, + "step": 94300, + "train_runtime": 45670.6842, + "train_tokens_per_second": 135317.543 + }, + { + "epoch": 0.944, + "grad_norm": 0.5740439891815186, + "learning_rate": 2.3630785330212286e-06, + "loss": 1.1588, + "num_input_tokens_seen": 6186598400, + "step": 94400, + "train_runtime": 45717.875, + "train_tokens_per_second": 135321.215 + }, + { + "epoch": 0.945, + "grad_norm": 0.6576538681983948, + "learning_rate": 2.2796636859390815e-06, + "loss": 1.1492, + "num_input_tokens_seen": 6193152000, + "step": 94500, + "train_runtime": 45766.0209, + "train_tokens_per_second": 135322.055 + }, + { + "epoch": 0.946, + "grad_norm": 0.5781713128089905, + "learning_rate": 2.197736381003612e-06, + "loss": 1.1725, + "num_input_tokens_seen": 6199705600, + "step": 94600, + "train_runtime": 45819.6687, + "train_tokens_per_second": 135306.644 + }, + { + "epoch": 0.947, + "grad_norm": 0.6812490820884705, + "learning_rate": 2.1172974432218826e-06, + "loss": 1.1509, + "num_input_tokens_seen": 6206259200, + "step": 94700, + "train_runtime": 45866.8187, + "train_tokens_per_second": 135310.435 + }, + { + "epoch": 0.948, + "grad_norm": 0.8884466886520386, + "learning_rate": 2.0383476826130786e-06, + "loss": 1.157, + "num_input_tokens_seen": 6212812800, + "step": 94800, + "train_runtime": 45915.7744, + "train_tokens_per_second": 135308.897 + }, + { + "epoch": 0.949, + "grad_norm": 0.6096293926239014, + "learning_rate": 1.96088789420043e-06, + "loss": 1.1609, + "num_input_tokens_seen": 6219366400, + "step": 94900, + "train_runtime": 45963.3824, + "train_tokens_per_second": 135311.33 + }, + { + "epoch": 0.95, + "grad_norm": 0.5762118697166443, + "learning_rate": 1.8849188580031539e-06, + "loss": 1.1621, + "num_input_tokens_seen": 6225920000, + "step": 95000, + "train_runtime": 46012.4538, + "train_tokens_per_second": 135309.454 + }, + { + "epoch": 0.951, + "grad_norm": 0.5296618938446045, + "learning_rate": 1.8104413390286066e-06, + "loss": 1.157, + "num_input_tokens_seen": 6232473600, + "step": 95100, + "train_runtime": 46059.2761, + "train_tokens_per_second": 135314.189 + }, + { + "epoch": 0.952, + "grad_norm": 0.6025533676147461, + "learning_rate": 1.7374560872645438e-06, + "loss": 1.1507, + "num_input_tokens_seen": 6239027200, + "step": 95200, + "train_runtime": 46113.68, + "train_tokens_per_second": 135296.667 + }, + { + "epoch": 0.953, + "grad_norm": 0.616148829460144, + "learning_rate": 1.6659638376716578e-06, + "loss": 1.1711, + "num_input_tokens_seen": 6245580800, + "step": 95300, + "train_runtime": 46162.0494, + "train_tokens_per_second": 135296.87 + }, + { + "epoch": 0.954, + "grad_norm": 0.6661262512207031, + "learning_rate": 1.5959653101761172e-06, + "loss": 1.1604, + "num_input_tokens_seen": 6252134400, + "step": 95400, + "train_runtime": 46208.848, + "train_tokens_per_second": 135301.672 + }, + { + "epoch": 0.955, + "grad_norm": 0.8173303604125977, + "learning_rate": 1.5274612096623063e-06, + "loss": 1.1498, + "num_input_tokens_seen": 6258688000, + "step": 95500, + "train_runtime": 46256.5159, + "train_tokens_per_second": 135303.922 + }, + { + "epoch": 0.956, + "grad_norm": 0.6189817786216736, + "learning_rate": 1.4604522259657635e-06, + "loss": 1.1602, + "num_input_tokens_seen": 6265241600, + "step": 95600, + "train_runtime": 46309.4141, + "train_tokens_per_second": 135290.885 + }, + { + "epoch": 0.957, + "grad_norm": 0.7523248195648193, + "learning_rate": 1.3949390338662047e-06, + "loss": 1.1655, + "num_input_tokens_seen": 6271795200, + "step": 95700, + "train_runtime": 46357.4405, + "train_tokens_per_second": 135292.094 + }, + { + "epoch": 0.958, + "grad_norm": 0.5935103297233582, + "learning_rate": 1.330922293080744e-06, + "loss": 1.1702, + "num_input_tokens_seen": 6278348800, + "step": 95800, + "train_runtime": 46406.0604, + "train_tokens_per_second": 135291.571 + }, + { + "epoch": 0.959, + "grad_norm": 0.8042653203010559, + "learning_rate": 1.2684026482572662e-06, + "loss": 1.1623, + "num_input_tokens_seen": 6284902400, + "step": 95900, + "train_runtime": 46454.8491, + "train_tokens_per_second": 135290.557 + }, + { + "epoch": 0.96, + "grad_norm": 0.5935735106468201, + "learning_rate": 1.2073807289678993e-06, + "loss": 1.1441, + "num_input_tokens_seen": 6291456000, + "step": 96000, + "train_runtime": 46502.688, + "train_tokens_per_second": 135292.308 + }, + { + "epoch": 0.961, + "grad_norm": 0.5718377828598022, + "learning_rate": 1.147857149702669e-06, + "loss": 1.1618, + "num_input_tokens_seen": 6298009600, + "step": 96100, + "train_runtime": 46555.2337, + "train_tokens_per_second": 135280.378 + }, + { + "epoch": 0.962, + "grad_norm": 0.6801995635032654, + "learning_rate": 1.0898325098633697e-06, + "loss": 1.1479, + "num_input_tokens_seen": 6304563200, + "step": 96200, + "train_runtime": 46603.2751, + "train_tokens_per_second": 135281.548 + }, + { + "epoch": 0.963, + "grad_norm": 0.5564619898796082, + "learning_rate": 1.0333073937575043e-06, + "loss": 1.1582, + "num_input_tokens_seen": 6311116800, + "step": 96300, + "train_runtime": 46652.5681, + "train_tokens_per_second": 135279.087 + }, + { + "epoch": 0.964, + "grad_norm": 0.6501321792602539, + "learning_rate": 9.782823705923204e-07, + "loss": 1.1617, + "num_input_tokens_seen": 6317670400, + "step": 96400, + "train_runtime": 46700.1727, + "train_tokens_per_second": 135281.521 + }, + { + "epoch": 0.965, + "grad_norm": 0.6728459596633911, + "learning_rate": 9.247579944692162e-07, + "loss": 1.1592, + "num_input_tokens_seen": 6324224000, + "step": 96500, + "train_runtime": 46748.7553, + "train_tokens_per_second": 135281.12 + }, + { + "epoch": 0.966, + "grad_norm": 0.5893784761428833, + "learning_rate": 8.72734804378078e-07, + "loss": 1.1691, + "num_input_tokens_seen": 6330777600, + "step": 96600, + "train_runtime": 46801.015, + "train_tokens_per_second": 135270.092 + }, + { + "epoch": 0.967, + "grad_norm": 0.8625339269638062, + "learning_rate": 8.222133241918172e-07, + "loss": 1.1518, + "num_input_tokens_seen": 6337331200, + "step": 96700, + "train_runtime": 46847.2237, + "train_tokens_per_second": 135276.559 + }, + { + "epoch": 0.968, + "grad_norm": 0.6501858830451965, + "learning_rate": 7.731940626612088e-07, + "loss": 1.1693, + "num_input_tokens_seen": 6343884800, + "step": 96800, + "train_runtime": 46895.3712, + "train_tokens_per_second": 135277.419 + }, + { + "epoch": 0.969, + "grad_norm": 0.6575475335121155, + "learning_rate": 7.256775134096615e-07, + "loss": 1.1552, + "num_input_tokens_seen": 6350438400, + "step": 96900, + "train_runtime": 46942.8491, + "train_tokens_per_second": 135280.208 + }, + { + "epoch": 0.97, + "grad_norm": 0.5287050604820251, + "learning_rate": 6.796641549283055e-07, + "loss": 1.1946, + "num_input_tokens_seen": 6356992000, + "step": 97000, + "train_runtime": 46991.8919, + "train_tokens_per_second": 135278.486 + }, + { + "epoch": 0.971, + "grad_norm": 0.568566083908081, + "learning_rate": 6.351544505711292e-07, + "loss": 1.1559, + "num_input_tokens_seen": 6363545600, + "step": 97100, + "train_runtime": 47040.0316, + "train_tokens_per_second": 135279.365 + }, + { + "epoch": 0.972, + "grad_norm": 0.9329395890235901, + "learning_rate": 5.921488485503833e-07, + "loss": 1.1603, + "num_input_tokens_seen": 6370099200, + "step": 97200, + "train_runtime": 47092.2725, + "train_tokens_per_second": 135268.46 + }, + { + "epoch": 0.973, + "grad_norm": 0.6256415843963623, + "learning_rate": 5.506477819319843e-07, + "loss": 1.1571, + "num_input_tokens_seen": 6376652800, + "step": 97300, + "train_runtime": 47139.4068, + "train_tokens_per_second": 135272.233 + }, + { + "epoch": 0.974, + "grad_norm": 0.7202081680297852, + "learning_rate": 5.106516686312345e-07, + "loss": 1.1638, + "num_input_tokens_seen": 6383206400, + "step": 97400, + "train_runtime": 47191.9059, + "train_tokens_per_second": 135260.619 + }, + { + "epoch": 0.975, + "grad_norm": 1.2700363397598267, + "learning_rate": 4.721609114085256e-07, + "loss": 1.1649, + "num_input_tokens_seen": 6389760000, + "step": 97500, + "train_runtime": 47240.0777, + "train_tokens_per_second": 135261.42 + }, + { + "epoch": 0.976, + "grad_norm": 0.5555500388145447, + "learning_rate": 4.3517589786539186e-07, + "loss": 1.1505, + "num_input_tokens_seen": 6396313600, + "step": 97600, + "train_runtime": 47287.972, + "train_tokens_per_second": 135263.013 + }, + { + "epoch": 0.977, + "grad_norm": 0.6499391198158264, + "learning_rate": 3.996970004404798e-07, + "loss": 1.153, + "num_input_tokens_seen": 6402867200, + "step": 97700, + "train_runtime": 47335.8726, + "train_tokens_per_second": 135264.586 + }, + { + "epoch": 0.978, + "grad_norm": 0.6353591084480286, + "learning_rate": 3.657245764058847e-07, + "loss": 1.1621, + "num_input_tokens_seen": 6409420800, + "step": 97800, + "train_runtime": 47382.5196, + "train_tokens_per_second": 135269.733 + }, + { + "epoch": 0.979, + "grad_norm": 0.62052321434021, + "learning_rate": 3.3325896786355334e-07, + "loss": 1.1539, + "num_input_tokens_seen": 6415974400, + "step": 97900, + "train_runtime": 47435.6023, + "train_tokens_per_second": 135256.518 + }, + { + "epoch": 0.98, + "grad_norm": 0.5979087352752686, + "learning_rate": 3.023005017418201e-07, + "loss": 1.1615, + "num_input_tokens_seen": 6422528000, + "step": 98000, + "train_runtime": 47484.0018, + "train_tokens_per_second": 135256.671 + }, + { + "epoch": 0.981, + "grad_norm": 1.0899096727371216, + "learning_rate": 2.7284948979205967e-07, + "loss": 1.166, + "num_input_tokens_seen": 6429081600, + "step": 98100, + "train_runtime": 47531.611, + "train_tokens_per_second": 135259.072 + }, + { + "epoch": 0.982, + "grad_norm": 0.6240010857582092, + "learning_rate": 2.449062285856729e-07, + "loss": 1.1565, + "num_input_tokens_seen": 6435635200, + "step": 98200, + "train_runtime": 47578.8884, + "train_tokens_per_second": 135262.412 + }, + { + "epoch": 0.983, + "grad_norm": 0.7941544651985168, + "learning_rate": 2.184709995109557e-07, + "loss": 1.1572, + "num_input_tokens_seen": 6442188800, + "step": 98300, + "train_runtime": 47627.3828, + "train_tokens_per_second": 135262.289 + }, + { + "epoch": 0.984, + "grad_norm": 0.5704551339149475, + "learning_rate": 1.9354406877038487e-07, + "loss": 1.1629, + "num_input_tokens_seen": 6448742400, + "step": 98400, + "train_runtime": 47679.6586, + "train_tokens_per_second": 135251.438 + }, + { + "epoch": 0.985, + "grad_norm": 0.5758212208747864, + "learning_rate": 1.7012568737788668e-07, + "loss": 1.1892, + "num_input_tokens_seen": 6455296000, + "step": 98500, + "train_runtime": 47728.7818, + "train_tokens_per_second": 135249.545 + }, + { + "epoch": 0.986, + "grad_norm": 0.5768951773643494, + "learning_rate": 1.4821609115630574e-07, + "loss": 1.1617, + "num_input_tokens_seen": 6461849600, + "step": 98600, + "train_runtime": 47775.3275, + "train_tokens_per_second": 135254.952 + }, + { + "epoch": 0.987, + "grad_norm": 0.5714033842086792, + "learning_rate": 1.278155007350068e-07, + "loss": 1.1712, + "num_input_tokens_seen": 6468403200, + "step": 98700, + "train_runtime": 47823.1467, + "train_tokens_per_second": 135256.746 + }, + { + "epoch": 0.988, + "grad_norm": 1.029975414276123, + "learning_rate": 1.089241215477099e-07, + "loss": 1.1621, + "num_input_tokens_seen": 6474956800, + "step": 98800, + "train_runtime": 47875.5087, + "train_tokens_per_second": 135245.702 + }, + { + "epoch": 0.989, + "grad_norm": 0.5554516315460205, + "learning_rate": 9.154214383042535e-08, + "loss": 1.1489, + "num_input_tokens_seen": 6481510400, + "step": 98900, + "train_runtime": 47923.8409, + "train_tokens_per_second": 135246.055 + }, + { + "epoch": 0.99, + "grad_norm": 0.6340943574905396, + "learning_rate": 7.566974261945524e-08, + "loss": 1.1721, + "num_input_tokens_seen": 6488064000, + "step": 99000, + "train_runtime": 47972.1937, + "train_tokens_per_second": 135246.348 + }, + { + "epoch": 0.991, + "grad_norm": 0.582399845123291, + "learning_rate": 6.13070777496949e-08, + "loss": 1.1497, + "num_input_tokens_seen": 6494617600, + "step": 99100, + "train_runtime": 48020.3976, + "train_tokens_per_second": 135247.06 + }, + { + "epoch": 0.992, + "grad_norm": 0.6133337020874023, + "learning_rate": 4.845429385303412e-08, + "loss": 1.1601, + "num_input_tokens_seen": 6501171200, + "step": 99200, + "train_runtime": 48068.6895, + "train_tokens_per_second": 135247.523 + }, + { + "epoch": 0.993, + "grad_norm": 0.5691381096839905, + "learning_rate": 3.711152035685838e-08, + "loss": 1.1571, + "num_input_tokens_seen": 6507724800, + "step": 99300, + "train_runtime": 48115.7967, + "train_tokens_per_second": 135251.315 + }, + { + "epoch": 0.994, + "grad_norm": 0.6613404750823975, + "learning_rate": 2.727887148278318e-08, + "loss": 1.1569, + "num_input_tokens_seen": 6514278400, + "step": 99400, + "train_runtime": 48169.6246, + "train_tokens_per_second": 135236.229 + }, + { + "epoch": 0.995, + "grad_norm": 0.5285235047340393, + "learning_rate": 1.8956446245455005e-08, + "loss": 1.1722, + "num_input_tokens_seen": 6520832000, + "step": 99500, + "train_runtime": 48217.4936, + "train_tokens_per_second": 135237.888 + }, + { + "epoch": 0.996, + "grad_norm": 0.8071156144142151, + "learning_rate": 1.2144328451618724e-08, + "loss": 1.1571, + "num_input_tokens_seen": 6527385600, + "step": 99600, + "train_runtime": 48264.7605, + "train_tokens_per_second": 135241.231 + }, + { + "epoch": 0.997, + "grad_norm": 0.5775815844535828, + "learning_rate": 6.84258669920168e-09, + "loss": 1.1634, + "num_input_tokens_seen": 6533939200, + "step": 99700, + "train_runtime": 48314.0709, + "train_tokens_per_second": 135238.846 + }, + { + "epoch": 0.998, + "grad_norm": 0.5299545526504517, + "learning_rate": 3.0512743767141524e-09, + "loss": 1.1563, + "num_input_tokens_seen": 6540492800, + "step": 99800, + "train_runtime": 48364.7142, + "train_tokens_per_second": 135232.74 + }, + { + "epoch": 0.999, + "grad_norm": 0.636650800704956, + "learning_rate": 7.70429662616534e-10, + "loss": 1.1653, + "num_input_tokens_seen": 6547046400, + "step": 99900, + "train_runtime": 48412.6126, + "train_tokens_per_second": 135234.313 + }, + { + "epoch": 1.0, + "grad_norm": 0.5705932974815369, + "learning_rate": 7.552498626495208e-14, + "loss": 1.1814, + "num_input_tokens_seen": 6553600000, + "step": 100000, + "train_runtime": 48460.0302, + "train_tokens_per_second": 135237.225 + }, + { + "epoch": 1.0, + "num_input_tokens_seen": 6553600000, + "step": 100000, + "total_flos": 1.23866185728e+17, + "train_loss": 1.241861473388672, + "train_runtime": 48460.2218, + "train_samples_per_second": 528.268, + "train_steps_per_second": 2.064 + } + ], + "logging_steps": 100, + "max_steps": 100000, + "num_input_tokens_seen": 6553600000, + "num_train_epochs": 9223372036854775807, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.23866185728e+17, + "train_batch_size": 256, + "trial_name": null, + "trial_params": null +}